## Individual Assignment 1 Task 2

### Name: Calaunan Alexander Jr Sumampong

### UOW ID: 7559161

In [1]:
# Import the necessary libraries
import numpy as np
import pandas as pd

## IMPORT DATA

In [2]:
# Import data
train_df = pd.read_csv('customer_churn_dataset-training-master.csv')
test_df = pd.read_csv('customer_churn_dataset-testing-master.csv')

# Concatenate train and test data
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# Get info on data
df.info()

FileNotFoundError: [Errno 2] No such file or directory: 'customer_churn_dataset-training-master.csv'

### See head of the dataframe

In [None]:
df.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0


In [None]:
# see the values of the Y variable
df['Churn'].value_counts()

Churn
1.0    280492
0.0    224714
Name: count, dtype: int64

### Some information:
- Target Variable is <u>**binary**</u>
- 1 row of null values
- Some columns are object type
    - Gender, Subscription Type, Contract Length are nominal data
    - One-Hot-Encoding required

## PREPROCESS THE DATA

### Drop the null valued row

In [None]:
# Delete the single null row
df= df.dropna()

# Convert the feature types back to int64
for col in df.columns:
    if df[col].dtype == 'float64':
        df[col] = df[col].astype('int64')

# Check for missing values inside the data sets
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 505206 entries, 0 to 505206
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   CustomerID         505206 non-null  int64 
 1   Age                505206 non-null  int64 
 2   Gender             505206 non-null  object
 3   Tenure             505206 non-null  int64 
 4   Usage Frequency    505206 non-null  int64 
 5   Support Calls      505206 non-null  int64 
 6   Payment Delay      505206 non-null  int64 
 7   Subscription Type  505206 non-null  object
 8   Contract Length    505206 non-null  object
 9   Total Spend        505206 non-null  int64 
 10  Last Interaction   505206 non-null  int64 
 11  Churn              505206 non-null  int64 
dtypes: int64(9), object(3)
memory usage: 50.1+ MB


### Look at the values of the nominal columns

In [None]:
print(df['Gender'].value_counts())
print()
print(df['Subscription Type'].value_counts())
print()
print(df['Contract Length'].value_counts())

Gender
Male      280273
Female    224933
Name: count, dtype: int64

Subscription Type
Standard    170630
Premium     170099
Basic       164477
Name: count, dtype: int64

Contract Length
Annual       198608
Quarterly    197364
Monthly      109234
Name: count, dtype: int64


### One-Hot Encode the nominal columns on the dataframe

In [None]:
# Manually one-hot encoding the 'Gender' feature using pd.get_dummies(), make sure columns are int64

df_encoded = pd.get_dummies(df, columns=['Gender', 'Subscription Type', 'Contract Length'], dtype='int64')

print(df_encoded.head())
print() 
df_encoded.info()


   CustomerID  Age  Tenure  Usage Frequency  Support Calls  Payment Delay  \
0           2   30      39               14              5             18   
1           3   65      49                1             10              8   
2           4   55      14                4              6             18   
3           5   58      38               21              7              7   
4           6   23      32               20              5              8   

   Total Spend  Last Interaction  Churn  Gender_Female  Gender_Male  \
0          932                17      1              1            0   
1          557                 6      1              1            0   
2          185                 3      1              1            0   
3          396                29      1              0            1   
4          617                20      1              0            1   

   Subscription Type_Basic  Subscription Type_Premium  \
0                        0                          0

## START DEFINING THE DECISION TREE MODEL

In [None]:
# DEFINE NODE CLASS
class Node:
    def __init__(self, feature=None, value=None, left=None, right=None, info_gain=None, leaf_value=None):
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right
        self.info_gain = info_gain
        
        # leaf nodes
        self.leaf_value = leaf_value

# DEFINE DECISION TREE CLASS
class DecisionTree():
    def __init__(self, min_samples_split=2, max_depth=None, criterion='info_gain'):
        self.root = None
        # stopping conditions
        self.min_samples_split = min_samples_split  # minimum number of samples required to split an internal node
        self.max_depth = max_depth  # maximum depth of the tree
        self.criterion = criterion  # criterion to measure the quality of a split (gini_index, gain_ratio, info_gain)

    # TYPES OF CRITERION:
    # FOR INFO GAIN
    def entropy(self, y):
        entropy = 0
        unique_values = set(y)

        # get the probability of each value
        for value in unique_values:
            p = sum(y == value) / len(y)    
            entropy -= p * np.log2(p)
        return entropy
    
    def information_gain(self, y, y_left, y_right):
        entropy_parent = self.entropy(y)
        entropy_children = (len(y_left) / len(y)) * self.entropy(y_left) + (len(y_right) / len(y)) * self.entropy(y_right)
        return entropy_parent - entropy_children
    
    # FOR GINI INDEX
    def gini_index(self, y):
        gini_index = 1
        unique_values = set(y)

        # get the probability of each value
        for value in unique_values:
            p = sum(y == value) / len(y)
            gini_index -= p ** 2
        return gini_index
    
    # FOR GAIN RATIO
    def gain_ratio(self, y, y_left, y_right):
        information_gain = self.information_gain(y, y_left, y_right)
        split_info = self.entropy(y)
        return information_gain / split_info
    
    def best_split(self, X, y):
        # best_split = (feature index, split value, information gain)
        best_split = (None, None, 0)

        for feature in range(X.shape[1]):
            X_feature = X[:, feature]
            unique_values = set(X_feature)
            for value in unique_values:
                y_left = y[X_feature <= value]
                y_right = y[X_feature > value]
                if self.criterion == 'gini_index':
                    info_gain = self.gini_index(y) - (len(y_left) / len(y)) * self.gini_index(y_left) - (len(y_right) / len(y)) * self.gini_index(y_right)
                elif self.criterion == 'gain_ratio':
                    info_gain = self.gain_ratio(y, y_left, y_right)
                elif self.criterion == 'info_gain':
                    info_gain = self.information_gain(y, y_left, y_right)
                else:
                    raise ValueError('Invalid criterion')
                if info_gain > best_split[2]:
                    best_split = (feature, value, info_gain)
        return best_split
    
    def build_tree(self, X, y, depth=0):
        # Prevent further splitting if stopping conditions are met
        if (depth >= self.max_depth) or (len(y) < self.min_samples_split) or (len(np.unique(y)) == 1):
            leaf_value = np.bincount(y.astype(int)).argmax()  # Convert y to integers before using bincount
            return Node(leaf_value=leaf_value)
        
        # Continue splitting the data if stopping conditions are not met
        feature, value, info_gain = self.best_split(X, y)  # find the best split point
        X_left, y_left = X[X[:, feature] <= value], y[X[:, feature] <= value]
        X_right, y_right = X[X[:, feature] > value], y[X[:, feature] > value]

        # Recursively build the subtrees
        left = self.build_tree(X_left, y_left, depth + 1)
        right = self.build_tree(X_right, y_right, depth + 1)
        return Node(feature=feature, value=value, left=left, right=right, info_gain=info_gain)
    
    def fit(self, X, y):
        self.root = self.build_tree(X, y)

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return predictions
    
    def _predict(self, x):
        node = self.root
        while node.leaf_value is None:
            if x[node.feature] <= node.value:
                node = node.left
            else:
                node = node.right
        return node.leaf_value
    
    def print_tree(self, node=None, depth=0):
        if node is None:
            node = self.root
        if node.leaf_value is not None:
            print(depth * '  ' + 'Prediction', node.leaf_value)
            return
        print(depth * '  ' + 'Feature', node.feature, '<=', node.value, 'Info gain:', node.info_gain)
        self.print_tree(node.left, depth + 1)
        self.print_tree(node.right, depth + 1)

### Define train_test_split

In [None]:
def train_test_split(df, test_size=0.2):
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))
    
    # get the indices of the test and train set
    test_indices = np.random.choice(len(df), test_size, replace=False)
    train_indices = np.array(list(set(range(len(df))) - set(test_indices)))

    # split the dataframe into train and test sets
    train_df = df.iloc[train_indices]
    test_df = df.iloc[test_indices]

    # convert the train and test sets into numpy arrays
    X_train = train_df.drop('Churn', axis=1).values
    y_train = train_df['Churn'].values
    X_test = test_df.drop('Churn', axis=1).values
    y_test = test_df['Churn'].values

    return X_train, X_test, y_train, y_test

### Start splitting the data

In [None]:
# Split df into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df, test_size=0.2)

## **DECISION TREE IMPLEMENTATION**

### Information Gain tree:

In [None]:
# Create 3 decision trees with different criteria
tree_info_gain = DecisionTree(min_samples_split=2, max_depth=3, criterion='info_gain')

### Gini Index tree:

In [None]:
tree_gini = DecisionTree(min_samples_split=2, max_depth=3, criterion='gini_index')

### Gain Ratio tree:

In [None]:
tree_gain_ratio = DecisionTree(min_samples_split=2, max_depth=3, criterion='gain_ratio')