## Assignment 2 Task 1

<strong>Name:</strong> Elroy Chua Ming Xuan </br>
<strong>UOW ID: </strong> 7431673 </br>
<strong>Data set: </strong> https://www.kaggle.com/datasets/muhammadshahidazeem/customer-churn-dataset

#### Step 1: Import Necessary Libraries and Load Dataset

In [66]:
import pandas as pd
import numpy as np
# Load the datasets
train_df = pd.read_csv("customer_churn_dataset-testing-master.csv")
test_df = pd.read_csv("customer_churn_dataset-training-master.csv")


#### Step 2: Explore and Preprocess the Data

In [67]:
# Handle missing values (consider other strategies like imputation)
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# Convert categorical variables using get_dummies
train_df = pd.get_dummies(train_df, drop_first=True)
test_df = pd.get_dummies(test_df, drop_first=True)

# Sample 20% of the training data
train_df = train_df.sample(frac=0.2)

# Extract features and labels
X_train = train_df.drop('Churn', axis=1)
y_train = train_df['Churn']
X_test = test_df.drop('Churn', axis=1)
y_test = test_df['Churn']

# Convert DataFrames to Numpy Arrays
train_data = np.array(pd.concat([X_train, y_train], axis=1))
test_data = np.array(pd.concat([X_test, y_test], axis=1))


#### Step 3: Implement Decision Tree Model

In [68]:
# Decision Tree Classifier
def entropy(data):
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)
    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
    return entropy


def information_gain(data_left, data_right, current_entropy):
    p = float(len(data_left) / (len(data_left) + len(data_right)))
    return current_entropy - p * entropy(data_left) - (1 - p) * entropy(data_right)


def split_check(data):
    feature_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):
        unique_values = np.unique(data[:, column_index])
        feature_splits[column_index] = unique_values
    return feature_splits


def InfoGainSplit(data, potential_splits):
    current_entropy = entropy(data)
    best_info_gain = -1
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_left, data_right = split(
                data, split_column=column_index, split_value=value)
            current_info_gain = information_gain(
                data_left, data_right, current_entropy)
            if current_info_gain >= best_info_gain:
                best_info_gain = current_info_gain
                best_split_column = column_index
                best_split_value = value

    return best_split_column, best_split_value


def split(data, split_column, split_value):
    split_column_values = data[:, split_column]
    data_left = data[split_column_values <= split_value]
    data_right = data[split_column_values > split_value]
    return data_left, data_right


def classify(data):
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(
        label_column, return_counts=True)
    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    return classification


def check_purity(data):
    unique_classes = np.unique(data[:, -1])
    if len(unique_classes) == 1:
        return True
    else:
        return False


def DT(data, depth=0, max_depth=10):
    if len(data) == 0:
        return None  # Return None for empty data
    if check_purity(data) or depth >= max_depth:
        return classify(data)
    else:
        potential_splits = split_check(data)
        if not potential_splits:  # Check if potential_splits is empty
            return classify(data)
        split_column, split_value = InfoGainSplit(data, potential_splits)
        data_left, data_right = split(data, split_column, split_value)

        question = "{} <= {}".format(split_column, split_value)
        subtree = {question: []}
        yes_answer = DT(data_left, depth=depth+1, max_depth=max_depth) # Increment depth for recursive call
        no_answer = DT(data_right, depth=depth+1, max_depth=max_depth) # Increment depth for recursive call
        if len(data_left) >= len(data) or len(data_right) >= len(data):
            return classify(data)
        if yes_answer == no_answer:
            subtree = yes_answer
        else:
            subtree[question].append(yes_answer)
            subtree[question].append(no_answer)

        return subtree


#### Train Decision Tree

In [69]:
# Train Decision Tree
tree = DT(train_data, max_depth=10)  # Add max_depth parameter


#### Evaluate Decision Tree

In [70]:
# Evaluate Decision Tree
correct_tree = 0
for i in range(len(test_data)):
    prediction_tree = predict(test_data[i], tree)
    if prediction_tree == test_data[i][-1]:
        correct_tree += 1

accuracy_tree = correct_tree / len(test_data)
print(f"Decision Tree Accuracy: {accuracy_tree * 100:.2f}%")


Decision Tree Accuracy: 49.75%


#### Make Predictions with Trained Tree

In [71]:
def predict(example, tree):
    question = list(tree.keys())[0]
    feature, comparison, value = question.split(" ")

    # Check for boolean values
    if value == "True":
        comparison_value = True
    elif value == "False":
        comparison_value = False
    else:
        comparison_value = float(value)

    # Ask question
    if example[int(feature)] <= comparison_value:
        answer = tree[question][0]
    else:
        answer = tree[question][1]

    # Base case: If answer is not another dict, return it (leaf node)
    if not isinstance(answer, dict):
        return answer

    # Recursive part: Go deeper in the tree
    return predict(example, answer)


#### Evaluate Accuracy of the Trained Tree

In [79]:
correct = 0
for i in range(len(test_data)):
    prediction = predict(test_data[i], tree)
    if prediction == test_data[i][-1]:  # Assuming the label is the last column
        correct += 1

accuracy = correct / len(test_data)
print(f"Decision Tree Accuracy: {accuracy*100:.2f}%")


Decision Tree Accuracy: 49.75%


#### Random Forest Classifier
1. Bootstrapping
2. Feature Selection
3. Tree Construction
4. Prediction

In [73]:
# Bootstrapping 
def bootstrap_sample(data, bootstrap_size=None):
    if bootstrap_size is None:
        bootstrap_size = len(data)
    indices = np.random.choice(len(data), size=bootstrap_size, replace=True)
    return data[indices]


In [74]:
# Feature Selection
def random_features(data, num_features):
    total_features = data.shape[1] - 1  # subtracting one for the label column
    features = np.random.choice(total_features, num_features, replace=False)
    return np.column_stack([data[:, i] for i in features] + [data[:, -1]])


In [75]:
# Training Random Forest
def train_random_forest(data, num_trees, num_features):
    forest = []
    for _ in range(num_trees):
        bootstrap_data = bootstrap_sample(data)
        bootstrap_data = random_features(bootstrap_data, num_features)
        tree = DT(bootstrap_data)
        forest.append(tree)
    return forest


In [76]:
# Predicting with Random Forest
def random_forest_predict(forest, example):
    tree_predictions = [predict(example, tree) for tree in forest]
    return max(set(tree_predictions), key=tree_predictions.count)


In [77]:
# Train Random Forest
num_trees = 10
num_features = 5
forest = train_random_forest(train_data, num_trees, num_features)

# Evaluate Random Forest
correct_forest = 0
for i in range(len(test_data)):
    prediction_forest = random_forest_predict(forest, test_data[i])
    if prediction_forest == test_data[i][-1]:
        correct_forest += 1

accuracy_forest = correct_forest / len(test_data)
print(f"Random Forest Accuracy: {accuracy_forest * 100:.2f}%")

Random Forest Accuracy: 50.15%


In [80]:
print(f"Decision Tree Accuracy: {accuracy*100:.2f}%")
print(f"Random Forest Accuracy: {accuracy_forest * 100:.2f}%")


Decision Tree Accuracy: 49.75%
Random Forest Accuracy: 50.15%


#### Comparison:
Based on the two models create above, we can see that Random Forest Accuracy is 50.15 and Decision Tree Accuracy is 49.85. This shows that Random Forest is a better model to use for this dataset. This is because Random Forest is an ensemble model that uses multiple decision trees to make predictions. This means that it is more accurate than a single decision tree.