In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import math
import pickle
import os

from matplotlib import pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics import precision_score, recall_score, f1_score

from scipy.stats import pointbiserialr, chi2_contingency

In [2]:
ATT_INT = ["Administrative", "Informational", "ProductRelated"]
ATT_INT_CATEGORY = ["OperatingSystems", "Browser", "Region", "TrafficType"]
ATT_FLOAT = [
    "Administrative_Duration",
    "Informational_Duration",
    "ProductRelated_Duration",
    "BounceRates",
    "ExitRates",
    "PageValues",
    "SpecialDay",
]
ATT_STRING = ["Month", "VisitorType"]
ATT_BOOL = ["Weekend", "Revenue"]
ATT_BOOL_NO_TARGET = ["Weekend"]

TARGET = "Revenue"

RANDOM_STATES = [0, 1, 5, 7, 13, 23, 29, 32, 37, 42]

SCALERS = [None, MinMaxScaler, StandardScaler, RobustScaler]
DEPTHS = [3, 4, 5, 6]

In [3]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
def calculate_average_variance(
    average_accuracy, average_precision, average_recall, average_f1
):
    avgs_variances = []
    for i, scaler in enumerate(SCALERS):
        print(f"Scaler: {scaler.__name__ if scaler is not None else None}")

        variance_accuracy = np.var(average_accuracy[i])
        variance_precision = np.var(average_precision[i])
        variance_recall = np.var(average_recall[i])
        variance_f1 = np.var(average_f1[i])

        print(f"Variance of accuracy: {variance_accuracy}")
        print(f"Variance of precision: {variance_precision}")
        print(f"Variance of recall: {variance_recall}")
        print(f"Variance of F1 score: {variance_f1}\n")

        avg_accuracy = np.mean(average_accuracy[i])
        avg_precision = np.mean(average_precision[i])
        avg_recall = np.mean(average_recall[i])
        avg_f1 = np.mean(average_f1[i])

        print(f"Average accuracy: {avg_accuracy}")
        print(f"Average precision: {avg_precision}")
        print(f"Average recall: {avg_recall}")
        print(f"Average F1 score: {avg_f1}\n")
        values = (
            avg_accuracy,
            avg_precision,
            avg_recall,
            avg_f1,
            variance_accuracy,
            variance_precision,
            variance_recall,
            variance_f1,
        )
        avgs_variances.append(values)
    return avgs_variances

In [5]:
import numpy as np
from collections import Counter


class MyDecisionTree:
    def __init__(self, max_depth=6):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X_train, y_train):
        data = np.c_[X_train.to_numpy(), y_train.to_numpy()]
        self.tree = self.id3(data, 0)

    def predict(self, X_test):
        # X = X_test.values if type(X_test) == pd.DataFrame else X_test.to_numpy()
        predictions = [self.traverse_tree(x, self.tree) for x in X_test.to_numpy()]
        return np.array(predictions)

    def find_best_split(self, data):
        best_gini = float("inf")
        best_attr = -1
        best_value = None
        for attr in range(data.shape[1] - 1):
            values = data[:, attr]
            possible_thresholds = np.unique(values)
            for value in possible_thresholds:
                data_below, data_above = self.split_dataset(data, attr, value)
                current_gini = (
                    self.gini_index(data_below) * len(data_below)
                    + self.gini_index(data_above) * len(data_above)
                ) / len(data)
                if current_gini <= best_gini:
                    best_gini = current_gini
                    best_attr = attr
                    best_value = value
        return best_attr, best_value

    def split_dataset(self, data, attr, value):
        data_below = data[data[:, attr] <= value]
        data_above = data[data[:, attr] > value]
        return data_below, data_above

    def gini_index(self, data):
        labels = data[:, -1]
        _, counts = np.unique(labels, return_counts=True)
        probabilities = counts / counts.sum()
        gini = 1 - sum(probabilities**2)
        return gini

    def majority_class(self, data):
        labels = data[:, -1]
        majority_class = Counter(labels).most_common(1)[0][0]
        return majority_class

    def is_pure(self, data):
        unique_labels = np.unique(data[:, -1])
        return len(unique_labels) == 1

    def id3(self, data, depth):
        if self.is_pure(data) or depth == self.max_depth:
            return self.majority_class(data)
        else:
            depth += 1
            attr, value = self.find_best_split(data)
            data_below, data_above = self.split_dataset(data, attr, value)
            question = "{} <= {}".format(attr, value)
            sub_tree = {question: []}
            yes_answer = self.id3(data_below, depth)
            no_answer = self.id3(data_above, depth)
            if yes_answer == no_answer:
                sub_tree = yes_answer
            else:
                sub_tree[question].append(yes_answer)
                sub_tree[question].append(no_answer)
            return sub_tree

    def traverse_tree(self, example, tree):
        question = list(tree.keys())[0]
        attr, comparison_operator, value = question.split(" ")
        if example[int(attr)] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
        if not isinstance(answer, dict):
            return answer
        else:
            residual_tree = answer
            return self.traverse_tree(example, residual_tree)

    def print_tree(self, tree=None, indent=" "):
        if isinstance(tree, dict):
            print("{}".format(list(tree.keys())[0]))
            print("{}T->".format(indent), end="")
            self.print_tree(tree[list(tree.keys())[0]][0], indent + "  ")
            print("{}F->".format(indent), end="")
            self.print_tree(tree[list(tree.keys())[0]][1], indent + "  ")
        else:
            print(tree)

In [6]:
def create_my_tree(df, scaler, state, depth):
    print("Creating model...")

    # Create an instance of LabelEncoder
    label_encoder = LabelEncoder()

    df_copy = df.copy()

    # Convert categorical variables to numerical labels
    for column in df_copy.columns:
        df_copy[column] = label_encoder.fit_transform(df_copy[column])

    # Split the dataset into features (X) and target variable (y)
    X = df_copy.drop(TARGET, axis=1)
    y = df_copy[TARGET]

    # Apply the scaler if provided
    if scaler is not None:
        scaler = scaler()
        X = scaler.fit_transform(X)
        X = pd.DataFrame(X)

    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=state
    )
    # Create the Decision Tree model
    model = MyDecisionTree(max_depth=depth)
    # Fit the model to the data
    model.fit(X_train, y_train)

    return model, X_test, y_test

In [7]:
import pickle


def run_trees(states, scalers, depths, constructor, filename):
    if filename is not None:
        if os.path.exists(filename):
            print(f"Loading data from {filename}")

            # Load the data from the file
            with open(filename, "rb") as f:
                (
                    average_accuracy,
                    average_precision,
                    average_recall,
                    average_f1,
                ) = pickle.load(f)
            return average_accuracy, average_precision, average_recall, average_f1

    average_accuracy = [[], [], [], []]
    average_precision = [[], [], [], []]
    average_recall = [[], [], [], []]
    average_f1 = [[], [], [], []]
    for state in states:
        print(f"Random state: {state}")
        for i, scaler in enumerate(scalers):
            print(f"Scaler: {scaler.__name__ if scaler is not None else None}")
            for depth in depths:
                print(f"Depth: {depth}")
                model, X_test, y_test = constructor(df, scaler, state, depth)
                # Predict the labels for the test set
                y_pred = model.predict(X_test)
                # Calculate the accuracy score
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred)

                print(f"Test accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1 score: {f1}\n")

                average_accuracy[i].append(accuracy)
                average_precision[i].append(precision)
                average_recall[i].append(recall)
                average_f1[i].append(f1)

    # Save the average lists to a file
    with open(filename, "wb") as f:
        pickle.dump(
            (average_accuracy, average_precision, average_recall, average_f1), f
        )
    return average_accuracy, average_precision, average_recall, average_f1

In [8]:
average_accuracy, average_precision, average_recall, average_f1 = run_trees(
    RANDOM_STATES, SCALERS, DEPTHS, create_my_tree, "myTreesData.pkl"
)

Random state: 0
Scaler: None
Depth: 3
Creating model...
Test accuracy: 0.8852392538523925
Precision: 0.683377308707124
Recall: 0.6137440758293838
F1 score: 0.6466916354556804

Depth: 4
Creating model...
Test accuracy: 0.8884833738848338
Precision: 0.7194029850746269
Recall: 0.5710900473933649
F1 score: 0.6367239101717305

Depth: 5
Creating model...
Test accuracy: 0.8880778588807786
Precision: 0.7354838709677419
Recall: 0.5402843601895735
F1 score: 0.6229508196721312

Depth: 6
Creating model...
Test accuracy: 0.8917274939172749
Precision: 0.7540983606557377
Recall: 0.5450236966824644
F1 score: 0.6327372764786794

Scaler: MinMaxScaler
Depth: 3
Creating model...
Test accuracy: 0.8852392538523925
Precision: 0.683377308707124
Recall: 0.6137440758293838
F1 score: 0.6466916354556804

Depth: 4
Creating model...
Test accuracy: 0.8884833738848338
Precision: 0.7194029850746269
Recall: 0.5710900473933649
F1 score: 0.6367239101717305

Depth: 5
Creating model...
Test accuracy: 0.8880778588807786
Pre

In [9]:
calculate_average_variance(
    average_accuracy, average_precision, average_recall, average_f1
)

Scaler: None
Variance of accuracy: 5.1552698204091504e-05
Variance of precision: 0.0013075710962759828
Variance of recall: 0.0018742745302232175
Variance of F1 score: 0.0005423811785374272

Average accuracy: 0.8957826439578265
Average precision: 0.6998701869684585
Average recall: 0.5869613735724173
Average F1 score: 0.6365834267323115

Scaler: MinMaxScaler
Variance of accuracy: 5.1552698204091504e-05
Variance of precision: 0.0013075710962759828
Variance of recall: 0.0018742745302232175
Variance of F1 score: 0.0005423811785374272

Average accuracy: 0.8957826439578265
Average precision: 0.6998701869684585
Average recall: 0.5869613735724173
Average F1 score: 0.6365834267323115

Scaler: StandardScaler
Variance of accuracy: 5.1552698204091504e-05
Variance of precision: 0.0013075710962759828
Variance of recall: 0.0018742745302232175
Variance of F1 score: 0.0005423811785374272

Average accuracy: 0.8957826439578265
Average precision: 0.6998701869684585
Average recall: 0.5869613735724173
Averag

In [10]:
from sklearn.tree import DecisionTreeClassifier


def create_decision_tree_model(df, scaler, state, depth):
    print("Creating model...")
    # Create the logistic regression model

    # Create an instance of LabelEncoder
    label_encoder = LabelEncoder()

    df_copy = df.copy()

    # Convert categorical variables to numerical labels
    for column in df_copy.columns:
        df_copy[column] = label_encoder.fit_transform(df_copy[column])

    # Split the dataset into features (X) and target variable (y)
    X = df_copy.drop(TARGET, axis=1)
    y = df_copy[TARGET]

    # # Apply the scaler if provided
    if scaler is not None:
        scaler = scaler()
        X = scaler.fit_transform(X)

    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=state
    )

    # Create an instance of DecisionTreeClassifier
    model = DecisionTreeClassifier(random_state=state, max_depth=depth)

    # Fit the model to the data
    model.fit(X_train, y_train)

    return model, X_test, y_test

In [11]:
average_accuracy, average_precision, average_recall, average_f1 = run_trees(
    RANDOM_STATES, SCALERS, DEPTHS, create_decision_tree_model, "decisionTreeData.pkl"
)

Random state: 0
Scaler: None
Depth: 3
Creating model...
Test accuracy: 0.8852392538523925
Precision: 0.683377308707124
Recall: 0.6137440758293838
F1 score: 0.6466916354556804

Depth: 4
Creating model...
Test accuracy: 0.8888888888888888
Precision: 0.7215568862275449
Recall: 0.5710900473933649
F1 score: 0.6375661375661374

Depth: 5
Creating model...
Test accuracy: 0.8884833738848338
Precision: 0.7378640776699029
Recall: 0.5402843601895735
F1 score: 0.6238030095759234

Depth: 6
Creating model...
Test accuracy: 0.8925385239253852
Precision: 0.759075907590759
Recall: 0.5450236966824644
F1 score: 0.6344827586206897

Scaler: MinMaxScaler
Depth: 3
Creating model...
Test accuracy: 0.8852392538523925
Precision: 0.683377308707124
Recall: 0.6137440758293838
F1 score: 0.6466916354556804

Depth: 4
Creating model...
Test accuracy: 0.8888888888888888
Precision: 0.7215568862275449
Recall: 0.5710900473933649
F1 score: 0.6375661375661374

Depth: 5
Creating model...
Test accuracy: 0.8884833738848338
Prec

In [12]:
calculate_average_variance(
    average_accuracy, average_precision, average_recall, average_f1
)

Scaler: None
Variance of accuracy: 5.2611707379320594e-05
Variance of precision: 0.001349573915335446
Variance of recall: 0.0018250765869248966
Variance of F1 score: 0.0005203417374181668

Average accuracy: 0.8958231954582321
Average precision: 0.7013975166698889
Average recall: 0.5844819826767265
Average F1 score: 0.6357358180621226

Scaler: MinMaxScaler
Variance of accuracy: 5.2611707379320594e-05
Variance of precision: 0.001349573915335446
Variance of recall: 0.0018250765869248966
Variance of F1 score: 0.0005203417374181668

Average accuracy: 0.8958231954582321
Average precision: 0.7013975166698889
Average recall: 0.5844819826767265
Average F1 score: 0.6357358180621226

Scaler: StandardScaler
Variance of accuracy: 5.2611707379320594e-05
Variance of precision: 0.001349573915335446
Variance of recall: 0.0018250765869248966
Variance of F1 score: 0.0005203417374181668

Average accuracy: 0.8958231954582321
Average precision: 0.7013975166698889
Average recall: 0.5844819826767265
Average F