In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import math

from matplotlib import pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics import precision_score, recall_score, f1_score

from scipy.stats import pointbiserialr, chi2_contingency


In [2]:
ATT_INT = ["Administrative", "Informational", "ProductRelated"]
ATT_INT_CATEGORY = ["OperatingSystems", "Browser", "Region", "TrafficType"]
ATT_FLOAT = ["Administrative_Duration", "Informational_Duration", "ProductRelated_Duration", "BounceRates", "ExitRates", "PageValues", "SpecialDay"]
ATT_STRING = ["Month", "VisitorType"]
ATT_BOOL = ["Weekend", "Revenue"]
ATT_BOOL_NO_TARGET = ["Weekend"]

TARGET = "Revenue"

RANDOM_STATES = [0, 1, 5, 7, 13, 23, 29, 32, 37, 42]

SCALERS = [None, MinMaxScaler, StandardScaler, RobustScaler]


In [3]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


Overall Revenue

In [None]:

# Count the number of occurrences of each class
class_counts = df[TARGET].value_counts()

# Create a bar plot
sns.barplot(x=class_counts.index, y=class_counts.values)

# Set the labels and title
plt.xlabel('Revenue')
plt.ylabel('Count')
plt.title('Overall Class Distribution')

# Show the plot
plt.show()





In [None]:
_, _, y_train, y_test = train_test_split(df.drop(TARGET, axis=1), df[TARGET], test_size=0.2, random_state=42)
trues_train = y_train[y_train == True].count()
false_train = y_train[y_train == False].count()
trues_test = y_test[y_test == True].count()
false_test = y_test[y_test == False].count()

# Count the number of occurrences of each class
class_counts = [false_train, trues_train]

# Create a bar plot
sns.barplot(x=[False, True], y=class_counts)

# Set the labels and title
plt.xlabel('Revenue')
plt.ylabel('Count')
plt.title('Class Distribution Train state 42')

# Show the plot
plt.show()

# Count the number of occurrences of each class
class_counts_test = [false_test, trues_test]

# Create a bar plot
sns.barplot(x=[False, True], y=class_counts_test)

# Set the labels and title
plt.xlabel('Revenue')
plt.ylabel('Count')
plt.title('Class Distribution Test state 42')

# Show the plot
plt.show()


In [None]:
trues_train = 0
false_train = 0
trues_test = 0
false_test = 0

for state in RANDOM_STATES:
    _, _, y_train, y_test = train_test_split(df.drop(TARGET, axis=1), df[TARGET], test_size=0.2, random_state=state)

    trues_train += y_train[y_train == True].count()
    false_train += y_train[y_train == False].count()
    trues_test += y_test[y_test == True].count()
    false_test += y_test[y_test == False].count()

trues_train /= len(RANDOM_STATES)
false_train /= len(RANDOM_STATES)
trues_test /= len(RANDOM_STATES)
false_test /= len(RANDOM_STATES)

print("Train: ", trues_train, false_train)
print("Test: ", trues_test, false_test)

# Count the number of occurrences of each class
class_counts = [false_train, trues_train]

# Create a bar plot
sns.barplot(x=[False, True], y=class_counts)

# Set the labels and title
plt.xlabel('Revenue')
plt.ylabel('Count')
plt.title('Class Distribution Train average')

# Show the plot
plt.show()

# Count the number of occurrences of each class
class_counts_test = [false_test, trues_test]

# Create a bar plot
sns.barplot(x=[False, True], y=class_counts_test)

# Set the labels and title
plt.xlabel('Revenue')
plt.ylabel('Count')
plt.title('Class Distribution Test average')

# Show the plot
plt.show()



## Analiza date 2

### 1.a numerice

In [None]:


for x in ATT_INT + ATT_FLOAT:
    # Calculate the percentiles for each attribute
    x_min = df[x].min()
    x_max = df[x].max()
    intervals = np.linspace(x_min, x_max, 11)

    # Create a bar chart
    df[x + '_interval'] = pd.cut(df[x], intervals)
    interval_counts = df[x + '_interval'].value_counts().sort_index()

    # Plot the values for each interval with logarithmic count (base 2)
    interval_counts.plot(kind='bar', logy=True)
    plt.xlabel('Interval')
    plt.ylabel('Count (log scale)')
    plt.title(f'Values for {x} in Intervals')
    plt.show()


### 1.b categorice

In [None]:
for x in ATT_INT_CATEGORY + ATT_STRING + ATT_BOOL:
    # Create a bar chart
    interval_counts = df[x].value_counts().sort_index()

    # Plot the values for each interval with logarithmic count (base 2)
    interval_counts.plot(kind='bar',
                         )
    plt.xlabel('Interval')
    plt.ylabel('Count (log scale)')
    plt.title(f'Values for {x}')
    plt.show()

### 2 numerice


In [None]:
# Calculate Point-Biserial Correlation for each numerical attribute

correlation_p_df = pd.DataFrame(columns=['Attribute', 'Correlation', 'P-Value'])
correlation_p_df_category = pd.DataFrame(columns=['Attribute', 'Correlation', 'P-Value'])

for attribute in ATT_INT + ATT_FLOAT:
    correlation, p_value = pointbiserialr(df[attribute], df[TARGET])
    correlation_p_df.loc[len(correlation_p_df)] = [attribute, correlation, p_value]




for attribute in ATT_BOOL + ATT_INT_CATEGORY + ATT_STRING:
    if attribute == TARGET:
        continue
    # Create a contingency table
    contingency = pd.crosstab(df[attribute], df[TARGET])

    # Calculate the Chi-Square statistic and the associated p-value
    correlation, p_value, dof, expected = chi2_contingency(contingency)

    correlation_p_df_category.loc[len(correlation_p_df_category)] = [attribute, correlation, p_value]

# Print the correlation and p-value dictionaries
print("Point Biserial Correlation")
print(correlation_p_df)


print("\nChi-Square Correlation")
print(correlation_p_df_category)

In [None]:
from sklearn.preprocessing import LabelEncoder
df_copy = df.copy()

df_copy = df.copy()

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Encode labels for each column in the dataframe
for column in df_copy.columns:
    df_copy[column] = label_encoder.fit_transform(df_copy[column])

matrix = df_copy.corr()

plt.figure(figsize=(12, 12))
# Plotting correlation matrix with 2 decimal places
sns.heatmap(matrix.round(2), cmap="Greens", annot=True)



In [None]:
# Filter the attributes with p-value lower than 0.05
filtered_attributes = correlation_p_df[correlation_p_df['P-Value'] < 0.05]['Attribute']

plt.close()
plt.figure(figsize=(15, 5))
# Create a bar chart plot for each filtered attribute
for attribute in filtered_attributes:
    # Get the correlation value for the attribute
    correlation = correlation_p_df.loc[correlation_p_df['Attribute'] == attribute, 'Correlation'].values[0]

    # Create a bar plot
    plt.bar(attribute, correlation)

# Set the labels and title
plt.xlabel('Attribute')
plt.xticks(rotation=45)
plt.ylabel('Point-Biserial Correlation')
plt.title('Point-Biserial Correlation for Attributes with P-Value < 0.05')

# Show the plot
plt.show()


In [None]:
# Filter the attributes with p-value lower than 0.05
filtered_attributes = correlation_p_df_category[correlation_p_df_category['P-Value'] < 0.05]['Attribute']

plt.close()
plt.figure(figsize=(15, 5))
# Create a bar chart plot for each filtered attribute
for attribute in filtered_attributes:
    # Get the correlation value for the attribute
    correlation = correlation_p_df_category.loc[correlation_p_df_category['Attribute'] == attribute, 'Correlation'].values[0]

    # Create a bar plot
    plt.bar(attribute, correlation)

# Set the labels and title
plt.xlabel('Attribute')
plt.xticks(rotation=45)
plt.ylabel('Logarithmic Chi-squared Correlation')
plt.title('Logarithmic Chi-squared Correlation for Attributes with P-Value < 0.05')
plt.yscale('log')
# Show the plot
plt.show()


## Regresie Logistica

In [None]:
def create_model(df, scaler, state):
    print("Creating model...")
    # Create the logistic regression model
    model = LogisticRegression(max_iter=df.shape[0])
    # Create an instance of LabelEncoder
    label_encoder = LabelEncoder()

    df_copy = df.copy()

    # Convert categorical variables to numerical labels
    for column in df_copy.columns:
        df_copy[column] = label_encoder.fit_transform(df_copy[column])


    # Split the dataset into features (X) and target variable (y)
    X = df_copy.drop(TARGET, axis=1)
    y = df_copy[TARGET]

    # # Apply the scaler if provided
    if scaler is not None:
        scaler = scaler()
        X = scaler.fit_transform(X)


    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=state)

    # Fit the model to the data
    model.fit(X_train, y_train)

    return model, X_test, y_test


In [None]:

model_simple, X_test, y_test = create_model(df, None, 42)
# Predict the labels for the test set
y_pred = model_simple.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)

print(f'Test accuracy: {accuracy}')

In [None]:


average_accuracy = [[], [], [], []]
average_precision = [[], [], [], []]
average_recall = [[], [], [], []]
average_f1 = [[], [], [], []]
for state in RANDOM_STATES:
    print(f"Random state: {state}")
    for i, scaler in enumerate(SCALERS):
        print(f"Scaler: {scaler.__name__ if scaler is not None else None}")
        model, X_test, y_test = create_model(df, scaler, state)
        # Predict the labels for the test set
        y_pred = model.predict(X_test)

        # Calculate the accuracy score
        accuracy = accuracy_score(y_test, y_pred)

        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print(f'Test accuracy: {accuracy}')
        print(f'Precision: {precision}')
        print(f'Recall: {recall}')
        print(f'F1 score: {f1}\n')

        average_accuracy[i].append(accuracy)
        average_precision[i].append(precision)
        average_recall[i].append(recall)
        average_f1[i].append(f1)




In [None]:

for i, scaler in enumerate(SCALERS):
    print(f"Scaler: {scaler.__name__ if scaler is not None else None}")

    print(f'Variance of accuracy: {np.var(average_accuracy[i])}')
    print(f'Variance of precision: {np.var(average_precision[i])}')
    print(f'Variance of recall: {np.var(average_recall[i])}')
    print(f'Variance of F1 score: {np.var(average_f1[i])}\n')

    print(f'Average accuracy: {np.mean(average_accuracy[i])}')
    print(f'Average precision: {np.mean(average_precision[i])}')
    print(f'Average recall: {np.mean(average_recall[i])}')
    print(f'Average F1 score: {np.mean(average_f1[i])}\n')

In [None]:
from scipy.special import expit
class MyLogisticRegression:
    def __init__(self, lr=.01, epochs_no=100):
        self.lr = lr
        self.epochs_no = epochs_no
        self.W = None

    def nll(self, Y, T):
        return -np.sum(T * np.log(Y) + (1 - T) * np.log(1 - Y))

    def train(self, X, T):
        (N, D) = X.shape
        X_hat = np.concatenate([X, np.ones((N, 1))], axis=1)
        W = np.random.randn((D+1))

        for _ in range(self.epochs_no):
            W = W - X_hat.T @ (expit(X_hat @ W) - T) * self.lr / N
        self.W =  W


    def predict(self, X):
        y = expit(np.concatenate([X, np.ones((X.shape[0], 1))], axis=1) @ self.W)
        for i in range(len(y)):
            if y[i] >= 0.5:
                y[i] = 1
            else:
                y[i] = 0
        return y

In [None]:
def create_my_model(df, scaler, state):
    print("Creating model...")
    # Create the logistic regression model
    model = MyLogisticRegression(lr=.01, epochs_no=1000)
    # Create an instance of LabelEncoder
    label_encoder = LabelEncoder()

    df_copy = df.copy()

    # Convert categorical variables to numerical labels
    for column in df_copy.columns:
        df_copy[column] = label_encoder.fit_transform(df_copy[column])


    # Split the dataset into features (X) and target variable (y)
    X = df_copy.drop(TARGET, axis=1)
    y = df_copy[TARGET]

    # # Apply the scaler if provided
    if scaler is not None:
        scaler = scaler()
        X = scaler.fit_transform(X)


    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=state)

    # Fit the model to the data
    model.train(X_train, y_train)

    return model, X_test, y_test

In [None]:
average_accuracy = [[], [], [], []]
average_precision = [[], [], [], []]
average_recall = [[], [], [], []]
average_f1 = [[], [], [], []]
for state in [42]:
    print(f"Random state: {state}")
    for i, scaler in enumerate(SCALERS):
        print(f"Scaler: {scaler.__name__ if scaler is not None else None}")
        model, X_test, y_test = create_my_model(df, scaler, state)
        # Predict the labels for the test set
        y_pred = model.predict(X_test)
        # Calculate the accuracy score
        accuracy = accuracy_score(y_test, y_pred)

        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print(f'Test accuracy: {accuracy}')
        print(f'Precision: {precision}')
        print(f'Recall: {recall}')
        print(f'F1 score: {f1}\n')

        average_accuracy[i].append(accuracy)
        average_precision[i].append(precision)
        average_recall[i].append(recall)
        average_f1[i].append(f1)




In [None]:

for i, scaler in enumerate(SCALERS):
    print(f"Scaler: {scaler.__name__ if scaler is not None else None}")

    print(f'Variance of accuracy: {np.var(average_accuracy[i])}')
    print(f'Variance of precision: {np.var(average_precision[i])}')
    print(f'Variance of recall: {np.var(average_recall[i])}')
    print(f'Variance of F1 score: {np.var(average_f1[i])}\n')

    print(f'Average accuracy: {np.mean(average_accuracy[i])}')
    print(f'Average precision: {np.mean(average_precision[i])}')
    print(f'Average recall: {np.mean(average_recall[i])}')
    print(f'Average F1 score: {np.mean(average_f1[i])}\n')

## Arbori de decizize

### Model sklearn

In [None]:
from sklearn.tree import DecisionTreeClassifier

def create_decision_tree_model(df, scaler, state, depth):
    print("Creating model...")
    # Create the logistic regression model



    # Create an instance of LabelEncoder
    label_encoder = LabelEncoder()

    df_copy = df.copy()

    # Convert categorical variables to numerical labels
    for column in df_copy.columns:
        df_copy[column] = label_encoder.fit_transform(df_copy[column])


    # Split the dataset into features (X) and target variable (y)
    X = df_copy.drop(TARGET, axis=1)
    y = df_copy[TARGET]

    # # Apply the scaler if provided
    if scaler is not None:
        scaler = scaler()
        X = scaler.fit_transform(X)


    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=state)

    # Create an instance of DecisionTreeClassifier
    clf = DecisionTreeClassifier(random_state=state, max_depth=depth)

    # Fit the model to the data
    clf.fit(X_train, y_train)

    return clf, X_test, y_test


In [None]:
model, X_test, y_test = create_decision_tree_model(df, None, 42, 6)
# Predict the labels for the test set
y_pred = model.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)

print(f'Test accuracy: {accuracy}')
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Test accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 score: {f1}\n')

In [None]:
average_accuracy = [[], [], [], []]
average_precision = [[], [], [], []]
average_recall = [[], [], [], []]
average_f1 = [[], [], [], []]
for state in [42]:
    print(f"Random state: {state}")
    for i, scaler in enumerate(SCALERS):
        print(f"Scaler: {scaler.__name__ if scaler is not None else None}")
        model, X_test, y_test = create_decision_tree_model(df, scaler, state)
        # Predict the labels for the test set
        y_pred = model.predict(X_test)
        # Calculate the accuracy score
        accuracy = accuracy_score(y_test, y_pred)

        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print(f'Test accuracy: {accuracy}')
        print(f'Precision: {precision}')
        print(f'Recall: {recall}')
        print(f'F1 score: {f1}\n')

        average_accuracy[i].append(accuracy)
        average_precision[i].append(precision)
        average_recall[i].append(recall)
        average_f1[i].append(f1)

### My decision tree

In [28]:
from collections import Counter
from copy import deepcopy
import csv
import os

class Node:
    """ Representation for a node from the decision tree """
    def __init__(self, label):
        """
            for non-leafs it is the name of the attribute
            for leafs it is the class
        """
        self.label = label

        # Dictionary of (attribute value, nodes)
        self.children = {}

    def display(self, indent = ""):
        print(indent + (self.label + ":" if self.children else "<" + self.label + ">"))
        indent += "   "
        if self.children:
            for key, value in self.children.items():
                print(indent + ":" + key)
                value.display(indent + "   ")


def getDataSet():
    """ Reads a dataset

    Args:
        dataSetName (str): Name for the dataset

    Returns:
        A tuple containing (classes, attributes, examples):
        classes (set): the classes that are found in the dataset
        attributes (list of strings): the attributes for the dataset
        examples (list of dictionaries): one example contains an entry as
            (attribute name, attribute value)
    """

    dataset_file = "dataset.csv"
    from os import path


    f_in = open(dataset_file, 'r')
    csv_reader = csv.reader(f_in, delimiter=",")

    # Read the header row
    row = next(csv_reader)

    # The last element represents the class
    attributeNames = row[:-1]

    examples = []
    classes = set()

    for row in csv_reader:
        *attributes, label = row
        classes.add(label)
        example = dict(zip(attributeNames, attributes))
        example[TARGET] = label
        examples.append(example)

    f_in.close()
    return classes, attributeNames, examples

def mostFrequentClass(X):
    # TODO 1a
    rez = Counter(list(map(lambda x: x[TARGET], X)))
    max_ap = 0
    for i in rez:
        if rez[i] > max_ap:
            max_ap = rez[i]
            cls_max = i
    return cls_max


def entropy(X):
    entropy = 0
    rez = Counter(list(map(lambda x: x[TARGET], X)))
    total = 0
    for i in rez:
        total += rez[i]
    for c in rez:
        if(rez[c] > 0):
            entropy += rez[c]/total * math.log2(rez[c]/total)
    return -entropy

def gain(X, a):
    # TODO 2b
    rez = Counter(list(map(lambda x: x[TARGET], X)))
    total = 0
    for i in rez:
        total += rez[i]
    sum = 0
    rez = Counter(list(map(lambda x: x[a], X)))
    for i in rez:
        Xij = list(filter(lambda x: x[a] == i, X))
        sum += len(Xij)/total * entropy(Xij)

    return entropy(X) - sum

def get_max_atr(X, list_atr):
    scores = list(map(lambda x: gain(X, x), list_atr))
    index_max = scores.index(max(scores))
    return list_atr[index_max]


def id3(X, A, d = 6):
    # TODO 2c
    # rez = Counter(list(map(lambda x: x[TARGET], X)))
    # if len(rez) == 1:
    #     keys = list(X[0].keys())
    #     key = keys[0]
    #     return Node(key)
    if d == 0 or len(A) == 0:
        return Node(mostFrequentClass(X))
    elif d > 0:
        atr = get_max_atr(X, A)
        rez = Counter(list(map(lambda x: x[atr], X)))
        A_new =  list(filter(lambda x: x != atr, A))
        children = {}
        for i in rez:
            X_new = list(filter(lambda x: x[atr] == i, X))
            children[i] = id3(X_new, A_new, d - 1)
        node = Node(atr)
        node.children = children
        return node

    return Node("")

def evaluate(tree, example):
    '''
    Functia intoarce clasa prezisa de arborele `tree` pentru exemplul `example`
    '''
    # TODO 2d
    node = tree
    while len(node.children) > 0:
        node = node.children[example[node.label]]
    return node.label

In [29]:
classes, attributes, examples = getDataSet()
print(classes)
print(attributes)
tree = id3(examples, attributes, 6)


{'TRUE', 'FALSE'}
['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend']


In [56]:
y_pred = []
tree.display()

for i in examples:
    y_pred.append(evaluate(tree, i))
y_values = list(map(lambda x: x[TARGET], examples))
print(y_values)
print(y_pred)
accuracy = accuracy_score(y_values, y_pred)
precision = precision_score(y_values, y_pred)
recall = recall_score(y_values, y_pred)
f1 = f1_score(y_values, y_pred)

print(f'Test accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 score: {f1}\n')

NameError: name 'tree' is not defined

In [4]:
import numpy as np

class MyDecisionTree:
    def __init__(self, max_depth):
        self.max_depth = max_depth
        self.tree = None

    def gini_index(self, X):
        gini = 1
        total = len(X)
        classes = Counter(X)
        for c in classes:
            p = classes[c] / total
            gini -= p ** 2
        return gini

    def get_best_attribute(self, X, y):
        best_attribute = None
        best_gini_index = float('inf')
        for attribute in X.columns:
            gini_index = 0
            attribute_values = X[attribute].unique()
            for value in attribute_values:
                subset = y[X[attribute] == value]
                gini_index += len(subset) / len(y) * self.gini_index(subset)
            if gini_index < best_gini_index:
                best_gini_index = gini_index
                best_attribute = attribute
        return best_attribute

    def id3(self, X, y, depth):
        if depth == 0 or len(y.unique()) == 1:
            return Counter(y).most_common(1)[0][0]
        if len(X.columns) == 0:
            return Counter(y).most_common(1)[0][0]
        if depth > self.max_depth:
            return Counter(y).most_common(1)[0][0]
        best_attribute = self.get_best_attribute(X, y)
        tree = {best_attribute: {}}
        attribute_values = X[best_attribute].unique()
        for value in attribute_values:
            subset_X = X[X[best_attribute] == value].drop(columns=best_attribute)
            subset_y = y[X[best_attribute] == value]
            tree[best_attribute][value] = self.id3(subset_X, subset_y, depth + 1)
        return tree

    def fit(self, X_train, y_train):
        self.tree = self.id3(X_train, y_train, 0)

    def predict(self, X_test):
        if isinstance(X_test, np.ndarray):
            X_test = pd.DataFrame(X_test)
        predictions = []
        for _, instance in X_test.iterrows():
            predictions.append(self.traverse_tree(instance, self.tree))
        return predictions

    def traverse_tree(self, instance, tree):
        if not isinstance(tree, dict):
            return tree
        attribute = next(iter(tree))
        value = instance[attribute]
        if value in tree[attribute]:
            subtree = tree[attribute][value]
        else:
            return None
        return self.traverse_tree(instance, subtree)


In [68]:
class MyDecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def gini_index(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        gini = 1 - np.sum(probabilities**2)
        return gini

    def get_best_split(self, X, y):
        m, n = X.shape
        if m <= 1:
            return None, None

        num_classes = len(np.unique(y))
        if num_classes == 1:
            return None, None

        current_gini = self.gini_index(y)
        best_gini = float('inf')
        best_index = None
        best_value = None

        for col in range(n):
            values = np.unique(X[:, col])
            for value in values:
                left_mask = X[:, col] <= value
                right_mask = ~left_mask

                left_gini = self.gini_index(y[left_mask])
                right_gini = self.gini_index(y[right_mask])

                weighted_gini = (len(y[left_mask]) / len(y)) * left_gini + (len(y[right_mask]) / len(y)) * right_gini

                if weighted_gini < best_gini:
                    best_gini = weighted_gini
                    best_index = col
                    best_value = value

        return best_index, best_value

    def split(self, X, y, index, value):
        mask = X[:, index] <= value
        return X[mask], y[mask], X[~mask], y[~mask]

    def id3(self, X, y, depth):
        if depth == 0 or len(np.unique(y)) == 1:
            return np.argmax(np.bincount(y))

        index, value = self.get_best_split(X, y)

        if index is None:
            return np.argmax(np.bincount(y))

        X_left, y_left, X_right, y_right = self.split(X, y, index, value)

        node = {}
        node['index'] = index
        node['value'] = value
        node['left'] = self.id3(X_left, y_left, depth - 1)
        node['right'] = self.id3(X_right, y_right, depth - 1)

        return node

    def fit(self, X_train, y_train):
        if isinstance(X_train, pd.DataFrame):
            X = X_train.values
        else:
            X = X_train
        if isinstance(y_train, pd.DataFrame):
            y = y_train.values
        else:
            y = y_train
        self.tree = self.id3(X, y, self.max_depth)

    def predict_instance(self, instance, tree):
        if type(tree) != dict:
            return tree
        if 'value' not in tree:
            return tree

        value = instance[tree['index']]

        if value <= tree['value']:
            return self.predict_instance(instance, tree['left'])
        else:
            return self.predict_instance(instance, tree['right'])

    def predict(self, X_test):
        if isinstance(X_test, pd.DataFrame):
            X = X_test.values
        else:
            X = X_test
        predictions = [self.predict_instance(instance, self.tree) for instance in X]
        return np.array(predictions)




In [66]:
def create_my_tree(df, scaler, state, depth):
    print("Creating model...")

    # Create an instance of LabelEncoder
    label_encoder = LabelEncoder()

    df_copy = df.copy()

    # Convert categorical variables to numerical labels
    for column in df_copy.columns:
        df_copy[column] = label_encoder.fit_transform(df_copy[column])


    # Split the dataset into features (X) and target variable (y)
    X = df_copy.drop(TARGET, axis=1)
    y = df_copy[TARGET]

    # Apply the scaler if provided
    if scaler is not None:
        scaler = scaler()
        X = scaler.fit_transform(X)


    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=state)
    # Create the Decision Tree model
    model = MyDecisionTree(max_depth=depth)
    # Fit the model to the data
    model.fit(X_train, y_train)

    return model, X_test, y_test

In [67]:


average_accuracy = [[], [], [], []]
average_precision = [[], [], [], []]
average_recall = [[], [], [], []]
average_f1 = [[], [], [], []]
for state in RANDOM_STATES:
    print(f"Random state: {state}")
    for i, scaler in enumerate(SCALERS):
        print(f"Scaler: {scaler.__name__ if scaler is not None else None}")
        for depth in range(3,7):
            print(f"Depth: {depth}")
            model, X_test, y_test = create_my_tree(df, scaler, state, depth)
            # Predict the labels for the test set
            y_pred = model.predict(X_test)
            # Calculate the accuracy score
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            print(f'Test accuracy: {accuracy}')
            print(f'Precision: {precision}')
            print(f'Recall: {recall}')
            print(f'F1 score: {f1}\n')

            average_accuracy[i].append(accuracy)
            average_precision[i].append(precision)
            average_recall[i].append(recall)
            average_f1[i].append(f1)




Random state: 0
Scaler: None
Depth: 3
Creating model...
Test accuracy: 0.8852392538523925
Precision: 0.683377308707124
Recall: 0.6137440758293838
F1 score: 0.6466916354556804

Depth: 4
Creating model...
Test accuracy: 0.8884833738848338
Precision: 0.7194029850746269
Recall: 0.5710900473933649
F1 score: 0.6367239101717305

Depth: 5
Creating model...
Test accuracy: 0.8880778588807786
Precision: 0.7354838709677419
Recall: 0.5402843601895735
F1 score: 0.6229508196721312

Depth: 6
Creating model...
Test accuracy: 0.8917274939172749
Precision: 0.7540983606557377
Recall: 0.5450236966824644
F1 score: 0.6327372764786794

Scaler: MinMaxScaler
Depth: 3
Creating model...


AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [34]:
for i, scaler in enumerate(SCALERS):
    print(f"Scaler: {scaler.__name__ if scaler is not None else None}")

    print(f'Variance of accuracy: {np.var(average_accuracy[i])}')
    print(f'Variance of precision: {np.var(average_precision[i])}')
    print(f'Variance of recall: {np.var(average_recall[i])}')
    print(f'Variance of F1 score: {np.var(average_f1[i])}\n')

    print(f'Average accuracy: {np.mean(average_accuracy[i])}')
    print(f'Average precision: {np.mean(average_precision[i])}')
    print(f'Average recall: {np.mean(average_recall[i])}')
    print(f'Average F1 score: {np.mean(average_f1[i])}\n')

Scaler: None
Variance of accuracy: 8.060309585874805e-05
Variance of precision: 0.0
Variance of recall: 0.0
Variance of F1 score: 0.0

Average accuracy: 0.8439578264395783
Average precision: 0.0
Average recall: 0.0
Average F1 score: 0.0

Scaler: MinMaxScaler
Variance of accuracy: 8.060309585874805e-05
Variance of precision: 0.0
Variance of recall: 0.0
Variance of F1 score: 0.0

Average accuracy: 0.8439578264395783
Average precision: 0.0
Average recall: 0.0
Average F1 score: 0.0

Scaler: StandardScaler
Variance of accuracy: 8.060309585874805e-05
Variance of precision: 0.0
Variance of recall: 0.0
Variance of F1 score: 0.0

Average accuracy: 0.8439578264395783
Average precision: 0.0
Average recall: 0.0
Average F1 score: 0.0

Scaler: RobustScaler
Variance of accuracy: 8.060309585874805e-05
Variance of precision: 0.0
Variance of recall: 0.0
Variance of F1 score: 0.0

Average accuracy: 0.8439578264395783
Average precision: 0.0
Average recall: 0.0
Average F1 score: 0.0



## Final