In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import math
import os
import pickle
from matplotlib import pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics import precision_score, recall_score, f1_score

from scipy.stats import pointbiserialr, chi2_contingency

In [2]:
ATT_INT = ["Administrative", "Informational", "ProductRelated"]
ATT_INT_CATEGORY = ["OperatingSystems", "Browser", "Region", "TrafficType"]
ATT_FLOAT = [
    "Administrative_Duration",
    "Informational_Duration",
    "ProductRelated_Duration",
    "BounceRates",
    "ExitRates",
    "PageValues",
    "SpecialDay",
]
ATT_STRING = ["Month", "VisitorType"]
ATT_BOOL = ["Weekend", "Revenue"]
ATT_BOOL_NO_TARGET = ["Weekend"]

TARGET = "Revenue"

RANDOM_STATES = [0, 1, 5, 7, 13, 23, 29, 32, 37, 42]

SCALERS = [None, MinMaxScaler, StandardScaler, RobustScaler]

In [3]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
def run_logistic(states, scalers, constructor, filename):
    if filename is not None:
        if os.path.exists(filename):
            print(f"Loading data from {filename}")

            # Load the data from the file
            with open(filename, "rb") as f:
                (
                    average_accuracy,
                    average_precision,
                    average_recall,
                    average_f1,
                ) = pickle.load(f)
            return average_accuracy, average_precision, average_recall, average_f1

    average_accuracy = [[], [], [], []]
    average_precision = [[], [], [], []]
    average_recall = [[], [], [], []]
    average_f1 = [[], [], [], []]
    for state in states:
        print(f"Random state: {state}")
        for i, scaler in enumerate(scalers):
            print(f"Scaler: {scaler.__name__ if scaler is not None else None}")
            model, X_test, y_test = constructor(df, scaler, state)
            # Predict the labels for the test set
            y_pred = model.predict(X_test)

            # Calculate the accuracy score
            accuracy = accuracy_score(y_test, y_pred)

            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            print(f"Test accuracy: {accuracy}")
            print(f"Precision: {precision}")
            print(f"Recall: {recall}")
            print(f"F1 score: {f1}\n")

            average_accuracy[i].append(accuracy)
            average_precision[i].append(precision)
            average_recall[i].append(recall)
            average_f1[i].append(f1)

    # Save the average lists to a file
    with open(filename, "wb") as f:
        pickle.dump(
            (average_accuracy, average_precision, average_recall, average_f1), f
        )
    return average_accuracy, average_precision, average_recall, average_f1

In [5]:
def calculate_average_variance(
    average_accuracy, average_precision, average_recall, average_f1
):
    avgs_variances = []
    for i, scaler in enumerate(SCALERS):
        print(f"Scaler: {scaler.__name__ if scaler is not None else None}")

        variance_accuracy = np.var(average_accuracy[i])
        variance_precision = np.var(average_precision[i])
        variance_recall = np.var(average_recall[i])
        variance_f1 = np.var(average_f1[i])

        print(f"Variance of accuracy: {variance_accuracy}")
        print(f"Variance of precision: {variance_precision}")
        print(f"Variance of recall: {variance_recall}")
        print(f"Variance of F1 score: {variance_f1}\n")

        avg_accuracy = np.mean(average_accuracy[i])
        avg_precision = np.mean(average_precision[i])
        avg_recall = np.mean(average_recall[i])
        avg_f1 = np.mean(average_f1[i])

        print(f"Average accuracy: {avg_accuracy}")
        print(f"Average precision: {avg_precision}")
        print(f"Average recall: {avg_recall}")
        print(f"Average F1 score: {avg_f1}\n")
        values = (
            avg_accuracy,
            avg_precision,
            avg_recall,
            avg_f1,
            variance_accuracy,
            variance_precision,
            variance_recall,
            variance_f1,
        )
        avgs_variances.append(values)
    return avgs_variances

In [6]:
def create_model(df, scaler, state):
    print("Creating model...")
    # Create the logistic regression model
    model = LogisticRegression(max_iter=df.shape[0])
    # Create an instance of LabelEncoder
    label_encoder = LabelEncoder()

    df_copy = df.copy()

    # Convert categorical variables to numerical labels
    for column in df_copy.columns:
        df_copy[column] = label_encoder.fit_transform(df_copy[column])

    # Split the dataset into features (X) and target variable (y)
    X = df_copy.drop(TARGET, axis=1)
    y = df_copy[TARGET]

    # # Apply the scaler if provided
    if scaler is not None:
        scaler = scaler()
        X = scaler.fit_transform(X)

    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=state
    )

    # Fit the model to the data
    model.fit(X_train, y_train)

    return model, X_test, y_test

In [7]:
model_simple, X_test, y_test = create_model(df, None, 42)
# Predict the labels for the test set
y_pred = model_simple.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)

print(f"Test accuracy: {accuracy}")

Creating model...
Test accuracy: 0.8807785888077859


In [8]:
average_accuracy, average_precision, average_recall, average_f1 = run_logistic(
    RANDOM_STATES, SCALERS, create_model, "logistic.pkl"
)

Loading data from logistic.pkl


In [9]:
calculate_average_variance(
    average_accuracy, average_precision, average_recall, average_f1
)

Scaler: None
Variance of accuracy: 4.5787347010995934e-05
Variance of precision: 0.0005192577084399036
Variance of recall: 0.00038491417123306997
Variance of F1 score: 0.00019789123622821324

Average accuracy: 0.8963503649635036
Average precision: 0.6924849532392865
Average recall: 0.6052713318709566
Average F1 score: 0.6455573528748052

Scaler: MinMaxScaler
Variance of accuracy: 4.314311292129322e-05
Variance of precision: 0.0004166265028214939
Variance of recall: 0.0002322291488033874
Variance of F1 score: 0.00013234754447677942

Average accuracy: 0.8952960259529602
Average precision: 0.6930091221882309
Average recall: 0.5916614299187041
Average F1 score: 0.6380548969181421

Scaler: StandardScaler
Variance of accuracy: 3.9933196911903204e-05
Variance of precision: 0.0003755323441311471
Variance of recall: 0.0002438391575655118
Variance of F1 score: 0.00010917547439038542

Average accuracy: 0.8952149229521492
Average precision: 0.6924595968063515
Average recall: 0.591951497715859
Aver

[(0.8963503649635036,
  0.6924849532392865,
  0.6052713318709566,
  0.6455573528748052,
  4.5787347010995934e-05,
  0.0005192577084399036,
  0.00038491417123306997,
  0.00019789123622821324),
 (0.8952960259529602,
  0.6930091221882309,
  0.5916614299187041,
  0.6380548969181421,
  4.314311292129322e-05,
  0.0004166265028214939,
  0.0002322291488033874,
  0.00013234754447677942),
 (0.8952149229521492,
  0.6924595968063515,
  0.591951497715859,
  0.6379686959479371,
  3.9933196911903204e-05,
  0.0003755323441311471,
  0.0002438391575655118,
  0.00010917547439038542),
 (0.8949310624493105,
  0.6909772230709689,
  0.5926844755277483,
  0.637648150286428,
  5.013684898068731e-05,
  0.0005074785469591962,
  0.00037943480175537227,
  0.00017410182285837458)]

In [10]:
from scipy.special import expit


class MyLogisticRegression:
    def __init__(self, lr=0.01, epochs_no=100):
        self.lr = lr
        self.epochs_no = epochs_no
        self.W = None

    def nll(self, Y, T):
        return -np.sum(T * np.log(Y) + (1 - T) * np.log(1 - Y))

    def train(self, X, T):
        (N, D) = X.shape
        X_hat = np.concatenate([X, np.ones((N, 1))], axis=1)
        W = np.random.randn((D + 1))

        for _ in range(self.epochs_no):
            W = W - X_hat.T @ (expit(X_hat @ W) - T) * self.lr / N
        self.W = W

    def predict(self, X):
        y = expit(np.concatenate([X, np.ones((X.shape[0], 1))], axis=1) @ self.W)
        for i in range(len(y)):
            if y[i] >= 0.5:
                y[i] = 1
            else:
                y[i] = 0
        return y

In [11]:
def create_my_model(df, scaler, state):
    print("Creating model...")
    # Create the logistic regression model
    model = MyLogisticRegression(lr=0.01, epochs_no=1000)
    # Create an instance of LabelEncoder
    label_encoder = LabelEncoder()

    df_copy = df.copy()

    # Convert categorical variables to numerical labels
    for column in df_copy.columns:
        df_copy[column] = label_encoder.fit_transform(df_copy[column])

    # Split the dataset into features (X) and target variable (y)
    X = df_copy.drop(TARGET, axis=1)
    y = df_copy[TARGET]

    # # Apply the scaler if provided
    if scaler is not None:
        scaler = scaler()
        X = scaler.fit_transform(X)

    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=state
    )

    # Fit the model to the data
    model.train(X_train, y_train)

    return model, X_test, y_test

In [12]:
average_accuracy, average_precision, average_recall, average_f1 = run_logistic(
    RANDOM_STATES, SCALERS, create_my_model, "my_logistic.pkl"
)

Loading data from my_logistic.pkl


In [14]:
calculate_average_variance(
    average_accuracy, average_precision, average_recall, average_f1
)

Scaler: None
Variance of accuracy: 0.006343966508999275
Variance of precision: 0.038790129625198115
Variance of recall: 0.13611836836165264
Variance of F1 score: 0.07257256014119179

Average accuracy: 0.8302919708029197
Average precision: 0.6453768587044486
Average recall: 0.4748927568689555
Average F1 score: 0.38064429543706707

Scaler: MinMaxScaler
Variance of accuracy: 0.00026777145661120987
Variance of precision: 0.01846004706894375
Variance of recall: 0.010870871790115366
Variance of F1 score: 0.014734140804535778

Average accuracy: 0.8393349553933496
Average precision: 0.444199342523863
Average recall: 0.17569543811027605
Average F1 score: 0.24192508808478247

Scaler: StandardScaler
Variance of accuracy: 0.0005511797427989018
Variance of precision: 0.0029041635466063895
Variance of recall: 0.0049871481737227925
Variance of F1 score: 0.002231048857600137

Average accuracy: 0.8356447688564476
Average precision: 0.484015719119987
Average recall: 0.5989587112832753
Average F1 score: 

[(0.8302919708029197,
  0.6453768587044486,
  0.4748927568689555,
  0.38064429543706707,
  0.006343966508999275,
  0.038790129625198115,
  0.13611836836165264,
  0.07257256014119179),
 (0.8393349553933496,
  0.444199342523863,
  0.17569543811027605,
  0.24192508808478247,
  0.00026777145661120987,
  0.01846004706894375,
  0.010870871790115366,
  0.014734140804535778),
 (0.8356447688564476,
  0.484015719119987,
  0.5989587112832753,
  0.5322357724744308,
  0.0005511797427989018,
  0.0029041635466063895,
  0.0049871481737227925,
  0.002231048857600137),
 (0.8325628548256285,
  0.4075246755810563,
  0.6432284577000448,
  0.4921691560189584,
  0.0009443089438915888,
  0.032889410977153345,
  0.09993290048435524,
  0.05646958680854919)]