# Classification

In [1]:
# Run this code to make Jupyter print every
# printable statement and not just the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# To visualize the data
import matplotlib.pyplot as plt

# Generic libraries
import seaborn as sns
import pandas as pd
import numpy as np

# Regression models
import sklearn
import scipy
from scipy.stats import *
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV #split the data into training and test
from sklearn.linear_model import LinearRegression #linear regression
from sklearn.preprocessing import PolynomialFeatures #for polynomial regression
from sklearn.metrics import r2_score, mean_squared_error

# 5-folds crossvalidation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, classification_report
from sklearn.pipeline import Pipeline

#classification
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier 

## Preprocessing

In [2]:
training_set = pd.read_csv("training_set_complete.csv")
test_set = pd.read_csv("test_set_complete.csv")
training_set = training_set.drop(columns=['Unnamed: 0'])
test_set = test_set.drop(columns=['Unnamed: 0'])
training_set['Revenue'] = training_set['Revenue'].astype(int)
test_set['Revenue'] = test_set['Revenue'].astype(int)
training_set['Revenue'].value_counts()
test_set['Revenue'].value_counts()
categorical_features = ['Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June',
       'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep',
       'OperatingSystems_1', 'OperatingSystems_2', 'OperatingSystems_3',
       'OperatingSystems_4', 'OperatingSystems_5', 'OperatingSystems_6',
       'OperatingSystems_7', 'OperatingSystems_8', 'Browser_1', 'Browser_2',
       'Browser_3', 'Browser_4', 'Browser_5', 'Browser_6', 'Browser_7',
       'Browser_8', 'Browser_9', 'Browser_10', 'Browser_11', 'Browser_12',
       'Browser_13', 'Region_1', 'Region_2', 'Region_3', 'Region_4',
       'Region_5', 'Region_6', 'Region_7', 'Region_8', 'Region_9',
       'TrafficType_1', 'TrafficType_2', 'TrafficType_3', 'TrafficType_4',
       'TrafficType_5', 'TrafficType_6', 'TrafficType_7', 'TrafficType_8',
       'TrafficType_9', 'TrafficType_10', 'TrafficType_11', 'TrafficType_12',
       'TrafficType_13', 'TrafficType_14', 'TrafficType_15', 'TrafficType_16',
       'TrafficType_17', 'TrafficType_18', 'TrafficType_19', 'TrafficType_20',
       'VisitorType_New_Visitor', 'VisitorType_Other',
       'VisitorType_Returning_Visitor', 'Weekend_False', 'Weekend_True']
training_set[categorical_features] = training_set[categorical_features].astype('category')
test_set[categorical_features] = test_set[categorical_features].astype('category')

0    7807
1    1441
Name: Revenue, dtype: int64

0    2615
1     467
Name: Revenue, dtype: int64

In [3]:
X_train_full = training_set.drop(columns=['Revenue'])
X_test_full = test_set.drop(columns=['Revenue'])
y_train = training_set['Revenue']
y_test = test_set['Revenue']

### Variance selection
We apply variance selection to remove all the features with a very low variance.

In [4]:
from sklearn.feature_selection import VarianceThreshold

threshold = 0.001  # Soglia di varianza desiderata
variance_selector = VarianceThreshold(threshold=threshold)

# Applicazione della selezione della varianza sul dataset

X_train = X_train_full.loc[:, variance_selector.fit(X_train_full).get_support()]
columns_remaining = X_train_full.columns.tolist()

X_test = X_test_full[columns_remaining]
print(X_train.columns.tolist())

['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep', 'OperatingSystems_1', 'OperatingSystems_2', 'OperatingSystems_3', 'OperatingSystems_4', 'OperatingSystems_6', 'OperatingSystems_8', 'Browser_1', 'Browser_2', 'Browser_3', 'Browser_4', 'Browser_5', 'Browser_6', 'Browser_7', 'Browser_8', 'Browser_10', 'Browser_13', 'Region_1', 'Region_2', 'Region_3', 'Region_4', 'Region_5', 'Region_6', 'Region_7', 'Region_8', 'Region_9', 'TrafficType_1', 'TrafficType_2', 'TrafficType_3', 'TrafficType_4', 'TrafficType_5', 'TrafficType_6', 'TrafficType_7', 'TrafficType_8', 'TrafficType_9', 'TrafficType_10', 'TrafficType_11', 'TrafficType_13', 'TrafficType_15', 'TrafficType_19', 'TrafficType_20', 'VisitorType_New_Visitor', 'VisitorType_Other'

### Functions and algorithms

#### Feature selection: Forward stepwise selection
We chose this algorithm because it's more scalable on a big dataset.

In [29]:
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

def get_evaluator(scorer):
    def evaluator(model, X, y, trained=False):
        if not trained:
            model = model.fit(X, y)
        score = scorer(model, X, y)
        return model, score
    return evaluator   

def get_cv_evaluator(scorer, cv=3):
    def evaluator(model, X, y, trained=False):            
        scores = cross_val_score(model, X, y, scoring=scorer, cv=cv)
        if not trained:
            model = model.fit(X, y)
        return model, np.mean(scores)
    
    return evaluator

def get_val_evaluator(scorer, val_size=0.1):
    def evaluator(model, X, y, trained=False):
        X_train_small, X_val, y_train_small, y_val = train_test_split(X, y, 
                                                                      test_size=val_size,
                                                                      random_state=0)
        
        if not trained:
            model = model.fit(X_train_small, y_train_small)
        score = scorer(model, X_val, y_val) 
        
        return model, score
    
    return evaluator


possible_models = {
    "LogisticRegression":LogisticRegression(solver="newton-cg", penalty='none',max_iter=1000),
    "LDA":LDA(),
    "QDA":QDA(),
    #"KNN":KNeighborsClassifier(n_neighbors=6),
    "LinearRegression":LinearRegression(fit_intercept=True),
}

def forward_selection(Xtrain_pd, ytrain, Xtest_pd, ytest,model_chosen,
                      candidates_evaluator, candidates_argbest, # Metric to be used at 2.b
                      subsets_evaluator, subsets_argbest,       # Metric to be used at 3
                      test_evaluator=None, test_argbest=None,
                      candidates_scorer_name=None,  # Name of 2. figure
                      subsets_scorer_name=None,     # Name of 3. figure
                      verbose=True, weight_step3=0):   
    test_evaluator = subsets_evaluator if not test_evaluator else test_evaluator
    test_argbest = subsets_argbest if not test_argbest else test_argbest
    
    # Global variable init
    # ====================
    num_features = Xtrain_pd.shape[-1]
    best_candidate_metric = []
    # subsets_* are lists containing one value for each Mk model (the best of the Mk candidates)
    subsets_test = []
    subsets_metric = []        # The best metric of each subset of dimension 'dim'
    subsets_best_features = [] # The best features combination in each subset of dimension 'dim'
    # A figure to keep track of candidates scores in each Mk subset
    num_evaluations = 0        # A conter to keep track of the total number of trials
    
    selected_features = []
    all_features = Xtrain_pd.columns
    
    
    # 1. Train M0
    # ===========
    model = DummyRegressor()
    # Compute (2.b) metrics
    model, score = candidates_evaluator(model, Xtrain_pd[[]], ytrain)
    best_candidate_metric.append(score)
    subsets_best_features.append([])
    # Compute metric for step 3.
    _, score = subsets_evaluator(model, Xtrain_pd[[]], ytrain, trained=True)
    subsets_metric.append(score)
    _, score = test_evaluator(model, Xtrain_pd[[]], ytrain, trained=True)
    subsets_test.append(score)
    
    
    for dim in range(num_features):
        candidate_metrics = [] # metrics for all the models with dim features
        candidate_models = []  # models with dim features
        
        remaining_features = all_features.difference(selected_features)
        
        # fit all the models with k features
        for new_column in remaining_features:
            Xtrain_sub = Xtrain_pd[selected_features+[new_column]].to_numpy()
            model = possible_models[model_chosen]
            #print(new_column)
            model, score = candidates_evaluator(model, Xtrain_sub, ytrain)
            candidate_models.append(model)
            candidate_metrics.append(score)
            num_evaluations += 1
            
        
        idx_best_candidate = candidates_argbest(candidate_metrics) # select the best Mk model
        selected_features.append(remaining_features[idx_best_candidate]) # Update selected feature
        best_candidate_metric.append(candidate_metrics[idx_best_candidate]) # Save best candidate features
        best_features = selected_features.copy()
        subsets_best_features.append(best_features)
        
        
        # Compute metric for the final step -> comparison of all the best models
        best_subset_model = candidate_models[idx_best_candidate] # save the best model
        best_subset_Xtrain = Xtrain_pd[best_features].to_numpy()
        best_subset_Xtest = Xtest_pd[best_features].to_numpy()
        _, score = test_evaluator(best_subset_model, best_subset_Xtest, ytest, trained=True)
        subsets_test.append(score) # computing the metrics for the test set
        _, score = subsets_evaluator(best_subset_model, best_subset_Xtrain, ytrain, trained=True)
        subsets_metric.append(score) #computing the metrics for the training set
        num_evaluations += weight_step3 
        
        if verbose:
            print("............")
            print("Best model (M{}) with {} features: {}".format(dim+1, dim+1, best_features))
            print("M{} subset score (3.): {}".format(dim+1, score))
        
    # choose the best candidates
    best_subset_idx = subsets_argbest(subsets_metric)
    best_features = subsets_best_features[best_subset_idx]
    
    if verbose:
        print("\n Best configuration has {} features".format(best_subset_idx))
        print("Features: {}".format(subsets_best_features[best_subset_idx]))
        print("Total number of trained models:", num_evaluations)
    
    # Complete the subsets_fig figure by plotting
    # a line connecting all best candidate score
    best_candidate_score_idx = candidates_argbest(best_candidate_metric)
    
    best_test_score_idx = test_argbest(subsets_test)
    return best_features
    

#### Accuracy

In [54]:
def accuracy(y_pred, y_true):
    return (y_pred == y_true).mean()

def calculate_sensitivity_specificity(confusion_matrix):
    # Extract values from the confusion matrix
    TN, FP, FN, TP = confusion_matrix.ravel()

    # Calculate Sensitivity (Recall)
    sensitivity = TP / (TP + FN)

    # Calculate Specificity
    specificity = TN / (TN + FP)

    return sensitivity, specificity

#### Evaluation metrics

In [53]:
from sklearn.metrics import confusion_matrix

def classification_metrics(model, model_name, Xtrain, ytrain, Xtest, ytest,cv):
    #sensitivity
    sensitivity_train = np.mean(np.absolute(cross_val_score(model, Xtrain, ytrain, cv=cv, scoring='recall', n_jobs=-1)))
    
    #specificity
    specificity_train = np.mean(np.absolute(cross_val_score(model, Xtrain, ytrain, cv=cv, scoring='precision', n_jobs=-1)))
    
    #accuracy
    accuracy_train = np.mean(np.absolute(cross_val_score(model, Xtrain, ytrain, cv=cv, scoring='accuracy', n_jobs=-1)))
    
    # building the confusion matrix for test performances
    model = model.fit(Xtrain,ytrain)
    ypred = model.predict(Xtest)
    cm = confusion_matrix(ytest.to_numpy(), np.array(ypred))
    sensitivity_test, specificity_test = calculate_sensitivity_specificity(cm)
    
    accuracy_test = accuracy(ypred, ytest)
                         
    # building a dataframe
    data = {
        'Model': [model_name + "_cv", model_name + "_test"],
        'Accuracy': [accuracy_train,accuracy_test],
        'Sensitivity': [sensitivity_train,sensitivity_test],
        'Specificity': [specificity_train,specificity_test],
    }
    return pd.DataFrame(data)                    

In [72]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def classification_metrics_empirical_threshold(model, model_name, X_train, y_train, X_test, y_test):
    
    model = model.fit(X_train, y_train)
    
    # Get predicted probabilities
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of positive class (class 1)
    
    # the mean will give the fraction of ones that we use for the empirical threshold
    threshold = y_train.mean()

    # Convert probabilities to binary predictions
    y_pred = (y_pred_proba > threshold).astype(int)

    # Evaluate the performance using various metrics
    accuracy = accuracy_score(y_test, y_pred)
    specificity = precision_score(y_test, y_pred)
    sensitivity = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    data = {
        'Model': [model_name],
        'Accuracy': [accuracy],
        'Sensitivity': [sensitivity],
        'Specificity': [specificity],
    }
    return pd.DataFrame(data) 


## Logistic regression

### Scaling data
We scale the numerical features to avoid convergence problem with logistic regression.

In [8]:
# FACCIAMO QUESTO STEP PER EVITARE PROBLEMI DI CONVERGENZA NEL MODELLO
from sklearn.preprocessing import StandardScaler

# Assuming you have your feature data X
numeric_features = ['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']
categorical_features = ['Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar', 'Month_May', 
                        'Month_Nov', 'Month_Oct', 'Month_Sep', 'OperatingSystems_1', 'OperatingSystems_2', 'OperatingSystems_3', 'OperatingSystems_4', 'OperatingSystems_6', 'OperatingSystems_8', 'Browser_1', 'Browser_2', 'Browser_3', 'Browser_4', 'Browser_5', 'Browser_6', 'Browser_7', 'Browser_8', 'Browser_10', 'Browser_13', 'Region_1', 'Region_2', 'Region_3', 'Region_4', 'Region_5',
                        'Region_6', 'Region_7', 'Region_8', 'Region_9', 'TrafficType_1', 'TrafficType_2', 
                        'TrafficType_3', 'TrafficType_4', 'TrafficType_5', 'TrafficType_6', 'TrafficType_7', 
                        'TrafficType_8', 'TrafficType_9', 'TrafficType_10', 'TrafficType_11', 'TrafficType_13', 
                        'TrafficType_15', 'TrafficType_19', 'TrafficType_20', 'VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor', 'Weekend_False', 'Weekend_True']
scaler = StandardScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(X_train[numeric_features]))
test_scaled = pd.DataFrame(scaler.fit_transform(X_test[numeric_features]))
train_scaled.columns = X_train[numeric_features].columns
test_scaled.columns = X_test[numeric_features].columns
X_train_scaled = pd.concat([train_scaled,X_train[categorical_features]],axis=1)
X_test_scaled = pd.concat([test_scaled,X_test[categorical_features]],axis=1)

### Feature selection

In [31]:
cv = 5
features_logistic = forward_selection(X_train_scaled, y_train, X_test_scaled, y_test,"LogisticRegression",
                  get_evaluator(make_scorer(accuracy)), np.argmax, # 2.
                  get_cv_evaluator(make_scorer(accuracy), cv), np.argmax, # 3.
                  get_cv_evaluator(make_scorer(accuracy), cv), np.argmax, # test
                  candidates_scorer_name="Accuracy",
                  subsets_scorer_name="Accuracy (CV)",
                  verbose=False, weight_step3=cv)
features_logistic

............
Best model (M1) with 1 features: ['PageValues']
M1 subset score (3.): 0.8796491894815313
............
Best model (M2) with 2 features: ['PageValues', 'VisitorType_New_Visitor']
M2 subset score (3.): 0.8822444856971629
............
Best model (M3) with 3 features: ['PageValues', 'VisitorType_New_Visitor', 'TrafficType_3']
M3 subset score (3.): 0.8828932512826511
............
Best model (M4) with 4 features: ['PageValues', 'VisitorType_New_Visitor', 'TrafficType_3', 'TrafficType_10']
M4 subset score (3.): 0.8828931928142312
............
Best model (M5) with 5 features: ['PageValues', 'VisitorType_New_Visitor', 'TrafficType_3', 'TrafficType_10', 'Browser_7']
M5 subset score (3.): 0.8830013593907591
............
Best model (M6) with 6 features: ['PageValues', 'VisitorType_New_Visitor', 'TrafficType_3', 'TrafficType_10', 'Browser_7', 'Region_8']
M6 subset score (3.): 0.8828932512826511
............
Best model (M7) with 7 features: ['PageValues', 'VisitorType_New_Visitor', 'Traf

............
Best model (M28) with 28 features: ['PageValues', 'VisitorType_New_Visitor', 'TrafficType_3', 'TrafficType_10', 'Browser_7', 'Region_8', 'TrafficType_11', 'Browser_3', 'Browser_8', 'Month_Feb', 'Browser_1', 'OperatingSystems_8', 'Region_5', 'Month_Aug', 'TrafficType_7', 'TrafficType_9', 'Region_2', 'Browser_13', 'OperatingSystems_6', 'Browser_4', 'VisitorType_Other', 'Browser_5', 'Region_6', 'TrafficType_15', 'Month_June', 'TrafficType_19', 'VisitorType_Returning_Visitor', 'Browser_10']
M28 subset score (3.): 0.883974566237411
............
Best model (M29) with 29 features: ['PageValues', 'VisitorType_New_Visitor', 'TrafficType_3', 'TrafficType_10', 'Browser_7', 'Region_8', 'TrafficType_11', 'Browser_3', 'Browser_8', 'Month_Feb', 'Browser_1', 'OperatingSystems_8', 'Region_5', 'Month_Aug', 'TrafficType_7', 'TrafficType_9', 'Region_2', 'Browser_13', 'OperatingSystems_6', 'Browser_4', 'VisitorType_Other', 'Browser_5', 'Region_6', 'TrafficType_15', 'Month_June', 'TrafficType_1

............
Best model (M41) with 41 features: ['PageValues', 'VisitorType_New_Visitor', 'TrafficType_3', 'TrafficType_10', 'Browser_7', 'Region_8', 'TrafficType_11', 'Browser_3', 'Browser_8', 'Month_Feb', 'Browser_1', 'OperatingSystems_8', 'Region_5', 'Month_Aug', 'TrafficType_7', 'TrafficType_9', 'Region_2', 'Browser_13', 'OperatingSystems_6', 'Browser_4', 'VisitorType_Other', 'Browser_5', 'Region_6', 'TrafficType_15', 'Month_June', 'TrafficType_19', 'VisitorType_Returning_Visitor', 'Browser_10', 'Month_Jul', 'Region_3', 'Region_7', 'TrafficType_20', 'OperatingSystems_4', 'OperatingSystems_1', 'TrafficType_4', 'TrafficType_5', 'Month_Oct', 'TrafficType_8', 'BounceRates', 'Month_Dec', 'ExitRates']
M41 subset score (3.): 0.8840824404718403
............
Best model (M42) with 42 features: ['PageValues', 'VisitorType_New_Visitor', 'TrafficType_3', 'TrafficType_10', 'Browser_7', 'Region_8', 'TrafficType_11', 'Browser_3', 'Browser_8', 'Month_Feb', 'Browser_1', 'OperatingSystems_8', 'Region

............
Best model (M51) with 51 features: ['PageValues', 'VisitorType_New_Visitor', 'TrafficType_3', 'TrafficType_10', 'Browser_7', 'Region_8', 'TrafficType_11', 'Browser_3', 'Browser_8', 'Month_Feb', 'Browser_1', 'OperatingSystems_8', 'Region_5', 'Month_Aug', 'TrafficType_7', 'TrafficType_9', 'Region_2', 'Browser_13', 'OperatingSystems_6', 'Browser_4', 'VisitorType_Other', 'Browser_5', 'Region_6', 'TrafficType_15', 'Month_June', 'TrafficType_19', 'VisitorType_Returning_Visitor', 'Browser_10', 'Month_Jul', 'Region_3', 'Region_7', 'TrafficType_20', 'OperatingSystems_4', 'OperatingSystems_1', 'TrafficType_4', 'TrafficType_5', 'Month_Oct', 'TrafficType_8', 'BounceRates', 'Month_Dec', 'ExitRates', 'Weekend_False', 'Month_Sep', 'TrafficType_1', 'Region_4', 'Weekend_True', 'TrafficType_6', 'Browser_2', 'Browser_6', 'OperatingSystems_2', 'Administrative_Duration']
M51 subset score (3.): 0.8830010085802407
............
Best model (M52) with 52 features: ['PageValues', 'VisitorType_New_Vi

............
Best model (M60) with 60 features: ['PageValues', 'VisitorType_New_Visitor', 'TrafficType_3', 'TrafficType_10', 'Browser_7', 'Region_8', 'TrafficType_11', 'Browser_3', 'Browser_8', 'Month_Feb', 'Browser_1', 'OperatingSystems_8', 'Region_5', 'Month_Aug', 'TrafficType_7', 'TrafficType_9', 'Region_2', 'Browser_13', 'OperatingSystems_6', 'Browser_4', 'VisitorType_Other', 'Browser_5', 'Region_6', 'TrafficType_15', 'Month_June', 'TrafficType_19', 'VisitorType_Returning_Visitor', 'Browser_10', 'Month_Jul', 'Region_3', 'Region_7', 'TrafficType_20', 'OperatingSystems_4', 'OperatingSystems_1', 'TrafficType_4', 'TrafficType_5', 'Month_Oct', 'TrafficType_8', 'BounceRates', 'Month_Dec', 'ExitRates', 'Weekend_False', 'Month_Sep', 'TrafficType_1', 'Region_4', 'Weekend_True', 'TrafficType_6', 'Browser_2', 'Browser_6', 'OperatingSystems_2', 'Administrative_Duration', 'OperatingSystems_3', 'SpecialDay', 'Region_1', 'Administrative', 'Region_9', 'ProductRelated_Duration', 'ProductRelated', '

### Evaluation Metrics

In [64]:
model = LogisticRegression(solver="newton-cg",penalty="none",max_iter=1000)
ypred = model.fit(X_train_scaled,y_train)
type(ypred)

sklearn.linear_model._logistic.LogisticRegression

In [65]:
model = LogisticRegression(solver="newton-cg",penalty="none",max_iter=1000)
classification_metrics(model, "LogisticRegression", X_train_scaled[features_logistic], y_train, X_test_scaled[features_logistic], y_test,5)

Unnamed: 0,Model,Accuracy,Sensitivity,Specificity
0,LogisticRegression_cv,0.884515,0.365686,0.773116
1,LogisticRegression_test,0.891629,0.391863,0.98088


The sensitivity is very low, we try to improve it using the empirical threshold

In [74]:
classification_metrics_empirical_threshold(model, 'LogisticRegression', X_train_scaled[features_logistic], y_train, X_test_scaled[features_logistic], y_test)

Unnamed: 0,Model,Accuracy,Sensitivity,Specificity
0,LogisticRegression,0.893251,0.67666,0.639676


# LDA

### Feature selection

In [33]:
cv = 5
features_LDA = forward_selection(X_train_scaled, y_train, X_test_scaled, y_test,"LDA",
                  get_evaluator(make_scorer(accuracy)), np.argmax, # 2.
                  get_cv_evaluator(make_scorer(accuracy), cv), np.argmax, # 3.
                  get_cv_evaluator(make_scorer(accuracy), cv), np.argmax, # test
                  candidates_scorer_name="Accuracy",
                  subsets_scorer_name="Accuracy (CV)",
                  verbose=True, weight_step3=cv)
features_LDA

............
Best model (M1) with 1 features: ['PageValues']
M1 subset score (3.): 0.8765136450674579
............
Best model (M2) with 2 features: ['PageValues', 'ExitRates']
M2 subset score (3.): 0.8793249236256268
............
Best model (M3) with 3 features: ['PageValues', 'ExitRates', 'BounceRates']
M3 subset score (3.): 0.8788923157879351
............
Best model (M4) with 4 features: ['PageValues', 'ExitRates', 'BounceRates', 'Month_Oct']
M4 subset score (3.): 0.8790004238960432
............
Best model (M5) with 5 features: ['PageValues', 'ExitRates', 'BounceRates', 'Month_Oct', 'VisitorType_Other']
M5 subset score (3.): 0.879324865157207
............
Best model (M6) with 6 features: ['PageValues', 'ExitRates', 'BounceRates', 'Month_Oct', 'VisitorType_Other', 'TrafficType_5']
M6 subset score (3.): 0.8790005408328826
............
Best model (M7) with 7 features: ['PageValues', 'ExitRates', 'BounceRates', 'Month_Oct', 'VisitorType_Other', 'TrafficType_5', 'OperatingSystems_1']
M7 s

............
Best model (M28) with 28 features: ['PageValues', 'ExitRates', 'BounceRates', 'Month_Oct', 'VisitorType_Other', 'TrafficType_5', 'OperatingSystems_1', 'Browser_3', 'Browser_7', 'Browser_8', 'Month_Aug', 'Month_Sep', 'OperatingSystems_4', 'Month_Feb', 'Region_3', 'Browser_4', 'Region_7', 'TrafficType_11', 'TrafficType_10', 'OperatingSystems_6', 'OperatingSystems_8', 'Region_5', 'Region_6', 'TrafficType_15', 'TrafficType_19', 'TrafficType_4', 'Browser_5', 'TrafficType_6']
M28 subset score (3.): 0.8804061216435473
............
Best model (M29) with 29 features: ['PageValues', 'ExitRates', 'BounceRates', 'Month_Oct', 'VisitorType_Other', 'TrafficType_5', 'OperatingSystems_1', 'Browser_3', 'Browser_7', 'Browser_8', 'Month_Aug', 'Month_Sep', 'OperatingSystems_4', 'Month_Feb', 'Region_3', 'Browser_4', 'Region_7', 'TrafficType_11', 'TrafficType_10', 'OperatingSystems_6', 'OperatingSystems_8', 'Region_5', 'Region_6', 'TrafficType_15', 'TrafficType_19', 'TrafficType_4', 'Browser_5',

............
Best model (M42) with 42 features: ['PageValues', 'ExitRates', 'BounceRates', 'Month_Oct', 'VisitorType_Other', 'TrafficType_5', 'OperatingSystems_1', 'Browser_3', 'Browser_7', 'Browser_8', 'Month_Aug', 'Month_Sep', 'OperatingSystems_4', 'Month_Feb', 'Region_3', 'Browser_4', 'Region_7', 'TrafficType_11', 'TrafficType_10', 'OperatingSystems_6', 'OperatingSystems_8', 'Region_5', 'Region_6', 'TrafficType_15', 'TrafficType_19', 'TrafficType_4', 'Browser_5', 'TrafficType_6', 'Month_Jul', 'TrafficType_7', 'TrafficType_9', 'Browser_13', 'Browser_10', 'Month_Mar', 'TrafficType_8', 'Region_2', 'Browser_2', 'Month_June', 'TrafficType_13', 'Region_4', 'Weekend_False', 'Weekend_True']
M42 subset score (3.): 0.8805143466884949
............
Best model (M43) with 43 features: ['PageValues', 'ExitRates', 'BounceRates', 'Month_Oct', 'VisitorType_Other', 'TrafficType_5', 'OperatingSystems_1', 'Browser_3', 'Browser_7', 'Browser_8', 'Month_Aug', 'Month_Sep', 'OperatingSystems_4', 'Month_Feb',

............
Best model (M52) with 52 features: ['PageValues', 'ExitRates', 'BounceRates', 'Month_Oct', 'VisitorType_Other', 'TrafficType_5', 'OperatingSystems_1', 'Browser_3', 'Browser_7', 'Browser_8', 'Month_Aug', 'Month_Sep', 'OperatingSystems_4', 'Month_Feb', 'Region_3', 'Browser_4', 'Region_7', 'TrafficType_11', 'TrafficType_10', 'OperatingSystems_6', 'OperatingSystems_8', 'Region_5', 'Region_6', 'TrafficType_15', 'TrafficType_19', 'TrafficType_4', 'Browser_5', 'TrafficType_6', 'Month_Jul', 'TrafficType_7', 'TrafficType_9', 'Browser_13', 'Browser_10', 'Month_Mar', 'TrafficType_8', 'Region_2', 'Browser_2', 'Month_June', 'TrafficType_13', 'Region_4', 'Weekend_False', 'Weekend_True', 'Browser_1', 'Browser_6', 'Region_8', 'VisitorType_New_Visitor', 'Informational_Duration', 'VisitorType_Returning_Visitor', 'OperatingSystems_2', 'Month_Dec', 'TrafficType_20', 'TrafficType_1']
M52 subset score (3.): 0.8802978965985997
............
Best model (M53) with 53 features: ['PageValues', 'ExitR

............
Best model (M61) with 61 features: ['PageValues', 'ExitRates', 'BounceRates', 'Month_Oct', 'VisitorType_Other', 'TrafficType_5', 'OperatingSystems_1', 'Browser_3', 'Browser_7', 'Browser_8', 'Month_Aug', 'Month_Sep', 'OperatingSystems_4', 'Month_Feb', 'Region_3', 'Browser_4', 'Region_7', 'TrafficType_11', 'TrafficType_10', 'OperatingSystems_6', 'OperatingSystems_8', 'Region_5', 'Region_6', 'TrafficType_15', 'TrafficType_19', 'TrafficType_4', 'Browser_5', 'TrafficType_6', 'Month_Jul', 'TrafficType_7', 'TrafficType_9', 'Browser_13', 'Browser_10', 'Month_Mar', 'TrafficType_8', 'Region_2', 'Browser_2', 'Month_June', 'TrafficType_13', 'Region_4', 'Weekend_False', 'Weekend_True', 'Browser_1', 'Browser_6', 'Region_8', 'VisitorType_New_Visitor', 'Informational_Duration', 'VisitorType_Returning_Visitor', 'OperatingSystems_2', 'Month_Dec', 'TrafficType_20', 'TrafficType_1', 'OperatingSystems_3', 'Administrative_Duration', 'Informational', 'SpecialDay', 'Administrative', 'Month_May', 

### Evaluation metrics

In [75]:
model = LDA(store_covariance=True)
classification_metrics(model, "LDA", X_train[features], y_train, X_test[features], y_test,5)

Unnamed: 0,Model,Accuracy,Sensitivity,Specificity
0,LDA_cv,0.880839,0.325442,0.782345
1,LDA_test,0.888384,0.35546,0.983556


We try to improve the sensitivity by setting the empirical threshold

In [76]:
classification_metrics_empirical_threshold(model, 'LDA', X_train_scaled[features_LDA], y_train, X_test_scaled[features_LDA], y_test)

Unnamed: 0,Model,Accuracy,Sensitivity,Specificity
0,LDA,0.899091,0.603854,0.691176


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

model = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('lda', LDA(store_covariance=True))])
model = model.fit(X_train_full, y_train)

print("Train accuracy:", accuracy(y_train, model.predict(X_train_full)))
print("Test accuracy:", accuracy(y_test, model.predict(X_test_full)))

In [None]:
from sklearn.model_selection import GridSearchCV

model = Pipeline([('poly', PolynomialFeatures()),
                  ('lda', LDA(store_covariance=True))])
params = {'poly__degree': range(1, 6)}
cv = GridSearchCV(model, params, refit=True, cv=10, 
                  scoring=make_scorer(accuracy))
cv.fit(X_train_full, y_train)

## QDA

In [None]:
# APPLICO PCA -> non possiamo perché dobbiamo individuare l'importanza di exit rates
from sklearn.decomposition import PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)
# Calcola la percentuale di varianza spiegata cumulativa
explained_variance_ratio_cumsum = np.cumsum(pca.explained_variance_ratio_)

# Trova il numero di componenti che catturano il 95% della varianza
n_components = np.argmax(explained_variance_ratio_cumsum >= 0.95) + 1
n_components
# Riduci la dimensionalità del dataset utilizzando il numero selezionato di componenti principali
pca = PCA(n_components=n_components)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_scaled))
X_test_pca = pd.DataFrame(pca.transform(X_test_scaled))

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
model = QDA(store_covariance=True)
model = model.fit(X_train_pca, y_train)

print("Train accuracy:", accuracy(y_train, model.predict(X_train_pca)))
print("Test accuracy:", accuracy(y_test, model.predict(X_test_pca)))

### Feature selection

In [61]:
cv = 5
features_QDA = forward_selection(X_train_scaled, y_train, X_test_scaled, y_test,"QDA",
                  get_evaluator(make_scorer(accuracy)), np.argmax, # 2.
                  get_cv_evaluator(make_scorer(accuracy), cv), np.argmax, # 3.
                  get_cv_evaluator(make_scorer(accuracy), cv), np.argmax, # test
                  candidates_scorer_name="Accuracy",
                  subsets_scorer_name="Accuracy (CV)",
                  verbose=False, weight_step3=cv)
features_QDA

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  return -0.5 * (norm2 + u) + np.log(self.priors_)
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M1) with 1 features: ['PageValues']
M1 subset score (3.): 0.8820282694809466
............
Best model (M2) with 2 features: ['PageValues', 'BounceRates']
M2 subset score (3.): 0.8867861956060983


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M3) with 3 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor']
M3 subset score (3.): 0.8887324923625627
............
Best model (M4) with 4 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7']
M4 subset score (3.): 0.8897055822723752


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M5) with 5 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2']
M5 subset score (3.): 0.8897055238039554


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M6) with 6 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9']
M6 subset score (3.): 0.8872184526332715


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M7) with 7 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False']
M7 subset score (3.): 0.8876509435341235


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M8) with 8 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1']
M8 subset score (3.): 0.887759227047491


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M9) with 9 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6']
M9 subset score (3.): 0.8812691739874001


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M10) with 10 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4']
M10 subset score (3.): 0.8800793416455936


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M11) with 11 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9']
M11 subset score (3.): 0.8661318755207343


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M12) with 12 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7']
M12 subset score (3.): 0.8795393857892506


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M13) with 13 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6']
M13 subset score (3.): 0.8760799263297911


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M14) with 14 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8']
M14 subset score (3.): 0.8680778214666802


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M15) with 15 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2']
M15 subset score (3.): 0.8614814143510736


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M16) with 16 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4']
M16 subset score (3.): 0.8633203046204668


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M17) with 17 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13']
M17 subset score (3.): 0.8806232733544794


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M18) with 18 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7']
M18 subset score (3.): 0.8798663411924634


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M19) with 19 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May']
M19 subset score (3.): 0.8673207138994051


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M20) with 20 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6']
M20 subset score (3.): 0.8659153669624194


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M21) with 21 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20']
M21 subset score (3.): 0.8747837399324689


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M22) with 22 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec']
M22 subset score (3.): 0.873486559572011


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M23) with 23 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2']
M23 subset score (3.): 0.8692682384926842


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M24) with 24 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2']
M24 subset score (3.): 0.8682949731776123


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


............
Best model (M25) with 25 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1']
M25 subset score (3.): 0.866889333898528




............
Best model (M26) with 26 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1']
M26 subset score (3.): 0.8655915688538729




............
Best model (M27) with 27 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3']
M27 subset score (3.): 0.8601847017379738




............
Best model (M28) with 28 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4']
M28 subset score (3.): 0.8588867612880593




............
Best model (M29) with 29 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul']
M29 subset score (3.): 0.8536972797567713




............
Best model (M30) with 30 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1']
M30 subset score (3.): 0.8485064534518294




............
Best model (M31) with 31 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration']
M31 subset score (3.): 0.8496967535409936




............
Best model (M32) with 32 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated']
M32 subset score (3.): 0.8498045108385833




............
Best model (M33) with 33 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June']
M33 subset score (3.): 0.8446139184073201




............
Best model (M34) with 34 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov']
M34 subset score (3.): 0.8433165626416033




............
Best model (M35) with 35 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3']
M35 subset score (3.): 0.8356387821028168




............
Best model (M36) with 36 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8']
M36 subset score (3.): 0.8341243330945872




............
Best model (M37) with 37 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug']
M37 subset score (3.): 0.8319621124640054




............
Best model (M38) with 38 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3']
M38 subset score (3.): 0.82839401868066




............
Best model (M39) with 39 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration']
M39 subset score (3.): 0.8285026530045458




............
Best model (M40) with 40 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative']
M40 subset score (3.): 0.8260159326443806




............
Best model (M41) with 41 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration']
M41 subset score (3.): 0.8248259248973149




............
Best model (M42) with 42 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5']
M42 subset score (3.): 0.8291507169689971




............
Best model (M43) with 43 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8']
M43 subset score (3.): 0.823960592285092




............
Best model (M44) with 44 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4']
M44 subset score (3.): 0.8306669200298188




............
Best model (M45) with 45 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8']
M45 subset score (3.): 0.7065565901217604




............
Best model (M46) with 46 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6']
M46 subset score (3.): 0.6987721631853595




............
Best model (M47) with 47 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational']
M47 subset score (3.): 0.7163905690438952




............
Best model (M48) with 48 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19']
M48 subset score (3.): 0.6014180930524902




............
Best model (M49) with 49 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5']
M49 subset score (3.): 0.6353772528612983




............
Best model (M50) with 50 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10']
M50 subset score (3.): 0.6395922997091196




............
Best model (M51) with 51 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10', 'TrafficType_11']
M51 subset score (3.): 0.626937687281657




............
Best model (M52) with 52 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10', 'TrafficType_11', 'Month_Oct']
M52 subset score (3.): 0.5840041512578018




............
Best model (M53) with 53 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10', 'TrafficType_11', 'Month_Oct', 'SpecialDay']
M53 subset score (3.): 0.5943848099045503




............
Best model (M54) with 54 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10', 'TrafficType_11', 'Month_Oct', 'SpecialDay', 'VisitorType_Returning_Visitor']
M54 subset score (3.): 0.49006025170654705




............
Best model (M55) with 55 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10', 'TrafficType_11', 'Month_Oct', 'SpecialDay', 'VisitorType_Returning_Visitor', 'TrafficType_5']
M55 subset score (3.): 0.51427798810



............
Best model (M56) with 56 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10', 'TrafficType_11', 'Month_Oct', 'SpecialDay', 'VisitorType_Returning_Visitor', 'TrafficType_5', 'Month_Mar']
M56 subset score (3.): 



............
Best model (M57) with 57 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10', 'TrafficType_11', 'Month_Oct', 'SpecialDay', 'VisitorType_Returning_Visitor', 'TrafficType_5', 'Month_Mar', 'ExitRates']
M57 subset



............
Best model (M58) with 58 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10', 'TrafficType_11', 'Month_Oct', 'SpecialDay', 'VisitorType_Returning_Visitor', 'TrafficType_5', 'Month_Mar', 'ExitRates', 'VisitorTy



............
Best model (M59) with 59 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10', 'TrafficType_11', 'Month_Oct', 'SpecialDay', 'VisitorType_Returning_Visitor', 'TrafficType_5', 'Month_Mar', 'ExitRates', 'VisitorTy



............
Best model (M60) with 60 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10', 'TrafficType_11', 'Month_Oct', 'SpecialDay', 'VisitorType_Returning_Visitor', 'TrafficType_5', 'Month_Mar', 'ExitRates', 'VisitorTy



............
Best model (M61) with 61 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10', 'TrafficType_11', 'Month_Oct', 'SpecialDay', 'VisitorType_Returning_Visitor', 'TrafficType_5', 'Month_Mar', 'ExitRates', 'VisitorTy



............
Best model (M62) with 62 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10', 'TrafficType_11', 'Month_Oct', 'SpecialDay', 'VisitorType_Returning_Visitor', 'TrafficType_5', 'Month_Mar', 'ExitRates', 'VisitorTy



............
Best model (M63) with 63 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10', 'TrafficType_11', 'Month_Oct', 'SpecialDay', 'VisitorType_Returning_Visitor', 'TrafficType_5', 'Month_Mar', 'ExitRates', 'VisitorTy



............
Best model (M64) with 64 features: ['PageValues', 'BounceRates', 'VisitorType_New_Visitor', 'Browser_7', 'OperatingSystems_2', 'Region_9', 'Weekend_False', 'Region_1', 'TrafficType_6', 'Region_4', 'TrafficType_9', 'TrafficType_7', 'OperatingSystems_6', 'Region_8', 'TrafficType_2', 'TrafficType_4', 'Browser_13', 'Region_7', 'Month_May', 'Region_6', 'TrafficType_20', 'Month_Dec', 'Browser_2', 'Region_2', 'Browser_1', 'OperatingSystems_1', 'OperatingSystems_3', 'Browser_4', 'Month_Jul', 'TrafficType_1', 'ProductRelated_Duration', 'ProductRelated', 'Month_June', 'Month_Nov', 'TrafficType_3', 'Browser_8', 'Month_Aug', 'Region_3', 'Informational_Duration', 'Administrative', 'Administrative_Duration', 'Region_5', 'TrafficType_8', 'OperatingSystems_4', 'OperatingSystems_8', 'Browser_6', 'Informational', 'TrafficType_19', 'Browser_5', 'TrafficType_10', 'TrafficType_11', 'Month_Oct', 'SpecialDay', 'VisitorType_Returning_Visitor', 'TrafficType_5', 'Month_Mar', 'ExitRates', 'VisitorTy



### Evaluation

In [77]:
model = QDA(store_covariance=True)
classification_metrics(model, "QDA", X_train[features], y_train, X_test[features], y_test,5)



Unnamed: 0,Model,Accuracy,Sensitivity,Specificity
0,QDA_cv,0.249675,0.988204,0.170654
1,QDA_test,0.242051,0.987152,0.108987


The sensitivity is very high with respect to the specificity, so we try to set the empirical threshold

In [78]:
classification_metrics_empirical_threshold(model, 'QDA', X_train_scaled[features_QDA], y_train, X_test_scaled[features_QDA], y_test)

Unnamed: 0,Model,Accuracy,Sensitivity,Specificity
0,QDA,0.81538,0.719486,0.434109


## KNN

### With all features

In [80]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [81]:
model = KNeighborsClassifier()
params = {'n_neighbors': range(1, 15)}
cv = GridSearchCV(model, params, refit=True, cv=10, 
                  scoring=make_scorer(accuracy))
cv.fit(X_train_full, y_train)
cv.best_params_[0]

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 15)},
             scoring=make_scorer(accuracy))

{'n_neighbors': 6}

#### Metrics

In [86]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=cv.best_params_['n_neighbors'])
classification_metrics(model, "KNN", X_train, y_train, X_test, y_test,5)

ValueError: X has 75 features, but KNeighborsClassifier is expecting 65 features as input.