In [125]:
import pandas as pd
import numpy as np

from sklearn.model_selection import *
from sklearn.metrics import *

from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.decomposition import *

import xgboost as xgb

import tensorflow as tf

import os
import re
import ast

In [95]:
directory_dataframes = '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/subsamples/'
directory_features = '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features/'

def get_sample_df(directory=directory_dataframes):
    list_dataframes = []
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            list_dataframes.append(pd.read_csv(f))
            
    return list_dataframes

def get_features(regex_str, directory=directory_features):
    regex = re.compile('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features/{}'.format(regex_str))
    
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if regex.match(f):
            file1 = open(f,"r+")
            feat_list = file1.read().splitlines()
            
            #txt file converts everything to string, so we need to convert it back to list
            for i in range(len(feat_list)):
                #adding ; to be used a separator for list
                if i<len(feat_list):
                    new_val = feat_list[i].replace('y','y;').replace(') ','); ').replace('4 ', '4; ').replace('5 ', '5; ')
                    feat_list[i] = new_val
                
    for val in feat_list:
        #separating the string into a list of features
        new_val = val.split('; ')
        feat_list[feat_list.index(val)] = new_val
        
    return feat_list

list_sample_dataframes = get_sample_df(directory_dataframes)

In [96]:
def model_train_predict(model, regex_str, dataframes=list_sample_dataframes):
    
    feat_list = get_features(regex_str)
    
    accuracy_list = []
    f1_score_list = []
    auc_list = []
    
    for sample, feat in zip(dataframes, feat_list):
        feat[len(feat)-1] = feat[len(feat)-1].replace('y;', 'y')
        x = sample[feat]
        y = sample['conversion_class']
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
        # print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        accuracy_list.append(accuracy_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))
        auc_list.append(roc_auc_score(y_test, y_pred))

    print('Average Accuracy', np.mean(accuracy_list))
    print('Average F1 Score', np.mean(f1_score_list))
    print('Average AUC', np.mean(auc_list)) 
    
    print('Max Accuracy', max(accuracy_list))
    print('Max F1 Score', max(f1_score_list))
    print('Max AUC', max(auc_list))  
    
    best_accuracy_index = accuracy_list.index(max(accuracy_list))
    best_f1_score_index = f1_score_list.index(max(f1_score_list))
    best_auc_index = auc_list.index(max(auc_list))
    
    print('Best Sample Index based on Max Accuracy', best_accuracy_index)
    print('Best Sample Index based on Max F1 Score', best_f1_score_index)
    print('Best Sample Index based on Max AUC', best_auc_index)
    
    print('Best Features based on Max Accuracy', feat_list[best_accuracy_index])
    print('Best Features based on Max F1 Score', feat_list[best_f1_score_index])
    print('Best Features based on Max AUC', feat_list[best_auc_index]) 
    
     
    
    return accuracy_list, f1_score_list, auc_list  


# Mutual Information

## 10 Percentile

In [97]:
lr = LogisticRegression()
accuracy_list_lr_10_mi, f1_score_list_lr_10_mi, auc_list_lr_10_mi = model_train_predict(lr, 'mi_feat_list_10')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_10_mi, f1_score_list_rfc_10_mi, auc_list_rfc_10_mi = model_train_predict(rfc, 'mi_feat_list_10')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_10_mi, f1_score_list_svm_10_mi, auc_list_svm_10_mi = model_train_predict(svc, 'mi_feat_list_10')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_10_mi, f1_score_list_xgb_10_mi, auc_list_xgb_10_mi = model_train_predict(xgbc, 'mi_feat_list_10')

Average Accuracy 0.8062499999999998
Average F1 Score 0.8018535896077769
Average AUC 0.8062499999999998
Max Accuracy 0.8653846153846154
Max F1 Score 0.8541666666666667
Max AUC 0.8653846153846153
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']
Best Features based on Max AUC ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']


Average Accuracy 0.8572115384615383
Average F1

## 20 Percentile

In [98]:
lr = LogisticRegression()
accuracy_list_lr_20_mi, f1_score_list_lr_20_mi, auc_list_lr_20_mi = model_train_predict(lr, 'mi_feat_list_20')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_20_mi, f1_score_list_rfc_20_mi, auc_list_rfc_20_mi = model_train_predict(rfc, 'mi_feat_list_20')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_20_mi, f1_score_list_svm_20_mi, auc_list_svm_20_mi = model_train_predict(svc, 'mi_feat_list_20')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_20_mi, f1_score_list_xgb_20_mi, auc_list_xgb_20_mi = model_train_predict(xgbc, 'mi_feat_list_20')

Average Accuracy 0.8134615384615385
Average F1 Score 0.8105797948711706
Average AUC 0.8134615384615385
Max Accuracy 0.8605769230769231
Max F1 Score 0.8481675392670157
Max AUC 0.860576923076923
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(3, 2)', '(1, 4)', '(4, 1)', '(3, 3)', '(1, 1, 1)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(3, 1, 1)', '(1, 4, 1)', '(3, 3, 3)', '(4, 1, 1)', '(3, 1, 4)', 'G5', 'L5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(3, 2)', '(1, 4)', '(4, 1)', '(3, 3)', '(1, 1, 1)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(3, 1, 1)', '(1, 4, 1)', '(3, 3,

## 30 Percentile

In [99]:
lr = LogisticRegression()
accuracy_list_lr_30_mi, f1_score_list_lr_30_mi, auc_list_lr_30_mi = model_train_predict(lr, 'mi_feat_list_30')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_30_mi, f1_score_list_rfc_30_mi, auc_list_rfc_30_mi = model_train_predict(rfc, 'mi_feat_list_30')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_30_mi, f1_score_list_svm_30_mi, auc_list_svm_30_mi = model_train_predict(svc, 'mi_feat_list_30')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_30_mi, f1_score_list_xgb_30_mi, auc_list_xgb_30_mi = model_train_predict(xgbc, 'mi_feat_list_30')

Average Accuracy 0.8149038461538461
Average F1 Score 0.812342708550494
Average AUC 0.8149038461538461
Max Accuracy 0.8509615384615384
Max F1 Score 0.8426395939086294
Max AUC 0.8509615384615384
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(3, 3)', '(1, 1, 1)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(3, 1, 1)', '(6, 3, 1)', '(1, 1, 4)', '(1, 4, 1)', '(2, 1, 4)', '(4, 1, 4)', '(3, 3, 2)', '(4, 1, 1)', '(1, 6, 2)', '(3, 1, 4)', '(2, 4, 1)', '(4, 2, 1)', 'E4', 'C4', 'O5', 'B5', 'G5', 'L5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern

## 50 Percentile

In [100]:
lr = LogisticRegression()
accuracy_list_lr_50_mi, f1_score_list_lr_50_mi, auc_list_lr_50_mi = model_train_predict(lr, 'mi_feat_list_50')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_50_mi, f1_score_list_rfc_50_mi, auc_list_rfc_50_mi = model_train_predict(rfc, 'mi_feat_list_50')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_50_mi, f1_score_list_svm_50_mi, auc_list_svm_50_mi = model_train_predict(svc, 'mi_feat_list_50')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_50_mi, f1_score_list_xgb_50_mi, auc_list_xgb_50_mi = model_train_predict(xgbc, 'mi_feat_list_50')

Average Accuracy 0.8173076923076923
Average F1 Score 0.8148611724708299
Average AUC 0.8173076923076923
Max Accuracy 0.8557692307692307
Max F1 Score 0.8469387755102041
Max AUC 0.8557692307692308
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 2)', '(2, 3)', '(3, 1)', '(6, 3)', '(3, 2)', '(1, 4)', '(4, 1)', '(1, 3)', '(3, 3)', '(1, 6)', '(6, 2)', '(1, 1, 1)', '(1, 2, 6)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 2)', '(2, 2, 2)', '(2, 2, 1)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(3, 1, 1)', '(2, 6, 3)', '(6, 3, 1)', '(1, 1, 4)', '(1, 4, 1)', '(4, 1, 2)', '(2, 1, 4)', '(4, 1, 4)', '(1, 3, 3)', '(3, 3, 3)', '(3, 3, 2)', '(3, 2, 1)', '(4, 1, 1)', '(3, 3, 1)', '(2, 3, 2)', '(3, 1, 4)', '(1, 1, 

## 75 Percentile

In [101]:
lr = LogisticRegression()
accuracy_list_lr_75_mi, f1_score_list_lr_75_mi, auc_list_lr_75_mi = model_train_predict(lr, 'mi_feat_list_75')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_75_mi, f1_score_list_rfc_75_mi, auc_list_rfc_75_mi = model_train_predict(rfc, 'mi_feat_list_75')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_75_mi, f1_score_list_svm_75_mi, auc_list_svm_75_mi = model_train_predict(svc, 'mi_feat_list_75')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_75_mi, f1_score_list_xgb_75_mi, auc_list_xgb_75_mi = model_train_predict(xgbc, 'mi_feat_list_75')

Average Accuracy 0.8153846153846154
Average F1 Score 0.8131550077877471
Average AUC 0.8153846153846154
Max Accuracy 0.8605769230769231
Max F1 Score 0.852791878172589
Max AUC 0.8605769230769231
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(1,)', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 6)', '(6, 1)', '(2, 1)', '(2, 2)', '(2, 3)', '(3, 1)', '(6, 3)', '(3, 2)', '(1, 4)', '(4, 1)', '(1, 3)', '(3, 3)', '(6, 2)', '(4, 4)', '(3, 4)', '(6, 6)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 6)', '(2, 6, 1)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 2)', '(2, 2, 2)', '(2, 2, 1)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(2, 2, 3)', '(3, 1, 1)', '(2, 6, 3)', '(6, 3, 2)', '(3, 2, 2)', '(6, 3, 1)', '(1, 1, 4)', '(1, 4, 1)', '(4, 1, 2)', '(2, 1, 4)', '(4, 1, 4)', 

## 90 Percentile

In [102]:
lr = LogisticRegression()
accuracy_list_lr_90_mi, f1_score_list_lr_90_mi, auc_list_lr_90_mi = model_train_predict(lr, 'mi_feat_list_90')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_90_mi, f1_score_list_rfc_90_mi, auc_list_rfc_90_mi = model_train_predict(rfc, 'mi_feat_list_90')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_90_mi, f1_score_list_svm_90_mi, auc_list_svm_90_mi = model_train_predict(svc, 'mi_feat_list_90')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_90_mi, f1_score_list_xgb_90_mi, auc_list_xgb_90_mi = model_train_predict(xgbc, 'mi_feat_list_90')

Average Accuracy 0.8173076923076923
Average F1 Score 0.8151089332692442
Average AUC 0.8173076923076923
Max Accuracy 0.8605769230769231
Max F1 Score 0.852791878172589
Max AUC 0.8605769230769231
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(1,)', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 6)', '(6, 1)', '(2, 1)', '(2, 2)', '(2, 3)', '(3, 1)', '(6, 3)', '(3, 2)', '(1, 4)', '(4, 1)', '(1, 3)', '(3, 3)', '(1, 6)', '(6, 2)', '(4, 4)', '(3, 4)', '(2, 4)', '(4, 2)', '(6, 6)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 6)', '(2, 6, 1)', '(6, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 2)', '(2, 2, 2)', '(2, 2, 1)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(2, 2, 3)', '(3, 1, 1)', '(2, 6, 3)', '(6, 3, 2)', '(3, 2, 2)', '(6, 3, 1)', '(1, 1, 4)', '(1, 4, 1

# mRMR

## 10 Percentile

In [103]:
lr = LogisticRegression()
accuracy_list_lr_10_mrmr, f1_score_list_lr_10_mrmr, auc_list_lr_10_mrmr = model_train_predict(lr, 'mrmr_feat_list_10')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_10_mrmr, f1_score_list_rfc_10_mrmr, auc_list_rfc_10_mrmr = model_train_predict(rfc, 'mrmr_feat_list_10')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_10_mrmr, f1_score_list_svm_10_mrmr, auc_list_svm_10_mrmr = model_train_predict(svc, 'mrmr_feat_list_10')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_10_mrmr, f1_score_list_xgb_10_mrmr, auc_list_xgb_10_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_10')

Average Accuracy 0.8163461538461538
Average F1 Score 0.8136012031765117
Average AUC 0.8163461538461538
Max Accuracy 0.8461538461538461
Max F1 Score 0.8400000000000001
Max AUC 0.8461538461538461
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(2,)', '(2, 3)', '(1, 2, 1)', '(3,)', 'trigram_entropy', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', '(1, 4)']
Best Features based on Max F1 Score ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(2,)', '(2, 3)', '(1, 2, 1)', '(3,)', 'trigram_entropy', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', '(1, 4)']
Best Features based on Max AUC ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(2,)', '(2, 3)', '(1, 2, 1)', '(3,)', 'trigram_entropy', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', '(1, 4)']


Average Accuracy 0.8403846153846153
Average F1

## 20 Percentile

In [104]:
lr = LogisticRegression()
accuracy_list_lr_20_mrmr, f1_score_list_lr_20_mrmr, auc_list_lr_20_mrmr = model_train_predict(lr, 'mrmr_feat_list_20*')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_20_mrmr, f1_score_list_rfc_20_mrmr, auc_list_rfc_20_mrmr = model_train_predict(rfc, 'mrmr_feat_list_20')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_20_mrmr, f1_score_list_svm_20_mrmr, auc_list_svm_20_mrmr = model_train_predict(svc, 'mrmr_feat_list_20')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_20_mrmr, f1_score_list_xgb_20_mrmr, auc_list_xgb_20_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_20')

Average Accuracy 0.8158653846153847
Average F1 Score 0.8135837083651871
Average AUC 0.8158653846153845
Max Accuracy 0.8557692307692307
Max F1 Score 0.8484848484848485
Max AUC 0.8557692307692307
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(2,)', '(2, 3)', '(1, 2, 1)', '(3,)', 'trigram_entropy', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', '(1, 4)', 'bigram_entropy', '(2, 1, 2)', '(1, 1, 2)', 'unigram_entropy', '(1, 2, 3)', '(1, 4, 1)', '(4, 1, 1)', 'pattern_hvg_4_nodes_entropy', '(6, 3)', '(4,)', '(3, 3)', '(2, 1, 1)', '(3, 1, 2)', '(3, 1, 4)']
Best Features based on Max F1 Score ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(2,)', '(2, 3)', '(1, 2, 1)', '(3,)', 'trigram_entropy', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', '(1, 4)', 'bigram_entropy', '(2, 1, 2)', '(1, 1, 2)', 'unigram_entr

## 30 Percentile

In [105]:
lr = LogisticRegression()
accuracy_list_lr_30_mrmr, f1_score_list_lr_30_mrmr, auc_list_lr_30_mrmr = model_train_predict(lr, 'mrmr_feat_list_30*')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_30_mrmr, f1_score_list_rfc_30_mrmr, auc_list_rfc_30_mrmr = model_train_predict(rfc, 'mrmr_feat_list_30')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_30_mrmr, f1_score_list_svm_30_mrmr, auc_list_svm_30_mrmr = model_train_predict(svc, 'mrmr_feat_list_30')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_30_mrmr, f1_score_list_xgb_30_mrmr, auc_list_xgb_30_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_30')

Average Accuracy 0.8158653846153845
Average F1 Score 0.8136041392365142
Average AUC 0.8158653846153845
Max Accuracy 0.8557692307692307
Max F1 Score 0.8484848484848485
Max AUC 0.8557692307692307
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(2,)', '(2, 3)', '(1, 2, 1)', '(3,)', 'trigram_entropy', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', '(1, 4)', 'bigram_entropy', '(2, 1, 2)', '(1, 1, 2)', 'unigram_entropy', '(1, 2, 3)', '(1, 4, 1)', '(4, 1, 1)', 'pattern_hvg_4_nodes_entropy', '(6, 3)', '(4,)', '(3, 3)', '(2, 1, 1)', '(3, 1, 2)', '(3, 1, 4)', '(2, 6, 3)', '(1, 3)', '(3, 3, 3)', '(4, 1, 2)', '(6, 3, 1)', '(3, 3, 1)', '(2, 1, 4)', '(1, 1, 4)', '(6, 2, 3)', '(1, 3, 3)', '(2, 2, 3)', '(4, 1, 4)', '(3, 2)', 'C4']
Best Features based on Max F1 Score ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(2, 1)', 'pattern_hvg_5_node_en

## 50 Percentile

In [106]:
lr = LogisticRegression()
accuracy_list_lr_50_mrmr, f1_score_list_lr_50_mrmr, auc_list_lr_50_mrmr = model_train_predict(lr, 'mrmr_feat_list_50')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_50_mrmr, f1_score_list_rfc_50_mrmr, auc_list_rfc_50_mrmr = model_train_predict(rfc, 'mrmr_feat_list_50')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_50_mrmr, f1_score_list_svm_50_mrmr, auc_list_svm_50_mrmr = model_train_predict(svc, 'mrmr_feat_list_50')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_50_mrmr, f1_score_list_xgb_50_mrmr, auc_list_xgb_50_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_50')

Average Accuracy 0.8192307692307693
Average F1 Score 0.8166789794915215
Average AUC 0.8192307692307693
Max Accuracy 0.8509615384615384
Max F1 Score 0.8426395939086294
Max AUC 0.8509615384615384
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(2,)', '(2, 3)', '(1, 2, 1)', '(3,)', 'trigram_entropy', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', '(1, 4)', 'bigram_entropy', '(2, 1, 2)', '(1, 1, 2)', 'unigram_entropy', '(1, 2, 3)', '(1, 4, 1)', '(4, 1, 1)', 'pattern_hvg_4_nodes_entropy', '(6, 3)', '(4,)', '(3, 3)', '(2, 1, 1)', '(3, 1, 2)', '(3, 1, 4)', '(2, 6, 3)', '(1, 3)', '(3, 3, 3)', '(4, 1, 2)', '(6, 3, 1)', '(3, 3, 1)', '(2, 1, 4)', '(1, 1, 4)', '(6, 2, 3)', '(1, 3, 3)', '(2, 2, 3)', '(4, 1, 4)', '(3, 2)', 'C4', '(1, 2, 2)', '(1, 1, 3)', '(2, 2, 2)', '(3, 2, 3)', '(3, 3, 2)', '(2, 2)', '(2, 3, 2)', '(2, 3, 3)', '

## 75 Percentile

In [107]:
lr = LogisticRegression()
accuracy_list_lr_75_mrmr, f1_score_list_lr_75_mrmr, auc_list_lr_75_mrmr = model_train_predict(lr, 'mrmr_feat_list_75*')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_75_mrmr, f1_score_list_rfc_75_mrmr, auc_list_rfc_75_mrmr = model_train_predict(rfc, 'mrmr_feat_list_75')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_75_mrmr, f1_score_list_svm_75_mrmr, auc_list_svm_75_mrmr = model_train_predict(svc, 'mrmr_feat_list_75')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_75_mrmr, f1_score_list_xgb_75_mrmr, auc_list_xgb_75_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_75')

Average Accuracy 0.816826923076923
Average F1 Score 0.8146912950230121
Average AUC 0.816826923076923
Max Accuracy 0.8605769230769231
Max F1 Score 0.852791878172589
Max AUC 0.8605769230769231
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(2,)', '(2, 3)', '(1, 2, 1)', '(3,)', 'trigram_entropy', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', '(1, 4)', 'bigram_entropy', '(2, 1, 2)', '(1, 1, 2)', 'unigram_entropy', '(1, 2, 3)', '(1, 4, 1)', '(4, 1, 1)', 'pattern_hvg_4_nodes_entropy', '(6, 3)', '(4,)', '(3, 3)', '(2, 1, 1)', '(3, 1, 2)', '(3, 1, 4)', '(2, 6, 3)', '(1, 3)', '(3, 3, 3)', '(4, 1, 2)', '(6, 3, 1)', '(3, 3, 1)', '(2, 1, 4)', '(1, 1, 4)', '(6, 2, 3)', '(1, 3, 3)', '(2, 2, 3)', '(4, 1, 4)', '(3, 2)', 'C4', '(1, 2, 2)', '(1, 1, 3)', '(2, 2, 2)', '(3, 2, 3)', '(3, 3, 2)', '(2, 2)', '(2, 3, 2)', '(2, 3, 3)', '(3,

## 90 Percentile

In [108]:
lr = LogisticRegression()
accuracy_list_lr_90_mrmr, f1_score_list_lr_90_mrmr, auc_list_lr_90_mrmr = model_train_predict(lr, 'mrmr_feat_list_90*')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_90_mrmr, f1_score_list_rfc_90_mrmr, auc_list_rfc_90_mrmr = model_train_predict(rfc, 'mrmr_feat_list_90')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_90_mrmr, f1_score_list_svm_90_mrmr, auc_list_svm_90_mrmr = model_train_predict(svc, 'mrmr_feat_list_90')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_90_mrmr, f1_score_list_xgb_90_mrmr, auc_list_xgb_90_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_90')

Average Accuracy 0.8182692307692309
Average F1 Score 0.8158907775917077
Average AUC 0.8182692307692309
Max Accuracy 0.8605769230769231
Max F1 Score 0.852791878172589
Max AUC 0.8605769230769231
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(2,)', '(2, 3)', '(1, 2, 1)', '(3,)', 'trigram_entropy', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', '(1, 4)', 'bigram_entropy', '(2, 1, 2)', '(1, 1, 2)', 'unigram_entropy', '(1, 2, 3)', '(1, 4, 1)', '(4, 1, 1)', 'pattern_hvg_4_nodes_entropy', '(6, 3)', '(4,)', '(3, 3)', '(2, 1, 1)', '(3, 1, 2)', '(3, 1, 4)', '(2, 6, 3)', '(1, 3)', '(3, 3, 3)', '(4, 1, 2)', '(6, 3, 1)', '(3, 3, 1)', '(2, 1, 4)', '(1, 1, 4)', '(6, 2, 3)', '(1, 3, 3)', '(2, 2, 3)', '(4, 1, 4)', '(3, 2)', 'C4', '(1, 2, 2)', '(1, 1, 3)', '(2, 2, 2)', '(3, 2, 3)', '(3, 3, 2)', '(2, 2)', '(2, 3, 2)', '(2, 3, 3)', '(

# MI and mRMR

## 10 Percentile

In [109]:
lr = LogisticRegression()
accuracy_list_lr_10_mi_mrmr, f1_score_list_lr_10_mi_mrmr, auc_list_lr_10_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_10')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_10_mi_mrmr, f1_score_list_rfc_10_mi_mrmr, auc_list_rfc_10_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_10')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_10_mi_mrmr, f1_score_list_svm_10_mi_mrmr, auc_list_svm_10_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_10')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_10_mi_mrmr, f1_score_list_xgb_10_mi_mrmr, auc_list_xgb_10_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_10')

Average Accuracy 0.8033653846153845
Average F1 Score 0.7999406621530627
Average AUC 0.8033653846153845
Max Accuracy 0.8653846153846154
Max F1 Score 0.8541666666666667
Max AUC 0.8653846153846153
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(1, 2, 1)', '(2, 1)', '(1, 2)', '(3,)', 'trigram_entropy', '(2, 3, 1)', '(2, 3)', '(2,)', '(3, 1, 1)', '(3, 1)']
Best Features based on Max F1 Score ['(1, 2, 1)', '(2, 1)', '(1, 2)', '(3,)', 'trigram_entropy', '(2, 3, 1)', '(2, 3)', '(2,)', '(3, 1, 1)', '(3, 1)']
Best Features based on Max AUC ['(1, 2, 1)', '(2, 1)', '(1, 2)', '(3,)', 'trigram_entropy', '(2, 3, 1)', '(2, 3)', '(2,)', '(3, 1, 1)', '(3, 1)']


Average Accuracy 0.851923076923077
Average F1 Score 0.8548735070632851
Average AUC 0.851923076923077
Max Accuracy 0.8846153846153846
Max F1 Score 0.8823529411764707
Max AUC 0.8846153846153846
Best Sample Index based on Max Accuracy 4
B

## 20 Percentile

In [110]:
lr = LogisticRegression()
accuracy_list_lr_20_mi_mrmr, f1_score_list_lr_20_mi_mrmr, auc_list_lr_20_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_20*')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_20_mi_mrmr, f1_score_list_rfc_20_mi_mrmr, auc_list_rfc_20_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_20')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_20_mi_mrmr, f1_score_list_svm_20_mi_mrmr, auc_list_svm_20_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_20')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_20_mi_mrmr, f1_score_list_xgb_20_mi_mrmr, auc_list_xgb_20_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_20')

Average Accuracy 0.8134615384615385
Average F1 Score 0.8110030854585389
Average AUC 0.8134615384615385
Max Accuracy 0.8605769230769231
Max F1 Score 0.8481675392670157
Max AUC 0.860576923076923
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(1, 2, 3)', '(4, 1, 1)', 'bigram_entropy', '(1, 4, 1)', '(3, 1, 2)', 'unigram_entropy', '(4,)', '(2, 3, 1)', '(3, 1, 1)', '(1, 2, 1)', '(3, 3)', '(3,)', '(2, 3)', '(2, 1, 1)', '(2, 1)', '(3, 1, 4)', '(1, 4)', '(1, 2)', 'trigram_entropy', '(2, 1, 2)', '(2,)', '(3, 1)', '(4, 1)']
Best Features based on Max F1 Score ['(1, 2, 3)', '(4, 1, 1)', 'bigram_entropy', '(1, 4, 1)', '(3, 1, 2)', 'unigram_entropy', '(4,)', '(2, 3, 1)', '(3, 1, 1)', '(1, 2, 1)', '(3, 3)', '(3,)', '(2, 3)', '(2, 1, 1)', '(2, 1)', '(3, 1, 4)', '(1, 4)', '(1, 2)', 'trigram_entropy', '(2, 1, 2)', '(2,)', '(3, 1)', '(4, 1)']
Best Features based on Max AUC ['(1, 2, 3)', '(4, 1,

## 30 Percentile

In [111]:
lr = LogisticRegression()
accuracy_list_lr_30_mi_mrmr, f1_score_list_lr_30_mi_mrmr, auc_list_lr_30_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_30')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_30_mi_mrmr, f1_score_list_rfc_30_mi_mrmr, auc_list_rfc_30_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_30')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_30_mi_mrmr, f1_score_list_svm_30_mi_mrmr, auc_list_svm_30_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_30')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_30_mi_mrmr, f1_score_list_xgb_30_mi_mrmr, auc_list_xgb_30_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_30')

Average Accuracy 0.8129807692307691
Average F1 Score 0.8107623534855938
Average AUC 0.8129807692307691
Max Accuracy 0.8557692307692307
Max F1 Score 0.8484848484848485
Max AUC 0.8557692307692307
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(1, 2, 3)', '(4, 1, 1)', 'pattern_hvg_5_node_entropy', 'bigram_entropy', '(1, 4, 1)', '(3, 1, 2)', 'unigram_entropy', '(4,)', '(2, 3, 1)', '(2, 1, 4)', '(3, 1, 1)', '(1, 2, 1)', '(3, 3)', '(3,)', 'pattern_hvg_4_nodes_entropy', '(2, 3)', '(1, 1, 4)', '(2, 1, 1)', '(2, 1)', '(3, 1, 4)', '(1, 4)', '(1, 2)', 'trigram_entropy', '(2, 1, 2)', '(2,)', '(3, 1)', 'C4', '(4, 1, 4)', '(4, 1)', '(6, 3, 1)']
Best Features based on Max F1 Score ['(1, 2, 3)', '(4, 1, 1)', 'pattern_hvg_5_node_entropy', 'bigram_entropy', '(1, 4, 1)', '(3, 1, 2)', 'unigram_entropy', '(4,)', '(2, 3, 1)', '(2, 1, 4)', '(3, 1, 1)', '(1, 2, 1)', '(3, 3)', '(3,)', 'pattern_hvg_4_

## 50 Percentile

In [112]:
lr = LogisticRegression()
accuracy_list_lr_50_mi_mrmr, f1_score_list_lr_50_mi_mrmr, auc_list_lr_50_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_50')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_50_mi_mrmr, f1_score_list_rfc_50_mi_mrmr, auc_list_rfc_50_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_50')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_50_mi_mrmr, f1_score_list_svm_50_mi_mrmr, auc_list_svm_50_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_50')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_50_mi_mrmr, f1_score_list_xgb_50_mi_mrmr, auc_list_xgb_50_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_50')

Average Accuracy 0.8168269230769232
Average F1 Score 0.8142057888591114
Average AUC 0.8168269230769232
Max Accuracy 0.8557692307692307
Max F1 Score 0.8469387755102041
Max AUC 0.8557692307692308
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(1, 2, 3)', '(4, 1, 1)', '(6, 3)', 'pattern_hvg_5_node_entropy', 'bigram_entropy', '(1, 4, 4)', '(3, 3, 1)', '(1, 4, 1)', '(3, 1, 2)', '(3, 2, 1)', '(1, 1)', 'unigram_entropy', '(4,)', '(2, 3, 1)', 'G5', '(2, 1, 4)', '(3, 1, 1)', '(1, 2, 2)', '(1, 2, 1)', 'F4', '(3, 3)', '(3,)', 'pattern_hvg_4_nodes_entropy', '(3, 3, 3)', '(2, 3)', '(1, 1, 4)', '(2, 2)', '(1, 3)', '(3, 2)', '(2, 2, 2)', '(2, 1, 1)', 'B4', '(3, 3, 2)', '(2, 1)', '(1, 3, 3)', '(3, 1, 4)', '(1, 4)', '(1, 2)', '(2, 6, 3)', 'trigram_entropy', '(2, 1, 2)', '(2,)', '(3, 1)', '(1, 1, 3)', '(4, 1, 2)', '(2, 3, 2)', 'C4', '(4, 1, 4)', '(2, 2, 1)', '(4, 1)', '(6, 3, 1)']
Best Feature

## 75 Percentile

In [113]:
lr = LogisticRegression()
accuracy_list_lr_75_mi_mrmr, f1_score_list_lr_75_mi_mrmr, auc_list_lr_75_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_75')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_75_mi_mrmr, f1_score_list_rfc_75_mi_mrmr, auc_list_rfc_75_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_75')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_75_mi_mrmr, f1_score_list_svm_75_mi_mrmr, auc_list_svm_75_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_75')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_75_mi_mrmr, f1_score_list_xgb_75_mi_mrmr, auc_list_xgb_75_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_75')

Average Accuracy 0.8168269230769232
Average F1 Score 0.8146937174877305
Average AUC 0.8168269230769232
Max Accuracy 0.8605769230769231
Max F1 Score 0.852791878172589
Max AUC 0.8605769230769231
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(1, 2, 3)', '(2, 3, 4)', '(4,)', '(2, 3, 1)', 'G5', '(2, 1, 4)', '(3, 4)', '(3, 3, 3)', '(2, 3)', 'B4', '(2, 1, 1)', '(3, 3, 2)', '(3, 1, 4)', '(1, 2, 6)', '(1, 4)', '(2, 6, 3)', '(4, 4)', 'trigram_entropy', '(2, 1, 2)', '(2,)', '(1, 1, 3)', '(3, 1, 3)', '(4, 2, 2)', '(3, 3, 1)', '(3, 2, 1)', '(2, 1, 3)', '(4, 2, 1)', '(1, 2, 2)', '(2, 6)', '(6, 2, 2)', 'F4', '(3, 3)', '(3, 2, 2)', '(1, 1, 4)', 'C5', '(1, 1, 1)', '(2, 2, 2)', '(6,)', '(2, 2, 3)', '(3, 4, 1)', '(2, 4, 2)', '(3, 1)', '(2, 3, 3)', '(4, 1, 2)', '(2, 2, 1)', '(6, 3, 1)', '(4, 1, 1)', 'V5', 'unigram_entropy', '(1, 2, 1)', 'L5', 'P5', '(3,)', '(3, 2)', '(2, 1)', '(1, 3, 3)', '(6, 

## 90 Percentile

In [114]:
lr = LogisticRegression()
accuracy_list_lr_90_mi_mrmr, f1_score_list_lr_90_mi_mrmr, auc_list_lr_90_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_90')
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_90_mi_mrmr, f1_score_list_rfc_90_mi_mrmr, auc_list_rfc_90_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_90')
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_90_mi_mrmr, f1_score_list_svm_90_mi_mrmr, auc_list_svm_90_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_90')
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_90_mi_mrmr, f1_score_list_xgb_90_mi_mrmr, auc_list_xgb_90_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_90')

Average Accuracy 0.8187500000000002
Average F1 Score 0.8163063161907684
Average AUC 0.81875
Max Accuracy 0.8653846153846154
Max F1 Score 0.8571428571428572
Max AUC 0.8653846153846154
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(1, 2, 3)', '(6, 2, 1)', '(2, 4, 1)', '(1, 3, 2)', '(2, 3, 4)', '(4,)', '(2, 3, 1)', '(4, 1, 3)', 'G5', '(2, 1, 4)', '(2, 2, 6)', '(3, 4)', 'H5', '(3, 3, 3)', '(2, 3)', '(6, 3, 3)', '(2, 1, 1)', '(2, 2, 4)', '(3, 3, 2)', '(3, 1, 4)', '(1, 2, 6)', '(4, 4)', '(1, 4)', '(2, 6, 3)', 'trigram_entropy', '(2, 1, 2)', '(2,)', '(1, 1, 3)', '(3, 1, 3)', '(4, 2, 2)', '(3, 3, 1)', '(3, 2, 1)', '(2, 1, 3)', '(4, 2, 1)', '(1, 2, 2)', '(2, 6)', '(4, 4, 4)', '(6, 2, 2)', 'F4', '(3, 3)', '(3, 2, 2)', '(1,)', '(1, 1, 6)', '(1, 1, 4)', '(1, 1, 1)', '(2, 2, 2)', '(6,)', '(2, 2, 3)', '(2, 6, 2)', '(3, 4, 1)', '(6, 1, 3)', '(3, 1)', '(2, 3, 3)', '(4, 1, 2)', '(2, 2, 1)', 

# PCA

In [167]:
def model_train_predict_pca(model, k, dataframes=list_sample_dataframes):
    
    accuracy_list = []
    f1_score_list = []
    auc_list = []
    
    for sample in dataframes:
        x = sample.drop(['Unnamed: 0', 'conversion_class'], axis=1)
        y = sample['conversion_class']
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
        
        pca = PCA(n_components=k)
        x_train = pca.fit_transform(x_train)
        x_test = pca.transform(x_test)
        
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        accuracy_list.append(accuracy_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))
        auc_list.append(roc_auc_score(y_test, y_pred))

    print('Average Accuracy', np.mean(accuracy_list))
    print('Average F1 Score', np.mean(f1_score_list))
    print('Average AUC', np.mean(auc_list)) 
    
    print('Max Accuracy', max(accuracy_list))
    print('Max F1 Score', max(f1_score_list))
    print('Max AUC', max(auc_list))  
    
    best_accuracy_index = accuracy_list.index(max(accuracy_list))
    best_f1_score_index = f1_score_list.index(max(f1_score_list))
    best_auc_index = auc_list.index(max(auc_list))
    
    print('Best Sample Index based on Max Accuracy', best_accuracy_index)
    print('Best Sample Index based on Max F1 Score', best_f1_score_index)
    print('Best Sample Index based on Max AUC', best_auc_index)
    
    return accuracy_list, f1_score_list, auc_list  

## 10 Percentile

In [168]:
lr = LogisticRegression()
accuracy_list_lr_10_pca, f1_score_list_lr_10_pca, auc_list_lr_10_pca = model_train_predict_pca(lr, 14)
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_10_pca, f1_score_list_rfc_10_pca, auc_list_rfc_10_pca = model_train_predict_pca(rfc, 14)
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_10_pca, f1_score_list_svm_10_pca, auc_list_svm_10_pca = model_train_predict_pca(svc, 14)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_10_pca, f1_score_list_xgb_10_pca, auc_list_xgb_10_pca = model_train_predict_pca(xgbc, 14)

Average Accuracy 0.795673076923077
Average F1 Score 0.7937299756345915
Average AUC 0.795673076923077
Max Accuracy 0.8461538461538461
Max F1 Score 0.8350515463917526
Max AUC 0.846153846153846
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.7971153846153847
Average F1 Score 0.7981551879150508
Average AUC 0.7971153846153847
Max Accuracy 0.8365384615384616
Max F1 Score 0.8300000000000001
Max AUC 0.8365384615384616
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.8139423076923077
Average F1 Score 0.8187233671665629
Average AUC 0.8139423076923077
Max Accuracy 0.8509615384615384
Max F1 Score 0.8502415458937198
Max AUC 0.8509615384615385
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.7927884615384615
Average F1 Score

## 20 Percentile

In [169]:
lr = LogisticRegression()
accuracy_list_lr_20_pca, f1_score_list_lr_20_pca, auc_list_lr_20_pca = model_train_predict_pca(lr, 28)
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_20_pca, f1_score_list_rfc_20_pca, auc_list_rfc_20_pca = model_train_predict_pca(rfc, 28)
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_20_pca, f1_score_list_svm_20_pca, auc_list_svm_20_pca = model_train_predict_pca(svc, 28)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_20_pca, f1_score_list_xgb_20_pca, auc_list_xgb_20_pca = model_train_predict_pca(xgbc, 28)

Average Accuracy 0.8149038461538461
Average F1 Score 0.81270616308118
Average AUC 0.8149038461538461
Max Accuracy 0.8509615384615384
Max F1 Score 0.8442211055276383
Max AUC 0.8509615384615385
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.8158653846153847
Average F1 Score 0.8145127842437102
Average AUC 0.8158653846153847
Max Accuracy 0.8413461538461539
Max F1 Score 0.8358208955223881
Max AUC 0.8413461538461539
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.8288461538461538
Average F1 Score 0.8313748856668669
Average AUC 0.8288461538461538
Max Accuracy 0.8605769230769231
Max F1 Score 0.8599033816425121
Max AUC 0.8605769230769231
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.825
Average F1 Score 0.824420289

## 30 Percentile

In [170]:
lr = LogisticRegression()
accuracy_list_lr_30_pca, f1_score_list_lr_30_pca, auc_list_lr_30_pca = model_train_predict_pca(lr, 42)
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_30_pca, f1_score_list_rfc_30_pca, auc_list_rfc_30_pca = model_train_predict_pca(rfc, 42)
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_30_pca, f1_score_list_svm_30_pca, auc_list_svm_30_pca = model_train_predict_pca(svc, 42)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_30_pca, f1_score_list_xgb_30_pca, auc_list_xgb_30_pca = model_train_predict_pca(xgbc, 42)

Average Accuracy 0.8173076923076923
Average F1 Score 0.8151089332692442
Average AUC 0.8173076923076923
Max Accuracy 0.8605769230769231
Max F1 Score 0.852791878172589
Max AUC 0.8605769230769231
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.8302884615384615
Average F1 Score 0.8313791770702634
Average AUC 0.8302884615384615
Max Accuracy 0.8701923076923077
Max F1 Score 0.8682926829268293
Max AUC 0.8701923076923077
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.8346153846153845
Average F1 Score 0.8365598068604878
Average AUC 0.8346153846153846
Max Accuracy 0.8701923076923077
Max F1 Score 0.8695652173913043
Max AUC 0.8701923076923077
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.8274038461538462
Average F1 Sco

## 50 Percentile

In [171]:
lr = LogisticRegression()
accuracy_list_lr_50_pca, f1_score_list_lr_50_pca, auc_list_lr_50_pca = model_train_predict_pca(lr, 69)
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_50_pca, f1_score_list_rfc_50_pca, auc_list_rfc_50_pca = model_train_predict_pca(rfc, 69)
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_50_pca, f1_score_list_svm_50_pca, auc_list_svm_50_pca = model_train_predict_pca(svc, 69)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_50_pca, f1_score_list_xgb_50_pca, auc_list_xgb_50_pca = model_train_predict_pca(xgbc, 69)

Average Accuracy 0.8173076923076923
Average F1 Score 0.8151089332692442
Average AUC 0.8173076923076923
Max Accuracy 0.8605769230769231
Max F1 Score 0.852791878172589
Max AUC 0.8605769230769231
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.84375
Average F1 Score 0.8458189053942815
Average AUC 0.84375
Max Accuracy 0.8653846153846154
Max F1 Score 0.8640776699029126
Max AUC 0.8653846153846154
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.8360576923076923
Average F1 Score 0.8378810987779877
Average AUC 0.8360576923076923
Max Accuracy 0.8701923076923077
Max F1 Score 0.8695652173913043
Max AUC 0.8701923076923077
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.8389423076923077
Average F1 Score 0.8395511261590812


## 75 Percentile

In [172]:
lr = LogisticRegression()
accuracy_list_lr_75_pca, f1_score_list_lr_75_pca, auc_list_lr_75_pca = model_train_predict_pca(lr, 104)
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_75_pca, f1_score_list_rfc_75_pca, auc_list_rfc_75_pca = model_train_predict_pca(rfc, 104)
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_75_pca, f1_score_list_svm_75_pca, auc_list_svm_75_pca = model_train_predict_pca(svc, 104)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_75_pca, f1_score_list_xgb_75_pca, auc_list_xgb_75_pca = model_train_predict_pca(xgbc, 104)

Average Accuracy 0.8173076923076923
Average F1 Score 0.8151089332692442
Average AUC 0.8173076923076923
Max Accuracy 0.8605769230769231
Max F1 Score 0.852791878172589
Max AUC 0.8605769230769231
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.8375
Average F1 Score 0.8410858204587199
Average AUC 0.8375
Max Accuracy 0.8605769230769231
Max F1 Score 0.8599033816425121
Max AUC 0.8605769230769231
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1


Average Accuracy 0.8360576923076923
Average F1 Score 0.8378810987779877
Average AUC 0.8360576923076923
Max Accuracy 0.8701923076923077
Max F1 Score 0.8695652173913043
Max AUC 0.8701923076923077
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.8451923076923077
Average F1 Score 0.845098713687697
Ave

## 90 Percentile

In [173]:
lr = LogisticRegression()
accuracy_list_lr_90_pca, f1_score_list_lr_90_pca, auc_list_lr_90_pca = model_train_predict_pca(lr, 125)
print("\n================================================================\n")
rfc = RandomForestClassifier()
accuracy_list_rfc_90_pca, f1_score_list_rfc_90_pca, auc_list_rfc_90_pca = model_train_predict_pca(rfc, 125)
print("\n================================================================\n")
svc = SVC()
accuracy_list_svm_90_pca, f1_score_list_svm_90_pca, auc_list_svm_90_pca = model_train_predict_pca(svc, 125)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_90_pca, f1_score_list_xgb_90_pca, auc_list_xgb_90_pca = model_train_predict_pca(xgbc, 125)

Average Accuracy 0.8173076923076923
Average F1 Score 0.8151089332692442
Average AUC 0.8173076923076923
Max Accuracy 0.8605769230769231
Max F1 Score 0.852791878172589
Max AUC 0.8605769230769231
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.8423076923076923
Average F1 Score 0.846175627063454
Average AUC 0.8423076923076923
Max Accuracy 0.8653846153846154
Max F1 Score 0.8666666666666667
Max AUC 0.8653846153846155
Best Sample Index based on Max Accuracy 7
Best Sample Index based on Max F1 Score 7
Best Sample Index based on Max AUC 7


Average Accuracy 0.8360576923076923
Average F1 Score 0.8378810987779877
Average AUC 0.8360576923076923
Max Accuracy 0.8701923076923077
Max F1 Score 0.8695652173913043
Max AUC 0.8701923076923077
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4


Average Accuracy 0.8427884615384615
Average F1 Scor

# Saving results

In [174]:
models = ['lr', 'rfc', 'svc', 'xgbc']
models = [value for value in models for _ in range(10)] * 6
percentiles = ['10', '20', '30', '50', '75', '90']
percentiles = [value for value in percentiles for _ in range(40)]
samples = [x for x in range(1, 11)]
samples = samples * 24

print(len(models))
print(len(percentiles))
print(len(samples))

240
240
240


In [116]:
overall_accuracy_list_mi = (accuracy_list_lr_10_mi + accuracy_list_rfc_10_mi + accuracy_list_svm_10_mi + accuracy_list_xgb_10_mi +
                            accuracy_list_lr_20_mi + accuracy_list_rfc_20_mi + accuracy_list_svm_20_mi + accuracy_list_xgb_20_mi +
                            accuracy_list_lr_30_mi + accuracy_list_rfc_30_mi + accuracy_list_svm_30_mi + accuracy_list_xgb_30_mi +
                            accuracy_list_lr_50_mi + accuracy_list_rfc_50_mi + accuracy_list_svm_50_mi + accuracy_list_xgb_50_mi +
                            accuracy_list_lr_75_mi + accuracy_list_rfc_75_mi + accuracy_list_svm_75_mi + accuracy_list_xgb_75_mi +
                            accuracy_list_lr_90_mi + accuracy_list_rfc_90_mi + accuracy_list_svm_90_mi + accuracy_list_xgb_90_mi)

overall_f1_score_list_mi = (f1_score_list_lr_10_mi + f1_score_list_rfc_10_mi + f1_score_list_svm_10_mi + f1_score_list_xgb_10_mi +
                            f1_score_list_lr_20_mi + f1_score_list_rfc_20_mi + f1_score_list_svm_20_mi + f1_score_list_xgb_20_mi +
                            f1_score_list_lr_30_mi + f1_score_list_rfc_30_mi + f1_score_list_svm_30_mi + f1_score_list_xgb_30_mi +
                            f1_score_list_lr_50_mi + f1_score_list_rfc_50_mi + f1_score_list_svm_50_mi + f1_score_list_xgb_50_mi +
                            f1_score_list_lr_75_mi + f1_score_list_rfc_75_mi + f1_score_list_svm_75_mi + f1_score_list_xgb_75_mi +
                            f1_score_list_lr_90_mi + f1_score_list_rfc_90_mi + f1_score_list_svm_90_mi + f1_score_list_xgb_90_mi)

overall_auc_list_mi = (auc_list_lr_10_mi + auc_list_rfc_10_mi + auc_list_svm_10_mi + auc_list_xgb_10_mi +
                            auc_list_lr_20_mi + auc_list_rfc_20_mi + auc_list_svm_20_mi + auc_list_xgb_20_mi +
                            auc_list_lr_30_mi + auc_list_rfc_30_mi + auc_list_svm_30_mi + auc_list_xgb_30_mi +
                            auc_list_lr_50_mi + auc_list_rfc_50_mi + auc_list_svm_50_mi + auc_list_xgb_50_mi +
                            auc_list_lr_75_mi + auc_list_rfc_75_mi + auc_list_svm_75_mi + auc_list_xgb_75_mi +
                            auc_list_lr_90_mi + auc_list_rfc_90_mi + auc_list_svm_90_mi + auc_list_xgb_90_mi)

In [117]:
overall_accuracy_list_mrmr = (accuracy_list_lr_10_mrmr + accuracy_list_rfc_10_mrmr + accuracy_list_svm_10_mrmr + accuracy_list_xgb_10_mrmr +
                            accuracy_list_lr_20_mrmr + accuracy_list_rfc_20_mrmr + accuracy_list_svm_20_mrmr + accuracy_list_xgb_20_mrmr +
                            accuracy_list_lr_30_mrmr + accuracy_list_rfc_30_mrmr + accuracy_list_svm_30_mrmr + accuracy_list_xgb_30_mrmr +
                            accuracy_list_lr_50_mrmr + accuracy_list_rfc_50_mrmr + accuracy_list_svm_50_mrmr + accuracy_list_xgb_50_mrmr +
                            accuracy_list_lr_75_mrmr + accuracy_list_rfc_75_mrmr + accuracy_list_svm_75_mrmr + accuracy_list_xgb_75_mrmr +
                            accuracy_list_lr_90_mrmr + accuracy_list_rfc_90_mrmr + accuracy_list_svm_90_mrmr + accuracy_list_xgb_90_mrmr)

overall_f1_score_list_mrmr = (f1_score_list_lr_10_mrmr + f1_score_list_rfc_10_mrmr + f1_score_list_svm_10_mrmr + f1_score_list_xgb_10_mrmr +
                            f1_score_list_lr_20_mrmr + f1_score_list_rfc_20_mrmr + f1_score_list_svm_20_mrmr + f1_score_list_xgb_20_mrmr +
                            f1_score_list_lr_30_mrmr + f1_score_list_rfc_30_mrmr + f1_score_list_svm_30_mrmr + f1_score_list_xgb_30_mrmr +
                            f1_score_list_lr_50_mrmr + f1_score_list_rfc_50_mrmr + f1_score_list_svm_50_mrmr + f1_score_list_xgb_50_mrmr +
                            f1_score_list_lr_75_mrmr + f1_score_list_rfc_75_mrmr + f1_score_list_svm_75_mrmr + f1_score_list_xgb_75_mrmr +
                            f1_score_list_lr_90_mrmr + f1_score_list_rfc_90_mrmr + f1_score_list_svm_90_mrmr + f1_score_list_xgb_90_mrmr)

overall_auc_list_mrmr = (auc_list_lr_10_mrmr + auc_list_rfc_10_mrmr + auc_list_svm_10_mrmr + auc_list_xgb_10_mrmr +
                            auc_list_lr_20_mrmr + auc_list_rfc_20_mrmr + auc_list_svm_20_mrmr + auc_list_xgb_20_mrmr +
                            auc_list_lr_30_mrmr + auc_list_rfc_30_mrmr + auc_list_svm_30_mrmr + auc_list_xgb_30_mrmr +
                            auc_list_lr_50_mrmr + auc_list_rfc_50_mrmr + auc_list_svm_50_mrmr + auc_list_xgb_50_mrmr +
                            auc_list_lr_75_mrmr + auc_list_rfc_75_mrmr + auc_list_svm_75_mrmr + auc_list_xgb_75_mrmr +
                            auc_list_lr_90_mrmr + auc_list_rfc_90_mrmr + auc_list_svm_90_mrmr + auc_list_xgb_90_mrmr)

In [118]:
overall_accuracy_list_mi_mrmr = (accuracy_list_lr_10_mi_mrmr + accuracy_list_rfc_10_mi_mrmr + accuracy_list_svm_10_mi_mrmr + accuracy_list_xgb_10_mi_mrmr +
                            accuracy_list_lr_20_mi_mrmr + accuracy_list_rfc_20_mi_mrmr + accuracy_list_svm_20_mi_mrmr + accuracy_list_xgb_20_mi_mrmr +
                            accuracy_list_lr_30_mi_mrmr + accuracy_list_rfc_30_mi_mrmr + accuracy_list_svm_30_mi_mrmr + accuracy_list_xgb_30_mi_mrmr +
                            accuracy_list_lr_50_mi_mrmr + accuracy_list_rfc_50_mi_mrmr + accuracy_list_svm_50_mi_mrmr + accuracy_list_xgb_50_mi_mrmr +
                            accuracy_list_lr_75_mi_mrmr + accuracy_list_rfc_75_mi_mrmr + accuracy_list_svm_75_mi_mrmr + accuracy_list_xgb_75_mi_mrmr +
                            accuracy_list_lr_90_mi_mrmr + accuracy_list_rfc_90_mi_mrmr + accuracy_list_svm_90_mi_mrmr + accuracy_list_xgb_90_mi_mrmr)

overall_f1_score_list_mi_mrmr = (f1_score_list_lr_10_mi_mrmr + f1_score_list_rfc_10_mi_mrmr + f1_score_list_svm_10_mi_mrmr + f1_score_list_xgb_10_mi_mrmr +
                            f1_score_list_lr_20_mi_mrmr + f1_score_list_rfc_20_mi_mrmr + f1_score_list_svm_20_mi_mrmr + f1_score_list_xgb_20_mi_mrmr +
                            f1_score_list_lr_30_mi_mrmr + f1_score_list_rfc_30_mi_mrmr + f1_score_list_svm_30_mi_mrmr + f1_score_list_xgb_30_mi_mrmr +
                            f1_score_list_lr_50_mi_mrmr + f1_score_list_rfc_50_mi_mrmr + f1_score_list_svm_50_mi_mrmr + f1_score_list_xgb_50_mi_mrmr +
                            f1_score_list_lr_75_mi_mrmr + f1_score_list_rfc_75_mi_mrmr + f1_score_list_svm_75_mi_mrmr + f1_score_list_xgb_75_mi_mrmr +
                            f1_score_list_lr_90_mi_mrmr + f1_score_list_rfc_90_mi_mrmr + f1_score_list_svm_90_mi_mrmr + f1_score_list_xgb_90_mi_mrmr)

overall_auc_list_mi_mrmr = (auc_list_lr_10_mi_mrmr + auc_list_rfc_10_mi_mrmr + auc_list_svm_10_mi_mrmr + auc_list_xgb_10_mi_mrmr +
                            auc_list_lr_20_mi_mrmr + auc_list_rfc_20_mi_mrmr + auc_list_svm_20_mi_mrmr + auc_list_xgb_20_mi_mrmr +
                            auc_list_lr_30_mi_mrmr + auc_list_rfc_30_mi_mrmr + auc_list_svm_30_mi_mrmr + auc_list_xgb_30_mi_mrmr +
                            auc_list_lr_50_mi_mrmr + auc_list_rfc_50_mi_mrmr + auc_list_svm_50_mi_mrmr + auc_list_xgb_50_mi_mrmr +
                            auc_list_lr_75_mi_mrmr + auc_list_rfc_75_mi_mrmr + auc_list_svm_75_mi_mrmr + auc_list_xgb_75_mi_mrmr +
                            auc_list_lr_90_mi_mrmr + auc_list_rfc_90_mi_mrmr + auc_list_svm_90_mi_mrmr + auc_list_xgb_90_mi_mrmr)

In [175]:
overall_accuracy_list_pca = (accuracy_list_lr_10_pca + accuracy_list_rfc_10_pca + accuracy_list_svm_10_pca + accuracy_list_xgb_10_pca +
                            accuracy_list_lr_20_pca + accuracy_list_rfc_20_pca + accuracy_list_svm_20_pca + accuracy_list_xgb_20_pca +
                            accuracy_list_lr_30_pca + accuracy_list_rfc_30_pca + accuracy_list_svm_30_pca + accuracy_list_xgb_30_pca +
                            accuracy_list_lr_50_pca + accuracy_list_rfc_50_pca + accuracy_list_svm_50_pca + accuracy_list_xgb_50_pca +
                            accuracy_list_lr_75_pca + accuracy_list_rfc_75_pca + accuracy_list_svm_75_pca + accuracy_list_xgb_75_pca +
                            accuracy_list_lr_90_pca + accuracy_list_rfc_90_pca + accuracy_list_svm_90_pca + accuracy_list_xgb_90_pca)

overall_f1_score_list_pca = (f1_score_list_lr_10_pca + f1_score_list_rfc_10_pca + f1_score_list_svm_10_pca + f1_score_list_xgb_10_pca +
                            f1_score_list_lr_20_pca + f1_score_list_rfc_20_pca + f1_score_list_svm_20_pca + f1_score_list_xgb_20_pca +
                            f1_score_list_lr_30_pca + f1_score_list_rfc_30_pca + f1_score_list_svm_30_pca + f1_score_list_xgb_30_pca +
                            f1_score_list_lr_50_pca + f1_score_list_rfc_50_pca + f1_score_list_svm_50_pca + f1_score_list_xgb_50_pca +
                            f1_score_list_lr_75_pca + f1_score_list_rfc_75_pca + f1_score_list_svm_75_pca + f1_score_list_xgb_75_pca +
                            f1_score_list_lr_90_pca + f1_score_list_rfc_90_pca + f1_score_list_svm_90_pca + f1_score_list_xgb_90_pca)

overall_auc_list_pca = (auc_list_lr_10_pca + auc_list_rfc_10_pca + auc_list_svm_10_pca + auc_list_xgb_10_pca +
                            auc_list_lr_20_pca + auc_list_rfc_20_pca + auc_list_svm_20_pca + auc_list_xgb_20_pca +
                            auc_list_lr_30_pca + auc_list_rfc_30_pca + auc_list_svm_30_pca + auc_list_xgb_30_pca +
                            auc_list_lr_50_pca + auc_list_rfc_50_pca + auc_list_svm_50_pca + auc_list_xgb_50_pca +
                            auc_list_lr_75_pca + auc_list_rfc_75_pca + auc_list_svm_75_pca + auc_list_xgb_75_pca +
                            auc_list_lr_90_pca + auc_list_rfc_90_pca + auc_list_svm_90_pca + auc_list_xgb_90_pca)

In [176]:
print(len(overall_accuracy_list_mi))
print(len(overall_f1_score_list_mi))
print(len(overall_auc_list_mi))

print(len(overall_accuracy_list_mrmr))
print(len(overall_f1_score_list_mrmr))
print(len(overall_auc_list_mrmr))

print(len(overall_accuracy_list_mi_mrmr))
print(len(overall_f1_score_list_mi_mrmr))
print(len(overall_auc_list_mi_mrmr))

print(len(overall_accuracy_list_pca))
print(len(overall_f1_score_list_pca))
print(len(overall_auc_list_pca))

240
240
240
240
240
240
240
240
240
240
240
240


In [177]:
results_dictionary = {
    'samples': samples,
    'models': models,
    'percentiles': percentiles,
    'mi_accuracy': overall_accuracy_list_mi,
    'mi_f1_score': overall_f1_score_list_mi,
    'mi_auc': overall_auc_list_mi,
    'mrmr_accuracy': overall_accuracy_list_mrmr,
    'mrmr_f1_score': overall_f1_score_list_mrmr,
    'mrmr_auc': overall_auc_list_mrmr,
    'mi_mrmr_accuracy': overall_accuracy_list_mi_mrmr,
    'mi_mrmr_f1_score': overall_f1_score_list_mi_mrmr,
    'mi_mrmr_auc': overall_auc_list_mi_mrmr,
    'pca_accuracy': overall_accuracy_list_pca,
    'pca_f1_score': overall_f1_score_list_pca,
    'pca_auc': overall_auc_list_pca,
}
results_df = pd.DataFrame(results_dictionary)

results_df.to_csv('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/results/overall_results.csv', index=False)