In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import *
from sklearn.metrics import *

from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.decomposition import *
from sklearn.model_selection import GridSearchCV

import xgboost as xgb

# import tensorflow as tf

import os
import re
import ast

In [4]:
directory_dataframes = '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/subsamples/'
directory_features = '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features/'

def get_sample_df(directory=directory_dataframes):
    list_dataframes = []
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            list_dataframes.append(pd.read_csv(f))
            
    return list_dataframes

def get_features(regex_str, directory=directory_features):
    regex = re.compile('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features/{}'.format(regex_str))
    
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if regex.match(f):
            file1 = open(f,"r+")
            feat_list = file1.read().splitlines()
            
            #txt file converts everything to string, so we need to convert it back to list
            for i in range(len(feat_list)):
                #adding ; to be used a separator for list
                if i<len(feat_list):
                    new_val = feat_list[i].replace('y','y;').replace(') ','); ').replace('4 ', '4; ').replace('5 ', '5; ')
                    feat_list[i] = new_val
                
    for val in feat_list:
        #separating the string into a list of features
        new_val = val.split('; ')
        feat_list[feat_list.index(val)] = new_val
        
    return feat_list

list_sample_dataframes = get_sample_df(directory_dataframes)

In [19]:
def model_train_predict(model, regex_str, dataframes=list_sample_dataframes, params=None):
    
    feat_list = get_features(regex_str)
    
    accuracy_list = []
    f1_score_list = []
    auc_list = []
    best_params_list = []
    
    for sample, feat in zip(dataframes, feat_list):
        feat[len(feat)-1] = feat[len(feat)-1].replace('y;', 'y')
        x = sample[feat]
        y = sample['conversion_class']
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
        # print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
        clf = GridSearchCV(estimator=model, param_grid=params, cv=5, n_jobs=-1, verbose=-1)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        # model.fit(x_train, y_train)
        # y_pred = model.predict(x_test)
        accuracy_list.append(accuracy_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))
        auc_list.append(roc_auc_score(y_test, y_pred))
        best_params_list.append(clf.best_params_)

    print('Average Accuracy', np.mean(accuracy_list))
    print('Average F1 Score', np.mean(f1_score_list))
    print('Average AUC', np.mean(auc_list)) 
    
    print('Max Accuracy', max(accuracy_list))
    print('Max F1 Score', max(f1_score_list))
    print('Max AUC', max(auc_list))  
    
    best_accuracy_index = accuracy_list.index(max(accuracy_list))
    best_f1_score_index = f1_score_list.index(max(f1_score_list))
    best_auc_index = auc_list.index(max(auc_list))
    
    print('Best Sample Index based on Max Accuracy', best_accuracy_index)
    print('Best Sample Index based on Max F1 Score', best_f1_score_index)
    print('Best Sample Index based on Max AUC', best_auc_index)
    
    print('Best Features based on Max Accuracy', feat_list[best_accuracy_index])
    print('Best Features based on Max F1 Score', feat_list[best_f1_score_index])
    print('Best Features based on Max AUC', feat_list[best_auc_index]) 
    print('Best Params based on Max Accuracy', best_params_list[best_accuracy_index])
    
     
    
    return accuracy_list, f1_score_list, auc_list, best_params_list  


# Mutual Information

## 10 Percentile

In [28]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_10_mi, f1_score_list_lr_10_mi, auc_list_lr_10_mi, param_list_lr_10_mi = model_train_predict(lr, 'mi_feat_list_10', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_10_mi, f1_score_list_rfc_10_mi, auc_list_rfc_10_mi, param_list_rfc_10_mi = model_train_predict(rfc, 'mi_feat_list_10', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_10_mi, f1_score_list_svm_10_mi, auc_list_svm_10_mi, param_list_svm_10_mi = model_train_predict(svc, 'mi_feat_list_10', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100, 200, 500], 'max_depth': [3, 5, 10, 20, 50, 100]}
accuracy_list_xgb_10_mi, f1_score_list_xgb_10_mi, auc_list_xgb_10_mi, param_list_xgb_10_mi = model_train_predict(xgbc, 'mi_feat_list_10', params=params)

Average Accuracy 0.8512607830126078
Average F1 Score 0.8476152167841166
Average AUC 0.8512607830126078
Max Accuracy 0.8526874585268746
Max F1 Score 0.8491847826086957
Max AUC 0.8526874585268746
Best Sample Index based on Max Accuracy 2
Best Sample Index based on Max F1 Score 2
Best Sample Index based on Max AUC 2
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(4, 1)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(4, 1)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)']
Best Features based on Max AUC ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(4, 1)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1,

## 20 Percentile

In [30]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_20_mi, f1_score_list_lr_20_mi, auc_list_lr_20_mi, param_list_lr_20_mi = model_train_predict(lr, 'mi_feat_list_20', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_20_mi, f1_score_list_rfc_20_mi, auc_list_rfc_20_mi, param_list_rfc_20_mi = model_train_predict(rfc, 'mi_feat_list_20', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_20_mi, f1_score_list_svm_20_mi, auc_list_svm_20_mi, param_list_svm_20_mi = model_train_predict(svc, 'mi_feat_list_20', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_20_mi, f1_score_list_xgb_20_mi, auc_list_xgb_20_mi, param_list_xgb_20_mi = model_train_predict(xgbc, 'mi_feat_list_20', params=params)





Average Accuracy 0.8918380889183808
Average F1 Score 0.8926152406167785
Average AUC 0.8918380889183808
Max Accuracy 0.8934970139349702
Max F1 Score 0.8945812807881773
Max AUC 0.8934970139349703
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(3, 3)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(3, 1, 1)', '(1, 1, 4)', '(1, 4, 1)', '(4, 1, 1)', 'A4', 'E5', 'G5', 'P5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)',

## 30 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_30_mi, f1_score_list_lr_30_mi, auc_list_lr_30_mi, param_list_lr_30_mi = model_train_predict(lr, 'mi_feat_list_30', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_30_mi, f1_score_list_rfc_30_mi, auc_list_rfc_30_mi, param_list_rfc_30_mi = model_train_predict(rfc, 'mi_feat_list_30', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_30_mi, f1_score_list_svm_30_mi, auc_list_svm_30_mi, param_list_svm_30_mi = model_train_predict(svc, 'mi_feat_list_30', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_30_mi, f1_score_list_xgb_30_mi, auc_list_xgb_30_mi, param_list_xgb_30_mi = model_train_predict(xgbc, 'mi_feat_list_30', params=params)

Average Accuracy 0.8110576923076923
Average F1 Score 0.8085883465941157
Average AUC 0.8110576923076923
Max Accuracy 0.8173076923076923
Max F1 Score 0.8137254901960785
Max AUC 0.8173076923076924
Best Sample Index based on Max Accuracy 2
Best Sample Index based on Max F1 Score 2
Best Sample Index based on Max AUC 2
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(1, 3)', '(3, 3)', '(1, 1, 1)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(3, 1, 1)', '(1, 1, 4)', '(1, 4, 1)', '(4, 1, 4)', '(4, 1, 1)', '(3, 1, 4)', 'A4', 'D4', 'E4', 'C4', 'F4', 'E5', 'N5', 'Q5', 'G5', 'A5', 'L5', 'P5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)'

## 50 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_50_mi, f1_score_list_lr_50_mi, auc_list_lr_50_mi, param_list_lr_50_mi = model_train_predict(lr, 'mi_feat_list_50', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_50_mi, f1_score_list_rfc_50_mi, auc_list_rfc_50_mi, param_list_rfc_50_mi = model_train_predict(rfc, 'mi_feat_list_50', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_50_mi, f1_score_list_svm_50_mi, auc_list_svm_50_mi, param_list_svm_50_mi = model_train_predict(svc, 'mi_feat_list_50', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_50_mi, f1_score_list_xgb_50_mi, auc_list_xgb_50_mi, param_list_xgb_50_mi = model_train_predict(xgbc, 'mi_feat_list_50', params=params)

Average Accuracy 0.8158653846153845
Average F1 Score 0.8125385831372078
Average AUC 0.8158653846153847
Max Accuracy 0.8221153846153846
Max F1 Score 0.8177339901477833
Max AUC 0.8221153846153846
Best Sample Index based on Max Accuracy 7
Best Sample Index based on Max F1 Score 7
Best Sample Index based on Max AUC 7
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(1,)', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 6)', '(2, 1)', '(2, 2)', '(2, 3)', '(3, 1)', '(6, 3)', '(3, 2)', '(1, 4)', '(4, 1)', '(1, 3)', '(3, 3)', '(1, 6)', '(2, 4)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 2)', '(2, 2, 1)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(2, 2, 3)', '(3, 1, 1)', '(1, 1, 4)', '(1, 4, 1)', '(4, 1, 4)', '(1, 3, 3)', '(3, 3, 3)', '(4, 1, 1)', '(3, 3, 1)', '(2, 3, 2)', '(6, 2, 1)', '(3, 1, 4)', '(2, 3, 3)', '(1, 1, 3)', '(4, 4, 1)', '(3, 1, 3)', '(2

## 75 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_75_mi, f1_score_list_lr_75_mi, auc_list_lr_75_mi, param_list_lr_75_mi = model_train_predict(lr, 'mi_feat_list_75', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_75_mi, f1_score_list_rfc_75_mi, auc_list_rfc_75_mi, param_list_rfc_75_mi = model_train_predict(rfc, 'mi_feat_list_75', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_75_mi, f1_score_list_svm_75_mi, auc_list_svm_75_mi, param_list_svm_75_mi = model_train_predict(svc, 'mi_feat_list_75', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_75_mi, f1_score_list_xgb_75_mi, auc_list_xgb_75_mi, param_list_xgb_75_mi = model_train_predict(xgbc, 'mi_feat_list_75', params=params)

Average Accuracy 0.8216346153846154
Average F1 Score 0.8173331401526127
Average AUC 0.8216346153846155
Max Accuracy 0.8221153846153846
Max F1 Score 0.8177339901477833
Max AUC 0.8221153846153846
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(1,)', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 6)', '(2, 1)', '(2, 2)', '(2, 3)', '(3, 1)', '(6, 3)', '(3, 2)', '(1, 4)', '(4, 1)', '(1, 3)', '(3, 3)', '(4, 4)', '(2, 4)', '(4, 2)', '(6, 6)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 6)', '(2, 6, 1)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 2)', '(2, 2, 1)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(2, 2, 3)', '(3, 1, 1)', '(2, 6, 3)', '(6, 3, 2)', '(3, 2, 2)', '(6, 3, 1)', '(1, 1, 4)', '(1, 4, 1)', '(4, 1, 2)', '(2, 1, 4)', '(4, 1, 4)', '(1, 3, 3)', '(3, 3, 3)', '(3,

## 90 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_90_mi, f1_score_list_lr_90_mi, auc_list_lr_90_mi, param_list_lr_90_mi = model_train_predict(lr, 'mi_feat_list_90', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_90_mi, f1_score_list_rfc_90_mi, auc_list_rfc_90_mi, param_list_rfc_90_mi = model_train_predict(rfc, 'mi_feat_list_90', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_90_mi, f1_score_list_svm_90_mi, auc_list_svm_90_mi, param_list_svm_90_mi = model_train_predict(svc, 'mi_feat_list_90', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_90_mi, f1_score_list_xgb_90_mi, auc_list_xgb_90_mi, param_list_xgb_90_mi = model_train_predict(xgbc, 'mi_feat_list_90', params=params)

Average Accuracy 0.8221153846153847
Average F1 Score 0.8177339901477833
Average AUC 0.8221153846153847
Max Accuracy 0.8221153846153846
Max F1 Score 0.8177339901477833
Max AUC 0.8221153846153846
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(1,)', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 6)', '(6, 1)', '(2, 1)', '(2, 2)', '(2, 3)', '(3, 1)', '(6, 3)', '(3, 2)', '(1, 4)', '(4, 1)', '(1, 3)', '(3, 3)', '(1, 6)', '(6, 2)', '(4, 4)', '(3, 4)', '(2, 4)', '(4, 2)', '(6, 6)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 6)', '(2, 6, 1)', '(6, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 2)', '(2, 2, 2)', '(2, 2, 1)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(2, 2, 3)', '(3, 1, 1)', '(2, 6, 3)', '(6, 3, 2)', '(3, 2, 2)', '(6, 3, 1)', '(1, 1, 4)', '(1, 4, 

# mRMR

## 10 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_10_mrmr, f1_score_list_lr_10_mrmr, auc_list_lr_10_mrmr, param_list_lr_10_mrmr = model_train_predict(lr, 'mrmr_feat_list_10', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_10_mrmr, f1_score_list_rfc_10_mrmr, auc_list_rfc_10_mrmr, param_list_rfc_10_mrmr = model_train_predict(rfc, 'mrmr_feat_list_10', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_10_mrmr, f1_score_list_svm_10_mrmr, auc_list_svm_10_mrmr, param_list_svm_10_mrmr = model_train_predict(svc, 'mrmr_feat_list_10', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_10_mrmr, f1_score_list_xgb_10_mrmr, auc_list_xgb_10_mrmr, param_list_xgb_10_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_10', params=params)

Average Accuracy 0.7932692307692307
Average F1 Score 0.7981220657276997
Average AUC 0.7932692307692307
Max Accuracy 0.7932692307692307
Max F1 Score 0.7981220657276996
Max AUC 0.7932692307692307
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(3,)', '(2, 3, 1)', '(1, 2, 1)', '(4, 1, 1)', '(2, 3)', '(2,)']
Best Features based on Max F1 Score ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(3,)', '(2, 3, 1)', '(1, 2, 1)', '(4, 1, 1)', '(2, 3)', '(2,)']
Best Features based on Max AUC ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(3,)', '(2, 3, 1)', '(1, 2, 1)', '(4, 1, 1)', '(2, 3)', '(2,)']


Average Accuracy 0.8831730769230768
A

## 20 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_20_mrmr, f1_score_list_lr_20_mrmr, auc_list_lr_20_mrmr, param_list_lr_20_mrmr = model_train_predict(lr, 'mrmr_feat_list_20*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_20_mrmr, f1_score_list_rfc_20_mrmr, auc_list_rfc_20_mrmr, param_list_rfc_20_mrmr = model_train_predict(rfc, 'mrmr_feat_list_20', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_20_mrmr, f1_score_list_svm_20_mrmr, auc_list_svm_20_mrmr, param_list_svm_20_mrmr = model_train_predict(svc, 'mrmr_feat_list_20', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_20_mrmr, f1_score_list_xgb_20_mrmr, auc_list_xgb_20_mrmr, param_list_xgb_20_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_20', params=params)

Average Accuracy 0.8125
Average F1 Score 0.8115942028985508
Average AUC 0.8125
Max Accuracy 0.8125
Max F1 Score 0.8115942028985508
Max AUC 0.8125
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(3,)', '(2, 3, 1)', '(1, 2, 1)', '(4, 1, 1)', '(2, 3)', '(2,)', 'bigram_entropy', '(2, 1, 2)', 'unigram_entropy', '(1, 1, 2)', '(1, 2, 3)', 'pattern_hvg_4_nodes_entropy', '(4,)', '(2, 1, 1)', '(1, 4)', '(3, 3, 1)', '(6, 3)', '(1, 4, 1)', '(3, 3)', 'A4']
Best Features based on Max F1 Score ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(3,)', '(2, 3, 1)', '(1, 2, 1)', '(4, 1, 1)', '(2, 3)', '(2,)', 'bigram_entropy', '(2, 1, 2)', 'unigram_entropy', '(1, 1, 2)', '(1, 2, 3)', 'pattern_hvg_4_nodes_entropy', '(

## 30 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_30_mrmr, f1_score_list_lr_30_mrmr, auc_list_lr_30_mrmr, param_list_lr_30_mrmr = model_train_predict(lr, 'mrmr_feat_list_30*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_30_mrmr, f1_score_list_rfc_30_mrmr, auc_list_rfc_30_mrmr, param_list_rfc_30_mrmr = model_train_predict(rfc, 'mrmr_feat_list_30', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_30_mrmr, f1_score_list_svm_30_mrmr, auc_list_svm_30_mrmr, param_list_svm_30_mrmr = model_train_predict(svc, 'mrmr_feat_list_30', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_30_mrmr, f1_score_list_xgb_30_mrmr, auc_list_xgb_30_mrmr, param_list_xgb_30_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_30', params=params)

Average Accuracy 0.8173076923076923
Average F1 Score 0.8155339805825242
Average AUC 0.8173076923076923
Max Accuracy 0.8173076923076923
Max F1 Score 0.8155339805825242
Max AUC 0.8173076923076923
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(3,)', '(2, 3, 1)', '(1, 2, 1)', '(4, 1, 1)', '(2, 3)', '(2,)', 'bigram_entropy', '(2, 1, 2)', 'unigram_entropy', '(1, 1, 2)', '(1, 2, 3)', 'pattern_hvg_4_nodes_entropy', '(4,)', '(2, 1, 1)', '(1, 4)', '(3, 3, 1)', '(6, 3)', '(1, 4, 1)', '(3, 3)', 'A4', '(3, 1, 4)', '(2, 6, 3)', '(1, 3)', '(3, 1, 2)', '(1, 1, 4)', '(2, 1, 4)', '(1, 2, 2)', '(6, 2, 3)', '(2, 2, 3)', '(3, 3, 3)', '(6, 3, 1)', 'C4', '(2, 3, 3)', '(3, 2)']
Best Features based on Max F1 Score ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy',

## 50 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_50_mrmr, f1_score_list_lr_50_mrmr, auc_list_lr_50_mrmr, param_list_lr_50_mrmr = model_train_predict(lr, 'mrmr_feat_list_50*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_50_mrmr, f1_score_list_rfc_50_mrmr, auc_list_rfc_50_mrmr, param_list_rfc_50_mrmr = model_train_predict(rfc, 'mrmr_feat_list_50', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_50_mrmr, f1_score_list_svm_50_mrmr, auc_list_svm_50_mrmr, param_list_svm_50_mrmr = model_train_predict(svc, 'mrmr_feat_list_50', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_50_mrmr, f1_score_list_xgb_50_mrmr, auc_list_xgb_50_mrmr, param_list_xgb_50_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_50', params=params)

Average Accuracy 0.8173076923076923
Average F1 Score 0.8155339805825242
Average AUC 0.8173076923076923
Max Accuracy 0.8173076923076923
Max F1 Score 0.8155339805825242
Max AUC 0.8173076923076923
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(3,)', '(2, 3, 1)', '(1, 2, 1)', '(4, 1, 1)', '(2, 3)', '(2,)', 'bigram_entropy', '(2, 1, 2)', 'unigram_entropy', '(1, 1, 2)', '(1, 2, 3)', 'pattern_hvg_4_nodes_entropy', '(4,)', '(2, 1, 1)', '(1, 4)', '(3, 3, 1)', '(6, 3)', '(1, 4, 1)', '(3, 3)', 'A4', '(3, 1, 4)', '(2, 6, 3)', '(1, 3)', '(3, 1, 2)', '(1, 1, 4)', '(2, 1, 4)', '(1, 2, 2)', '(6, 2, 3)', '(2, 2, 3)', '(3, 3, 3)', '(6, 3, 1)', 'C4', '(2, 3, 3)', '(3, 2)', '(1, 3, 3)', 'A5', '(1, 1, 3)', '(2, 2)', '(4, 1, 2)', '(3, 3, 2)', '(2, 2, 1)', '(4, 2)', '(4, 1, 4)', '(6, 

## 75 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_75_mrmr, f1_score_list_lr_75_mrmr, auc_list_lr_75_mrmr, param_list_lr_75_mrmr = model_train_predict(lr, 'mrmr_feat_list_75*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_75_mrmr, f1_score_list_rfc_75_mrmr, auc_list_rfc_75_mrmr, param_list_rfc_75_mrmr = model_train_predict(rfc, 'mrmr_feat_list_75', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_75_mrmr, f1_score_list_svm_75_mrmr, auc_list_svm_75_mrmr, param_list_svm_75_mrmr = model_train_predict(svc, 'mrmr_feat_list_75', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_75_mrmr, f1_score_list_xgb_75_mrmr, auc_list_xgb_75_mrmr, param_list_xgb_75_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_75', params=params)

Average Accuracy 0.8221153846153847
Average F1 Score 0.8177339901477833
Average AUC 0.8221153846153847
Max Accuracy 0.8221153846153846
Max F1 Score 0.8177339901477833
Max AUC 0.8221153846153846
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(3,)', '(2, 3, 1)', '(1, 2, 1)', '(4, 1, 1)', '(2, 3)', '(2,)', 'bigram_entropy', '(2, 1, 2)', 'unigram_entropy', '(1, 1, 2)', '(1, 2, 3)', 'pattern_hvg_4_nodes_entropy', '(4,)', '(2, 1, 1)', '(1, 4)', '(3, 3, 1)', '(6, 3)', '(1, 4, 1)', '(3, 3)', 'A4', '(3, 1, 4)', '(2, 6, 3)', '(1, 3)', '(3, 1, 2)', '(1, 1, 4)', '(2, 1, 4)', '(1, 2, 2)', '(6, 2, 3)', '(2, 2, 3)', '(3, 3, 3)', '(6, 3, 1)', 'C4', '(2, 3, 3)', '(3, 2)', '(1, 3, 3)', 'A5', '(1, 1, 3)', '(2, 2)', '(4, 1, 2)', '(3, 3, 2)', '(2, 2, 1)', '(4, 2)', '(4, 1, 4)', '(6, 

## 90 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_90_mrmr, f1_score_list_lr_90_mrmr, auc_list_lr_90_mrmr, param_list_lr_90_mrmr = model_train_predict(lr, 'mrmr_feat_list_90*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_90_mrmr, f1_score_list_rfc_90_mrmr, auc_list_rfc_90_mrmr, param_list_rfc_90_mrmr = model_train_predict(rfc, 'mrmr_feat_list_90', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_90_mrmr, f1_score_list_svm_90_mrmr, auc_list_svm_90_mrmr, param_list_svm_90_mrmr = model_train_predict(svc, 'mrmr_feat_list_90', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_90_mrmr, f1_score_list_xgb_90_mrmr, auc_list_xgb_90_mrmr, param_list_xgb_90_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_90', params=params)

Average Accuracy 0.8221153846153847
Average F1 Score 0.8177339901477833
Average AUC 0.8221153846153847
Max Accuracy 0.8221153846153846
Max F1 Score 0.8177339901477833
Max AUC 0.8221153846153846
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(3, 1)', '(2, 1, 3)', '(4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(3,)', '(2, 3, 1)', '(1, 2, 1)', '(4, 1, 1)', '(2, 3)', '(2,)', 'bigram_entropy', '(2, 1, 2)', 'unigram_entropy', '(1, 1, 2)', '(1, 2, 3)', 'pattern_hvg_4_nodes_entropy', '(4,)', '(2, 1, 1)', '(1, 4)', '(3, 3, 1)', '(6, 3)', '(1, 4, 1)', '(3, 3)', 'A4', '(3, 1, 4)', '(2, 6, 3)', '(1, 3)', '(3, 1, 2)', '(1, 1, 4)', '(2, 1, 4)', '(1, 2, 2)', '(6, 2, 3)', '(2, 2, 3)', '(3, 3, 3)', '(6, 3, 1)', 'C4', '(2, 3, 3)', '(3, 2)', '(1, 3, 3)', 'A5', '(1, 1, 3)', '(2, 2)', '(4, 1, 2)', '(3, 3, 2)', '(2, 2, 1)', '(4, 2)', '(4, 1, 4)', '(6, 

# MI and mRMR

## 10 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_10_mi_mrmr, f1_score_list_lr_10_mi_mrmr, auc_list_lr_10_mi_mrmr, param_list_lr_10_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_10*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_10_mi_mrmr, f1_score_list_rfc_10_mi_mrmr, auc_list_rfc_10_mi_mrmr, param_list_rfc_10_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_10', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_10_mi_mrmr, f1_score_list_svm_10_mi_mrmr, auc_list_svm_10_mi_mrmr, param_list_svm_10_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_10', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_10_mi_mrmr, f1_score_list_xgb_10_mi_mrmr, auc_list_xgb_10_mi_mrmr, param_list_xgb_10_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_10', params=params)

Average Accuracy 0.7841346153846154
Average F1 Score 0.7850662495399339
Average AUC 0.7841346153846153
Max Accuracy 0.7884615384615384
Max F1 Score 0.7884615384615384
Max AUC 0.7884615384615384
Best Sample Index based on Max Accuracy 9
Best Sample Index based on Max F1 Score 9
Best Sample Index based on Max AUC 9
Best Features based on Max Accuracy ['(3,)', '(3, 1, 1)', '(2, 1)', '(4, 1)', '(3, 1)', '(2,)', 'trigram_entropy', '(1, 2, 1)', '(2, 3)', '(1, 2)']
Best Features based on Max F1 Score ['(3,)', '(3, 1, 1)', '(2, 1)', '(4, 1)', '(3, 1)', '(2,)', 'trigram_entropy', '(1, 2, 1)', '(2, 3)', '(1, 2)']
Best Features based on Max AUC ['(3,)', '(3, 1, 1)', '(2, 1)', '(4, 1)', '(3, 1)', '(2,)', 'trigram_entropy', '(1, 2, 1)', '(2, 3)', '(1, 2)']


Average Accuracy 0.8778846153846154
Average F1 Score 0.8785500395612911
Average AUC 0.8778846153846154
Max Accuracy 0.8894230769230769
Max F1 Score 0.8909952606635071
Max AUC 0.889423076923077
Best Sample Index based on Max Accuracy 3
Best Samp

## 20 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_20_mi_mrmr, f1_score_list_lr_20_mi_mrmr, auc_list_lr_20_mi_mrmr, param_list_lr_20_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_20*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_20_mi_mrmr, f1_score_list_rfc_20_mi_mrmr, auc_list_rfc_20_mi_mrmr, param_list_rfc_20_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_20', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_20_mi_mrmr, f1_score_list_svm_20_mi_mrmr, auc_list_svm_20_mi_mrmr, param_list_svm_20_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_20', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_20_mi_mrmr, f1_score_list_xgb_20_mi_mrmr, auc_list_xgb_20_mi_mrmr, param_list_xgb_20_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_20', params=params)

Average Accuracy 0.8
Average F1 Score 0.8019211242249972
Average AUC 0.8
Max Accuracy 0.8076923076923077
Max F1 Score 0.8095238095238094
Max AUC 0.8076923076923077
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 8
Best Sample Index based on Max AUC 5
Best Features based on Max Accuracy ['A4', '(2, 1, 1)', '(2, 1, 2)', 'unigram_entropy', '(1, 2, 1)', '(1, 4)', 'pattern_hvg_4_nodes_entropy', '(1, 2, 3)', '(2, 3)', '(2, 3, 1)', '(3,)', '(3, 1, 1)', '(4,)', '(2, 1)', 'trigram_entropy', '(4, 1, 1)', '(3, 3)', '(3, 1)', 'pattern_hvg_5_node_entropy', '(1, 4, 1)', '(2,)', '(4, 1)', 'bigram_entropy', '(1, 2)']
Best Features based on Max F1 Score ['(2, 1, 1)', '(2, 1, 2)', 'unigram_entropy', '(1, 2, 1)', '(1, 4)', 'pattern_hvg_4_nodes_entropy', '(1, 2, 3)', '(2, 3)', '(2, 3, 1)', '(3,)', '(3, 1, 1)', '(4,)', '(2, 1)', 'trigram_entropy', '(4, 1, 1)', '(3, 3)', '(3, 1)', 'pattern_hvg_5_node_entropy', '(1, 4, 1)', '(2,)', '(4, 1)', 'bigram_entropy', '(1, 2)']
Best 

## 30 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_30_mi_mrmr, f1_score_list_lr_30_mi_mrmr, auc_list_lr_30_mi_mrmr, param_list_lr_30_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_30*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_30_mi_mrmr, f1_score_list_rfc_30_mi_mrmr, auc_list_rfc_30_mi_mrmr, param_list_rfc_30_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_30', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_30_mi_mrmr, f1_score_list_svm_30_mi_mrmr, auc_list_svm_30_mi_mrmr, param_list_svm_30_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_30', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_30_mi_mrmr, f1_score_list_xgb_30_mi_mrmr, auc_list_xgb_30_mi_mrmr, param_list_xgb_30_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_30', params=params)

Average Accuracy 0.8139423076923077
Average F1 Score 0.8131433672634711
Average AUC 0.8139423076923077
Max Accuracy 0.8173076923076923
Max F1 Score 0.8155339805825242
Max AUC 0.8173076923076923
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 5
Best Features based on Max Accuracy ['A4', '(1, 1, 4)', '(2, 1, 1)', '(2, 1, 2)', 'unigram_entropy', '(1, 2, 1)', '(1, 4)', 'pattern_hvg_4_nodes_entropy', '(1, 2, 3)', '(2, 3)', '(2, 3, 1)', '(3,)', '(3, 1, 1)', '(4,)', '(2, 1)', 'trigram_entropy', '(4, 1, 1)', 'C4', '(1, 3)', '(2, 2, 3)', '(3, 3)', '(3, 1)', '(3, 1, 2)', 'pattern_hvg_5_node_entropy', '(1, 1, 2)', '(1, 4, 1)', '(2,)', '(4, 1)', '(3, 1, 4)', 'bigram_entropy', '(1, 2)']
Best Features based on Max F1 Score ['A4', '(1, 1, 4)', '(2, 1, 1)', '(2, 1, 2)', 'unigram_entropy', '(1, 2, 1)', '(1, 4)', 'pattern_hvg_4_nodes_entropy', '(1, 2, 3)', '(2, 3)', '(2, 3, 1)', '(3,)', '(3, 1, 1)', '(4,)', '(2, 1)', 'trigram_entropy

## 50 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_50_mi_mrmr, f1_score_list_lr_50_mi_mrmr, auc_list_lr_50_mi_mrmr, param_list_lr_50_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_50*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_50_mi_mrmr, f1_score_list_rfc_50_mi_mrmr, auc_list_rfc_50_mi_mrmr, param_list_rfc_50_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_50', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_50_mi_mrmr, f1_score_list_svm_50_mi_mrmr, auc_list_svm_50_mi_mrmr, param_list_svm_50_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_50', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_50_mi_mrmr, f1_score_list_xgb_50_mi_mrmr, auc_list_xgb_50_mi_mrmr, param_list_xgb_50_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_50', params=params)

Average Accuracy 0.816826923076923
Average F1 Score 0.8151400028141269
Average AUC 0.816826923076923
Max Accuracy 0.8173076923076923
Max F1 Score 0.8155339805825242
Max AUC 0.8173076923076923
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['A4', 'V5', '(2, 1, 4)', '(1, 1, 4)', '(2, 1, 1)', '(2, 1, 2)', '(3, 3, 1)', 'A5', 'unigram_entropy', '(1, 2, 1)', '(1, 4)', '(3, 3, 2)', '(4, 1, 4)', 'pattern_hvg_4_nodes_entropy', '(3, 2, 2)', '(1, 2, 3)', '(2, 3)', '(2, 3, 1)', '(3,)', '(3, 1, 1)', '(4,)', '(2, 1)', 'trigram_entropy', '(4, 1, 1)', 'C4', '(1, 3, 3)', '(1, 1, 3)', '(1, 3)', '(2, 3, 2)', '(2, 2, 3)', '(2, 2, 2)', '(2, 6, 3)', 'N5', '(3, 3)', '(3, 1)', '(3, 1, 2)', '(3, 2)', 'pattern_hvg_5_node_entropy', '(1, 1, 2)', '(2, 2, 1)', '(1, 4, 1)', '(2,)', '(4, 1)', '(2, 1, 3)', '(1, 2, 2)', '(3, 2, 1)', '(3, 1, 4)', 'bigram_entropy', '(1, 2)']
Best Features based on Max F1 Score ['

## 75 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_75_mi_mrmr, f1_score_list_lr_75_mi_mrmr, auc_list_lr_75_mi_mrmr, param_list_lr_75_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_75', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_75_mi_mrmr, f1_score_list_rfc_75_mi_mrmr, auc_list_rfc_75_mi_mrmr, param_list_rfc_75_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_75', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_75_mi_mrmr, f1_score_list_svm_75_mi_mrmr, auc_list_svm_75_mi_mrmr, param_list_svm_75_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_75', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_75_mi_mrmr, f1_score_list_xgb_75_mi_mrmr, auc_list_xgb_75_mi_mrmr, param_list_xgb_75_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_75', params=params)

Average Accuracy 0.8211538461538461
Average F1 Score 0.8169322901574423
Average AUC 0.8211538461538461
Max Accuracy 0.8221153846153846
Max F1 Score 0.8177339901477833
Max AUC 0.8221153846153846
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(2, 6)', '(3, 3, 2)', '(4, 2)', '(2, 6, 1)', 'F4', '(1, 2, 3)', '(3, 1, 1)', '(6, 2, 3)', '(2, 1)', '(3, 3, 3)', '(1, 4, 4)', '(2, 2, 3)', '(2, 6, 6)', '(3, 1, 2)', '(4, 4, 1)', '(4, 2, 1)', '(2, 2, 1)', '(4, 1)', 'A4', '(6, 3, 1)', 'P5', '(1, 1, 4)', '(2, 1, 2)', 'A5', '(1, 1)', '(4,)', '(4, 1, 1)', '(6, 1, 6)', '(1, 1, 3)', '(2, 3, 2)', 'B4', 'N5', '(3, 3)', '(3, 1)', '(1, 1, 2)', '(4, 4, 4)', '(4, 4)', '(6, 3)', '(1, 4)', '(2, 1, 4)', '(6, 3, 2)', '(4, 1, 2)', '(2, 1, 1)', '(2, 2, 6)', '(3, 3, 1)', '(2, 2)', '(1, 2, 1)', '(4, 1, 4)', '(3, 2, 2)', '(2, 3)', '(3,)', '(1, 3, 3)', '(1, 2, 4)', '(6, 3, 3)', '(2, 3, 3)', '(3, 2)', '(3, 1, 6)'

## 90 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_90_mi_mrmr, f1_score_list_lr_90_mi_mrmr, auc_list_lr_90_mi_mrmr, param_list_lr_90_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_90', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_90_mi_mrmr, f1_score_list_rfc_90_mi_mrmr, auc_list_rfc_90_mi_mrmr, param_list_rfc_90_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_90', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_90_mi_mrmr, f1_score_list_svm_90_mi_mrmr, auc_list_svm_90_mi_mrmr, param_list_svm_90_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_90', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_90_mi_mrmr, f1_score_list_xgb_90_mi_mrmr, auc_list_xgb_90_mi_mrmr, param_list_xgb_90_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_90', params=params)

Average Accuracy 0.8221153846153847
Average F1 Score 0.8177339901477833
Average AUC 0.8221153846153847
Max Accuracy 0.8221153846153846
Max F1 Score 0.8177339901477833
Max AUC 0.8221153846153846
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(2, 6)', '(6,)', '(3, 3, 2)', '(4, 2)', '(2, 6, 1)', 'F4', '(1, 2, 3)', '(3, 1, 1)', '(6, 2, 3)', '(2, 1)', '(3, 3, 4)', 'U5', '(3, 3, 3)', '(6, 1, 4)', '(6, 1, 1)', '(1, 4, 4)', '(2, 2, 3)', '(4, 1, 3)', '(6, 1, 2)', '(2, 6, 6)', '(3, 1, 2)', '(2, 1, 6)', '(6, 2, 2)', '(4, 4, 1)', '(4, 2, 1)', '(2, 2, 1)', 'Q5', '(4, 1)', '(2, 1, 3)', 'B5', 'A4', '(6, 3, 1)', 'P5', '(1, 1, 4)', '(2, 1, 2)', 'A5', 'C5', '(1, 1)', '(2, 4, 1)', '(4, 2, 3)', '(4,)', '(1, 6)', '(4, 1, 1)', '(6, 1, 6)', '(1, 1, 3)', '(2, 3, 2)', '(2, 2, 2)', 'B4', 'N5', '(1, 1, 6)', '(3, 3)', '(3, 1)', '(1, 1, 2)', '(4, 4, 4)', 'E5', '(2, 3, 4)', '(4, 4)', '(6, 3)', '(1, 4)', '

# PCA

In [None]:
def model_train_predict_pca(model, k, dataframes=list_sample_dataframes, params=None):
    
    accuracy_list = []
    f1_score_list = []
    auc_list = []
    param_list = []
    
    for sample in dataframes:
        x = sample.drop(['Unnamed: 0', 'conversion_class'], axis=1)
        y = sample['conversion_class']
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
        
        pca = PCA(n_components=k)
        x_train = pca.fit_transform(x_train)
        x_test = pca.transform(x_test)
        
        clf = GridSearchCV(estimator=model, param_grid=params, cv=5, n_jobs=-1, verbose=-1)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        # model.fit(x_train, y_train)
        # y_pred = model.predict(x_test)
        accuracy_list.append(accuracy_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))
        auc_list.append(roc_auc_score(y_test, y_pred))
        param_list.append(clf.best_params_)

    print('Average Accuracy', np.mean(accuracy_list))
    print('Average F1 Score', np.mean(f1_score_list))
    print('Average AUC', np.mean(auc_list)) 
    
    print('Max Accuracy', max(accuracy_list))
    print('Max F1 Score', max(f1_score_list))
    print('Max AUC', max(auc_list))  
    
    best_accuracy_index = accuracy_list.index(max(accuracy_list))
    best_f1_score_index = f1_score_list.index(max(f1_score_list))
    best_auc_index = auc_list.index(max(auc_list))
    
    print('Best Sample Index based on Max Accuracy', best_accuracy_index)
    print('Best Sample Index based on Max F1 Score', best_f1_score_index)
    print('Best Sample Index based on Max AUC', best_auc_index)
    print('Best Parameters', param_list[best_accuracy_index])
    
    return accuracy_list, f1_score_list, auc_list, param_list  

## 10 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_10_pca, f1_score_list_lr_10_pca, auc_list_lr_10_pca, param_list_lr_10_pca = model_train_predict_pca(lr, 14, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_10_pca, f1_score_list_rfc_10_pca, auc_list_rfc_10_pca, param_list_rfc_10_pca = model_train_predict_pca(rfc, 14, params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_10_pca, f1_score_list_svm_10_pca, auc_list_svm_10_pca, param_list_svm_10_pca = model_train_predict_pca(svc, 14, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_10_pca, f1_score_list_xgb_10_pca, auc_list_xgb_10_pca, param_list_xgb_10_pca = model_train_predict_pca(xgbc, 14, params=params)

Average Accuracy 0.7980769230769231
Average F1 Score 0.7980769230769231
Average AUC 0.7980769230769231
Max Accuracy 0.7980769230769231
Max F1 Score 0.7980769230769231
Max AUC 0.7980769230769231
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0


Average Accuracy 0.8072115384615385
Average F1 Score 0.8132366127549174
Average AUC 0.8072115384615385
Max Accuracy 0.8173076923076923
Max F1 Score 0.824074074074074
Max AUC 0.8173076923076925
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 2


Average Accuracy 0.7932692307692307
Average F1 Score 0.8018433179723502
Average AUC 0.7932692307692308
Max Accuracy 0.7932692307692307
Max F1 Score 0.8018433179723502
Max AUC 0.7932692307692308
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0


Average Accuracy 0.8173076923076923
Average F1 Sco

## 20 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_20_pca, f1_score_list_lr_20_pca, auc_list_lr_20_pca, param_list_lr_20_pca = model_train_predict_pca(lr, 28, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_20_pca, f1_score_list_rfc_20_pca, auc_list_rfc_20_pca, param_list_rfc_20_pca = model_train_predict_pca(rfc, 28, params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_20_pca, f1_score_list_svm_20_pca, auc_list_svm_20_pca, param_list_svm_20_pca = model_train_predict_pca(svc, 28, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_20_pca, f1_score_list_xgb_20_pca, auc_list_xgb_20_pca, param_list_xgb_20_pca = model_train_predict_pca(xgbc, 28, params=params)

Average Accuracy 0.8221153846153847
Average F1 Score 0.8177339901477833
Average AUC 0.8221153846153847
Max Accuracy 0.8221153846153846
Max F1 Score 0.8177339901477833
Max AUC 0.8221153846153846
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0


Average Accuracy 0.8288461538461538
Average F1 Score 0.8268312597478822
Average AUC 0.828846153846154
Max Accuracy 0.8365384615384616
Max F1 Score 0.8365384615384616
Max AUC 0.8365384615384617
Best Sample Index based on Max Accuracy 2
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3


Average Accuracy 0.8365384615384615
Average F1 Score 0.8365384615384615
Average AUC 0.8365384615384617
Max Accuracy 0.8365384615384616
Max F1 Score 0.8365384615384616
Max AUC 0.8365384615384617
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0


Average Accuracy 0.8490384615384615
Average F1 Sco

## 30 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_30_pca, f1_score_list_lr_30_pca, auc_list_lr_30_pca, param_list_lr_30_pca = model_train_predict_pca(lr, 42, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_30_pca, f1_score_list_rfc_30_pca, auc_list_rfc_30_pca, param_list_rfc_30_pca = model_train_predict_pca(rfc, 42, params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_30_pca, f1_score_list_svm_30_pca, auc_list_svm_30_pca, param_list_svm_30_pca = model_train_predict_pca(svc, 42, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_30_pca, f1_score_list_xgb_30_pca, auc_list_xgb_30_pca, param_list_xgb_30_pca = model_train_predict_pca(xgbc, 42, params=params)

Average Accuracy 0.8221153846153847
Average F1 Score 0.8177339901477833
Average AUC 0.8221153846153847
Max Accuracy 0.8221153846153846
Max F1 Score 0.8177339901477833
Max AUC 0.8221153846153846
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0


Average Accuracy 0.8399038461538462
Average F1 Score 0.8379669508604163
Average AUC 0.8399038461538462
Max Accuracy 0.8509615384615384
Max F1 Score 0.848780487804878
Max AUC 0.8509615384615385
Best Sample Index based on Max Accuracy 2
Best Sample Index based on Max F1 Score 2
Best Sample Index based on Max AUC 2


Average Accuracy 0.8365384615384615
Average F1 Score 0.8365384615384615
Average AUC 0.8365384615384617
Max Accuracy 0.8365384615384616
Max F1 Score 0.8365384615384616
Max AUC 0.8365384615384617
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0


Average Accuracy 0.858173076923077
Average F1 Scor

## 50 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_50_pca, f1_score_list_lr_50_pca, auc_list_lr_50_pca, param_list_lr_50_pca = model_train_predict_pca(lr, 69, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_50_pca, f1_score_list_rfc_50_pca, auc_list_rfc_50_pca, param_list_rfc_50_pca = model_train_predict_pca(rfc, 69, params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_50_pca, f1_score_list_svm_50_pca, auc_list_svm_50_pca, param_list_svm_50_pca = model_train_predict_pca(svc, 69, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_50_pca, f1_score_list_xgb_50_pca, auc_list_xgb_50_pca, param_list_xgb_50_pca = model_train_predict_pca(xgbc, 69, params=params)

Average Accuracy 0.8221153846153847
Average F1 Score 0.8177339901477833
Average AUC 0.8221153846153847
Max Accuracy 0.8221153846153846
Max F1 Score 0.8177339901477833
Max AUC 0.8221153846153846
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0


Average Accuracy 0.8480769230769232
Average F1 Score 0.8481752226503296
Average AUC 0.8480769230769232
Max Accuracy 0.8653846153846154
Max F1 Score 0.8666666666666667
Max AUC 0.8653846153846155
Best Sample Index based on Max Accuracy 8
Best Sample Index based on Max F1 Score 8
Best Sample Index based on Max AUC 8


Average Accuracy 0.8365384615384615
Average F1 Score 0.8365384615384615
Average AUC 0.8365384615384617
Max Accuracy 0.8365384615384616
Max F1 Score 0.8365384615384616
Max AUC 0.8365384615384617
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0


Average Accuracy 0.8624999999999998
Average F1 Sc

## 75 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_75_pca, f1_score_list_lr_75_pca, auc_list_lr_75_pca, param_list_lr_75_pca = model_train_predict_pca(lr, 104, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_75_pca, f1_score_list_rfc_75_pca, auc_list_rfc_75_pca, param_list_rfc_75_pca = model_train_predict_pca(rfc, 104, params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_75_pca, f1_score_list_svm_75_pca, auc_list_svm_75_pca, param_list_svm_75_pca = model_train_predict_pca(svc, 104, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_75_pca, f1_score_list_xgb_75_pca, auc_list_xgb_75_pca, param_list_xgb_75_pca = model_train_predict_pca(xgbc, 104, params=params)

Average Accuracy 0.8221153846153847
Average F1 Score 0.8177339901477833
Average AUC 0.8221153846153847
Max Accuracy 0.8221153846153846
Max F1 Score 0.8177339901477833
Max AUC 0.8221153846153846
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0


Average Accuracy 0.8413461538461539
Average F1 Score 0.8424404903576278
Average AUC 0.8413461538461539
Max Accuracy 0.8605769230769231
Max F1 Score 0.8612440191387559
Max AUC 0.8605769230769231
Best Sample Index based on Max Accuracy 8
Best Sample Index based on Max F1 Score 8
Best Sample Index based on Max AUC 8


Average Accuracy 0.8365384615384615
Average F1 Score 0.8365384615384615
Average AUC 0.8365384615384617
Max Accuracy 0.8365384615384616
Max F1 Score 0.8365384615384616
Max AUC 0.8365384615384617
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0


Average Accuracy 0.8605769230769231
Average F1 Sc

## 90 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_90_pca, f1_score_list_lr_90_pca, auc_list_lr_90_pca, param_list_lr_90_pca = model_train_predict_pca(lr, 125, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_90_pca, f1_score_list_rfc_90_pca, auc_list_rfc_90_pca, param_list_rfc_90_pca = model_train_predict_pca(rfc, 125, params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_90_pca, f1_score_list_svm_90_pca, auc_list_svm_90_pca, param_list_svm_90_pca = model_train_predict_pca(svc, 125, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_90_pca, f1_score_list_xgb_90_pca, auc_list_xgb_90_pca, param_list_xgb_90_pca = model_train_predict_pca(xgbc, 125, params=params)

Average Accuracy 0.8221153846153847
Average F1 Score 0.8177339901477833
Average AUC 0.8221153846153847
Max Accuracy 0.8221153846153846
Max F1 Score 0.8177339901477833
Max AUC 0.8221153846153846
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0


Average Accuracy 0.8423076923076923
Average F1 Score 0.8417311719149062
Average AUC 0.8423076923076923
Max Accuracy 0.8557692307692307
Max F1 Score 0.8529411764705882
Max AUC 0.8557692307692308
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 5


Average Accuracy 0.8365384615384615
Average F1 Score 0.8365384615384615
Average AUC 0.8365384615384617
Max Accuracy 0.8365384615384616
Max F1 Score 0.8365384615384616
Max AUC 0.8365384615384617
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0


Average Accuracy 0.8557692307692306
Average F1 Sc

# Saving results

In [None]:
models = ['lr', 'rfc', 'svc', 'xgbc']
models = [value for value in models for _ in range(10)] * 6
percentiles = ['10', '20', '30', '50', '75', '90']
percentiles = [value for value in percentiles for _ in range(40)]
samples = [x for x in range(1, 11)]
samples = samples * 24

print(len(models))
print(len(percentiles))
print(len(samples))

240
240
240


In [None]:
overall_accuracy_list_mi = (accuracy_list_lr_10_mi + accuracy_list_rfc_10_mi + accuracy_list_svm_10_mi + accuracy_list_xgb_10_mi +
                            accuracy_list_lr_20_mi + accuracy_list_rfc_20_mi + accuracy_list_svm_20_mi + accuracy_list_xgb_20_mi +
                            accuracy_list_lr_30_mi + accuracy_list_rfc_30_mi + accuracy_list_svm_30_mi + accuracy_list_xgb_30_mi +
                            accuracy_list_lr_50_mi + accuracy_list_rfc_50_mi + accuracy_list_svm_50_mi + accuracy_list_xgb_50_mi +
                            accuracy_list_lr_75_mi + accuracy_list_rfc_75_mi + accuracy_list_svm_75_mi + accuracy_list_xgb_75_mi +
                            accuracy_list_lr_90_mi + accuracy_list_rfc_90_mi + accuracy_list_svm_90_mi + accuracy_list_xgb_90_mi)

overall_f1_score_list_mi = (f1_score_list_lr_10_mi + f1_score_list_rfc_10_mi + f1_score_list_svm_10_mi + f1_score_list_xgb_10_mi +
                            f1_score_list_lr_20_mi + f1_score_list_rfc_20_mi + f1_score_list_svm_20_mi + f1_score_list_xgb_20_mi +
                            f1_score_list_lr_30_mi + f1_score_list_rfc_30_mi + f1_score_list_svm_30_mi + f1_score_list_xgb_30_mi +
                            f1_score_list_lr_50_mi + f1_score_list_rfc_50_mi + f1_score_list_svm_50_mi + f1_score_list_xgb_50_mi +
                            f1_score_list_lr_75_mi + f1_score_list_rfc_75_mi + f1_score_list_svm_75_mi + f1_score_list_xgb_75_mi +
                            f1_score_list_lr_90_mi + f1_score_list_rfc_90_mi + f1_score_list_svm_90_mi + f1_score_list_xgb_90_mi)

overall_auc_list_mi = (auc_list_lr_10_mi + auc_list_rfc_10_mi + auc_list_svm_10_mi + auc_list_xgb_10_mi +
                            auc_list_lr_20_mi + auc_list_rfc_20_mi + auc_list_svm_20_mi + auc_list_xgb_20_mi +
                            auc_list_lr_30_mi + auc_list_rfc_30_mi + auc_list_svm_30_mi + auc_list_xgb_30_mi +
                            auc_list_lr_50_mi + auc_list_rfc_50_mi + auc_list_svm_50_mi + auc_list_xgb_50_mi +
                            auc_list_lr_75_mi + auc_list_rfc_75_mi + auc_list_svm_75_mi + auc_list_xgb_75_mi +
                            auc_list_lr_90_mi + auc_list_rfc_90_mi + auc_list_svm_90_mi + auc_list_xgb_90_mi)

overall_param_list_mi = (param_list_lr_10_mi + param_list_rfc_10_mi + param_list_svm_10_mi + param_list_xgb_10_mi +
                            param_list_lr_20_mi + param_list_rfc_20_mi + param_list_svm_20_mi + param_list_xgb_20_mi +
                            param_list_lr_30_mi + param_list_rfc_30_mi + param_list_svm_30_mi + param_list_xgb_30_mi +
                            param_list_lr_50_mi + param_list_rfc_50_mi + param_list_svm_50_mi + param_list_xgb_50_mi +
                            param_list_lr_75_mi + param_list_rfc_75_mi + param_list_svm_75_mi + param_list_xgb_75_mi +
                            param_list_lr_90_mi + param_list_rfc_90_mi + param_list_svm_90_mi + param_list_xgb_90_mi)

In [None]:
overall_accuracy_list_mrmr = (accuracy_list_lr_10_mrmr + accuracy_list_rfc_10_mrmr + accuracy_list_svm_10_mrmr + accuracy_list_xgb_10_mrmr +
                            accuracy_list_lr_20_mrmr + accuracy_list_rfc_20_mrmr + accuracy_list_svm_20_mrmr + accuracy_list_xgb_20_mrmr +
                            accuracy_list_lr_30_mrmr + accuracy_list_rfc_30_mrmr + accuracy_list_svm_30_mrmr + accuracy_list_xgb_30_mrmr +
                            accuracy_list_lr_50_mrmr + accuracy_list_rfc_50_mrmr + accuracy_list_svm_50_mrmr + accuracy_list_xgb_50_mrmr +
                            accuracy_list_lr_75_mrmr + accuracy_list_rfc_75_mrmr + accuracy_list_svm_75_mrmr + accuracy_list_xgb_75_mrmr +
                            accuracy_list_lr_90_mrmr + accuracy_list_rfc_90_mrmr + accuracy_list_svm_90_mrmr + accuracy_list_xgb_90_mrmr)

overall_f1_score_list_mrmr = (f1_score_list_lr_10_mrmr + f1_score_list_rfc_10_mrmr + f1_score_list_svm_10_mrmr + f1_score_list_xgb_10_mrmr +
                            f1_score_list_lr_20_mrmr + f1_score_list_rfc_20_mrmr + f1_score_list_svm_20_mrmr + f1_score_list_xgb_20_mrmr +
                            f1_score_list_lr_30_mrmr + f1_score_list_rfc_30_mrmr + f1_score_list_svm_30_mrmr + f1_score_list_xgb_30_mrmr +
                            f1_score_list_lr_50_mrmr + f1_score_list_rfc_50_mrmr + f1_score_list_svm_50_mrmr + f1_score_list_xgb_50_mrmr +
                            f1_score_list_lr_75_mrmr + f1_score_list_rfc_75_mrmr + f1_score_list_svm_75_mrmr + f1_score_list_xgb_75_mrmr +
                            f1_score_list_lr_90_mrmr + f1_score_list_rfc_90_mrmr + f1_score_list_svm_90_mrmr + f1_score_list_xgb_90_mrmr)

overall_auc_list_mrmr = (auc_list_lr_10_mrmr + auc_list_rfc_10_mrmr + auc_list_svm_10_mrmr + auc_list_xgb_10_mrmr +
                            auc_list_lr_20_mrmr + auc_list_rfc_20_mrmr + auc_list_svm_20_mrmr + auc_list_xgb_20_mrmr +
                            auc_list_lr_30_mrmr + auc_list_rfc_30_mrmr + auc_list_svm_30_mrmr + auc_list_xgb_30_mrmr +
                            auc_list_lr_50_mrmr + auc_list_rfc_50_mrmr + auc_list_svm_50_mrmr + auc_list_xgb_50_mrmr +
                            auc_list_lr_75_mrmr + auc_list_rfc_75_mrmr + auc_list_svm_75_mrmr + auc_list_xgb_75_mrmr +
                            auc_list_lr_90_mrmr + auc_list_rfc_90_mrmr + auc_list_svm_90_mrmr + auc_list_xgb_90_mrmr)

overall_param_list_mrmr = (param_list_lr_10_mrmr + param_list_rfc_10_mrmr + param_list_svm_10_mrmr + param_list_xgb_10_mrmr +
                            param_list_lr_20_mrmr + param_list_rfc_20_mrmr + param_list_svm_20_mrmr + param_list_xgb_20_mrmr +
                            param_list_lr_30_mrmr + param_list_rfc_30_mrmr + param_list_svm_30_mrmr + param_list_xgb_30_mrmr +
                            param_list_lr_50_mrmr + param_list_rfc_50_mrmr + param_list_svm_50_mrmr + param_list_xgb_50_mrmr +
                            param_list_lr_75_mrmr + param_list_rfc_75_mrmr + param_list_svm_75_mrmr + param_list_xgb_75_mrmr +
                            param_list_lr_90_mrmr + param_list_rfc_90_mrmr + param_list_svm_90_mrmr + param_list_xgb_90_mrmr)

In [None]:
overall_accuracy_list_mi_mrmr = (accuracy_list_lr_10_mi_mrmr + accuracy_list_rfc_10_mi_mrmr + accuracy_list_svm_10_mi_mrmr + accuracy_list_xgb_10_mi_mrmr +
                            accuracy_list_lr_20_mi_mrmr + accuracy_list_rfc_20_mi_mrmr + accuracy_list_svm_20_mi_mrmr + accuracy_list_xgb_20_mi_mrmr +
                            accuracy_list_lr_30_mi_mrmr + accuracy_list_rfc_30_mi_mrmr + accuracy_list_svm_30_mi_mrmr + accuracy_list_xgb_30_mi_mrmr +
                            accuracy_list_lr_50_mi_mrmr + accuracy_list_rfc_50_mi_mrmr + accuracy_list_svm_50_mi_mrmr + accuracy_list_xgb_50_mi_mrmr +
                            accuracy_list_lr_75_mi_mrmr + accuracy_list_rfc_75_mi_mrmr + accuracy_list_svm_75_mi_mrmr + accuracy_list_xgb_75_mi_mrmr +
                            accuracy_list_lr_90_mi_mrmr + accuracy_list_rfc_90_mi_mrmr + accuracy_list_svm_90_mi_mrmr + accuracy_list_xgb_90_mi_mrmr)

overall_f1_score_list_mi_mrmr = (f1_score_list_lr_10_mi_mrmr + f1_score_list_rfc_10_mi_mrmr + f1_score_list_svm_10_mi_mrmr + f1_score_list_xgb_10_mi_mrmr +
                            f1_score_list_lr_20_mi_mrmr + f1_score_list_rfc_20_mi_mrmr + f1_score_list_svm_20_mi_mrmr + f1_score_list_xgb_20_mi_mrmr +
                            f1_score_list_lr_30_mi_mrmr + f1_score_list_rfc_30_mi_mrmr + f1_score_list_svm_30_mi_mrmr + f1_score_list_xgb_30_mi_mrmr +
                            f1_score_list_lr_50_mi_mrmr + f1_score_list_rfc_50_mi_mrmr + f1_score_list_svm_50_mi_mrmr + f1_score_list_xgb_50_mi_mrmr +
                            f1_score_list_lr_75_mi_mrmr + f1_score_list_rfc_75_mi_mrmr + f1_score_list_svm_75_mi_mrmr + f1_score_list_xgb_75_mi_mrmr +
                            f1_score_list_lr_90_mi_mrmr + f1_score_list_rfc_90_mi_mrmr + f1_score_list_svm_90_mi_mrmr + f1_score_list_xgb_90_mi_mrmr)

overall_auc_list_mi_mrmr = (auc_list_lr_10_mi_mrmr + auc_list_rfc_10_mi_mrmr + auc_list_svm_10_mi_mrmr + auc_list_xgb_10_mi_mrmr +
                            auc_list_lr_20_mi_mrmr + auc_list_rfc_20_mi_mrmr + auc_list_svm_20_mi_mrmr + auc_list_xgb_20_mi_mrmr +
                            auc_list_lr_30_mi_mrmr + auc_list_rfc_30_mi_mrmr + auc_list_svm_30_mi_mrmr + auc_list_xgb_30_mi_mrmr +
                            auc_list_lr_50_mi_mrmr + auc_list_rfc_50_mi_mrmr + auc_list_svm_50_mi_mrmr + auc_list_xgb_50_mi_mrmr +
                            auc_list_lr_75_mi_mrmr + auc_list_rfc_75_mi_mrmr + auc_list_svm_75_mi_mrmr + auc_list_xgb_75_mi_mrmr +
                            auc_list_lr_90_mi_mrmr + auc_list_rfc_90_mi_mrmr + auc_list_svm_90_mi_mrmr + auc_list_xgb_90_mi_mrmr)

overall_param_list_mi_mrmr = (param_list_lr_10_mi_mrmr + param_list_rfc_10_mi_mrmr + param_list_svm_10_mi_mrmr + param_list_xgb_10_mi_mrmr +
                            param_list_lr_20_mi_mrmr + param_list_rfc_20_mi_mrmr + param_list_svm_20_mi_mrmr + param_list_xgb_20_mi_mrmr +
                            param_list_lr_30_mi_mrmr + param_list_rfc_30_mi_mrmr + param_list_svm_30_mi_mrmr + param_list_xgb_30_mi_mrmr +
                            param_list_lr_50_mi_mrmr + param_list_rfc_50_mi_mrmr + param_list_svm_50_mi_mrmr + param_list_xgb_50_mi_mrmr +
                            param_list_lr_75_mi_mrmr + param_list_rfc_75_mi_mrmr + param_list_svm_75_mi_mrmr + param_list_xgb_75_mi_mrmr +
                            param_list_lr_90_mi_mrmr + param_list_rfc_90_mi_mrmr + param_list_svm_90_mi_mrmr + param_list_xgb_90_mi_mrmr)

In [None]:
overall_accuracy_list_pca = (accuracy_list_lr_10_pca + accuracy_list_rfc_10_pca + accuracy_list_svm_10_pca + accuracy_list_xgb_10_pca +
                            accuracy_list_lr_20_pca + accuracy_list_rfc_20_pca + accuracy_list_svm_20_pca + accuracy_list_xgb_20_pca +
                            accuracy_list_lr_30_pca + accuracy_list_rfc_30_pca + accuracy_list_svm_30_pca + accuracy_list_xgb_30_pca +
                            accuracy_list_lr_50_pca + accuracy_list_rfc_50_pca + accuracy_list_svm_50_pca + accuracy_list_xgb_50_pca +
                            accuracy_list_lr_75_pca + accuracy_list_rfc_75_pca + accuracy_list_svm_75_pca + accuracy_list_xgb_75_pca +
                            accuracy_list_lr_90_pca + accuracy_list_rfc_90_pca + accuracy_list_svm_90_pca + accuracy_list_xgb_90_pca)

overall_f1_score_list_pca = (f1_score_list_lr_10_pca + f1_score_list_rfc_10_pca + f1_score_list_svm_10_pca + f1_score_list_xgb_10_pca +
                            f1_score_list_lr_20_pca + f1_score_list_rfc_20_pca + f1_score_list_svm_20_pca + f1_score_list_xgb_20_pca +
                            f1_score_list_lr_30_pca + f1_score_list_rfc_30_pca + f1_score_list_svm_30_pca + f1_score_list_xgb_30_pca +
                            f1_score_list_lr_50_pca + f1_score_list_rfc_50_pca + f1_score_list_svm_50_pca + f1_score_list_xgb_50_pca +
                            f1_score_list_lr_75_pca + f1_score_list_rfc_75_pca + f1_score_list_svm_75_pca + f1_score_list_xgb_75_pca +
                            f1_score_list_lr_90_pca + f1_score_list_rfc_90_pca + f1_score_list_svm_90_pca + f1_score_list_xgb_90_pca)

overall_auc_list_pca = (auc_list_lr_10_pca + auc_list_rfc_10_pca + auc_list_svm_10_pca + auc_list_xgb_10_pca +
                            auc_list_lr_20_pca + auc_list_rfc_20_pca + auc_list_svm_20_pca + auc_list_xgb_20_pca +
                            auc_list_lr_30_pca + auc_list_rfc_30_pca + auc_list_svm_30_pca + auc_list_xgb_30_pca +
                            auc_list_lr_50_pca + auc_list_rfc_50_pca + auc_list_svm_50_pca + auc_list_xgb_50_pca +
                            auc_list_lr_75_pca + auc_list_rfc_75_pca + auc_list_svm_75_pca + auc_list_xgb_75_pca +
                            auc_list_lr_90_pca + auc_list_rfc_90_pca + auc_list_svm_90_pca + auc_list_xgb_90_pca)

overall_param_list_pca = (param_list_lr_10_pca + param_list_rfc_10_pca + param_list_svm_10_pca + param_list_xgb_10_pca +
                            param_list_lr_20_pca + param_list_rfc_20_pca + param_list_svm_20_pca + param_list_xgb_20_pca +
                            param_list_lr_30_pca + param_list_rfc_30_pca + param_list_svm_30_pca + param_list_xgb_30_pca +
                            param_list_lr_50_pca + param_list_rfc_50_pca + param_list_svm_50_pca + param_list_xgb_50_pca +
                            param_list_lr_75_pca + param_list_rfc_75_pca + param_list_svm_75_pca + param_list_xgb_75_pca +
                            param_list_lr_90_pca + param_list_rfc_90_pca + param_list_svm_90_pca + param_list_xgb_90_pca)

In [None]:
print(len(overall_accuracy_list_mi))
print(len(overall_f1_score_list_mi))
print(len(overall_auc_list_mi))
print(len(overall_param_list_mi))

print(len(overall_accuracy_list_mrmr))
print(len(overall_f1_score_list_mrmr))
print(len(overall_auc_list_mrmr))
print(len(overall_param_list_mrmr))

print(len(overall_accuracy_list_mi_mrmr))
print(len(overall_f1_score_list_mi_mrmr))
print(len(overall_auc_list_mi_mrmr))
print(len(overall_param_list_mi_mrmr))

print(len(overall_accuracy_list_pca))
print(len(overall_f1_score_list_pca))
print(len(overall_auc_list_pca))
print(len(overall_param_list_pca))

240
240
240
240
240
240
240
240
240
240
240
240


In [None]:
results_dictionary = {
    'samples': samples,
    'models': models,
    'percentiles': percentiles,
    'mi_accuracy': overall_accuracy_list_mi,
    'mi_f1_score': overall_f1_score_list_mi,
    'mi_auc': overall_auc_list_mi,
    'mi_params': overall_param_list_mi,
    'mrmr_accuracy': overall_accuracy_list_mrmr,
    'mrmr_f1_score': overall_f1_score_list_mrmr,
    'mrmr_auc': overall_auc_list_mrmr,
    'mrmr_params': overall_param_list_mrmr,
    'mi_mrmr_accuracy': overall_accuracy_list_mi_mrmr,
    'mi_mrmr_f1_score': overall_f1_score_list_mi_mrmr,
    'mi_mrmr_auc': overall_auc_list_mi_mrmr,
    'mi_mrmr_params': overall_param_list_mi_mrmr,
    'pca_accuracy': overall_accuracy_list_pca,
    'pca_f1_score': overall_f1_score_list_pca,
    'pca_auc': overall_auc_list_pca,
    'pca_params': overall_param_list_pca
}
results_df = pd.DataFrame(results_dictionary)

results_df.to_csv('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/results/overall_results.csv', index=False)