In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import *
from sklearn.metrics import *

from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.decomposition import *

import xgboost as xgb
import lightgbm as lgb

# import tensorflow as tf

import os
import re
import ast

In [41]:
directory_dataframes = '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/subsamples/'
directory_features = '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features/'

def get_sample_df(directory=directory_dataframes):
    filename_list = []
    list_dataframes = []
    for filename in os.listdir(directory):
        print(filename)
        filename_list.append(filename)
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            list_dataframes.append(pd.read_csv(f))
            
    return list_dataframes, filename_list

def get_features(regex_str, directory=directory_features):
    regex = re.compile('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features/{}'.format(regex_str))
    
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if regex.match(f):
            file1 = open(f,"r+")
            feat_list = file1.read().splitlines()
            
            #txt file converts everything to string, so we need to convert it back to list
            for i in range(len(feat_list)):
                #adding ; to be used a separator for list
                if i<len(feat_list):
                    new_val = feat_list[i].replace('y','y;').replace(') ','); ').replace('4 ', '4; ').replace('5 ', '5; ')
                    feat_list[i] = new_val
                
    for val in feat_list:
        #separating the string into a list of features
        new_val = val.split('; ')
        feat_list[feat_list.index(val)] = new_val
        
    return feat_list

list_sample_dataframes, filename_sample_list = get_sample_df(directory_dataframes)

subsample_8.csv
subsample_9.csv
subsample_7.csv
subsample_6.csv
subsample_10.csv
subsample_4.csv
subsample_5.csv
subsample_1.csv
subsample_2.csv
subsample_3.csv


In [7]:
def model_train_predict(model, regex_str, dataframes=list_sample_dataframes, params=None):
    
    feat_list = get_features(regex_str)
    
    accuracy_list = []
    f1_score_list = []
    auc_list = []
    best_params_list = []
    
    for sample, feat in zip(dataframes, feat_list):
        feat[len(feat)-1] = feat[len(feat)-1].replace('y;', 'y')
        x = sample[feat]
        x = x.rename(columns = lambda a:re.sub('[^A-Za-z0-9_]+', '', a))
        
        y = sample['conversion_class']
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
        clf = GridSearchCV(estimator=model, param_grid=params, cv=5, n_jobs=-1)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        
        
        # model.fit(x_train, y_train)
        # y_pred = model.predict(x_test)
        accuracy_list.append(accuracy_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))
        auc_list.append(roc_auc_score(y_test, y_pred))
        best_params_list.append(clf.best_params_)

    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
    
    print('Average Accuracy', np.mean(accuracy_list))
    print('Average F1 Score', np.mean(f1_score_list))
    print('Average AUC', np.mean(auc_list)) 
    
    print('Max Accuracy', max(accuracy_list))
    print('Max F1 Score', max(f1_score_list))
    print('Max AUC', max(auc_list))  
    
    print(accuracy_list)
    print(auc_list)
    best_accuracy_index = accuracy_list.index(max(accuracy_list))
    best_f1_score_index = f1_score_list.index(max(f1_score_list))
    best_auc_index = auc_list.index(max(auc_list))
    
    print('Best Sample Index based on Max Accuracy', best_accuracy_index)
    print('Best Sample Index based on Max F1 Score', best_f1_score_index)
    print('Best Sample Index based on Max AUC', best_auc_index)
    
    print('Best Features based on Max Accuracy', feat_list[best_accuracy_index])
    print('Best Features based on Max F1 Score', feat_list[best_f1_score_index])
    print('Best Features based on Max AUC', feat_list[best_auc_index]) 
    print('Best Params based on Max Accuracy', best_params_list[best_accuracy_index])
    
    return accuracy_list, f1_score_list, auc_list, best_params_list  


# Mutual Information

## 10 Percentile

In [8]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_10_mi, f1_score_list_lr_10_mi, auc_list_lr_10_mi, param_list_lr_10_mi = model_train_predict(lr, 'mi_feat_list_10', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_10_mi, f1_score_list_rfc_10_mi, auc_list_rfc_10_mi, param_list_rfc_10_mi = model_train_predict(rfc, 'mi_feat_list_10', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_10_mi, f1_score_list_svm_10_mi, auc_list_svm_10_mi, param_list_svm_10_mi = model_train_predict(svc, 'mi_feat_list_10', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100, 200, 500], 'max_depth': [3, 5, 10, 20, 50, 100]}
accuracy_list_xgb_10_mi, f1_score_list_xgb_10_mi, auc_list_xgb_10_mi, param_list_xgb_10_mi = model_train_predict(xgbc, 'mi_feat_list_10', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_10_mi, f1_score_list_lgb_10_mi, auc_list_lgb_10_mi, param_list_lgb_10_mi = model_train_predict(lgbc, 'mi_feat_list_10', params=params)

(12052, 16) (12052,) (3014, 16) (3014,)
Average Accuracy 0.8510617120106172
Average F1 Score 0.8474320438030037
Average AUC 0.8510617120106172
Max Accuracy 0.8526874585268746
Max F1 Score 0.8491847826086957
Max AUC 0.8526874585268746
[0.8526874585268746, 0.8526874585268746, 0.8506967485069675, 0.8506967485069675, 0.8500331785003318, 0.8506967485069675, 0.8506967485069675, 0.8500331785003318, 0.8506967485069675, 0.851692103516921]
[0.8526874585268746, 0.8526874585268746, 0.8506967485069675, 0.8506967485069675, 0.8500331785003318, 0.8506967485069675, 0.8506967485069675, 0.8500331785003318, 0.8506967485069675, 0.851692103516921]
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(4, 1)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)']
Best Features

## 20 Percentile

In [9]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_20_mi, f1_score_list_lr_20_mi, auc_list_lr_20_mi, param_list_lr_20_mi = model_train_predict(lr, 'mi_feat_list_20', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_20_mi, f1_score_list_rfc_20_mi, auc_list_rfc_20_mi, param_list_rfc_20_mi = model_train_predict(rfc, 'mi_feat_list_20', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_20_mi, f1_score_list_svm_20_mi, auc_list_svm_20_mi, param_list_svm_20_mi = model_train_predict(svc, 'mi_feat_list_20', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_20_mi, f1_score_list_xgb_20_mi, auc_list_xgb_20_mi, param_list_xgb_20_mi = model_train_predict(xgbc, 'mi_feat_list_20', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_20_mi, f1_score_list_lgb_20_mi, auc_list_lgb_20_mi, param_list_lgb_20_mi = model_train_predict(lgbc, 'mi_feat_list_20', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(12052, 32) (12052,) (3014, 32) (3014,)
Average Accuracy 0.8716323822163238
Average F1 Score 0.8701901462213879
Average AUC 0.8716323822163238
Max Accuracy 0.8749170537491705
Max F1 Score 0.8735323716873532
Max AUC 0.8749170537491705
[0.8749170537491705, 0.8709356337093563, 0.8722627737226277, 0.8712674187126742, 0.8702720637027206, 0.8719309887193099, 0.8692767086927671, 0.8729263437292635, 0.8709356337093563, 0.8715992037159921]
[0.8749170537491705, 0.8709356337093563, 0.8722627737226278, 0.8712674187126742, 0.8702720637027207, 0.8719309887193099, 0.8692767086927671, 0.8729263437292635, 0.8709356337093563, 0.8715992037159921]
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 

## 30 Percentile

In [10]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_30_mi, f1_score_list_lr_30_mi, auc_list_lr_30_mi, param_list_lr_30_mi = model_train_predict(lr, 'mi_feat_list_30', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_30_mi, f1_score_list_rfc_30_mi, auc_list_rfc_30_mi, param_list_rfc_30_mi = model_train_predict(rfc, 'mi_feat_list_30', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_30_mi, f1_score_list_svm_30_mi, auc_list_svm_30_mi, param_list_svm_30_mi = model_train_predict(svc, 'mi_feat_list_30', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_30_mi, f1_score_list_xgb_30_mi, auc_list_xgb_30_mi, param_list_xgb_30_mi = model_train_predict(xgbc, 'mi_feat_list_30', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_30_mi, f1_score_list_lgb_30_mi, auc_list_lgb_30_mi, param_list_lgb_30_mi = model_train_predict(lgbc, 'mi_feat_list_30', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 48) (12052,) (3014, 48) (3014,)
Average Accuracy 0.875149303251493
Average F1 Score 0.8740750275882423
Average AUC 0.875149303251493
Max Accuracy 0.8765759787657598
Max F1 Score 0.8756684491978609
Max AUC 0.8765759787657599
[0.8765759787657598, 0.8762441937624419, 0.8742534837425349, 0.8765759787657598, 0.8749170537491705, 0.8762441937624419, 0.8709356337093563, 0.8749170537491705, 0.8755806237558063, 0.8752488387524884]
[0.8765759787657598, 0.876244193762442, 0.8742534837425349, 0.8765759787657599, 0.8749170537491705, 0.8762441937624419, 0.8709356337093562, 0.8749170537491705, 0.8755806237558064, 0.8752488387524884]
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(1,)', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(3, 2)', 

## 50 Percentile

In [11]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_50_mi, f1_score_list_lr_50_mi, auc_list_lr_50_mi, param_list_lr_50_mi = model_train_predict(lr, 'mi_feat_list_50', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_50_mi, f1_score_list_rfc_50_mi, auc_list_rfc_50_mi, param_list_rfc_50_mi = model_train_predict(rfc, 'mi_feat_list_50', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_50_mi, f1_score_list_svm_50_mi, auc_list_svm_50_mi, param_list_svm_50_mi = model_train_predict(svc, 'mi_feat_list_50', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_50_mi, f1_score_list_xgb_50_mi, auc_list_xgb_50_mi, param_list_xgb_50_mi = model_train_predict(xgbc, 'mi_feat_list_50', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_50_mi, f1_score_list_lgb_50_mi, auc_list_lgb_50_mi, param_list_lgb_50_mi = model_train_predict(lgbc, 'mi_feat_list_50', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 80) (12052,) (3014, 80) (3014,)
Average Accuracy 0.8821831453218314
Average F1 Score 0.8815420703579988
Average AUC 0.8821831453218316
Max Accuracy 0.8832116788321168
Max F1 Score 0.8829008649367931
Max AUC 0.8832116788321167
[0.8822163238221632, 0.8818845388188454, 0.8815527538155276, 0.8825481088254811, 0.8818845388188454, 0.8818845388188454, 0.8832116788321168, 0.8822163238221632, 0.8825481088254811, 0.8818845388188454]
[0.8822163238221633, 0.8818845388188454, 0.8815527538155274, 0.8825481088254812, 0.8818845388188454, 0.8818845388188454, 0.8832116788321167, 0.8822163238221633, 0.8825481088254811, 0.8818845388188454]
Best Sample Index based on Max Accuracy 6
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(1,)', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 2)', '(2, 3)',

## 75 Percentile

In [56]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_75_mi, f1_score_list_lr_75_mi, auc_list_lr_75_mi, param_list_lr_75_mi = model_train_predict(lr, 'mi_feat_list_75', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_75_mi, f1_score_list_rfc_75_mi, auc_list_rfc_75_mi, param_list_rfc_75_mi = model_train_predict(rfc, 'mi_feat_list_75', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_75_mi, f1_score_list_svm_75_mi, auc_list_svm_75_mi, param_list_svm_75_mi = model_train_predict(svc, 'mi_feat_list_75', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_75_mi, f1_score_list_xgb_75_mi, auc_list_xgb_75_mi, param_list_xgb_75_mi = model_train_predict(xgbc, 'mi_feat_list_75', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_75_mi, f1_score_list_lgb_75_mi, auc_list_lgb_75_mi, param_list_lgb_75_mi = model_train_predict(lgbc, 'mi_feat_list_75', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 120) (12052,) (3014, 120) (3014,)
Average Accuracy 0.88364299933643
Average F1 Score 0.8832596355875824
Average AUC 0.88364299933643
Max Accuracy 0.8845388188453882
Max F1 Score 0.8839226150767178
Max AUC 0.8845388188453881
[0.882879893828799, 0.8835434638354346, 0.8835434638354346, 0.8838752488387525, 0.8832116788321168, 0.8838752488387525, 0.8845388188453882, 0.8838752488387525, 0.8835434638354346, 0.8835434638354346]
[0.8828798938287987, 0.8835434638354345, 0.8835434638354345, 0.8838752488387525, 0.8832116788321168, 0.8838752488387525, 0.8845388188453881, 0.8838752488387525, 0.8835434638354345, 0.8835434638354347]
Best Sample Index based on Max Accuracy 6
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(1,)', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 6)', '(6, 1)', '(2, 1)', '(

## 90 Percentile

In [57]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_90_mi, f1_score_list_lr_90_mi, auc_list_lr_90_mi, param_list_lr_90_mi = model_train_predict(lr, 'mi_feat_list_90', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_90_mi, f1_score_list_rfc_90_mi, auc_list_rfc_90_mi, param_list_rfc_90_mi = model_train_predict(rfc, 'mi_feat_list_90', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_90_mi, f1_score_list_svm_90_mi, auc_list_svm_90_mi, param_list_svm_90_mi = model_train_predict(svc, 'mi_feat_list_90', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_90_mi, f1_score_list_xgb_90_mi, auc_list_xgb_90_mi, param_list_xgb_90_mi = model_train_predict(xgbc, 'mi_feat_list_90', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_90_mi, f1_score_list_lgb_90_mi, auc_list_lgb_90_mi, param_list_lgb_90_mi = model_train_predict(lgbc, 'mi_feat_list_90', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 144) (12052,) (3014, 144) (3014,)
Average Accuracy 0.8837425348374252
Average F1 Score 0.8834098141224281
Average AUC 0.8837425348374252
Max Accuracy 0.8838752488387525
Max F1 Score 0.8835662009314703
Max AUC 0.8838752488387525
[0.8838752488387525, 0.8835434638354346, 0.8835434638354346, 0.8838752488387525, 0.8832116788321168, 0.8838752488387525, 0.8838752488387525, 0.8838752488387525, 0.8838752488387525, 0.8838752488387525]
[0.8838752488387525, 0.8835434638354345, 0.8835434638354345, 0.8838752488387525, 0.8832116788321168, 0.8838752488387525, 0.8838752488387525, 0.8838752488387525, 0.8838752488387525, 0.8838752488387525]
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(1,)', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 6)', '(6, 1)', '(2, 1)

# mRMR

## 10 Percentile

In [14]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_10_mrmr, f1_score_list_lr_10_mrmr, auc_list_lr_10_mrmr, param_list_lr_10_mrmr = model_train_predict(lr, 'mrmr_feat_list_10', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_10_mrmr, f1_score_list_rfc_10_mrmr, auc_list_rfc_10_mrmr, param_list_rfc_10_mrmr = model_train_predict(rfc, 'mrmr_feat_list_10', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_10_mrmr, f1_score_list_svm_10_mrmr, auc_list_svm_10_mrmr, param_list_svm_10_mrmr = model_train_predict(svc, 'mrmr_feat_list_10', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_10_mrmr, f1_score_list_xgb_10_mrmr, auc_list_xgb_10_mrmr, param_list_xgb_10_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_10', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_10_mrmr, f1_score_list_lgb_10_mrmr, auc_list_lgb_10_mrmr, param_list_lgb_10_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_10', params=params)

(12052, 14) (12052,) (3014, 14) (3014,)
Average Accuracy 0.8523556735235568
Average F1 Score 0.8482782134333448
Average AUC 0.8523556735235568
Max Accuracy 0.8523556735235568
Max F1 Score 0.8482782134333448
Max AUC 0.8523556735235568
[0.8523556735235568, 0.8523556735235568, 0.8523556735235568, 0.8523556735235568, 0.8523556735235568, 0.8523556735235568, 0.8523556735235568, 0.8523556735235568, 0.8523556735235568, 0.8523556735235568]
[0.8523556735235568, 0.8523556735235568, 0.8523556735235568, 0.8523556735235568, 0.8523556735235568, 0.8523556735235568, 0.8523556735235568, 0.8523556735235568, 0.8523556735235568, 0.8523556735235568]
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(3, 2, 2)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', 'bigram_entropy', '(1, 2, 1)', 'trigram_entropy', '(1, 2)', '(2, 3, 1)', '(4, 1, 1)', '(2, 3)']
Best Featur

## 20 Percentile

In [15]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_20_mrmr, f1_score_list_lr_20_mrmr, auc_list_lr_20_mrmr, param_list_lr_20_mrmr = model_train_predict(lr, 'mrmr_feat_list_20*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_20_mrmr, f1_score_list_rfc_20_mrmr, auc_list_rfc_20_mrmr, param_list_rfc_20_mrmr = model_train_predict(rfc, 'mrmr_feat_list_20', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_20_mrmr, f1_score_list_svm_20_mrmr, auc_list_svm_20_mrmr, param_list_svm_20_mrmr = model_train_predict(svc, 'mrmr_feat_list_20', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_20_mrmr, f1_score_list_xgb_20_mrmr, auc_list_xgb_20_mrmr, param_list_xgb_20_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_20', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_20_mrmr, f1_score_list_lgb_20_mrmr, auc_list_lgb_20_mrmr, param_list_lgb_20_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_20', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 28) (12052,) (3014, 28) (3014,)
Average Accuracy 0.8646317186463172
Average F1 Score 0.8622552329507089
Average AUC 0.8646317186463172
Max Accuracy 0.8646317186463172
Max F1 Score 0.8622552329507089
Max AUC 0.8646317186463173
[0.8646317186463172, 0.8646317186463172, 0.8646317186463172, 0.8646317186463172, 0.8646317186463172, 0.8646317186463172, 0.8646317186463172, 0.8646317186463172, 0.8646317186463172, 0.8646317186463172]
[0.8646317186463173, 0.8646317186463173, 0.8646317186463173, 0.8646317186463173, 0.8646317186463173, 0.8646317186463173, 0.8646317186463173, 0.8646317186463173, 0.8646317186463173, 0.8646317186463173]
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(3, 2, 2)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', 'bigram_entropy', '(1, 2, 1)', 'trigram_entropy', '(1, 2)', '(2, 3, 1)', '(4, 1, 1)', '(2, 3)', '(2,)', '(3

## 30 Percentile

In [16]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_30_mrmr, f1_score_list_lr_30_mrmr, auc_list_lr_30_mrmr, param_list_lr_30_mrmr = model_train_predict(lr, 'mrmr_feat_list_30*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_30_mrmr, f1_score_list_rfc_30_mrmr, auc_list_rfc_30_mrmr, param_list_rfc_30_mrmr = model_train_predict(rfc, 'mrmr_feat_list_30', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_30_mrmr, f1_score_list_svm_30_mrmr, auc_list_svm_30_mrmr, param_list_svm_30_mrmr = model_train_predict(svc, 'mrmr_feat_list_30', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_30_mrmr, f1_score_list_xgb_30_mrmr, auc_list_xgb_30_mrmr, param_list_xgb_30_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_30', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_30_mrmr, f1_score_list_lgb_30_mrmr, auc_list_lgb_30_mrmr, param_list_lgb_30_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_30', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 42) (12052,) (3014, 42) (3014,)
Average Accuracy 0.8732581287325812
Average F1 Score 0.8720696584058942
Average AUC 0.8732581287325815
Max Accuracy 0.8732581287325812
Max F1 Score 0.8720696584058941
Max AUC 0.8732581287325814
[0.8732581287325812, 0.8732581287325812, 0.8732581287325812, 0.8732581287325812, 0.8732581287325812, 0.8732581287325812, 0.8732581287325812, 0.8732581287325812, 0.8732581287325812, 0.8732581287325812]
[0.8732581287325814, 0.8732581287325814, 0.8732581287325814, 0.8732581287325814, 0.8732581287325814, 0.8732581287325814, 0.8732581287325814, 0.8732581287325814, 0.8732581287325814, 0.8732581287325814]
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(3, 2, 2)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', 'bigram_entropy', '(1, 2, 1)', 'trigram_entropy', '(1, 2)', '(2, 3, 1)', '(4, 1, 1)', '(2, 3)', '(2,)', '(3

## 50 Percentile

In [17]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_50_mrmr, f1_score_list_lr_50_mrmr, auc_list_lr_50_mrmr, param_list_lr_50_mrmr = model_train_predict(lr, 'mrmr_feat_list_50*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_50_mrmr, f1_score_list_rfc_50_mrmr, auc_list_rfc_50_mrmr, param_list_rfc_50_mrmr = model_train_predict(rfc, 'mrmr_feat_list_50', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_50_mrmr, f1_score_list_svm_50_mrmr, auc_list_svm_50_mrmr, param_list_svm_50_mrmr = model_train_predict(svc, 'mrmr_feat_list_50', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_50_mrmr, f1_score_list_xgb_50_mrmr, auc_list_xgb_50_mrmr, param_list_xgb_50_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_50', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_50_mrmr, f1_score_list_lgb_50_mrmr, auc_list_lgb_50_mrmr, param_list_lgb_50_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_50', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 69) (12052,) (3014, 69) (3014,)
Average Accuracy 0.8825481088254812
Average F1 Score 0.8815261044176707
Average AUC 0.882548108825481
Max Accuracy 0.8825481088254811
Max F1 Score 0.8815261044176707
Max AUC 0.882548108825481
[0.8825481088254811, 0.8825481088254811, 0.8825481088254811, 0.8825481088254811, 0.8825481088254811, 0.8825481088254811, 0.8825481088254811, 0.8825481088254811, 0.8825481088254811, 0.8825481088254811]
[0.882548108825481, 0.882548108825481, 0.882548108825481, 0.882548108825481, 0.882548108825481, 0.882548108825481, 0.882548108825481, 0.882548108825481, 0.882548108825481, 0.882548108825481]
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(3, 2, 2)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', 'bigram_entropy', '(1, 2, 1)', 'trigram_entropy', '(1, 2)', '(2, 3, 1)', '(4, 1, 1)', '(2, 3)', '(2,)', '(3,)', 'unigra

## 75 Percentile

In [58]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_75_mrmr, f1_score_list_lr_75_mrmr, auc_list_lr_75_mrmr, param_list_lr_75_mrmr = model_train_predict(lr, 'mrmr_feat_list_75*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_75_mrmr, f1_score_list_rfc_75_mrmr, auc_list_rfc_75_mrmr, param_list_rfc_75_mrmr = model_train_predict(rfc, 'mrmr_feat_list_75', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_75_mrmr, f1_score_list_svm_75_mrmr, auc_list_svm_75_mrmr, param_list_svm_75_mrmr = model_train_predict(svc, 'mrmr_feat_list_75', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_75_mrmr, f1_score_list_xgb_75_mrmr, auc_list_xgb_75_mrmr, param_list_xgb_75_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_75', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_75_mrmr, f1_score_list_lgb_75_mrmr, auc_list_lgb_75_mrmr, param_list_lgb_75_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_75', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 104) (12052,) (3014, 104) (3014,)
Average Accuracy 0.8845388188453882
Average F1 Score 0.8836898395721924
Average AUC 0.8845388188453882
Max Accuracy 0.8845388188453882
Max F1 Score 0.8836898395721925
Max AUC 0.8845388188453882
[0.8845388188453882, 0.8845388188453882, 0.8845388188453882, 0.8845388188453882, 0.8845388188453882, 0.8845388188453882, 0.8845388188453882, 0.8845388188453882, 0.8845388188453882, 0.8845388188453882]
[0.8845388188453882, 0.8845388188453882, 0.8845388188453882, 0.8845388188453882, 0.8845388188453882, 0.8845388188453882, 0.8845388188453882, 0.8845388188453882, 0.8845388188453882, 0.8845388188453882]
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(3, 2, 2)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', 'bigram_entropy', '(1, 2, 1)', 'trigram_entropy', '(1, 2)', '(2, 3, 1)', '(4, 1, 1)', '(2, 3)', '(2,)', '

## 90 Percentile

In [59]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_90_mrmr, f1_score_list_lr_90_mrmr, auc_list_lr_90_mrmr, param_list_lr_90_mrmr = model_train_predict(lr, 'mrmr_feat_list_90*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_90_mrmr, f1_score_list_rfc_90_mrmr, auc_list_rfc_90_mrmr, param_list_rfc_90_mrmr = model_train_predict(rfc, 'mrmr_feat_list_90', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_90_mrmr, f1_score_list_svm_90_mrmr, auc_list_svm_90_mrmr, param_list_svm_90_mrmr = model_train_predict(svc, 'mrmr_feat_list_90', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_90_mrmr, f1_score_list_xgb_90_mrmr, auc_list_xgb_90_mrmr, param_list_xgb_90_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_90', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_90_mrmr, f1_score_list_lgb_90_mrmr, auc_list_lgb_90_mrmr, param_list_lgb_90_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_90', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 125) (12052,) (3014, 125) (3014,)
Average Accuracy 0.8832116788321167
Average F1 Score 0.8824315297261188
Average AUC 0.8832116788321167
Max Accuracy 0.8832116788321168
Max F1 Score 0.8824315297261188
Max AUC 0.8832116788321168
[0.8832116788321168, 0.8832116788321168, 0.8832116788321168, 0.8832116788321168, 0.8832116788321168, 0.8832116788321168, 0.8832116788321168, 0.8832116788321168, 0.8832116788321168, 0.8832116788321168]
[0.8832116788321168, 0.8832116788321168, 0.8832116788321168, 0.8832116788321168, 0.8832116788321168, 0.8832116788321168, 0.8832116788321168, 0.8832116788321168, 0.8832116788321168, 0.8832116788321168]
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(3, 2, 2)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', 'bigram_entropy', '(1, 2, 1)', 'trigram_entropy', '(1, 2)', '(2, 3, 1)', '(4, 1, 1)', '(2, 3)', '(2,)', '

KeyboardInterrupt: 

# MI and mRMR

## 10 Percentile

In [20]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_10_mi_mrmr, f1_score_list_lr_10_mi_mrmr, auc_list_lr_10_mi_mrmr, param_list_lr_10_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_10*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_10_mi_mrmr, f1_score_list_rfc_10_mi_mrmr, auc_list_rfc_10_mi_mrmr, param_list_rfc_10_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_10', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_10_mi_mrmr, f1_score_list_svm_10_mi_mrmr, auc_list_svm_10_mi_mrmr, param_list_svm_10_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_10', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_10_mi_mrmr, f1_score_list_xgb_10_mi_mrmr, auc_list_xgb_10_mi_mrmr, param_list_xgb_10_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_10', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_10_mi_mrmr, f1_score_list_lgb_10_mi_mrmr, auc_list_lgb_10_mi_mrmr, param_list_lgb_10_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_10', params=params)

(12052, 10) (12052,) (3014, 10) (3014,)
Average Accuracy 0.8358327803583278
Average F1 Score 0.8306173543202673
Average AUC 0.8358327803583278
Max Accuracy 0.8364299933642999
Max F1 Score 0.830653438248375
Max AUC 0.8364299933642999
[0.8357664233576643, 0.8357664233576643, 0.8357664233576643, 0.8357664233576643, 0.8357664233576643, 0.8357664233576643, 0.8357664233576643, 0.8357664233576643, 0.8357664233576643, 0.8364299933642999]
[0.8357664233576642, 0.8357664233576642, 0.8357664233576642, 0.8357664233576642, 0.8357664233576642, 0.8357664233576642, 0.8357664233576642, 0.8357664233576642, 0.8357664233576642, 0.8364299933642999]
Best Sample Index based on Max Accuracy 9
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 9
Best Features based on Max Accuracy ['(3, 1)', '(2, 1)', '(4, 1, 1)', '(2, 3)', '(1, 2, 1)', 'trigram_entropy', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', 'bigram_entropy']
Best Features based on Max F1 Score ['(3, 1)', '(2, 1)', '(2, 3)', '(1, 2, 1)'

## 20 Percentile

In [21]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_20_mi_mrmr, f1_score_list_lr_20_mi_mrmr, auc_list_lr_20_mi_mrmr, param_list_lr_20_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_20*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_20_mi_mrmr, f1_score_list_rfc_20_mi_mrmr, auc_list_rfc_20_mi_mrmr, param_list_rfc_20_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_20', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_20_mi_mrmr, f1_score_list_svm_20_mi_mrmr, auc_list_svm_20_mi_mrmr, param_list_svm_20_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_20', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_20_mi_mrmr, f1_score_list_xgb_20_mi_mrmr, auc_list_xgb_20_mi_mrmr, param_list_xgb_20_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_20', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_20_mi_mrmr, f1_score_list_lgb_20_mi_mrmr, auc_list_lgb_20_mi_mrmr, param_list_lgb_20_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_20', params=params)

(12052, 23) (12052,) (3014, 23) (3014,)
Average Accuracy 0.863271400132714
Average F1 Score 0.8608294622416336
Average AUC 0.863271400132714
Max Accuracy 0.864963503649635
Max F1 Score 0.8622673434856176
Max AUC 0.864963503649635
[0.864963503649635, 0.8629727936297279, 0.8626410086264101, 0.8629727936297279, 0.8629727936297279, 0.8629727936297279, 0.8626410086264101, 0.864963503649635, 0.8626410086264101, 0.8629727936297279]
[0.864963503649635, 0.8629727936297279, 0.86264100862641, 0.8629727936297279, 0.8629727936297279, 0.8629727936297279, 0.86264100862641, 0.864963503649635, 0.86264100862641, 0.8629727936297279]
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(1, 4, 1)', '(2, 3, 1)', '(3, 1)', '(2, 1, 2)', '(4,)', '(2,)', 'unigram_entropy', '(1, 2, 3)', '(2, 1)', '(4, 1)', '(1, 2, 1)', 'trigram_entropy', '(1, 1, 4)', '(1, 2)', '(1, 4)', '(4, 1, 1)', '(2, 3)', 'pattern_hvg_4_

## 30 Percentile

In [22]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_30_mi_mrmr, f1_score_list_lr_30_mi_mrmr, auc_list_lr_30_mi_mrmr, param_list_lr_30_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_30*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_30_mi_mrmr, f1_score_list_rfc_30_mi_mrmr, auc_list_rfc_30_mi_mrmr, param_list_rfc_30_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_30', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_30_mi_mrmr, f1_score_list_svm_30_mi_mrmr, auc_list_svm_30_mi_mrmr, param_list_svm_30_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_30', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_30_mi_mrmr, f1_score_list_xgb_30_mi_mrmr, auc_list_xgb_30_mi_mrmr, param_list_xgb_30_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_30', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_30_mi_mrmr, f1_score_list_lgb_30_mi_mrmr, auc_list_lgb_30_mi_mrmr, param_list_lgb_30_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_30', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 31) (12052,) (3014, 31) (3014,)
Average Accuracy 0.8747179827471798
Average F1 Score 0.8734243732404383
Average AUC 0.8747179827471798
Max Accuracy 0.8765759787657598
Max F1 Score 0.8753351206434317
Max AUC 0.8765759787657598
[0.8752488387524884, 0.8759124087591241, 0.8735899137358991, 0.8765759787657598, 0.8752488387524884, 0.8732581287325812, 0.8732581287325812, 0.8752488387524884, 0.8745852687458527, 0.8742534837425349]
[0.8752488387524884, 0.875912408759124, 0.873589913735899, 0.8765759787657598, 0.8752488387524885, 0.8732581287325812, 0.8732581287325812, 0.8752488387524885, 0.8745852687458526, 0.8742534837425349]
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['(1, 4, 1)', '(3, 3, 1)', '(2, 1, 4)', '(2, 3, 1)', '(4, 1, 4)', '(3, 1)', '(2, 1, 2)', '(4,)', '(2,)', 'unigram_entropy', '(3, 3)', '(1, 2, 3)', '(1, 3, 3)', '(3, 1, 4)', '(2, 1, 1)', '(2, 1)', '(4, 1)', '(1

## 50 Percentile

In [23]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_50_mi_mrmr, f1_score_list_lr_50_mi_mrmr, auc_list_lr_50_mi_mrmr, param_list_lr_50_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_50*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_50_mi_mrmr, f1_score_list_rfc_50_mi_mrmr, auc_list_rfc_50_mi_mrmr, param_list_rfc_50_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_50', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_50_mi_mrmr, f1_score_list_svm_50_mi_mrmr, auc_list_svm_50_mi_mrmr, param_list_svm_50_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_50', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_50_mi_mrmr, f1_score_list_xgb_50_mi_mrmr, auc_list_xgb_50_mi_mrmr, param_list_xgb_50_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_50', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_50_mi_mrmr, f1_score_list_lgb_50_mi_mrmr, auc_list_lgb_50_mi_mrmr, param_list_lgb_50_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_50', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 54) (12052,) (3014, 54) (3014,)
Average Accuracy 0.8830457863304579
Average F1 Score 0.8819685307226349
Average AUC 0.8830457863304579
Max Accuracy 0.8838752488387525
Max F1 Score 0.8828647925033467
Max AUC 0.8838752488387523
[0.8825481088254811, 0.8838752488387525, 0.8818845388188454, 0.8832116788321168, 0.8835434638354346, 0.8825481088254811, 0.8832116788321168, 0.8835434638354346, 0.8832116788321168, 0.882879893828799]
[0.8825481088254812, 0.8838752488387523, 0.8818845388188453, 0.8832116788321168, 0.8835434638354346, 0.8825481088254812, 0.8832116788321166, 0.8835434638354346, 0.8832116788321166, 0.882879893828799]
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Features based on Max Accuracy ['(1, 4, 1)', '(3, 3, 1)', '(6, 2, 3)', '(2, 1, 4)', '(2, 3, 3)', '(2, 3, 1)', '(1, 3)', '(4, 1, 4)', '(2, 2, 2)', 'A4', '(3, 1)', '(4, 1, 2)', '(2, 1, 2)', '(4,)', '(2,)', '(1, 2, 2)', '(6, 3)', 'unigram_entr

## 75 Percentile

In [24]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_75_mi_mrmr, f1_score_list_lr_75_mi_mrmr, auc_list_lr_75_mi_mrmr, param_list_lr_75_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_75', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_75_mi_mrmr, f1_score_list_rfc_75_mi_mrmr, auc_list_rfc_75_mi_mrmr, param_list_rfc_75_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_75', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_75_mi_mrmr, f1_score_list_svm_75_mi_mrmr, auc_list_svm_75_mi_mrmr, param_list_svm_75_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_75', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_75_mi_mrmr, f1_score_list_xgb_75_mi_mrmr, auc_list_xgb_75_mi_mrmr, param_list_xgb_75_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_75', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_75_mi_mrmr, f1_score_list_lgb_75_mi_mrmr, auc_list_lgb_75_mi_mrmr, param_list_lgb_75_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_75', params=params)

## 90 Percentile

In [25]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_90_mi_mrmr, f1_score_list_lr_90_mi_mrmr, auc_list_lr_90_mi_mrmr, param_list_lr_90_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_90', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_90_mi_mrmr, f1_score_list_rfc_90_mi_mrmr, auc_list_rfc_90_mi_mrmr, param_list_rfc_90_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_90', params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_90_mi_mrmr, f1_score_list_svm_90_mi_mrmr, auc_list_svm_90_mi_mrmr, param_list_svm_90_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_90', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_90_mi_mrmr, f1_score_list_xgb_90_mi_mrmr, auc_list_xgb_90_mi_mrmr, param_list_xgb_90_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_90', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_90_mi_mrmr, f1_score_list_lgb_90_mi_mrmr, auc_list_lgb_90_mi_mrmr, param_list_lgb_90_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_90', params=params)

# PCA

In [26]:
def model_train_predict_pca(model, k, dataframes=list_sample_dataframes, params=None):
    
    accuracy_list = []
    f1_score_list = []
    auc_list = []
    param_list = []
    
    for sample in dataframes:
        x = sample.drop(['Unnamed: 0', 'conversion_class'], axis=1)
        y = sample['conversion_class']
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify=y)
        
        pca = PCA(n_components=k)
        x_train = pca.fit_transform(x_train)
        x_test = pca.transform(x_test)
        
        clf = GridSearchCV(estimator=model, param_grid=params, cv=5, n_jobs=-1)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        # model.fit(x_train, y_train)
        # y_pred = model.predict(x_test)
        accuracy_list.append(accuracy_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))
        auc_list.append(roc_auc_score(y_test, y_pred))
        param_list.append(clf.best_params_)

    print('Average Accuracy', np.mean(accuracy_list))
    print('Average F1 Score', np.mean(f1_score_list))
    print('Average AUC', np.mean(auc_list)) 
    
    print('Max Accuracy', max(accuracy_list))
    print('Max F1 Score', max(f1_score_list))
    print('Max AUC', max(auc_list))  
    
    best_accuracy_index = accuracy_list.index(max(accuracy_list))
    best_f1_score_index = f1_score_list.index(max(f1_score_list))
    best_auc_index = auc_list.index(max(auc_list))
    
    print('Best Sample Index based on Max Accuracy', best_accuracy_index)
    print('Best Sample Index based on Max F1 Score', best_f1_score_index)
    print('Best Sample Index based on Max AUC', best_auc_index)
    print('Best Parameters', param_list[best_accuracy_index])
    
    return accuracy_list, f1_score_list, auc_list, param_list  

## 10 Percentile

In [27]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_10_pca, f1_score_list_lr_10_pca, auc_list_lr_10_pca, param_list_lr_10_pca = model_train_predict_pca(lr, 14, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_10_pca, f1_score_list_rfc_10_pca, auc_list_rfc_10_pca, param_list_rfc_10_pca = model_train_predict_pca(rfc, 14, params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_10_pca, f1_score_list_svm_10_pca, auc_list_svm_10_pca, param_list_svm_10_pca = model_train_predict_pca(svc, 14, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_10_pca, f1_score_list_xgb_10_pca, auc_list_xgb_10_pca, param_list_xgb_10_pca = model_train_predict_pca(xgbc, 14, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_10_pca, f1_score_list_lgb_10_pca, auc_list_lgb_10_pca, param_list_lgb_10_pca = model_train_predict_pca(lgbc, 14, params=params)

Average Accuracy 0.8305309734513274
Average F1 Score 0.8303808680248006
Average AUC 0.8305309734513274
Max Accuracy 0.8305309734513274
Max F1 Score 0.8303808680248006
Max AUC 0.8305309734513274
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Parameters {'C': 1, 'max_iter': 1000}


Average Accuracy 0.8476548672566372
Average F1 Score 0.8514715195688846
Average AUC 0.8476548672566372
Max Accuracy 0.8502212389380531
Max F1 Score 0.8541891018737886
Max AUC 0.8502212389380531
Best Sample Index based on Max Accuracy 7
Best Sample Index based on Max F1 Score 9
Best Sample Index based on Max AUC 7
Best Parameters {'max_depth': 10, 'n_estimators': 100}


Average Accuracy 0.8530973451327434
Average F1 Score 0.8581196581196581
Average AUC 0.8530973451327434
Max Accuracy 0.8530973451327434
Max F1 Score 0.8581196581196581
Max AUC 0.8530973451327433
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F

## 20 Percentile

In [28]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_20_pca, f1_score_list_lr_20_pca, auc_list_lr_20_pca, param_list_lr_20_pca = model_train_predict_pca(lr, 28, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_20_pca, f1_score_list_rfc_20_pca, auc_list_rfc_20_pca, param_list_rfc_20_pca = model_train_predict_pca(rfc, 28, params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_20_pca, f1_score_list_svm_20_pca, auc_list_svm_20_pca, param_list_svm_20_pca = model_train_predict_pca(svc, 28, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_20_pca, f1_score_list_xgb_20_pca, auc_list_xgb_20_pca, param_list_xgb_20_pca = model_train_predict_pca(xgbc, 28, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_20_pca, f1_score_list_lgb_20_pca, auc_list_lgb_20_pca, param_list_lgb_20_pca = model_train_predict_pca(lgbc, 28, params=params)

Average Accuracy 0.8542035398230089
Average F1 Score 0.8526056810556921
Average AUC 0.8542035398230088
Max Accuracy 0.8542035398230089
Max F1 Score 0.8526056810556922
Max AUC 0.8542035398230087
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Parameters {'C': 1, 'max_iter': 1000}


Average Accuracy 0.8633849557522124
Average F1 Score 0.8657732657777031
Average AUC 0.8633849557522124
Max Accuracy 0.8657079646017699
Max F1 Score 0.868358273693342
Max AUC 0.8657079646017699
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 7
Best Sample Index based on Max AUC 4
Best Parameters {'max_depth': 10, 'n_estimators': 100}


Average Accuracy 0.8780973451327434
Average F1 Score 0.8801392212312378
Average AUC 0.8780973451327434
Max Accuracy 0.8780973451327434
Max F1 Score 0.8801392212312378
Max AUC 0.8780973451327434
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1

## 30 Percentile

In [29]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_30_pca, f1_score_list_lr_30_pca, auc_list_lr_30_pca, param_list_lr_30_pca = model_train_predict_pca(lr, 42, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_30_pca, f1_score_list_rfc_30_pca, auc_list_rfc_30_pca, param_list_rfc_30_pca = model_train_predict_pca(rfc, 42, params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_30_pca, f1_score_list_svm_30_pca, auc_list_svm_30_pca, param_list_svm_30_pca = model_train_predict_pca(svc, 42, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_30_pca, f1_score_list_xgb_30_pca, auc_list_xgb_30_pca, param_list_xgb_30_pca = model_train_predict_pca(xgbc, 42, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_30_pca, f1_score_list_lgb_30_pca, auc_list_lgb_30_pca, param_list_lgb_30_pca = model_train_predict_pca(lgbc, 42, params=params)

Average Accuracy 0.8744911504424777
Average F1 Score 0.8733281141979526
Average AUC 0.8744911504424777
Max Accuracy 0.8747787610619469
Max F1 Score 0.8736042876284056
Max AUC 0.8747787610619469
Best Sample Index based on Max Accuracy 7
Best Sample Index based on Max F1 Score 7
Best Sample Index based on Max AUC 7
Best Parameters {'C': 1000, 'max_iter': 1000}


Average Accuracy 0.8855530973451327
Average F1 Score 0.8876824688353461
Average AUC 0.8855530973451327
Max Accuracy 0.8884955752212389
Max F1 Score 0.8908145580589255
Max AUC 0.8884955752212389
Best Sample Index based on Max Accuracy 2
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Parameters {'max_depth': 10, 'n_estimators': 50}


Average Accuracy 0.8939380530973452
Average F1 Score 0.8955601017240798
Average AUC 0.8939380530973453
Max Accuracy 0.8942477876106195
Max F1 Score 0.8958605664488017
Max AUC 0.8942477876106194
Best Sample Index based on Max Accuracy 2
Best Sample Index based on Max

## 50 Percentile

In [30]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_50_pca, f1_score_list_lr_50_pca, auc_list_lr_50_pca, param_list_lr_50_pca = model_train_predict_pca(lr, 69, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_50_pca, f1_score_list_rfc_50_pca, auc_list_rfc_50_pca, param_list_rfc_50_pca = model_train_predict_pca(rfc, 69, params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_50_pca, f1_score_list_svm_50_pca, auc_list_svm_50_pca, param_list_svm_50_pca = model_train_predict_pca(svc, 69, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_50_pca, f1_score_list_xgb_50_pca, auc_list_xgb_50_pca, param_list_xgb_50_pca = model_train_predict_pca(xgbc, 69, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_50_pca, f1_score_list_lgb_50_pca, auc_list_lgb_50_pca, param_list_lgb_50_pca = model_train_predict_pca(lgbc, 69, params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Average Accuracy 0.8812389380530974
Average F1 Score 0.8807002397432576
Average AUC 0.8812389380530974
Max Accuracy 0.8820796460176992
Max F1 Score 0.8815818706954011
Max AUC 0.8820796460176992
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 3
Best Parameters {'C': 1000, 'max_iter': 1000}


Average Accuracy 0.8882300884955752
Average F1 Score 0.8905454759679673
Average AUC 0.8882300884955752
Max Accuracy 0.8896017699115044
Max F1 Score 0.8917805248319237
Max AUC 0.8896017699115044
Best Sample Index based on Max Accuracy 9
Best Sample Index based on Max F1 Score 9
Best Sample Index based on Max AUC 9
Best Parameters {'max_depth': 10, 'n_estimators': 100}


Average Accuracy 0.896438053097345
Average F1 Score 0.8979574016937674
Average AUC 0.896438053097345
Max Accuracy 0.8966814159292036
Max F1 Score 0.8981905384783083
Max AUC 0.8966814159292035
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max 

## 75 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_75_pca, f1_score_list_lr_75_pca, auc_list_lr_75_pca, param_list_lr_75_pca = model_train_predict_pca(lr, 104, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_75_pca, f1_score_list_rfc_75_pca, auc_list_rfc_75_pca, param_list_rfc_75_pca = model_train_predict_pca(rfc, 104, params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_75_pca, f1_score_list_svm_75_pca, auc_list_svm_75_pca, param_list_svm_75_pca = model_train_predict_pca(svc, 104, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_75_pca, f1_score_list_xgb_75_pca, auc_list_xgb_75_pca, param_list_xgb_75_pca = model_train_predict_pca(xgbc, 104, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_75_pca, f1_score_list_lgb_75_pca, auc_list_lgb_75_pca, param_list_lgb_75_pca = model_train_predict_pca(lgbc, 104, params=params)

## 90 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_90_pca, f1_score_list_lr_90_pca, auc_list_lr_90_pca, param_list_lr_90_pca = model_train_predict_pca(lr, 125, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_90_pca, f1_score_list_rfc_90_pca, auc_list_rfc_90_pca, param_list_rfc_90_pca = model_train_predict_pca(rfc, 125, params=params)
print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_90_pca, f1_score_list_svm_90_pca, auc_list_svm_90_pca, param_list_svm_90_pca = model_train_predict_pca(svc, 125, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_90_pca, f1_score_list_xgb_90_pca, auc_list_xgb_90_pca, param_list_xgb_90_pca = model_train_predict_pca(xgbc, 125, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_90_pca, f1_score_list_lgb_90_pca, auc_list_lgb_90_pca, param_list_lgb_90_pca = model_train_predict_pca(lgbc, 125, params=params)

# Saving results

In [49]:
models = ['lr', 'rfc', 'svc', 'xgbc', 'lgbm']
models = [value for value in models for _ in range(10)] * 5
percentiles = ['10', '20', '30', '50', '75', '90']
percentiles = [value for value in percentiles for _ in range(50)]
filename_sample_list_final = filename_sample_list * 30

print(len(models))
print(len(percentiles))
print(len(filename_sample_list_final))

200
200
200


In [50]:
overall_accuracy_list_mi = (accuracy_list_lr_10_mi + accuracy_list_rfc_10_mi + accuracy_list_svm_10_mi + accuracy_list_xgb_10_mi + accuracy_list_lgb_10_mi +
                            accuracy_list_lr_20_mi + accuracy_list_rfc_20_mi + accuracy_list_svm_20_mi + accuracy_list_xgb_20_mi + accuracy_list_lgb_20_mi +
                            accuracy_list_lr_30_mi + accuracy_list_rfc_30_mi + accuracy_list_svm_30_mi + accuracy_list_xgb_30_mi + accuracy_list_lgb_30_mi +
                            accuracy_list_lr_50_mi + accuracy_list_rfc_50_mi + accuracy_list_svm_50_mi + accuracy_list_xgb_50_mi + accuracy_list_lgb_50_mi +
                            accuracy_list_lr_75_mi + accuracy_list_rfc_75_mi + accuracy_list_svm_75_mi + accuracy_list_xgb_75_mi + accuracy_list_lgb_75_mi +
                            accuracy_list_lr_90_mi + accuracy_list_rfc_90_mi + accuracy_list_svm_90_mi + accuracy_list_xgb_90_mi + accuracy_list_lgb_90_mi)

overall_f1_score_list_mi = (f1_score_list_lr_10_mi + f1_score_list_rfc_10_mi + f1_score_list_svm_10_mi + f1_score_list_xgb_10_mi + f1_score_list_lgb_10_mi +
                            f1_score_list_lr_20_mi + f1_score_list_rfc_20_mi + f1_score_list_svm_20_mi + f1_score_list_xgb_20_mi + f1_score_list_lgb_20_mi +
                            f1_score_list_lr_30_mi + f1_score_list_rfc_30_mi + f1_score_list_svm_30_mi + f1_score_list_xgb_30_mi + f1_score_list_lgb_30_mi +
                            f1_score_list_lr_50_mi + f1_score_list_rfc_50_mi + f1_score_list_svm_50_mi + f1_score_list_xgb_50_mi + f1_score_list_lgb_50_mi +
                            f1_score_list_lr_75_mi + f1_score_list_rfc_75_mi + f1_score_list_svm_75_mi + f1_score_list_xgb_75_mi + f1_score_list_lgb_75_mi +
                            f1_score_list_lr_90_mi + f1_score_list_rfc_90_mi + f1_score_list_svm_90_mi + f1_score_list_xgb_90_mi + f1_score_list_lgb_90_mi)

overall_auc_list_mi =  (auc_list_lr_10_mi + auc_list_rfc_10_mi + auc_list_svm_10_mi + auc_list_xgb_10_mi + auc_list_lgb_10_mi +
                        auc_list_lr_20_mi + auc_list_rfc_20_mi + auc_list_svm_20_mi + auc_list_xgb_20_mi + auc_list_lgb_20_mi +
                        auc_list_lr_30_mi + auc_list_rfc_30_mi + auc_list_svm_30_mi + auc_list_xgb_30_mi + auc_list_lgb_30_mi +
                        auc_list_lr_50_mi + auc_list_rfc_50_mi + auc_list_svm_50_mi + auc_list_xgb_50_mi + auc_list_lgb_50_mi +
                        auc_list_lr_75_mi + auc_list_rfc_75_mi + auc_list_svm_75_mi + auc_list_xgb_75_mi + auc_list_lgb_75_mi +
                        auc_list_lr_90_mi + auc_list_rfc_90_mi + auc_list_svm_90_mi + auc_list_xgb_90_mi + auc_list_lgb_90_mi)

overall_param_list_mi = (param_list_lr_10_mi + param_list_rfc_10_mi + param_list_svm_10_mi + param_list_xgb_10_mi + param_list_lgb_10_mi +
                            param_list_lr_20_mi + param_list_rfc_20_mi + param_list_svm_20_mi + param_list_xgb_20_mi + param_list_lgb_20_mi +
                            param_list_lr_30_mi + param_list_rfc_30_mi + param_list_svm_30_mi + param_list_xgb_30_mi + param_list_lgb_30_mi +
                            param_list_lr_50_mi + param_list_rfc_50_mi + param_list_svm_50_mi + param_list_xgb_50_mi + param_list_lgb_50_mi +
                            param_list_lr_75_mi + param_list_rfc_75_mi + param_list_svm_75_mi + param_list_xgb_75_mi + param_list_lgb_75_mi +
                            param_list_lr_90_mi + param_list_rfc_90_mi + param_list_svm_90_mi + param_list_xgb_90_mi + param_list_lgb_90_mi)

In [51]:
overall_accuracy_list_mrmr = (accuracy_list_lr_10_mrmr + accuracy_list_rfc_10_mrmr + accuracy_list_svm_10_mrmr + accuracy_list_xgb_10_mrmr + accuracy_list_lgb_10_mrmr +
                            accuracy_list_lr_20_mrmr + accuracy_list_rfc_20_mrmr + accuracy_list_svm_20_mrmr + accuracy_list_xgb_20_mrmr + accuracy_list_lgb_20_mrmr +
                            accuracy_list_lr_30_mrmr + accuracy_list_rfc_30_mrmr + accuracy_list_svm_30_mrmr + accuracy_list_xgb_30_mrmr + accuracy_list_lgb_30_mrmr +
                            accuracy_list_lr_50_mrmr + accuracy_list_rfc_50_mrmr + accuracy_list_svm_50_mrmr + accuracy_list_xgb_50_mrmr + accuracy_list_lgb_50_mrmr +
                            accuracy_list_lr_75_mrmr + accuracy_list_rfc_75_mrmr + accuracy_list_svm_75_mrmr + accuracy_list_xgb_75_mrmr + accuracy_list_lgb_75_mrmr +
                            accuracy_list_lr_90_mrmr + accuracy_list_rfc_90_mrmr + accuracy_list_svm_90_mrmr + accuracy_list_xgb_90_mrmr + accuracy_list_lgb_90_mrmr)

overall_f1_score_list_mrmr = (f1_score_list_lr_10_mrmr + f1_score_list_rfc_10_mrmr + f1_score_list_svm_10_mrmr + f1_score_list_xgb_10_mrmr + f1_score_list_lgb_10_mrmr +
                            f1_score_list_lr_20_mrmr + f1_score_list_rfc_20_mrmr + f1_score_list_svm_20_mrmr + f1_score_list_xgb_20_mrmr + f1_score_list_lgb_20_mrmr +
                            f1_score_list_lr_30_mrmr + f1_score_list_rfc_30_mrmr + f1_score_list_svm_30_mrmr + f1_score_list_xgb_30_mrmr + f1_score_list_lgb_30_mrmr +
                            f1_score_list_lr_50_mrmr + f1_score_list_rfc_50_mrmr + f1_score_list_svm_50_mrmr + f1_score_list_xgb_50_mrmr + f1_score_list_lgb_50_mrmr +
                            f1_score_list_lr_75_mrmr + f1_score_list_rfc_75_mrmr + f1_score_list_svm_75_mrmr + f1_score_list_xgb_75_mrmr + f1_score_list_lgb_75_mrmr +
                            f1_score_list_lr_90_mrmr + f1_score_list_rfc_90_mrmr + f1_score_list_svm_90_mrmr + f1_score_list_xgb_90_mrmr + f1_score_list_lgb_90_mrmr)

overall_auc_list_mrmr =  (auc_list_lr_10_mrmr + auc_list_rfc_10_mrmr + auc_list_svm_10_mrmr + auc_list_xgb_10_mrmr + auc_list_lgb_10_mrmr +
                        auc_list_lr_20_mrmr + auc_list_rfc_20_mrmr + auc_list_svm_20_mrmr + auc_list_xgb_20_mrmr + auc_list_lgb_20_mrmr +
                        auc_list_lr_30_mrmr + auc_list_rfc_30_mrmr + auc_list_svm_30_mrmr + auc_list_xgb_30_mrmr + auc_list_lgb_30_mrmr +
                        auc_list_lr_50_mrmr + auc_list_rfc_50_mrmr + auc_list_svm_50_mrmr + auc_list_xgb_50_mrmr + auc_list_lgb_50_mrmr +
                        auc_list_lr_75_mrmr + auc_list_rfc_75_mrmr + auc_list_svm_75_mrmr + auc_list_xgb_75_mrmr + auc_list_lgb_75_mrmr +
                        auc_list_lr_90_mrmr + auc_list_rfc_90_mrmr + auc_list_svm_90_mrmr + auc_list_xgb_90_mrmr + auc_list_lgb_90_mrmr)

overall_param_list_mrmr = (param_list_lr_10_mrmr + param_list_rfc_10_mrmr + param_list_svm_10_mrmr + param_list_xgb_10_mrmr + param_list_lgb_10_mrmr +
                            param_list_lr_20_mrmr + param_list_rfc_20_mrmr + param_list_svm_20_mrmr + param_list_xgb_20_mrmr + param_list_lgb_20_mrmr +
                            param_list_lr_30_mrmr + param_list_rfc_30_mrmr + param_list_svm_30_mrmr + param_list_xgb_30_mrmr + param_list_lgb_30_mrmr +
                            param_list_lr_50_mrmr + param_list_rfc_50_mrmr + param_list_svm_50_mrmr + param_list_xgb_50_mrmr + param_list_lgb_50_mrmr +
                            param_list_lr_75_mrmr + param_list_rfc_75_mrmr + param_list_svm_75_mrmr + param_list_xgb_75_mrmr + param_list_lgb_75_mrmr +
                            param_list_lr_90_mrmr + param_list_rfc_90_mrmr + param_list_svm_90_mrmr + param_list_xgb_90_mrmr + param_list_lgb_90_mrmr)

In [52]:
overall_accuracy_list_mi_mrmr = (accuracy_list_lr_10_mi_mrmr + accuracy_list_rfc_10_mi_mrmr + accuracy_list_svm_10_mi_mrmr + accuracy_list_xgb_10_mi_mrmr + accuracy_list_lgb_10_mi_mrmr +
                            accuracy_list_lr_20_mi_mrmr + accuracy_list_rfc_20_mi_mrmr + accuracy_list_svm_20_mi_mrmr + accuracy_list_xgb_20_mi_mrmr + accuracy_list_lgb_20_mi_mrmr +
                            accuracy_list_lr_30_mi_mrmr + accuracy_list_rfc_30_mi_mrmr + accuracy_list_svm_30_mi_mrmr + accuracy_list_xgb_30_mi_mrmr + accuracy_list_lgb_30_mi_mrmr +
                            accuracy_list_lr_50_mi_mrmr + accuracy_list_rfc_50_mi_mrmr + accuracy_list_svm_50_mi_mrmr + accuracy_list_xgb_50_mi_mrmr + accuracy_list_lgb_50_mi_mrmr +
                            accuracy_list_lr_75_mi_mrmr + accuracy_list_rfc_75_mi_mrmr + accuracy_list_svm_75_mi_mrmr + accuracy_list_xgb_75_mi_mrmr + accuracy_list_lgb_75_mi_mrmr +
                            accuracy_list_lr_90_mi_mrmr + accuracy_list_rfc_90_mi_mrmr + accuracy_list_svm_90_mi_mrmr + accuracy_list_xgb_90_mi_mrmr + accuracy_list_lgb_90_mi_mrmr)

overall_f1_score_list_mi_mrmr = (f1_score_list_lr_10_mi_mrmr + f1_score_list_rfc_10_mi_mrmr + f1_score_list_svm_10_mi_mrmr + f1_score_list_xgb_10_mi_mrmr + f1_score_list_lgb_10_mi_mrmr +
                            f1_score_list_lr_20_mi_mrmr + f1_score_list_rfc_20_mi_mrmr + f1_score_list_svm_20_mi_mrmr + f1_score_list_xgb_20_mi_mrmr + f1_score_list_lgb_20_mi_mrmr +
                            f1_score_list_lr_30_mi_mrmr + f1_score_list_rfc_30_mi_mrmr + f1_score_list_svm_30_mi_mrmr + f1_score_list_xgb_30_mi_mrmr + f1_score_list_lgb_30_mi_mrmr +
                            f1_score_list_lr_50_mi_mrmr + f1_score_list_rfc_50_mi_mrmr + f1_score_list_svm_50_mi_mrmr + f1_score_list_xgb_50_mi_mrmr + f1_score_list_lgb_50_mi_mrmr +
                            f1_score_list_lr_75_mi_mrmr + f1_score_list_rfc_75_mi_mrmr + f1_score_list_svm_75_mi_mrmr + f1_score_list_xgb_75_mi_mrmr + f1_score_list_lgb_75_mi_mrmr +
                            f1_score_list_lr_90_mi_mrmr + f1_score_list_rfc_90_mi_mrmr + f1_score_list_svm_90_mi_mrmr + f1_score_list_xgb_90_mi_mrmr + f1_score_list_lgb_90_mi_mrmr)

overall_auc_list_mi_mrmr =  (auc_list_lr_10_mi_mrmr + auc_list_rfc_10_mi_mrmr + auc_list_svm_10_mi_mrmr + auc_list_xgb_10_mi_mrmr + auc_list_lgb_10_mi_mrmr +
                        auc_list_lr_20_mi_mrmr + auc_list_rfc_20_mi_mrmr + auc_list_svm_20_mi_mrmr + auc_list_xgb_20_mi_mrmr + auc_list_lgb_20_mi_mrmr +
                        auc_list_lr_30_mi_mrmr + auc_list_rfc_30_mi_mrmr + auc_list_svm_30_mi_mrmr + auc_list_xgb_30_mi_mrmr + auc_list_lgb_30_mi_mrmr +
                        auc_list_lr_50_mi_mrmr + auc_list_rfc_50_mi_mrmr + auc_list_svm_50_mi_mrmr + auc_list_xgb_50_mi_mrmr + auc_list_lgb_50_mi_mrmr +
                        auc_list_lr_75_mi_mrmr + auc_list_rfc_75_mi_mrmr + auc_list_svm_75_mi_mrmr + auc_list_xgb_75_mi_mrmr + auc_list_lgb_75_mi_mrmr +
                        auc_list_lr_90_mi_mrmr + auc_list_rfc_90_mi_mrmr + auc_list_svm_90_mi_mrmr + auc_list_xgb_90_mi_mrmr + auc_list_lgb_90_mi_mrmr)

overall_param_list_mi_mrmr = (param_list_lr_10_mi_mrmr + param_list_rfc_10_mi_mrmr + param_list_svm_10_mi_mrmr + param_list_xgb_10_mi_mrmr + param_list_lgb_10_mi_mrmr +
                            param_list_lr_20_mi_mrmr + param_list_rfc_20_mi_mrmr + param_list_svm_20_mi_mrmr + param_list_xgb_20_mi_mrmr + param_list_lgb_20_mi_mrmr +
                            param_list_lr_30_mi_mrmr + param_list_rfc_30_mi_mrmr + param_list_svm_30_mi_mrmr + param_list_xgb_30_mi_mrmr + param_list_lgb_30_mi_mrmr +
                            param_list_lr_50_mi_mrmr + param_list_rfc_50_mi_mrmr + param_list_svm_50_mi_mrmr + param_list_xgb_50_mi_mrmr + param_list_lgb_50_mi_mrmr +
                            param_list_lr_75_mi_mrmr + param_list_rfc_75_mi_mrmr + param_list_svm_75_mi_mrmr + param_list_xgb_75_mi_mrmr + param_list_lgb_75_mi_mrmr +
                            param_list_lr_90_mi_mrmr + param_list_rfc_90_mi_mrmr + param_list_svm_90_mi_mrmr + param_list_xgb_90_mi_mrmr + param_list_lgb_90_mi_mrmr)

In [53]:
overall_accuracy_list_pca = (accuracy_list_lr_10_pca + accuracy_list_rfc_10_pca + accuracy_list_svm_10_pca + accuracy_list_xgb_10_pca + accuracy_list_lgb_10_pca +
                            accuracy_list_lr_20_pca + accuracy_list_rfc_20_pca + accuracy_list_svm_20_pca + accuracy_list_xgb_20_pca + accuracy_list_lgb_20_pca +
                            accuracy_list_lr_30_pca + accuracy_list_rfc_30_pca + accuracy_list_svm_30_pca + accuracy_list_xgb_30_pca + accuracy_list_lgb_30_pca +
                            accuracy_list_lr_50_pca + accuracy_list_rfc_50_pca + accuracy_list_svm_50_pca + accuracy_list_xgb_50_pca + accuracy_list_lgb_50_pca +
                            accuracy_list_lr_75_pca + accuracy_list_rfc_75_pca + accuracy_list_svm_75_pca + accuracy_list_xgb_75_pca + accuracy_list_lgb_75_pca +
                            accuracy_list_lr_90_pca + accuracy_list_rfc_90_pca + accuracy_list_svm_90_pca + accuracy_list_xgb_90_pca + accuracy_list_lgb_90_pca)

overall_f1_score_list_pca = (f1_score_list_lr_10_pca + f1_score_list_rfc_10_pca + f1_score_list_svm_10_pca + f1_score_list_xgb_10_pca + f1_score_list_lgb_10_pca +
                            f1_score_list_lr_20_pca + f1_score_list_rfc_20_pca + f1_score_list_svm_20_pca + f1_score_list_xgb_20_pca + f1_score_list_lgb_20_pca +
                            f1_score_list_lr_30_pca + f1_score_list_rfc_30_pca + f1_score_list_svm_30_pca + f1_score_list_xgb_30_pca + f1_score_list_lgb_30_pca +
                            f1_score_list_lr_50_pca + f1_score_list_rfc_50_pca + f1_score_list_svm_50_pca + f1_score_list_xgb_50_pca + f1_score_list_lgb_50_pca +
                            f1_score_list_lr_75_pca + f1_score_list_rfc_75_pca + f1_score_list_svm_75_pca + f1_score_list_xgb_75_pca + f1_score_list_lgb_75_pca +
                            f1_score_list_lr_90_pca + f1_score_list_rfc_90_pca + f1_score_list_svm_90_pca + f1_score_list_xgb_90_pca + f1_score_list_lgb_90_pca)

overall_auc_list_pca =  (auc_list_lr_10_pca + auc_list_rfc_10_pca + auc_list_svm_10_pca + auc_list_xgb_10_pca + auc_list_lgb_10_pca +
                        auc_list_lr_20_pca + auc_list_rfc_20_pca + auc_list_svm_20_pca + auc_list_xgb_20_pca + auc_list_lgb_20_pca +
                        auc_list_lr_30_pca + auc_list_rfc_30_pca + auc_list_svm_30_pca + auc_list_xgb_30_pca + auc_list_lgb_30_pca +
                        auc_list_lr_50_pca + auc_list_rfc_50_pca + auc_list_svm_50_pca + auc_list_xgb_50_pca + auc_list_lgb_50_pca +
                        auc_list_lr_75_pca + auc_list_rfc_75_pca + auc_list_svm_75_pca + auc_list_xgb_75_pca + auc_list_lgb_75_pca +
                        auc_list_lr_90_pca + auc_list_rfc_90_pca + auc_list_svm_90_pca + auc_list_xgb_90_pca + auc_list_lgb_90_pca)

overall_param_list_pca = (param_list_lr_10_pca + param_list_rfc_10_pca + param_list_svm_10_pca + param_list_xgb_10_pca + param_list_lgb_10_pca +
                            param_list_lr_20_pca + param_list_rfc_20_pca + param_list_svm_20_pca + param_list_xgb_20_pca + param_list_lgb_20_pca +
                            param_list_lr_30_pca + param_list_rfc_30_pca + param_list_svm_30_pca + param_list_xgb_30_pca + param_list_lgb_30_pca +
                            param_list_lr_50_pca + param_list_rfc_50_pca + param_list_svm_50_pca + param_list_xgb_50_pca + param_list_lgb_50_pca +
                            param_list_lr_75_pca + param_list_rfc_75_pca + param_list_svm_75_pca + param_list_xgb_75_pca + param_list_lgb_75_pca +
                            param_list_lr_90_pca + param_list_rfc_90_pca + param_list_svm_90_pca + param_list_xgb_90_pca + param_list_lgb_90_pca)

In [54]:
print(len(overall_accuracy_list_mi))
print(len(overall_f1_score_list_mi))
print(len(overall_auc_list_mi))
print(len(overall_param_list_mi))

print(len(overall_accuracy_list_mrmr))
print(len(overall_f1_score_list_mrmr))
print(len(overall_auc_list_mrmr))
print(len(overall_param_list_mrmr))

print(len(overall_accuracy_list_mi_mrmr))
print(len(overall_f1_score_list_mi_mrmr))
print(len(overall_auc_list_mi_mrmr))
print(len(overall_param_list_mi_mrmr))

print(len(overall_accuracy_list_pca))
print(len(overall_f1_score_list_pca))
print(len(overall_auc_list_pca))
print(len(overall_param_list_pca))

200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200


In [55]:
results_dictionary = {
    'samples': filename_sample_list_final,
    'models': models,
    'percentiles': percentiles,
    'mi_accuracy': overall_accuracy_list_mi,
    'mi_f1_score': overall_f1_score_list_mi,
    'mi_auc': overall_auc_list_mi,
    'mrmr_accuracy': overall_accuracy_list_mrmr,
    'mrmr_f1_score': overall_f1_score_list_mrmr,
    'mrmr_auc': overall_auc_list_mrmr,
    'mi_mrmr_accuracy': overall_accuracy_list_mi_mrmr,
    'mi_mrmr_f1_score': overall_f1_score_list_mi_mrmr,
    'mi_mrmr_auc': overall_auc_list_mi_mrmr,
    'pca_accuracy': overall_accuracy_list_pca,
    'pca_f1_score': overall_f1_score_list_pca,
    'pca_auc': overall_auc_list_pca,
    'mi_params': overall_param_list_mi,
    'mrmr_params': overall_param_list_mrmr,
    'mi_mrmr_params': overall_param_list_mi_mrmr,
    'pca_params': overall_param_list_pca
}
results_df = pd.DataFrame(results_dictionary)

results_df.to_csv('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/results/overall_results_20_v2.csv', index=False)