In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import *
from sklearn.metrics import *

from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.decomposition import *

import xgboost as xgb
import lightgbm as lgb

# import tensorflow as tf

import os
import re
import ast

In [3]:
directory_dataframes = '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/subsamples_v2/'
directory_features = '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/'

def get_sample_df(directory=directory_dataframes):
    filename_list = []
    list_dataframes = []
    for filename in os.listdir(directory):
        print(filename)
        filename_list.append(filename)
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            list_dataframes.append(pd.read_csv(f))
            
    return list_dataframes, filename_list

def get_features(regex_str, directory=directory_features):
    regex = re.compile('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/{}'.format(regex_str))
    
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if regex.match(f):
            file1 = open(f,"r+")
            feat_list = file1.read().splitlines()
            
            #txt file converts everything to string, so we need to convert it back to list
            for i in range(len(feat_list)):
                #adding ; to be used a separator for list
                if i<len(feat_list):
                    new_val = feat_list[i].replace('y','y;').replace(') ','); ').replace('4 ', '4; ').replace('5 ', '5; ')
                    feat_list[i] = new_val
                
    for val in feat_list:
        #separating the string into a list of features
        new_val = val.split('; ')
        feat_list[feat_list.index(val)] = new_val
        
    return feat_list

list_sample_dataframes, filename_sample_list = get_sample_df(directory_dataframes)

subsample_7_v2.csv
subsample_9_v2.csv
subsample_5_v2.csv
subsample_1_v2.csv
subsample_10_v2.csv
subsample_3_v2.csv
subsample_6_v2.csv
subsample_8_v2.csv
subsample_4_v2.csv
subsample_2_v2.csv


In [9]:
def model_train_predict(model, regex_str, dataframes=list_sample_dataframes, params=None):
    
    feat_list = get_features(regex_str)
    
    accuracy_list = []
    f1_score_list = []
    auc_list = []
    best_params_list = []
    
    for sample, feat in zip(dataframes, feat_list):
        feat[len(feat)-1] = feat[len(feat)-1].replace('y;', 'y')
        x = sample[feat]
        x = x.rename(columns = lambda a:re.sub('[^A-Za-z0-9_]+', '', a))
        
        y = sample['conversion_class']
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
        clf = GridSearchCV(estimator=model, param_grid=params, cv=5, n_jobs=-1)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        
        
        # model.fit(x_train, y_train)
        # y_pred = model.predict(x_test)
        accuracy_list.append(accuracy_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))
        auc_list.append(roc_auc_score(y_test, y_pred))
        best_params_list.append(clf.best_params_)

    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
    
    print('Average Accuracy', np.mean(accuracy_list))
    print('Average F1 Score', np.mean(f1_score_list))
    print('Average AUC', np.mean(auc_list)) 
    
    print('Max Accuracy', max(accuracy_list))
    print('Max F1 Score', max(f1_score_list))
    print('Max AUC', max(auc_list))  
    
    print(accuracy_list)
    print(auc_list)
    best_accuracy_index = accuracy_list.index(max(accuracy_list))
    best_f1_score_index = f1_score_list.index(max(f1_score_list))
    best_auc_index = auc_list.index(max(auc_list))
    
    print('Best Sample Index based on Max Accuracy', best_accuracy_index)
    print('Best Sample Index based on Max F1 Score', best_f1_score_index)
    print('Best Sample Index based on Max AUC', best_auc_index)
    
    print('Best Features based on Max Accuracy', feat_list[best_accuracy_index])
    print('Best Features based on Max F1 Score', feat_list[best_f1_score_index])
    print('Best Features based on Max AUC', feat_list[best_auc_index]) 
    print('Best Params based on Max Accuracy', best_params_list[best_accuracy_index])
    
    return accuracy_list, f1_score_list, auc_list, best_params_list  


# Mutual Information

## 10 Percentile

In [10]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_10_mi, f1_score_list_lr_10_mi, auc_list_lr_10_mi, param_list_lr_10_mi = model_train_predict(lr, 'mi_feat_list_10', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_10_mi, f1_score_list_rfc_10_mi, auc_list_rfc_10_mi, param_list_rfc_10_mi = model_train_predict(rfc, 'mi_feat_list_10', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_10_mi, f1_score_list_svm_10_mi, auc_list_svm_10_mi, param_list_svm_10_mi = model_train_predict(svc, 'mi_feat_list_10', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_10_mi, f1_score_list_xgb_10_mi, auc_list_xgb_10_mi, param_list_xgb_10_mi = model_train_predict(xgbc, 'mi_feat_list_10', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_10_mi, f1_score_list_lgb_10_mi, auc_list_lgb_10_mi, param_list_lgb_10_mi = model_train_predict(lgbc, 'mi_feat_list_10', params=params)

(12052, 16) (12052,) (3014, 16) (3014,)
Average Accuracy 0.8467153284671532
Average F1 Score 0.8433048901517444
Average AUC 0.8467153284671532
Max Accuracy 0.8526874585268746
Max F1 Score 0.8480492813141685
Max AUC 0.8526874585268747
[0.8513603185136032, 0.8526874585268746, 0.8394160583941606, 0.8506967485069675, 0.8433974784339748, 0.8453881884538819, 0.8437292634372926, 0.8437292634372926, 0.8503649635036497, 0.8463835434638355]
[0.8513603185136032, 0.8526874585268747, 0.8394160583941606, 0.8506967485069674, 0.8433974784339748, 0.8453881884538819, 0.8437292634372926, 0.8437292634372927, 0.8503649635036497, 0.8463835434638354]
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(1, 2, 1)', '(2, 1, 2)', '(2, 3, 1)', '(3, 1, 1)']
Best Features 

## 20 Percentile

In [11]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_20_mi, f1_score_list_lr_20_mi, auc_list_lr_20_mi, param_list_lr_20_mi = model_train_predict(lr, 'mi_feat_list_20', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_20_mi, f1_score_list_rfc_20_mi, auc_list_rfc_20_mi, param_list_rfc_20_mi = model_train_predict(rfc, 'mi_feat_list_20', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_20_mi, f1_score_list_svm_20_mi, auc_list_svm_20_mi, param_list_svm_20_mi = model_train_predict(svc, 'mi_feat_list_20', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_20_mi, f1_score_list_xgb_20_mi, auc_list_xgb_20_mi, param_list_xgb_20_mi = model_train_predict(xgbc, 'mi_feat_list_20', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_20_mi, f1_score_list_lgb_20_mi, auc_list_lgb_20_mi, param_list_lgb_20_mi = model_train_predict(lgbc, 'mi_feat_list_20', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(12052, 32) (12052,) (3014, 32) (3014,)
Average Accuracy 0.8669542136695421
Average F1 Score 0.8662770348215437
Average AUC 0.8669542136695421
Max Accuracy 0.8745852687458527
Max F1 Score 0.8731543624161074
Max AUC 0.8745852687458527
[0.8639681486396815, 0.8745852687458527, 0.8576642335766423, 0.869608493696085, 0.8636363636363636, 0.8702720637027206, 0.8682813536828136, 0.8636363636363636, 0.8682813536828136, 0.869608493696085]
[0.8639681486396815, 0.8745852687458527, 0.8576642335766423, 0.869608493696085, 0.8636363636363634, 0.8702720637027207, 0.8682813536828135, 0.8636363636363636, 0.8682813536828136, 0.8696084936960848]
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)'

## 30 Percentile

In [12]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_30_mi, f1_score_list_lr_30_mi, auc_list_lr_30_mi, param_list_lr_30_mi = model_train_predict(lr, 'mi_feat_list_30', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_30_mi, f1_score_list_rfc_30_mi, auc_list_rfc_30_mi, param_list_rfc_30_mi = model_train_predict(rfc, 'mi_feat_list_30', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_30_mi, f1_score_list_svm_30_mi, auc_list_svm_30_mi, param_list_svm_30_mi = model_train_predict(svc, 'mi_feat_list_30', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_30_mi, f1_score_list_xgb_30_mi, auc_list_xgb_30_mi, param_list_xgb_30_mi = model_train_predict(xgbc, 'mi_feat_list_30', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_30_mi, f1_score_list_lgb_30_mi, auc_list_lgb_30_mi, param_list_lgb_30_mi = model_train_predict(lgbc, 'mi_feat_list_30', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 48) (12052,) (3014, 48) (3014,)
Average Accuracy 0.8709024552090245
Average F1 Score 0.8702894779648881
Average AUC 0.8709024552090245
Max Accuracy 0.880557398805574
Max F1 Score 0.8786244099797706
Max AUC 0.880557398805574
[0.8682813536828136, 0.8745852687458527, 0.8672859986728599, 0.880557398805574, 0.8669542136695422, 0.8719309887193099, 0.8669542136695422, 0.8686131386861314, 0.8692767086927671, 0.8745852687458527]
[0.8682813536828136, 0.8745852687458527, 0.86728599867286, 0.880557398805574, 0.8669542136695421, 0.87193098871931, 0.8669542136695421, 0.8686131386861313, 0.869276708692767, 0.8745852687458527]
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(3, 2)', '(1, 4)', '(4,

## 50 Percentile

In [13]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_50_mi, f1_score_list_lr_50_mi, auc_list_lr_50_mi, param_list_lr_50_mi = model_train_predict(lr, 'mi_feat_list_50', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_50_mi, f1_score_list_rfc_50_mi, auc_list_rfc_50_mi, param_list_rfc_50_mi = model_train_predict(rfc, 'mi_feat_list_50', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_50_mi, f1_score_list_svm_50_mi, auc_list_svm_50_mi, param_list_svm_50_mi = model_train_predict(svc, 'mi_feat_list_50', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_50_mi, f1_score_list_xgb_50_mi, auc_list_xgb_50_mi, param_list_xgb_50_mi = model_train_predict(xgbc, 'mi_feat_list_50', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_50_mi, f1_score_list_lgb_50_mi, auc_list_lgb_50_mi, param_list_lgb_50_mi = model_train_predict(lgbc, 'mi_feat_list_50', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 80) (12052,) (3014, 80) (3014,)
Average Accuracy 0.8790311877903119
Average F1 Score 0.8788009729181969
Average AUC 0.8790311877903119
Max Accuracy 0.8858659588586596
Max F1 Score 0.8859416445623342
Max AUC 0.8858659588586596
[0.8822163238221632, 0.8832116788321168, 0.8725945587259456, 0.8832116788321168, 0.8702720637027206, 0.8858659588586596, 0.8749170537491705, 0.8788984737889848, 0.8788984737889848, 0.8802256138022562]
[0.8822163238221632, 0.8832116788321168, 0.8725945587259456, 0.8832116788321167, 0.8702720637027206, 0.8858659588586596, 0.8749170537491706, 0.8788984737889848, 0.8788984737889848, 0.8802256138022562]
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 5
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(1,)', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(6, 1)', '(2, 1)', '(2, 2)', '(2, 3)

## 75 Percentile

In [14]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_75_mi, f1_score_list_lr_75_mi, auc_list_lr_75_mi, param_list_lr_75_mi = model_train_predict(lr, 'mi_feat_list_75', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_75_mi, f1_score_list_rfc_75_mi, auc_list_rfc_75_mi, param_list_rfc_75_mi = model_train_predict(rfc, 'mi_feat_list_75', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_75_mi, f1_score_list_svm_75_mi, auc_list_svm_75_mi, param_list_svm_75_mi = model_train_predict(svc, 'mi_feat_list_75', params=params)
# print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_75_mi, f1_score_list_xgb_75_mi, auc_list_xgb_75_mi, param_list_xgb_75_mi = model_train_predict(xgbc, 'mi_feat_list_75', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_75_mi, f1_score_list_lgb_75_mi, auc_list_lgb_75_mi, param_list_lgb_75_mi = model_train_predict(lgbc, 'mi_feat_list_75', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 120) (12052,) (3014, 120) (3014,)
Average Accuracy 0.8820172528201725
Average F1 Score 0.8821286505542488
Average AUC 0.8820172528201725
Max Accuracy 0.887193098871931
Max F1 Score 0.8873426110006626
Max AUC 0.887193098871931
[0.8838752488387525, 0.8835434638354346, 0.8759124087591241, 0.8861977438619775, 0.8765759787657598, 0.887193098871931, 0.8762441937624419, 0.882879893828799, 0.8845388188453882, 0.8832116788321168]
[0.8838752488387526, 0.8835434638354346, 0.8759124087591241, 0.8861977438619776, 0.8765759787657597, 0.887193098871931, 0.8762441937624419, 0.882879893828799, 0.8845388188453881, 0.8832116788321167]
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 5
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(1,)', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 6)', '(6, 1)', '(2, 1)', '(2

## 90 Percentile

In [15]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_90_mi, f1_score_list_lr_90_mi, auc_list_lr_90_mi, param_list_lr_90_mi = model_train_predict(lr, 'mi_feat_list_90', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_90_mi, f1_score_list_rfc_90_mi, auc_list_rfc_90_mi, param_list_rfc_90_mi = model_train_predict(rfc, 'mi_feat_list_90', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_90_mi, f1_score_list_svm_90_mi, auc_list_svm_90_mi, param_list_svm_90_mi = model_train_predict(svc, 'mi_feat_list_90', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_90_mi, f1_score_list_xgb_90_mi, auc_list_xgb_90_mi, param_list_xgb_90_mi = model_train_predict(xgbc, 'mi_feat_list_90', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_90_mi, f1_score_list_lgb_90_mi, auc_list_lgb_90_mi, param_list_lgb_90_mi = model_train_predict(lgbc, 'mi_feat_list_90', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 144) (12052,) (3014, 144) (3014,)
Average Accuracy 0.8826808228268082
Average F1 Score 0.882860251382972
Average AUC 0.8826808228268082
Max Accuracy 0.887193098871931
Max F1 Score 0.8874172185430464
Max AUC 0.887193098871931
[0.8825481088254811, 0.8865295288652952, 0.8762441937624419, 0.8845388188453882, 0.8772395487723955, 0.887193098871931, 0.8798938287989383, 0.8838752488387525, 0.8855341738553417, 0.8832116788321168]
[0.8825481088254812, 0.8865295288652953, 0.8762441937624419, 0.8845388188453881, 0.8772395487723955, 0.887193098871931, 0.8798938287989382, 0.8838752488387525, 0.8855341738553417, 0.8832116788321167]
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 5
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(1,)', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 6)', '(6, 1)', '(2, 1)', '(

# mRMR

## 10 Percentile

In [16]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_10_mrmr, f1_score_list_lr_10_mrmr, auc_list_lr_10_mrmr, param_list_lr_10_mrmr = model_train_predict(lr, 'mrmr_feat_list_10', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_10_mrmr, f1_score_list_rfc_10_mrmr, auc_list_rfc_10_mrmr, param_list_rfc_10_mrmr = model_train_predict(rfc, 'mrmr_feat_list_10', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_10_mrmr, f1_score_list_svm_10_mrmr, auc_list_svm_10_mrmr, param_list_svm_10_mrmr = model_train_predict(svc, 'mrmr_feat_list_10', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_10_mrmr, f1_score_list_xgb_10_mrmr, auc_list_xgb_10_mrmr, param_list_xgb_10_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_10', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_10_mrmr, f1_score_list_lgb_10_mrmr, auc_list_lgb_10_mrmr, param_list_lgb_10_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_10', params=params)

(12052, 14) (12052,) (3014, 14) (3014,)
Average Accuracy 0.841506303915063
Average F1 Score 0.8392111434268992
Average AUC 0.841506303915063
Max Accuracy 0.8520238885202389
Max F1 Score 0.8481960517358746
Max AUC 0.8520238885202389
[0.8400796284007963, 0.8520238885202389, 0.8311214333112144, 0.8467153284671532, 0.8397478433974784, 0.8470471134704711, 0.8354346383543464, 0.8374253483742535, 0.8433974784339748, 0.8420703384207033]
[0.8400796284007963, 0.8520238885202389, 0.8311214333112144, 0.8467153284671534, 0.8397478433974783, 0.8470471134704711, 0.8354346383543464, 0.8374253483742536, 0.8433974784339747, 0.8420703384207034]
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(2, 1, 4)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', 'bigram_entropy', '(1, 2, 1)', '(2, 3)', '(1, 2)', 'trigram_entropy', '(2,)', 'unigram_entropy']
Best Feature

## 20 Percentile

In [17]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_20_mrmr, f1_score_list_lr_20_mrmr, auc_list_lr_20_mrmr, param_list_lr_20_mrmr = model_train_predict(lr, 'mrmr_feat_list_20*', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_20_mrmr, f1_score_list_rfc_20_mrmr, auc_list_rfc_20_mrmr, param_list_rfc_20_mrmr = model_train_predict(rfc, 'mrmr_feat_list_20', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_20_mrmr, f1_score_list_svm_20_mrmr, auc_list_svm_20_mrmr, param_list_svm_20_mrmr = model_train_predict(svc, 'mrmr_feat_list_20', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_20_mrmr, f1_score_list_xgb_20_mrmr, auc_list_xgb_20_mrmr, param_list_xgb_20_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_20', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_20_mrmr, f1_score_list_lgb_20_mrmr, auc_list_lgb_20_mrmr, param_list_lgb_20_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_20', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 28) (12052,) (3014, 28) (3014,)
Average Accuracy 0.8640345056403451
Average F1 Score 0.8633897517949704
Average AUC 0.8640345056403451
Max Accuracy 0.8755806237558063
Max F1 Score 0.8741188318227593
Max AUC 0.8755806237558063
[0.858327803583278, 0.8755806237558063, 0.8520238885202389, 0.8712674187126742, 0.8586595885865959, 0.8689449236894492, 0.8613138686131386, 0.8642999336429993, 0.8609820836098209, 0.8689449236894492]
[0.858327803583278, 0.8755806237558063, 0.8520238885202389, 0.8712674187126742, 0.858659588586596, 0.8689449236894493, 0.8613138686131385, 0.8642999336429994, 0.8609820836098209, 0.8689449236894492]
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(2, 1, 4)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', 'bigram_entropy', '(1, 2, 1)', '(2, 3)', '(1, 2)', 'trigram_entropy', '(2,)', 'unigram_entropy', '(1, 1, 2)', 

## 30 Percentile

In [18]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_30_mrmr, f1_score_list_lr_30_mrmr, auc_list_lr_30_mrmr, param_list_lr_30_mrmr = model_train_predict(lr, 'mrmr_feat_list_30*', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_30_mrmr, f1_score_list_rfc_30_mrmr, auc_list_rfc_30_mrmr, param_list_rfc_30_mrmr = model_train_predict(rfc, 'mrmr_feat_list_30', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_30_mrmr, f1_score_list_svm_30_mrmr, auc_list_svm_30_mrmr, param_list_svm_30_mrmr = model_train_predict(svc, 'mrmr_feat_list_30', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_30_mrmr, f1_score_list_xgb_30_mrmr, auc_list_xgb_30_mrmr, param_list_xgb_30_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_30', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_30_mrmr, f1_score_list_lgb_30_mrmr, auc_list_lgb_30_mrmr, param_list_lgb_30_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_30', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 42) (12052,) (3014, 42) (3014,)
Average Accuracy 0.8724286662242866
Average F1 Score 0.8722390990983471
Average AUC 0.8724286662242866
Max Accuracy 0.8792302587923025
Max F1 Score 0.8779342723004695
Max AUC 0.8792302587923025
[0.8749170537491705, 0.8792302587923025, 0.8666224286662243, 0.8779031187790312, 0.8689449236894492, 0.8735899137358991, 0.8656270736562708, 0.8692767086927671, 0.8729263437292635, 0.8752488387524884]
[0.8749170537491705, 0.8792302587923025, 0.8666224286662241, 0.8779031187790312, 0.8689449236894492, 0.8735899137358993, 0.8656270736562707, 0.8692767086927671, 0.8729263437292634, 0.8752488387524884]
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(2, 1, 4)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', 'bigram_entropy', '(1, 2, 1)', '(2, 3)', '(1, 2)', 'trigram_entropy', '(2,)', 'unigram_entropy', '(1, 1, 2)

## 50 Percentile

In [19]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_50_mrmr, f1_score_list_lr_50_mrmr, auc_list_lr_50_mrmr, param_list_lr_50_mrmr = model_train_predict(lr, 'mrmr_feat_list_50*', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_50_mrmr, f1_score_list_rfc_50_mrmr, auc_list_rfc_50_mrmr, param_list_rfc_50_mrmr = model_train_predict(rfc, 'mrmr_feat_list_50', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_50_mrmr, f1_score_list_svm_50_mrmr, auc_list_svm_50_mrmr, param_list_svm_50_mrmr = model_train_predict(svc, 'mrmr_feat_list_50', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_50_mrmr, f1_score_list_xgb_50_mrmr, auc_list_xgb_50_mrmr, param_list_xgb_50_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_50', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_50_mrmr, f1_score_list_lgb_50_mrmr, auc_list_lgb_50_mrmr, param_list_lgb_50_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_50', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 69) (12052,) (3014, 69) (3014,)
Average Accuracy 0.8789648307896483
Average F1 Score 0.8788351085247154
Average AUC 0.8789648307896483
Max Accuracy 0.8842070338420703
Max F1 Score 0.8830995323981295
Max AUC 0.8842070338420703
[0.8798938287989383, 0.8838752488387525, 0.8725945587259456, 0.8842070338420703, 0.8735899137358991, 0.8795620437956204, 0.8749170537491705, 0.8779031187790312, 0.8798938287989383, 0.8832116788321168]
[0.8798938287989383, 0.8838752488387525, 0.8725945587259456, 0.8842070338420703, 0.8735899137358991, 0.8795620437956204, 0.8749170537491705, 0.8779031187790312, 0.8798938287989382, 0.8832116788321167]
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(3, 2, 2)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', 'bigram_entropy', '(1, 2, 1)', '(1, 2)', 'trigram_entropy', '(2, 3)', '(2,)', '(3,)', '(4, 1, 1)', '(2, 3, 

## 75 Percentile

In [20]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_75_mrmr, f1_score_list_lr_75_mrmr, auc_list_lr_75_mrmr, param_list_lr_75_mrmr = model_train_predict(lr, 'mrmr_feat_list_75*', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_75_mrmr, f1_score_list_rfc_75_mrmr, auc_list_rfc_75_mrmr, param_list_rfc_75_mrmr = model_train_predict(rfc, 'mrmr_feat_list_75', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_75_mrmr, f1_score_list_svm_75_mrmr, auc_list_svm_75_mrmr, param_list_svm_75_mrmr = model_train_predict(svc, 'mrmr_feat_list_75', params=params)
# print("\n=============================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_75_mrmr, f1_score_list_xgb_75_mrmr, auc_list_xgb_75_mrmr, param_list_xgb_75_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_75', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_75_mrmr, f1_score_list_lgb_75_mrmr, auc_list_lgb_75_mrmr, param_list_lgb_75_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_75', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 104) (12052,) (3014, 104) (3014,)
Average Accuracy 0.8806569343065693
Average F1 Score 0.8804285333436386
Average AUC 0.8806569343065694
Max Accuracy 0.8881884538818845
Max F1 Score 0.8872532619605219
Max AUC 0.8881884538818846
[0.8825481088254811, 0.8881884538818845, 0.8788984737889848, 0.8838752488387525, 0.8735899137358991, 0.8818845388188454, 0.873921698739217, 0.8795620437956204, 0.8812209688122097, 0.882879893828799]
[0.882548108825481, 0.8881884538818846, 0.8788984737889847, 0.8838752488387525, 0.8735899137358992, 0.8818845388188454, 0.8739216987392171, 0.8795620437956204, 0.8812209688122097, 0.8828798938287991]
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(2, 1, 4)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', 'bigram_entropy', '(1, 2, 1)', '(2, 3)', '(1, 2)', 'trigram_entropy', '(2,)', 'unigram_entropy', '(1, 1, 2)'

## 90 Percentile

In [21]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_90_mrmr, f1_score_list_lr_90_mrmr, auc_list_lr_90_mrmr, param_list_lr_90_mrmr = model_train_predict(lr, 'mrmr_feat_list_90*', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_90_mrmr, f1_score_list_rfc_90_mrmr, auc_list_rfc_90_mrmr, param_list_rfc_90_mrmr = model_train_predict(rfc, 'mrmr_feat_list_90', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_90_mrmr, f1_score_list_svm_90_mrmr, auc_list_svm_90_mrmr, param_list_svm_90_mrmr = model_train_predict(svc, 'mrmr_feat_list_90', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_90_mrmr, f1_score_list_xgb_90_mrmr, auc_list_xgb_90_mrmr, param_list_xgb_90_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_90', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_90_mrmr, f1_score_list_lgb_90_mrmr, auc_list_lgb_90_mrmr, param_list_lgb_90_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_90', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 125) (12052,) (3014, 125) (3014,)
Average Accuracy 0.8815195753151958
Average F1 Score 0.8814839920331391
Average AUC 0.8815195753151958
Max Accuracy 0.8868613138686131
Max F1 Score 0.8862195528862195
Max AUC 0.8868613138686131
[0.8818845388188454, 0.8868613138686131, 0.8788984737889848, 0.8825481088254811, 0.8765759787657598, 0.8852023888520238, 0.8775713337757134, 0.8795620437956204, 0.8838752488387525, 0.8822163238221632]
[0.8818845388188453, 0.8868613138686131, 0.8788984737889848, 0.882548108825481, 0.8765759787657597, 0.8852023888520238, 0.8775713337757134, 0.8795620437956203, 0.8838752488387526, 0.8822163238221632]
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(2, 1, 4)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', 'bigram_entropy', '(1, 2, 1)', '(2, 3)', '(1, 2)', 'trigram_entropy', '(2,)', 'unigram_entropy', '(1, 1, 2

# MI and mRMR

## 10 Percentile

In [22]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_10_mi_mrmr, f1_score_list_lr_10_mi_mrmr, auc_list_lr_10_mi_mrmr, param_list_lr_10_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_10*', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_10_mi_mrmr, f1_score_list_rfc_10_mi_mrmr, auc_list_rfc_10_mi_mrmr, param_list_rfc_10_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_10', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_10_mi_mrmr, f1_score_list_svm_10_mi_mrmr, auc_list_svm_10_mi_mrmr, param_list_svm_10_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_10', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_10_mi_mrmr, f1_score_list_xgb_10_mi_mrmr, auc_list_xgb_10_mi_mrmr, param_list_xgb_10_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_10', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_10_mi_mrmr, f1_score_list_lgb_10_mi_mrmr, auc_list_lgb_10_mi_mrmr, param_list_lgb_10_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_10', params=params)

(12052, 11) (12052,) (3014, 11) (3014,)
Average Accuracy 0.8352687458526875
Average F1 Score 0.8318000660055629
Average AUC 0.8352687458526875
Max Accuracy 0.8424021234240212
Max F1 Score 0.8378286104472517
Max AUC 0.8424021234240212
[0.8370935633709357, 0.8420703384207033, 0.8238221632382217, 0.8387524883875249, 0.8258128732581287, 0.836098208360982, 0.8347710683477106, 0.8324485733244857, 0.8424021234240212, 0.8394160583941606]
[0.8370935633709357, 0.8420703384207034, 0.8238221632382217, 0.8387524883875249, 0.8258128732581287, 0.836098208360982, 0.8347710683477108, 0.8324485733244856, 0.8424021234240212, 0.8394160583941604]
Best Sample Index based on Max Accuracy 8
Best Sample Index based on Max F1 Score 8
Best Sample Index based on Max AUC 8
Best Features based on Max Accuracy ['(1, 2)', '(3,)', '(1, 2, 1)', '(2, 3, 1)', '(2, 1)', '(3, 1, 1)', 'bigram_entropy', '(3, 1)', 'trigram_entropy', '(2,)', '(2, 3)']
Best Features based on Max F1 Score ['(1, 2)', '(3,)', '(1, 2, 1)', '(2, 3, 

## 20 Percentile

In [23]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_20_mi_mrmr, f1_score_list_lr_20_mi_mrmr, auc_list_lr_20_mi_mrmr, param_list_lr_20_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_20*', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_20_mi_mrmr, f1_score_list_rfc_20_mi_mrmr, auc_list_rfc_20_mi_mrmr, param_list_rfc_20_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_20', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_20_mi_mrmr, f1_score_list_svm_20_mi_mrmr, auc_list_svm_20_mi_mrmr, param_list_svm_20_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_20', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_20_mi_mrmr, f1_score_list_xgb_20_mi_mrmr, auc_list_xgb_20_mi_mrmr, param_list_xgb_20_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_20', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_20_mi_mrmr, f1_score_list_lgb_20_mi_mrmr, auc_list_lgb_20_mi_mrmr, param_list_lgb_20_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_20', params=params)

(12052, 22) (12052,) (3014, 22) (3014,)
Average Accuracy 0.8593231585932315
Average F1 Score 0.8582986550531333
Average AUC 0.8593231585932315
Max Accuracy 0.8732581287325812
Max F1 Score 0.8715534633490248
Max AUC 0.8732581287325812
[0.8593231585932316, 0.8732581287325812, 0.8463835434638355, 0.8642999336429993, 0.8593231585932316, 0.8603185136031851, 0.8523556735235568, 0.8536828135368282, 0.8629727936297279, 0.8613138686131386]
[0.8593231585932315, 0.8732581287325812, 0.8463835434638355, 0.8642999336429993, 0.8593231585932315, 0.860318513603185, 0.8523556735235567, 0.8536828135368282, 0.8629727936297279, 0.8613138686131387]
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Features based on Max Accuracy ['(4,)', '(1, 4)', '(1, 2, 1)', '(4, 1)', 'pattern_hvg_5_node_entropy', '(2, 1)', 'bigram_entropy', '(2, 1, 2)', 'trigram_entropy', '(1, 1, 2)', '(2,)', '(3,)', 'pattern_hvg_4_nodes_entropy', '(2, 3, 1)', '(2

## 30 Percentile

In [24]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_30_mi_mrmr, f1_score_list_lr_30_mi_mrmr, auc_list_lr_30_mi_mrmr, param_list_lr_30_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_30*', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_30_mi_mrmr, f1_score_list_rfc_30_mi_mrmr, auc_list_rfc_30_mi_mrmr, param_list_rfc_30_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_30', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_30_mi_mrmr, f1_score_list_svm_30_mi_mrmr, auc_list_svm_30_mi_mrmr, param_list_svm_30_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_30', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_30_mi_mrmr, f1_score_list_xgb_30_mi_mrmr, auc_list_xgb_30_mi_mrmr, param_list_xgb_30_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_30', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_30_mi_mrmr, f1_score_list_lgb_30_mi_mrmr, auc_list_lgb_30_mi_mrmr, param_list_lgb_30_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_30', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 31) (12052,) (3014, 31) (3014,)
Average Accuracy 0.8696084936960851
Average F1 Score 0.8692749324695587
Average AUC 0.8696084936960851
Max Accuracy 0.8765759787657598
Max F1 Score 0.8749999999999999
Max AUC 0.8765759787657599
[0.8692767086927671, 0.8765759787657598, 0.8666224286662243, 0.8755806237558063, 0.8629727936297279, 0.873921698739217, 0.8616456536164565, 0.8639681486396815, 0.8715992037159921, 0.873921698739217]
[0.8692767086927671, 0.8765759787657599, 0.8666224286662244, 0.8755806237558061, 0.8629727936297281, 0.873921698739217, 0.8616456536164565, 0.8639681486396815, 0.8715992037159921, 0.8739216987392169]
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Features based on Max Accuracy ['(4,)', '(1, 4)', '(1, 2, 1)', '(4, 1, 4)', 'pattern_hvg_5_node_entropy', '(2, 1)', '(4, 1)', '(1, 1, 4)', 'bigram_entropy', '(2, 1, 2)', 'trigram_entropy', '(1, 1, 2)', '(2,)', '(3, 1, 2)', '(3,)', '(3, 1, 4)

## 50 Percentile

In [25]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_50_mi_mrmr, f1_score_list_lr_50_mi_mrmr, auc_list_lr_50_mi_mrmr, param_list_lr_50_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_50*', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_50_mi_mrmr, f1_score_list_rfc_50_mi_mrmr, auc_list_rfc_50_mi_mrmr, param_list_rfc_50_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_50', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_50_mi_mrmr, f1_score_list_svm_50_mi_mrmr, auc_list_svm_50_mi_mrmr, param_list_svm_50_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_50', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_50_mi_mrmr, f1_score_list_xgb_50_mi_mrmr, auc_list_xgb_50_mi_mrmr, param_list_xgb_50_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_50', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_50_mi_mrmr, f1_score_list_lgb_50_mi_mrmr, auc_list_lgb_50_mi_mrmr, param_list_lgb_50_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_50', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 56) (12052,) (3014, 56) (3014,)
Average Accuracy 0.8778699402786995
Average F1 Score 0.8776370522974231
Average AUC 0.8778699402786995
Max Accuracy 0.8832116788321168
Max F1 Score 0.8822742474916387
Max AUC 0.8832116788321167
[0.8788984737889848, 0.8832116788321168, 0.8715992037159921, 0.882879893828799, 0.8709356337093563, 0.8808891838088918, 0.8742534837425349, 0.8765759787657598, 0.8795620437956204, 0.8798938287989383]
[0.8788984737889848, 0.8832116788321167, 0.871599203715992, 0.882879893828799, 0.8709356337093562, 0.8808891838088919, 0.8742534837425349, 0.8765759787657599, 0.8795620437956204, 0.8798938287989383]
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Features based on Max Accuracy ['(4,)', '(6, 3)', '(2, 2, 1)', '(1, 1, 3)', '(3, 2, 1)', '(1, 4)', '(1, 2, 1)', '(3, 3, 3)', '(1, 4, 4)', '(1, 3, 3)', '(4, 1, 4)', 'pattern_hvg_5_node_entropy', '(2, 1)', '(4, 1)', '(1, 1, 4)', 'bigram_entrop

## 75 Percentile

In [26]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_75_mi_mrmr, f1_score_list_lr_75_mi_mrmr, auc_list_lr_75_mi_mrmr, param_list_lr_75_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_75', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_75_mi_mrmr, f1_score_list_rfc_75_mi_mrmr, auc_list_rfc_75_mi_mrmr, param_list_rfc_75_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_75', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_75_mi_mrmr, f1_score_list_svm_75_mi_mrmr, auc_list_svm_75_mi_mrmr, param_list_svm_75_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_75', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_75_mi_mrmr, f1_score_list_xgb_75_mi_mrmr, auc_list_xgb_75_mi_mrmr, param_list_xgb_75_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_75', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_75_mi_mrmr, f1_score_list_lgb_75_mi_mrmr, auc_list_lgb_75_mi_mrmr, param_list_lgb_75_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_75', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 93) (12052,) (3014, 93) (3014,)
Average Accuracy 0.8792966157929663
Average F1 Score 0.8790991954008074
Average AUC 0.879296615792966
Max Accuracy 0.8838752488387525
Max F1 Score 0.8840291583830351
Max AUC 0.8838752488387526
[0.8838752488387525, 0.8832116788321168, 0.8745852687458527, 0.8835434638354346, 0.8715992037159921, 0.8815527538155276, 0.873921698739217, 0.8798938287989383, 0.8785666887856669, 0.8822163238221632]
[0.8838752488387526, 0.8832116788321167, 0.8745852687458526, 0.8835434638354347, 0.871599203715992, 0.8815527538155274, 0.8739216987392171, 0.8798938287989383, 0.8785666887856669, 0.8822163238221633]
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Features based on Max Accuracy ['(2, 2, 1)', '(3, 2, 1)', '(3, 3, 3)', '(1, 4, 4)', '(2, 6, 3)', '(1, 1, 4)', '(3, 3, 2)', '(1, 1, 2)', '(1, 1)', 'pattern_hvg_4_nodes_entropy', 'I5', '(6,)', '(3, 3, 1)', '(6, 1, 3)', '(2, 1, 1)', '(1, 1, 1)'

## 90 Percentile

In [27]:
# lr = LogisticRegression()
# params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
# accuracy_list_lr_90_mi_mrmr, f1_score_list_lr_90_mi_mrmr, auc_list_lr_90_mi_mrmr, param_list_lr_90_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_90', params=params)
# print("\n================================================================\n")
# rfc = RandomForestClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_rfc_90_mi_mrmr, f1_score_list_rfc_90_mi_mrmr, auc_list_rfc_90_mi_mrmr, param_list_rfc_90_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_90', params=params)
# print("\n================================================================\n")
svc = SVC()
params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
accuracy_list_svm_90_mi_mrmr, f1_score_list_svm_90_mi_mrmr, auc_list_svm_90_mi_mrmr, param_list_svm_90_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_90', params=params)
print("\n================================================================\n")
# xgbc = xgb.XGBClassifier()
# params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_xgb_90_mi_mrmr, f1_score_list_xgb_90_mi_mrmr, auc_list_xgb_90_mi_mrmr, param_list_xgb_90_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_90', params=params)
# print("\n================================================================\n")
# lgbc = lgb.LGBMClassifier()
# params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
# accuracy_list_lgb_90_mi_mrmr, f1_score_list_lgb_90_mi_mrmr, auc_list_lgb_90_mi_mrmr, param_list_lgb_90_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_90', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(12052, 116) (12052,) (3014, 116) (3014,)
Average Accuracy 0.8817518248175183
Average F1 Score 0.88173582847519
Average AUC 0.8817518248175183
Max Accuracy 0.8865295288652952
Max F1 Score 0.8859239492995329
Max AUC 0.8865295288652953
[0.8818845388188454, 0.8865295288652952, 0.8788984737889848, 0.882879893828799, 0.8762441937624419, 0.8858659588586596, 0.8788984737889848, 0.8802256138022562, 0.8832116788321168, 0.882879893828799]
[0.8818845388188453, 0.8865295288652953, 0.8788984737889848, 0.882879893828799, 0.8762441937624419, 0.8858659588586596, 0.8788984737889847, 0.8802256138022562, 0.8832116788321168, 0.882879893828799]
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Features based on Max Accuracy ['(2, 2, 1)', '(3, 2, 1)', '(3, 3, 3)', '(1, 4, 4)', '(2, 6, 3)', '(1, 1, 4)', '(3, 3, 2)', '(1, 1, 2)', '(4, 2, 1)', '(1, 6, 1)', '(1, 1)', 'pattern_hvg_4_nodes_entropy', '(6, 1)', '(6,)', '(3, 3, 1)', '(6, 1, 

# PCA

In [4]:
def model_train_predict_pca(model, k, dataframes=list_sample_dataframes, params=None):
    
    accuracy_list = []
    f1_score_list = []
    auc_list = []
    param_list = []
    
    for sample in dataframes:
        x = sample.drop(['Unnamed: 0', 'conversion_class'], axis=1)
        y = sample['conversion_class']
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
        
        pca = PCA(n_components=k)
        x_train = pca.fit_transform(x_train)
        x_test = pca.transform(x_test)
        
        clf = GridSearchCV(estimator=model, param_grid=params, cv=5, n_jobs=-1)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        # model.fit(x_train, y_train)
        # y_pred = model.predict(x_test)
        accuracy_list.append(accuracy_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))
        auc_list.append(roc_auc_score(y_test, y_pred))
        param_list.append(clf.best_params_)

    print('Average Accuracy', np.mean(accuracy_list))
    print('Average F1 Score', np.mean(f1_score_list))
    print('Average AUC', np.mean(auc_list)) 
    
    print('Max Accuracy', max(accuracy_list))
    print('Max F1 Score', max(f1_score_list))
    print('Max AUC', max(auc_list))  
    
    best_accuracy_index = accuracy_list.index(max(accuracy_list))
    best_f1_score_index = f1_score_list.index(max(f1_score_list))
    best_auc_index = auc_list.index(max(auc_list))
    
    print('Best Sample Index based on Max Accuracy', best_accuracy_index)
    print('Best Sample Index based on Max F1 Score', best_f1_score_index)
    print('Best Sample Index based on Max AUC', best_auc_index)
    print('Best Parameters', param_list[best_accuracy_index])
    
    return accuracy_list, f1_score_list, auc_list, param_list  

## 10 Percentile

In [5]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_10_pca, f1_score_list_lr_10_pca, auc_list_lr_10_pca, param_list_lr_10_pca = model_train_predict_pca(lr, 14, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_10_pca, f1_score_list_rfc_10_pca, auc_list_rfc_10_pca, param_list_rfc_10_pca = model_train_predict_pca(rfc, 14, params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_10_pca, f1_score_list_svm_10_pca, auc_list_svm_10_pca, param_list_svm_10_pca = model_train_predict_pca(svc, 14, params=params)
# print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_10_pca, f1_score_list_xgb_10_pca, auc_list_xgb_10_pca, param_list_xgb_10_pca = model_train_predict_pca(xgbc, 14, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_10_pca, f1_score_list_lgb_10_pca, auc_list_lgb_10_pca, param_list_lgb_10_pca = model_train_predict_pca(lgbc, 14, params=params)

Average Accuracy 0.8362309223623093
Average F1 Score 0.8357233091656106
Average AUC 0.8362309223623091
Max Accuracy 0.8467153284671532
Max F1 Score 0.8443396226415094
Max AUC 0.8467153284671534
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Parameters {'C': 100, 'max_iter': 1000}


Average Accuracy 0.8514930325149302
Average F1 Score 0.8548496882158823
Average AUC 0.8514930325149302
Max Accuracy 0.8589913735899137
Max F1 Score 0.8617886178861789
Max AUC 0.8589913735899137
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 8
Best Sample Index based on Max AUC 1
Best Parameters {'max_depth': 10, 'n_estimators': 50}


Average Accuracy 0.8581950895819508
Average F1 Score 0.8600882238253748
Average AUC 0.858195089581951
Max Accuracy 0.8666224286662243
Max F1 Score 0.8673267326732672
Max AUC 0.8666224286662244
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F

## 20 Percentile

In [6]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_20_pca, f1_score_list_lr_20_pca, auc_list_lr_20_pca, param_list_lr_20_pca = model_train_predict_pca(lr, 28, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_20_pca, f1_score_list_rfc_20_pca, auc_list_rfc_20_pca, param_list_rfc_20_pca = model_train_predict_pca(rfc, 28, params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_20_pca, f1_score_list_svm_20_pca, auc_list_svm_20_pca, param_list_svm_20_pca = model_train_predict_pca(svc, 28, params=params)
# print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_20_pca, f1_score_list_xgb_20_pca, auc_list_xgb_20_pca, param_list_xgb_20_pca = model_train_predict_pca(xgbc, 28, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_20_pca, f1_score_list_lgb_20_pca, auc_list_lgb_20_pca, param_list_lgb_20_pca = model_train_predict_pca(lgbc, 28, params=params)

Average Accuracy 0.8482415394824153
Average F1 Score 0.8469689986732597
Average AUC 0.8482415394824153
Max Accuracy 0.856337093563371
Max F1 Score 0.8541596497137082
Max AUC 0.856337093563371
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Parameters {'C': 100, 'max_iter': 1000}


Average Accuracy 0.8570670205706703
Average F1 Score 0.8601125831174363
Average AUC 0.8570670205706703
Max Accuracy 0.8639681486396815
Max F1 Score 0.8665364583333333
Max AUC 0.8639681486396815
Best Sample Index based on Max Accuracy 0
Best Sample Index based on Max F1 Score 0
Best Sample Index based on Max AUC 0
Best Parameters {'max_depth': 10, 'n_estimators': 100}




KeyboardInterrupt: 

## 30 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_30_pca, f1_score_list_lr_30_pca, auc_list_lr_30_pca, param_list_lr_30_pca = model_train_predict_pca(lr, 42, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_30_pca, f1_score_list_rfc_30_pca, auc_list_rfc_30_pca, param_list_rfc_30_pca = model_train_predict_pca(rfc, 42, params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_30_pca, f1_score_list_svm_30_pca, auc_list_svm_30_pca, param_list_svm_30_pca = model_train_predict_pca(svc, 42, params=params)
# print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_30_pca, f1_score_list_xgb_30_pca, auc_list_xgb_30_pca, param_list_xgb_30_pca = model_train_predict_pca(xgbc, 42, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_30_pca, f1_score_list_lgb_30_pca, auc_list_lgb_30_pca, param_list_lgb_30_pca = model_train_predict_pca(lgbc, 42, params=params)

Average Accuracy 0.8688122096881221
Average F1 Score 0.8681383437608986
Average AUC 0.8688122096881221
Max Accuracy 0.8775713337757134
Max F1 Score 0.8759663865546218
Max AUC 0.8775713337757134
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Parameters {'C': 10, 'max_iter': 1000}


Average Accuracy 0.8782680822826808
Average F1 Score 0.8808136891992243
Average AUC 0.8782680822826808
Max Accuracy 0.8835434638354346
Max F1 Score 0.8852566198103956
Max AUC 0.8835434638354345
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Parameters {'max_depth': 10, 'n_estimators': 100}


Average Accuracy 0.8878566688785667
Average F1 Score 0.8897711968114386
Average AUC 0.8878566688785667
Max Accuracy 0.9007962840079629
Max F1 Score 0.9023195034302516
Max AUC 0.9007962840079627
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max 

## 50 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_50_pca, f1_score_list_lr_50_pca, auc_list_lr_50_pca, param_list_lr_50_pca = model_train_predict_pca(lr, 69, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_50_pca, f1_score_list_rfc_50_pca, auc_list_rfc_50_pca, param_list_rfc_50_pca = model_train_predict_pca(rfc, 69, params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_50_pca, f1_score_list_svm_50_pca, auc_list_svm_50_pca, param_list_svm_50_pca = model_train_predict_pca(svc, 69, params=params)
# print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_50_pca, f1_score_list_xgb_50_pca, auc_list_xgb_50_pca, param_list_xgb_50_pca = model_train_predict_pca(xgbc, 69, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_50_pca, f1_score_list_lgb_50_pca, auc_list_lgb_50_pca, param_list_lgb_50_pca = model_train_predict_pca(lgbc, 69, params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Average Accuracy 0.879296615792966
Average F1 Score 0.8793960970941814
Average AUC 0.8792966157929663
Max Accuracy 0.8838752488387525
Max F1 Score 0.8827077747989276
Max AUC 0.8838752488387526
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Parameters {'C': 100, 'max_iter': 1000}


Average Accuracy 0.8822163238221632
Average F1 Score 0.8845763673673062
Average AUC 0.8822163238221632
Max Accuracy 0.8881884538818845
Max F1 Score 0.8897612037945699
Max AUC 0.8881884538818845
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Parameters {'max_depth': 10, 'n_estimators': 100}


Average Accuracy 0.89907100199071
Average F1 Score 0.9008086365181598
Average AUC 0.89907100199071
Max Accuracy 0.9054412740544128
Max F1 Score 0.9071963529794854
Max AUC 0.9054412740544128
Best Sample Index based on Max Accuracy 8
Best Sample Index based on Max F1 S

## 75 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_75_pca, f1_score_list_lr_75_pca, auc_list_lr_75_pca, param_list_lr_75_pca = model_train_predict_pca(lr, 104, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_75_pca, f1_score_list_rfc_75_pca, auc_list_rfc_75_pca, param_list_rfc_75_pca = model_train_predict_pca(rfc, 104, params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_75_pca, f1_score_list_svm_75_pca, auc_list_svm_75_pca, param_list_svm_75_pca = model_train_predict_pca(svc, 104, params=params)
# print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_75_pca, f1_score_list_xgb_75_pca, auc_list_xgb_75_pca, param_list_xgb_75_pca = model_train_predict_pca(xgbc, 104, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_75_pca, f1_score_list_lgb_75_pca, auc_list_lgb_75_pca, param_list_lgb_75_pca = model_train_predict_pca(lgbc, 104, params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Average Accuracy 0.8823158593231586
Average F1 Score 0.8824706683846953
Average AUC 0.8823158593231586
Max Accuracy 0.8875248838752489
Max F1 Score 0.8877111626366346
Max AUC 0.8875248838752489
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 5
Best Parameters {'C': 1000, 'max_iter': 2000}


Average Accuracy 0.8861645653616457
Average F1 Score 0.8879786506339874
Average AUC 0.8861645653616457
Max Accuracy 0.8918380889183809
Max F1 Score 0.8936031331592691
Max AUC 0.8918380889183809
Best Sample Index based on Max Accuracy 8
Best Sample Index based on Max F1 Score 8
Best Sample Index based on Max AUC 8
Best Parameters {'max_depth': 10, 'n_estimators': 100}


Average Accuracy 0.9012607830126077
Average F1 Score 0.9030922591978399
Average AUC 0.9012607830126077
Max Accuracy 0.9064366290643663
Max F1 Score 0.9080834419817471
Max AUC 0.9064366290643663
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Ma

## 90 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_90_pca, f1_score_list_lr_90_pca, auc_list_lr_90_pca, param_list_lr_90_pca = model_train_predict_pca(lr, 125, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_90_pca, f1_score_list_rfc_90_pca, auc_list_rfc_90_pca, param_list_rfc_90_pca = model_train_predict_pca(rfc, 125, params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_90_pca, f1_score_list_svm_90_pca, auc_list_svm_90_pca, param_list_svm_90_pca = model_train_predict_pca(svc, 125, params=params)
# print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_90_pca, f1_score_list_xgb_90_pca, auc_list_xgb_90_pca, param_list_xgb_90_pca = model_train_predict_pca(xgbc, 125, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_90_pca, f1_score_list_lgb_90_pca, auc_list_lgb_90_pca, param_list_lgb_90_pca = model_train_predict_pca(lgbc, 125, params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Average Accuracy 0.8831453218314532
Average F1 Score 0.8833241457965452
Average AUC 0.8831453218314532
Max Accuracy 0.8878566688785667
Max F1 Score 0.8877111626366346
Max AUC 0.8878566688785667
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 1
Best Parameters {'C': 1000, 'max_iter': 1000}


Average Accuracy 0.8867285998672859
Average F1 Score 0.8884333071018791
Average AUC 0.8867285998672859
Max Accuracy 0.8931652289316523
Max F1 Score 0.8940092165898618
Max AUC 0.8931652289316523
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Parameters {'max_depth': 10, 'n_estimators': 100}


Average Accuracy 0.9005640345056405
Average F1 Score 0.902110545663912
Average AUC 0.9005640345056405
Max Accuracy 0.9051094890510949
Max F1 Score 0.9067188519243314
Max AUC 0.9051094890510949
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max

# Saving results

In [None]:
models = ['lr', 'rfc', 'xgbc', 'lgbm']
models = [value for value in models for _ in range(10)] * 6

percentiles = ['10', '20', '30', '50', '75', '90']
percentiles = [value for value in percentiles for _ in range(40)]

filename_sample_list_final = filename_sample_list * 24

print(len(models))
print(len(percentiles))
print(len(filename_sample_list_final))

240
240
240


In [None]:
# overall_accuracy_list_mi = (accuracy_list_lr_10_mi + accuracy_list_rfc_10_mi + accuracy_list_xgb_10_mi + accuracy_list_lgb_10_mi +
#                             accuracy_list_lr_20_mi + accuracy_list_rfc_20_mi + accuracy_list_xgb_20_mi + accuracy_list_lgb_20_mi +
#                             accuracy_list_lr_30_mi + accuracy_list_rfc_30_mi + accuracy_list_xgb_30_mi + accuracy_list_lgb_30_mi +
#                             accuracy_list_lr_50_mi + accuracy_list_rfc_50_mi + accuracy_list_xgb_50_mi + accuracy_list_lgb_50_mi +
#                             accuracy_list_lr_75_mi + accuracy_list_rfc_75_mi + accuracy_list_xgb_75_mi + accuracy_list_lgb_75_mi +
#                             accuracy_list_lr_90_mi + accuracy_list_rfc_90_mi + accuracy_list_xgb_90_mi + accuracy_list_lgb_90_mi)

# overall_f1_score_list_mi = (f1_score_list_lr_10_mi + f1_score_list_rfc_10_mi + f1_score_list_xgb_10_mi + f1_score_list_lgb_10_mi +
#                             f1_score_list_lr_20_mi + f1_score_list_rfc_20_mi + f1_score_list_xgb_20_mi + f1_score_list_lgb_20_mi +
#                             f1_score_list_lr_30_mi + f1_score_list_rfc_30_mi + f1_score_list_xgb_30_mi + f1_score_list_lgb_30_mi +
#                             f1_score_list_lr_50_mi + f1_score_list_rfc_50_mi + f1_score_list_xgb_50_mi + f1_score_list_lgb_50_mi +
#                             f1_score_list_lr_75_mi + f1_score_list_rfc_75_mi + f1_score_list_xgb_75_mi + f1_score_list_lgb_75_mi +
#                             f1_score_list_lr_90_mi + f1_score_list_rfc_90_mi + f1_score_list_xgb_90_mi + f1_score_list_lgb_90_mi)

# overall_auc_list_mi =  (auc_list_lr_10_mi + auc_list_rfc_10_mi  + auc_list_xgb_10_mi + auc_list_lgb_10_mi +
#                         auc_list_lr_20_mi + auc_list_rfc_20_mi  + auc_list_xgb_20_mi + auc_list_lgb_20_mi +
#                         auc_list_lr_30_mi + auc_list_rfc_30_mi  + auc_list_xgb_30_mi + auc_list_lgb_30_mi +
#                         auc_list_lr_50_mi + auc_list_rfc_50_mi  + auc_list_xgb_50_mi + auc_list_lgb_50_mi +
#                         auc_list_lr_75_mi + auc_list_rfc_75_mi  + auc_list_xgb_75_mi + auc_list_lgb_75_mi +
#                         auc_list_lr_90_mi + auc_list_rfc_90_mi  + auc_list_xgb_90_mi + auc_list_lgb_90_mi)

overall_accuracy_list_mi = (accuracy_list_svm_10_mi + accuracy_list_svm_20_mi + accuracy_list_svm_30_mi + accuracy_list_svm_50_mi + accuracy_list_svm_75_mi + accuracy_list_svm_90_mi)

# overall_param_list_mi = (param_list_lr_10_mi + param_list_rfc_10_mi + param_list_svm_10_mi + param_list_xgb_10_mi + param_list_lgb_10_mi +
#                             param_list_lr_20_mi + param_list_rfc_20_mi + param_list_svm_20_mi + param_list_xgb_20_mi + param_list_lgb_20_mi +
#                             param_list_lr_30_mi + param_list_rfc_30_mi + param_list_svm_30_mi + param_list_xgb_30_mi + param_list_lgb_30_mi +
#                             param_list_lr_50_mi + param_list_rfc_50_mi + param_list_svm_50_mi + param_list_xgb_50_mi + param_list_lgb_50_mi +
#                             param_list_lr_75_mi + param_list_rfc_75_mi + param_list_svm_75_mi + param_list_xgb_75_mi + param_list_lgb_75_mi +
#                             param_list_lr_90_mi + param_list_rfc_90_mi + param_list_svm_90_mi + param_list_xgb_90_mi + param_list_lgb_90_mi)

In [37]:
# overall_accuracy_list_mrmr = (accuracy_list_lr_10_mrmr + accuracy_list_rfc_10_mrmr + accuracy_list_xgb_10_mrmr + accuracy_list_lgb_10_mrmr +
#                             accuracy_list_lr_20_mrmr + accuracy_list_rfc_20_mrmr + accuracy_list_xgb_20_mrmr + accuracy_list_lgb_20_mrmr +
#                             accuracy_list_lr_30_mrmr + accuracy_list_rfc_30_mrmr + accuracy_list_xgb_30_mrmr + accuracy_list_lgb_30_mrmr +
#                             accuracy_list_lr_50_mrmr + accuracy_list_rfc_50_mrmr + accuracy_list_xgb_50_mrmr + accuracy_list_lgb_50_mrmr +
#                             accuracy_list_lr_75_mrmr + accuracy_list_rfc_75_mrmr + accuracy_list_xgb_75_mrmr + accuracy_list_lgb_75_mrmr +
#                             accuracy_list_lr_90_mrmr + accuracy_list_rfc_90_mrmr + accuracy_list_xgb_90_mrmr + accuracy_list_lgb_90_mrmr)

# overall_f1_score_list_mrmr = (f1_score_list_lr_10_mrmr + f1_score_list_rfc_10_mrmr + f1_score_list_xgb_10_mrmr + f1_score_list_lgb_10_mrmr +
#                             f1_score_list_lr_20_mrmr + f1_score_list_rfc_20_mrmr + f1_score_list_xgb_20_mrmr + f1_score_list_lgb_20_mrmr +
#                             f1_score_list_lr_30_mrmr + f1_score_list_rfc_30_mrmr + f1_score_list_xgb_30_mrmr + f1_score_list_lgb_30_mrmr +
#                             f1_score_list_lr_50_mrmr + f1_score_list_rfc_50_mrmr + f1_score_list_xgb_50_mrmr + f1_score_list_lgb_50_mrmr +
#                             f1_score_list_lr_75_mrmr + f1_score_list_rfc_75_mrmr + f1_score_list_xgb_75_mrmr + f1_score_list_lgb_75_mrmr +
#                             f1_score_list_lr_90_mrmr + f1_score_list_rfc_90_mrmr + f1_score_list_xgb_90_mrmr + f1_score_list_lgb_90_mrmr)

# overall_auc_list_mrmr =  (auc_list_lr_10_mrmr + auc_list_rfc_10_mrmr + auc_list_xgb_10_mrmr + auc_list_lgb_10_mrmr +
#                         auc_list_lr_20_mrmr + auc_list_rfc_20_mrmr + auc_list_xgb_20_mrmr + auc_list_lgb_20_mrmr +
#                         auc_list_lr_30_mrmr + auc_list_rfc_30_mrmr + auc_list_xgb_30_mrmr + auc_list_lgb_30_mrmr +
#                         auc_list_lr_50_mrmr + auc_list_rfc_50_mrmr + auc_list_xgb_50_mrmr + auc_list_lgb_50_mrmr +
#                         auc_list_lr_75_mrmr + auc_list_rfc_75_mrmr + auc_list_xgb_75_mrmr + auc_list_lgb_75_mrmr +
#                         auc_list_lr_90_mrmr + auc_list_rfc_90_mrmr + auc_list_xgb_90_mrmr + auc_list_lgb_90_mrmr)

overall_accuracy_list_mrmr = (accuracy_list_svm_10_mrmr + accuracy_list_svm_20_mrmr + accuracy_list_svm_30_mrmr + accuracy_list_svm_50_mrmr + accuracy_list_svm_75_mrmr + accuracy_list_svm_90_mrmr)


# overall_param_list_mrmr = (param_list_lr_10_mrmr + param_list_rfc_10_mrmr + param_list_svm_10_mrmr + param_list_xgb_10_mrmr + param_list_lgb_10_mrmr +
#                             param_list_lr_20_mrmr + param_list_rfc_20_mrmr + param_list_svm_20_mrmr + param_list_xgb_20_mrmr + param_list_lgb_20_mrmr +
#                             param_list_lr_30_mrmr + param_list_rfc_30_mrmr + param_list_svm_30_mrmr + param_list_xgb_30_mrmr + param_list_lgb_30_mrmr +
#                             param_list_lr_50_mrmr + param_list_rfc_50_mrmr + param_list_svm_50_mrmr + param_list_xgb_50_mrmr + param_list_lgb_50_mrmr +
#                             param_list_lr_75_mrmr + param_list_rfc_75_mrmr + param_list_svm_75_mrmr + param_list_xgb_75_mrmr + param_list_lgb_75_mrmr +
#                             param_list_lr_90_mrmr + param_list_rfc_90_mrmr + param_list_svm_90_mrmr + param_list_xgb_90_mrmr + param_list_lgb_90_mrmr)

In [38]:
# overall_accuracy_list_mi_mrmr = (accuracy_list_lr_10_mi_mrmr + accuracy_list_rfc_10_mi_mrmr + accuracy_list_xgb_10_mi_mrmr + accuracy_list_lgb_10_mi_mrmr +
#                             accuracy_list_lr_20_mi_mrmr + accuracy_list_rfc_20_mi_mrmr + accuracy_list_xgb_20_mi_mrmr + accuracy_list_lgb_20_mi_mrmr +
#                             accuracy_list_lr_30_mi_mrmr + accuracy_list_rfc_30_mi_mrmr + accuracy_list_xgb_30_mi_mrmr + accuracy_list_lgb_30_mi_mrmr +
#                             accuracy_list_lr_50_mi_mrmr + accuracy_list_rfc_50_mi_mrmr + accuracy_list_xgb_50_mi_mrmr + accuracy_list_lgb_50_mi_mrmr +
#                             accuracy_list_lr_75_mi_mrmr + accuracy_list_rfc_75_mi_mrmr + accuracy_list_xgb_75_mi_mrmr + accuracy_list_lgb_75_mi_mrmr +
#                             accuracy_list_lr_90_mi_mrmr + accuracy_list_rfc_90_mi_mrmr + accuracy_list_xgb_90_mi_mrmr + accuracy_list_lgb_90_mi_mrmr)

# overall_f1_score_list_mi_mrmr = (f1_score_list_lr_10_mi_mrmr + f1_score_list_rfc_10_mi_mrmr + f1_score_list_xgb_10_mi_mrmr + f1_score_list_lgb_10_mi_mrmr +
#                             f1_score_list_lr_20_mi_mrmr + f1_score_list_rfc_20_mi_mrmr + f1_score_list_xgb_20_mi_mrmr + f1_score_list_lgb_20_mi_mrmr +
#                             f1_score_list_lr_30_mi_mrmr + f1_score_list_rfc_30_mi_mrmr + f1_score_list_xgb_30_mi_mrmr + f1_score_list_lgb_30_mi_mrmr +
#                             f1_score_list_lr_50_mi_mrmr + f1_score_list_rfc_50_mi_mrmr + f1_score_list_xgb_50_mi_mrmr + f1_score_list_lgb_50_mi_mrmr +
#                             f1_score_list_lr_75_mi_mrmr + f1_score_list_rfc_75_mi_mrmr + f1_score_list_xgb_75_mi_mrmr + f1_score_list_lgb_75_mi_mrmr +
#                             f1_score_list_lr_90_mi_mrmr + f1_score_list_rfc_90_mi_mrmr + f1_score_list_xgb_90_mi_mrmr + f1_score_list_lgb_90_mi_mrmr)

# overall_auc_list_mi_mrmr =  (auc_list_lr_10_mi_mrmr + auc_list_rfc_10_mi_mrmr + auc_list_xgb_10_mi_mrmr + auc_list_lgb_10_mi_mrmr +
#                         auc_list_lr_20_mi_mrmr + auc_list_rfc_20_mi_mrmr + auc_list_xgb_20_mi_mrmr + auc_list_lgb_20_mi_mrmr +
#                         auc_list_lr_30_mi_mrmr + auc_list_rfc_30_mi_mrmr + auc_list_xgb_30_mi_mrmr + auc_list_lgb_30_mi_mrmr +
#                         auc_list_lr_50_mi_mrmr + auc_list_rfc_50_mi_mrmr + auc_list_xgb_50_mi_mrmr + auc_list_lgb_50_mi_mrmr +
#                         auc_list_lr_75_mi_mrmr + auc_list_rfc_75_mi_mrmr + auc_list_xgb_75_mi_mrmr + auc_list_lgb_75_mi_mrmr +
#                         auc_list_lr_90_mi_mrmr + auc_list_rfc_90_mi_mrmr + auc_list_xgb_90_mi_mrmr + auc_list_lgb_90_mi_mrmr)

overall_accuracy_list_mi_mrmr = (accuracy_list_svm_10_mi_mrmr + accuracy_list_svm_20_mi_mrmr + accuracy_list_svm_30_mi_mrmr + accuracy_list_svm_50_mi_mrmr + accuracy_list_svm_75_mi_mrmr + accuracy_list_svm_90_mi_mrmr)


# overall_param_list_mi_mrmr = (param_list_lr_10_mi_mrmr + param_list_rfc_10_mi_mrmr + param_list_svm_10_mi_mrmr + param_list_xgb_10_mi_mrmr + param_list_lgb_10_mi_mrmr +
#                             param_list_lr_20_mi_mrmr + param_list_rfc_20_mi_mrmr + param_list_svm_20_mi_mrmr + param_list_xgb_20_mi_mrmr + param_list_lgb_20_mi_mrmr +
#                             param_list_lr_30_mi_mrmr + param_list_rfc_30_mi_mrmr + param_list_svm_30_mi_mrmr + param_list_xgb_30_mi_mrmr + param_list_lgb_30_mi_mrmr +
#                             param_list_lr_50_mi_mrmr + param_list_rfc_50_mi_mrmr + param_list_svm_50_mi_mrmr + param_list_xgb_50_mi_mrmr + param_list_lgb_50_mi_mrmr +
#                             param_list_lr_75_mi_mrmr + param_list_rfc_75_mi_mrmr + param_list_svm_75_mi_mrmr + param_list_xgb_75_mi_mrmr + param_list_lgb_75_mi_mrmr +
#                             param_list_lr_90_mi_mrmr + param_list_rfc_90_mi_mrmr + param_list_svm_90_mi_mrmr + param_list_xgb_90_mi_mrmr + param_list_lgb_90_mi_mrmr)

In [39]:
overall_accuracy_list_pca = (accuracy_list_lr_10_pca + accuracy_list_rfc_10_pca + accuracy_list_xgb_10_pca + accuracy_list_lgb_10_pca +
                            accuracy_list_lr_20_pca + accuracy_list_rfc_20_pca + accuracy_list_xgb_20_pca + accuracy_list_lgb_20_pca +
                            accuracy_list_lr_30_pca + accuracy_list_rfc_30_pca + accuracy_list_xgb_30_pca + accuracy_list_lgb_30_pca +
                            accuracy_list_lr_50_pca + accuracy_list_rfc_50_pca + accuracy_list_xgb_50_pca + accuracy_list_lgb_50_pca +
                            accuracy_list_lr_75_pca + accuracy_list_rfc_75_pca + accuracy_list_xgb_75_pca + accuracy_list_lgb_75_pca +
                            accuracy_list_lr_90_pca + accuracy_list_rfc_90_pca + accuracy_list_xgb_90_pca + accuracy_list_lgb_90_pca)

overall_f1_score_list_pca = (f1_score_list_lr_10_pca + f1_score_list_rfc_10_pca + f1_score_list_xgb_10_pca + f1_score_list_lgb_10_pca +
                            f1_score_list_lr_20_pca + f1_score_list_rfc_20_pca + f1_score_list_xgb_20_pca + f1_score_list_lgb_20_pca +
                            f1_score_list_lr_30_pca + f1_score_list_rfc_30_pca + f1_score_list_xgb_30_pca + f1_score_list_lgb_30_pca +
                            f1_score_list_lr_50_pca + f1_score_list_rfc_50_pca + f1_score_list_xgb_50_pca + f1_score_list_lgb_50_pca +
                            f1_score_list_lr_75_pca + f1_score_list_rfc_75_pca + f1_score_list_xgb_75_pca + f1_score_list_lgb_75_pca +
                            f1_score_list_lr_90_pca + f1_score_list_rfc_90_pca + f1_score_list_xgb_90_pca + f1_score_list_lgb_90_pca)

overall_auc_list_pca = (auc_list_lr_10_pca + auc_list_rfc_10_pca + auc_list_xgb_10_pca + auc_list_lgb_10_pca +
                            auc_list_lr_20_pca + auc_list_rfc_20_pca + auc_list_xgb_20_pca + auc_list_lgb_20_pca +
                            auc_list_lr_30_pca + auc_list_rfc_30_pca + auc_list_xgb_30_pca + auc_list_lgb_30_pca +
                            auc_list_lr_50_pca + auc_list_rfc_50_pca + auc_list_xgb_50_pca + auc_list_lgb_50_pca +
                            auc_list_lr_75_pca + auc_list_rfc_75_pca + auc_list_xgb_75_pca + auc_list_lgb_75_pca +
                            auc_list_lr_90_pca + auc_list_rfc_90_pca + auc_list_xgb_90_pca + auc_list_lgb_90_pca)

# overall_accuracy_list_mi = (accuracy_list_svm_10_pca + accuracy_list_svm_20_pca + accuracy_list_svm_30_pca + accuracy_list_svm_50_pca + accuracy_list_svm_75_pca + accuracy_list_svm_90_pca)


# overall_param_list_pca = (param_list_lr_10_pca + param_list_rfc_10_pca + param_list_svm_10_pca + param_list_xgb_10_pca + param_list_lgb_10_pca +
#                             param_list_lr_20_pca + param_list_rfc_20_pca + param_list_svm_20_pca + param_list_xgb_20_pca + param_list_lgb_20_pca +
#                             param_list_lr_30_pca + param_list_rfc_30_pca + param_list_svm_30_pca + param_list_xgb_30_pca + param_list_lgb_30_pca +
#                             param_list_lr_50_pca + param_list_rfc_50_pca + param_list_svm_50_pca + param_list_xgb_50_pca + param_list_lgb_50_pca +
#                             param_list_lr_75_pca + param_list_rfc_75_pca + param_list_svm_75_pca + param_list_xgb_75_pca + param_list_lgb_75_pca +
#                             param_list_lr_90_pca + param_list_rfc_90_pca + param_list_svm_90_pca + param_list_xgb_90_pca + param_list_lgb_90_pca)


In [40]:
print(len(overall_accuracy_list_mi))
print(len(overall_f1_score_list_mi))
print(len(overall_auc_list_mi))
# print(len(overall_param_list_mi))

print(len(overall_accuracy_list_mrmr))
print(len(overall_f1_score_list_mrmr))
print(len(overall_auc_list_mrmr))
# print(len(overall_param_list_mrmr))

print(len(overall_accuracy_list_mi_mrmr))
print(len(overall_f1_score_list_mi_mrmr))
print(len(overall_auc_list_mi_mrmr))
# print(len(overall_param_list_mi_mrmr))

print(len(overall_accuracy_list_pca))
print(len(overall_f1_score_list_pca))
print(len(overall_auc_list_pca))
# print(len(overall_param_list_pca))

240
240
240
240
240
240
240
240
240
240
240
240


In [1]:
results_dictionary = {
    'samples': filename_sample_list_final,
    'models': models,
    'percentiles': percentiles,
    'mi_accuracy': overall_accuracy_list_mi,
    'mi_f1_score': overall_f1_score_list_mi,
    'mi_auc': overall_auc_list_mi,
    'mrmr_accuracy': overall_accuracy_list_mrmr,
    'mrmr_f1_score': overall_f1_score_list_mrmr,
    'mrmr_auc': overall_auc_list_mrmr,
    'mi_mrmr_accuracy': overall_accuracy_list_mi_mrmr,
    'mi_mrmr_f1_score': overall_f1_score_list_mi_mrmr,
    'mi_mrmr_auc': overall_auc_list_mi_mrmr,
    'pca_accuracy': overall_accuracy_list_pca,
    'pca_f1_score': overall_f1_score_list_pca,
    'pca_auc': overall_auc_list_pca,
}
results_df = pd.DataFrame(results_dictionary)

results_df.to_csv('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/results_v2/overall_results_20_v2.csv', index=False)

NameError: name 'filename_sample_list_final' is not defined