In [72]:
import pandas as pd
import numpy as np

from sklearn.model_selection import *
from sklearn.metrics import *

from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.decomposition import *

import xgboost as xgb
import lightgbm as lgb

# import tensorflow as tf

import os
import re
import ast

In [73]:
length_text = 10
directory_dataframes = '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/short_trajectory/subsamples/{}/'.format(length_text)
directory_features = '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/short_trajectory/features/{}/'.format(length_text)

def get_sample_df(directory=directory_dataframes):
    filename_list = []
    list_dataframes = []
    for filename in os.listdir(directory):
        print(filename)
        filename_list.append(filename)
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            list_dataframes.append(pd.read_csv(f))
            
    return list_dataframes, filename_list

def get_features(regex_str, directory=directory_features):
    regex = re.compile('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/short_trajectory/features/{}/{}'.format(length_text, regex_str))
    
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if regex.match(f):
            file1 = open(f,"r+")
            feat_list = file1.read().splitlines()
            
            #txt file converts everything to string, so we need to convert it back to list
            for i in range(len(feat_list)):
                #adding ; to be used a separator for list
                if i<len(feat_list):
                    new_val = feat_list[i].replace('y','y;').replace(') ','); ').replace('4 ', '4; ').replace('5 ', '5; ')
                    feat_list[i] = new_val
                
    for val in feat_list:
        #separating the string into a list of features
        new_val = val.split('; ')
        feat_list[feat_list.index(val)] = new_val
        
    return feat_list

list_sample_dataframes, filename_sample_list = get_sample_df(directory_dataframes)

subsample_8.csv


subsample_9.csv
subsample_7.csv
subsample_6.csv
subsample_10.csv
subsample_4.csv
subsample_5.csv
subsample_1.csv
subsample_2.csv
subsample_3.csv


In [74]:
def model_train_predict(model, regex_str, dataframes=list_sample_dataframes, params=None):
    
    feat_list = get_features(regex_str)
    
    accuracy_list = []
    f1_score_list = []
    auc_list = []
    best_params_list = []
    
    for sample, feat in zip(dataframes, feat_list):
        feat[len(feat)-1] = feat[len(feat)-1].replace('y;', 'y')
        x = sample[feat]
        x = x.rename(columns = lambda a:re.sub('[^A-Za-z0-9_]+', '', a))
        
        y = sample['conversion_class']
        # print(y.value_counts())
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
        clf = GridSearchCV(estimator=model, param_grid=params, cv=5, n_jobs=-1)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        
        
        # model.fit(x_train, y_train)
        # y_pred = model.predict(x_test)
        accuracy_list.append(accuracy_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))
        auc_list.append(roc_auc_score(y_test, y_pred))
        best_params_list.append(clf.best_params_)
        
    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
    print('Average Accuracy', np.mean(accuracy_list))
    print('Average F1 Score', np.mean(f1_score_list))
    print('Average AUC', np.mean(auc_list)) 
    
    print('Max Accuracy', max(accuracy_list))
    print('Max F1 Score', max(f1_score_list))
    print('Max AUC', max(auc_list))  
    
    print(accuracy_list)
    print(auc_list)
    best_accuracy_index = accuracy_list.index(max(accuracy_list))
    best_f1_score_index = f1_score_list.index(max(f1_score_list))
    best_auc_index = auc_list.index(max(auc_list))
    
    print('Best Sample Index based on Max Accuracy', best_accuracy_index)
    print('Best Sample Index based on Max F1 Score', best_f1_score_index)
    print('Best Sample Index based on Max AUC', best_auc_index)
    
    print('Best Features based on Max Accuracy', feat_list[best_accuracy_index])
    print('Best Features based on Max F1 Score', feat_list[best_f1_score_index])
    print('Best Features based on Max AUC', feat_list[best_auc_index]) 
    print('Best Params based on Max Accuracy', best_params_list[best_accuracy_index])
    
    return accuracy_list, f1_score_list, auc_list, best_params_list  


# Mutual Information

## 10 Percentile

In [75]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_10_mi, f1_score_list_lr_10_mi, auc_list_lr_10_mi, param_list_lr_10_mi = model_train_predict(lr, 'mi_feat_list_10', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_10_mi, f1_score_list_rfc_10_mi, auc_list_rfc_10_mi, param_list_rfc_10_mi = model_train_predict(rfc, 'mi_feat_list_10', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_10_mi, f1_score_list_svm_10_mi, auc_list_svm_10_mi, param_list_svm_10_mi = model_train_predict(svc, 'mi_feat_list_10', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_10_mi, f1_score_list_xgb_10_mi, auc_list_xgb_10_mi, param_list_xgb_10_mi = model_train_predict(xgbc, 'mi_feat_list_10', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_10_mi, f1_score_list_lgb_10_mi, auc_list_lgb_10_mi, param_list_lgb_10_mi = model_train_predict(lgbc, 'mi_feat_list_10', params=params)

(11200, 15) (11200,) (2800, 15) (2800,)
Average Accuracy 0.7206785714285714
Average F1 Score 0.6883725766688009
Average AUC 0.7206785714285715
Max Accuracy 0.7260714285714286
Max F1 Score 0.6955140928940056
Max AUC 0.7260714285714286
[0.7135714285714285, 0.7175, 0.7242857142857143, 0.7260714285714286, 0.7192857142857143, 0.7260714285714286, 0.7185714285714285, 0.7135714285714285, 0.7257142857142858, 0.7221428571428572]
[0.7135714285714285, 0.7175, 0.7242857142857143, 0.7260714285714286, 0.7192857142857143, 0.7260714285714286, 0.7185714285714286, 0.7135714285714285, 0.7257142857142856, 0.7221428571428572]
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_5_node_entropy', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)']
Best Features based on 

## 20 Percentile

In [76]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_20_mi, f1_score_list_lr_20_mi, auc_list_lr_20_mi, param_list_lr_20_mi = model_train_predict(lr, 'mi_feat_list_20', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_20_mi, f1_score_list_rfc_20_mi, auc_list_rfc_20_mi, param_list_rfc_20_mi = model_train_predict(rfc, 'mi_feat_list_20', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_20_mi, f1_score_list_svm_20_mi, auc_list_svm_20_mi, param_list_svm_20_mi = model_train_predict(svc, 'mi_feat_list_20', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_20_mi, f1_score_list_xgb_20_mi, auc_list_xgb_20_mi, param_list_xgb_20_mi = model_train_predict(xgbc, 'mi_feat_list_20', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_20_mi, f1_score_list_lgb_20_mi, auc_list_lgb_20_mi, param_list_lgb_20_mi = model_train_predict(lgbc, 'mi_feat_list_20', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(11200, 29) (11200,) (2800, 29) (2800,)
Average Accuracy 0.7248928571428571
Average F1 Score 0.6919722814437563
Average AUC 0.7248928571428571
Max Accuracy 0.7296428571428571
Max F1 Score 0.6970788315326131
Max AUC 0.7296428571428571
[0.715, 0.7278571428571429, 0.7282142857142857, 0.7264285714285714, 0.7232142857142857, 0.7296428571428571, 0.7253571428571428, 0.7210714285714286, 0.7285714285714285, 0.7235714285714285]
[0.715, 0.7278571428571429, 0.7282142857142857, 0.7264285714285714, 0.7232142857142856, 0.7296428571428571, 0.7253571428571429, 0.7210714285714285, 0.7285714285714285, 0.7235714285714285]
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 5
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(6, 2)', '(1, 4)', '(4, 1)', '(3, 2)', '(1, 3)', '(3, 3)', '(1, 2, 1)', '(2, 1, 2

## 30 Percentile

In [77]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_30_mi, f1_score_list_lr_30_mi, auc_list_lr_30_mi, param_list_lr_30_mi = model_train_predict(lr, 'mi_feat_list_30', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_30_mi, f1_score_list_rfc_30_mi, auc_list_rfc_30_mi, param_list_rfc_30_mi = model_train_predict(rfc, 'mi_feat_list_30', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_30_mi, f1_score_list_svm_30_mi, auc_list_svm_30_mi, param_list_svm_30_mi = model_train_predict(svc, 'mi_feat_list_30', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_30_mi, f1_score_list_xgb_30_mi, auc_list_xgb_30_mi, param_list_xgb_30_mi = model_train_predict(xgbc, 'mi_feat_list_30', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_30_mi, f1_score_list_lgb_30_mi, auc_list_lgb_30_mi, param_list_lgb_30_mi = model_train_predict(lgbc, 'mi_feat_list_30', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(11200, 43) (11200,) (2800, 43) (2800,)
Average Accuracy 0.7253928571428572
Average F1 Score 0.6935155699772809
Average AUC 0.7253928571428572
Max Accuracy 0.7328571428571429
Max F1 Score 0.6998011928429423
Max AUC 0.732857142857143
[0.7157142857142857, 0.7267857142857143, 0.7328571428571429, 0.7242857142857143, 0.7239285714285715, 0.7303571428571428, 0.7246428571428571, 0.7225, 0.7303571428571428, 0.7225]
[0.7157142857142856, 0.7267857142857143, 0.732857142857143, 0.7242857142857143, 0.7239285714285715, 0.7303571428571429, 0.7246428571428573, 0.7225, 0.7303571428571428, 0.7225]
Best Sample Index based on Max Accuracy 2
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 2
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(3, 2)', '(1, 3)', '(3, 3)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 2)', '(1, 2, 3)', '(2,

## 50 Percentile

In [78]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_50_mi, f1_score_list_lr_50_mi, auc_list_lr_50_mi, param_list_lr_50_mi = model_train_predict(lr, 'mi_feat_list_50', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_50_mi, f1_score_list_rfc_50_mi, auc_list_rfc_50_mi, param_list_rfc_50_mi = model_train_predict(rfc, 'mi_feat_list_50', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_50_mi, f1_score_list_svm_50_mi, auc_list_svm_50_mi, param_list_svm_50_mi = model_train_predict(svc, 'mi_feat_list_50', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_50_mi, f1_score_list_xgb_50_mi, auc_list_xgb_50_mi, param_list_xgb_50_mi = model_train_predict(xgbc, 'mi_feat_list_50', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_50_mi, f1_score_list_lgb_50_mi, auc_list_lgb_50_mi, param_list_lgb_50_mi = model_train_predict(lgbc, 'mi_feat_list_50', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(11200, 71) (11200,) (2800, 71) (2800,)
Average Accuracy 0.7263214285714286
Average F1 Score 0.6945191041505876
Average AUC 0.7263214285714286
Max Accuracy 0.7310714285714286
Max F1 Score 0.7001194743130227
Max AUC 0.7310714285714286
[0.7185714285714285, 0.7264285714285714, 0.7278571428571429, 0.7289285714285715, 0.7275, 0.7246428571428571, 0.7260714285714286, 0.7235714285714285, 0.7285714285714285, 0.7310714285714286]
[0.7185714285714286, 0.7264285714285714, 0.7278571428571429, 0.7289285714285714, 0.7275, 0.7246428571428571, 0.7260714285714286, 0.7235714285714286, 0.7285714285714285, 0.7310714285714286]
Best Sample Index based on Max Accuracy 9
Best Sample Index based on Max F1 Score 9
Best Sample Index based on Max AUC 9
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 6)', '(6, 1)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '

## 75 Percentile

In [79]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_75_mi, f1_score_list_lr_75_mi, auc_list_lr_75_mi, param_list_lr_75_mi = model_train_predict(lr, 'mi_feat_list_75', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_75_mi, f1_score_list_rfc_75_mi, auc_list_rfc_75_mi, param_list_rfc_75_mi = model_train_predict(rfc, 'mi_feat_list_75', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_75_mi, f1_score_list_svm_75_mi, auc_list_svm_75_mi, param_list_svm_75_mi = model_train_predict(svc, 'mi_feat_list_75', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_75_mi, f1_score_list_xgb_75_mi, auc_list_xgb_75_mi, param_list_xgb_75_mi = model_train_predict(xgbc, 'mi_feat_list_75', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_75_mi, f1_score_list_lgb_75_mi, auc_list_lgb_75_mi, param_list_lgb_75_mi = model_train_predict(lgbc, 'mi_feat_list_75', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(11200, 107) (11200,) (2800, 107) (2800,)
Average Accuracy 0.7289285714285716
Average F1 Score 0.6975601277343039
Average AUC 0.7289285714285716
Max Accuracy 0.7339285714285714
Max F1 Score 0.7016419703644373
Max AUC 0.7339285714285714
[0.7275, 0.7292857142857143, 0.7296428571428571, 0.7267857142857143, 0.7257142857142858, 0.7282142857142857, 0.7264285714285714, 0.7307142857142858, 0.7310714285714286, 0.7339285714285714]
[0.7274999999999999, 0.7292857142857143, 0.7296428571428571, 0.7267857142857143, 0.7257142857142858, 0.7282142857142858, 0.7264285714285714, 0.7307142857142859, 0.7310714285714286, 0.7339285714285714]
Best Sample Index based on Max Accuracy 9
Best Sample Index based on Max F1 Score 9
Best Sample Index based on Max AUC 9
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(1,)', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 6)', '(6, 1)', '(2, 1)', '(2, 2)',

## 90 Percentile

In [80]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_90_mi, f1_score_list_lr_90_mi, auc_list_lr_90_mi, param_list_lr_90_mi = model_train_predict(lr, 'mi_feat_list_90', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_90_mi, f1_score_list_rfc_90_mi, auc_list_rfc_90_mi, param_list_rfc_90_mi = model_train_predict(rfc, 'mi_feat_list_90', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_90_mi, f1_score_list_svm_90_mi, auc_list_svm_90_mi, param_list_svm_90_mi = model_train_predict(svc, 'mi_feat_list_90', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_90_mi, f1_score_list_xgb_90_mi, auc_list_xgb_90_mi, param_list_xgb_90_mi = model_train_predict(xgbc, 'mi_feat_list_90', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_90_mi, f1_score_list_lgb_90_mi, auc_list_lgb_90_mi, param_list_lgb_90_mi = model_train_predict(lgbc, 'mi_feat_list_90', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(11200, 128) (11200,) (2800, 128) (2800,)
Average Accuracy 0.7300000000000001
Average F1 Score 0.6993716975556258
Average AUC 0.7300000000000001
Max Accuracy 0.7357142857142858
Max F1 Score 0.703588143525741
Max AUC 0.7357142857142858
[0.7253571428571428, 0.7285714285714285, 0.7321428571428571, 0.7289285714285715, 0.7310714285714286, 0.7282142857142857, 0.7282142857142857, 0.7317857142857143, 0.73, 0.7357142857142858]
[0.7253571428571429, 0.7285714285714285, 0.7321428571428572, 0.7289285714285715, 0.7310714285714286, 0.7282142857142857, 0.7282142857142857, 0.7317857142857143, 0.7300000000000001, 0.7357142857142858]
Best Sample Index based on Max Accuracy 9
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 9
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(1,)', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 6)', '(6, 1)', '(2, 1)', '(2, 2)', '(

# mRMR

## 10 Percentile

In [81]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_10_mrmr, f1_score_list_lr_10_mrmr, auc_list_lr_10_mrmr, param_list_lr_10_mrmr = model_train_predict(lr, 'mrmr_feat_list_10', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_10_mrmr, f1_score_list_rfc_10_mrmr, auc_list_rfc_10_mrmr, param_list_rfc_10_mrmr = model_train_predict(rfc, 'mrmr_feat_list_10', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_10_mrmr, f1_score_list_svm_10_mrmr, auc_list_svm_10_mrmr, param_list_svm_10_mrmr = model_train_predict(svc, 'mrmr_feat_list_10', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_10_mrmr, f1_score_list_xgb_10_mrmr, auc_list_xgb_10_mrmr, param_list_xgb_10_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_10', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_10_mrmr, f1_score_list_lgb_10_mrmr, auc_list_lgb_10_mrmr, param_list_lgb_10_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_10', params=params)

(11200, 15) (11200,) (2800, 15) (2800,)
Average Accuracy 0.7232142857142858
Average F1 Score 0.6924013677222073
Average AUC 0.7232142857142858
Max Accuracy 0.7289285714285715
Max F1 Score 0.6967895362663497
Max AUC 0.7289285714285714
[0.7157142857142857, 0.7253571428571428, 0.7242857142857143, 0.7285714285714285, 0.7210714285714286, 0.7267857142857143, 0.7225, 0.7171428571428572, 0.7289285714285715, 0.7217857142857143]
[0.7157142857142857, 0.7253571428571429, 0.7242857142857143, 0.7285714285714285, 0.7210714285714286, 0.7267857142857143, 0.7224999999999999, 0.7171428571428571, 0.7289285714285714, 0.7217857142857143]
Best Sample Index based on Max Accuracy 8
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 8
Best Features based on Max Accuracy ['(2, 1)', 'unigram_entropy', '(3,)', '(1, 2, 1)', '(3, 1)', '(2, 3)', '(1, 4)', 'pattern_hvg_5_node_entropy', '(2, 1, 2)', '(1, 2, 3)', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', 'bigram_entropy', '(1, 3)']
Best Features base

## 20 Percentile

In [82]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_20_mrmr, f1_score_list_lr_20_mrmr, auc_list_lr_20_mrmr, param_list_lr_20_mrmr = model_train_predict(lr, 'mrmr_feat_list_20*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_20_mrmr, f1_score_list_rfc_20_mrmr, auc_list_rfc_20_mrmr, param_list_rfc_20_mrmr = model_train_predict(rfc, 'mrmr_feat_list_20', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_20_mrmr, f1_score_list_svm_20_mrmr, auc_list_svm_20_mrmr, param_list_svm_20_mrmr = model_train_predict(svc, 'mrmr_feat_list_20', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_20_mrmr, f1_score_list_xgb_20_mrmr, auc_list_xgb_20_mrmr, param_list_xgb_20_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_20', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_20_mrmr, f1_score_list_lgb_20_mrmr, auc_list_lgb_20_mrmr, param_list_lgb_20_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_20', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(11200, 29) (11200,) (2800, 29) (2800,)
Average Accuracy 0.7240714285714287
Average F1 Score 0.6891354899602842
Average AUC 0.7240714285714287
Max Accuracy 0.73
Max F1 Score 0.6936790923824959
Max AUC 0.73
[0.7128571428571429, 0.7239285714285715, 0.7275, 0.7278571428571429, 0.7239285714285715, 0.7267857142857143, 0.7217857142857143, 0.7203571428571428, 0.73, 0.7257142857142858]
[0.7128571428571429, 0.7239285714285714, 0.7274999999999999, 0.7278571428571428, 0.7239285714285715, 0.7267857142857143, 0.7217857142857143, 0.7203571428571428, 0.73, 0.7257142857142859]
Best Sample Index based on Max Accuracy 8
Best Sample Index based on Max F1 Score 8
Best Sample Index based on Max AUC 8
Best Features based on Max Accuracy ['(2, 1)', 'unigram_entropy', '(3,)', '(1, 2, 1)', '(3, 1)', '(2, 3)', '(1, 4)', 'pattern_hvg_5_node_entropy', '(2, 1, 2)', '(1, 2, 3)', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', 'bigram_entropy', '(1, 3)', '(2,)', '(6, 3)', '(4,)', '(2, 1, 1)', '(3, 2)', '(3, 3)', '(4, 1)', '(2, 

## 30 Percentile

In [83]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_30_mrmr, f1_score_list_lr_30_mrmr, auc_list_lr_30_mrmr, param_list_lr_30_mrmr = model_train_predict(lr, 'mrmr_feat_list_30*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_30_mrmr, f1_score_list_rfc_30_mrmr, auc_list_rfc_30_mrmr, param_list_rfc_30_mrmr = model_train_predict(rfc, 'mrmr_feat_list_30', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_30_mrmr, f1_score_list_svm_30_mrmr, auc_list_svm_30_mrmr, param_list_svm_30_mrmr = model_train_predict(svc, 'mrmr_feat_list_30', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_30_mrmr, f1_score_list_xgb_30_mrmr, auc_list_xgb_30_mrmr, param_list_xgb_30_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_30', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_30_mrmr, f1_score_list_lgb_30_mrmr, auc_list_lgb_30_mrmr, param_list_lgb_30_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_30', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(11200, 43) (11200,) (2800, 43) (2800,)
Average Accuracy 0.7267142857142856
Average F1 Score 0.6928834299446874
Average AUC 0.7267142857142856
Max Accuracy 0.7321428571428571
Max F1 Score 0.6999606763664962
Max AUC 0.7321428571428572
[0.7164285714285714, 0.7275, 0.7285714285714285, 0.7303571428571428, 0.7239285714285715, 0.7310714285714286, 0.7239285714285715, 0.7232142857142857, 0.7321428571428571, 0.73]
[0.7164285714285714, 0.7275, 0.7285714285714285, 0.7303571428571428, 0.7239285714285714, 0.7310714285714286, 0.7239285714285715, 0.7232142857142857, 0.7321428571428572, 0.7299999999999999]
Best Sample Index based on Max Accuracy 8
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 8
Best Features based on Max Accuracy ['(2, 1)', 'unigram_entropy', '(3,)', '(1, 2, 1)', '(3, 1)', '(2, 3)', '(1, 4)', 'pattern_hvg_5_node_entropy', '(2, 1, 2)', '(1, 2, 3)', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', 'bigram_entropy', '(1, 3)', '(2,)', '(6, 3)', '(4,)', '(2, 1, 1)', '(3, 

## 50 Percentile

In [84]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_50_mrmr, f1_score_list_lr_50_mrmr, auc_list_lr_50_mrmr, param_list_lr_50_mrmr = model_train_predict(lr, 'mrmr_feat_list_50*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_50_mrmr, f1_score_list_rfc_50_mrmr, auc_list_rfc_50_mrmr, param_list_rfc_50_mrmr = model_train_predict(rfc, 'mrmr_feat_list_50', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_50_mrmr, f1_score_list_svm_50_mrmr, auc_list_svm_50_mrmr, param_list_svm_50_mrmr = model_train_predict(svc, 'mrmr_feat_list_50', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_50_mrmr, f1_score_list_xgb_50_mrmr, auc_list_xgb_50_mrmr, param_list_xgb_50_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_50', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_50_mrmr, f1_score_list_lgb_50_mrmr, auc_list_lgb_50_mrmr, param_list_lgb_50_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_50', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(11200, 71) (11200,) (2800, 71) (2800,)
Average Accuracy 0.7269642857142856
Average F1 Score 0.69440059030329
Average AUC 0.7269642857142856
Max Accuracy 0.7332142857142857
Max F1 Score 0.6989117291414751
Max AUC 0.7332142857142858
[0.7171428571428572, 0.7253571428571428, 0.7296428571428571, 0.7282142857142857, 0.7271428571428571, 0.7332142857142857, 0.7232142857142857, 0.7239285714285715, 0.7314285714285714, 0.7303571428571428]
[0.7171428571428572, 0.7253571428571427, 0.729642857142857, 0.7282142857142858, 0.7271428571428571, 0.7332142857142858, 0.7232142857142857, 0.7239285714285715, 0.7314285714285714, 0.7303571428571428]
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 5
Best Features based on Max Accuracy ['(2, 1)', 'unigram_entropy', '(3,)', '(3, 1)', '(1, 2, 1)', '(1, 4)', '(2, 3)', 'pattern_hvg_5_node_entropy', '(2, 1, 2)', '(1, 2, 3)', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', 'bigram_entropy', '(1, 3)', '(4,)', '

## 75 Percentile

In [85]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_75_mrmr, f1_score_list_lr_75_mrmr, auc_list_lr_75_mrmr, param_list_lr_75_mrmr = model_train_predict(lr, 'mrmr_feat_list_75*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_75_mrmr, f1_score_list_rfc_75_mrmr, auc_list_rfc_75_mrmr, param_list_rfc_75_mrmr = model_train_predict(rfc, 'mrmr_feat_list_75', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_75_mrmr, f1_score_list_svm_75_mrmr, auc_list_svm_75_mrmr, param_list_svm_75_mrmr = model_train_predict(svc, 'mrmr_feat_list_75', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_75_mrmr, f1_score_list_xgb_75_mrmr, auc_list_xgb_75_mrmr, param_list_xgb_75_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_75', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_75_mrmr, f1_score_list_lgb_75_mrmr, auc_list_lgb_75_mrmr, param_list_lgb_75_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_75', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(11200, 107) (11200,) (2800, 107) (2800,)
Average Accuracy 0.727107142857143
Average F1 Score 0.6971427458046742
Average AUC 0.727107142857143
Max Accuracy 0.7346428571428572
Max F1 Score 0.7056056448451589
Max AUC 0.7346428571428572
[0.72, 0.7257142857142858, 0.7278571428571429, 0.7235714285714285, 0.7275, 0.7346428571428572, 0.7207142857142858, 0.7317857142857143, 0.7303571428571428, 0.7289285714285715]
[0.72, 0.7257142857142858, 0.7278571428571428, 0.7235714285714286, 0.7275, 0.7346428571428572, 0.7207142857142858, 0.7317857142857143, 0.7303571428571429, 0.7289285714285715]
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 7
Best Sample Index based on Max AUC 5
Best Features based on Max Accuracy ['(2, 1)', 'unigram_entropy', '(3,)', '(3, 1)', '(1, 2, 1)', '(1, 4)', '(2, 3)', 'pattern_hvg_5_node_entropy', '(2, 1, 2)', '(1, 2, 3)', '(3, 1, 1)', '(1, 2)', '(2, 3, 1)', 'bigram_entropy', '(1, 3)', '(4,)', '(6, 3)', '(2,)', '(3, 2)', '(3, 3)', '(4, 1)', 't

## 90 Percentile

In [86]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_90_mrmr, f1_score_list_lr_90_mrmr, auc_list_lr_90_mrmr, param_list_lr_90_mrmr = model_train_predict(lr, 'mrmr_feat_list_90*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_90_mrmr, f1_score_list_rfc_90_mrmr, auc_list_rfc_90_mrmr, param_list_rfc_90_mrmr = model_train_predict(rfc, 'mrmr_feat_list_90', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_90_mrmr, f1_score_list_svm_90_mrmr, auc_list_svm_90_mrmr, param_list_svm_90_mrmr = model_train_predict(svc, 'mrmr_feat_list_90', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_90_mrmr, f1_score_list_xgb_90_mrmr, auc_list_xgb_90_mrmr, param_list_xgb_90_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_90', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_90_mrmr, f1_score_list_lgb_90_mrmr, auc_list_lgb_90_mrmr, param_list_lgb_90_mrmr = model_train_predict(lgbc, 'mrmr_feat_list_90', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(11200, 128) (11200,) (2800, 128) (2800,)
Average Accuracy 0.7288571428571429
Average F1 Score 0.698219196289645
Average AUC 0.7288571428571429
Max Accuracy 0.7325
Max F1 Score 0.7013806706114398
Max AUC 0.7325
[0.725, 0.7285714285714285, 0.7325, 0.7282142857142857, 0.7267857142857143, 0.7296428571428571, 0.73, 0.73, 0.7307142857142858, 0.7271428571428571]
[0.725, 0.7285714285714286, 0.7325, 0.7282142857142857, 0.7267857142857143, 0.7296428571428571, 0.73, 0.7299999999999999, 0.7307142857142858, 0.7271428571428572]
Best Sample Index based on Max Accuracy 2
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 2
Best Features based on Max Accuracy ['(2, 1)', '(2, 1, 4)', 'unigram_entropy', '(3,)', '(2, 3)', '(1, 2, 1)', '(3, 1)', 'pattern_hvg_5_node_entropy', '(2, 1, 2)', '(1, 2, 3)', '(1, 4)', '(2, 3, 1)', '(1, 2)', '(3, 1, 1)', 'bigram_entropy', '(1, 3)', '(6, 3)', '(2,)', '(3, 2)', '(4,)', 'trigram_entropy', '(3, 3)', '(2, 6, 3)', '(4, 1, 1)', '(3, 1, 2)', '(2,

# MI and mRMR

## 10 Percentile

In [87]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_10_mi_mrmr, f1_score_list_lr_10_mi_mrmr, auc_list_lr_10_mi_mrmr, param_list_lr_10_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_10*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_10_mi_mrmr, f1_score_list_rfc_10_mi_mrmr, auc_list_rfc_10_mi_mrmr, param_list_rfc_10_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_10', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_10_mi_mrmr, f1_score_list_svm_10_mi_mrmr, auc_list_svm_10_mi_mrmr, param_list_svm_10_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_10', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_10_mi_mrmr, f1_score_list_xgb_10_mi_mrmr, auc_list_xgb_10_mi_mrmr, param_list_xgb_10_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_10', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_10_mi_mrmr, f1_score_list_lgb_10_mi_mrmr, auc_list_lgb_10_mi_mrmr, param_list_lgb_10_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_10', params=params)

(11200, 12) (11200,) (2800, 12) (2800,)
Average Accuracy 0.7149642857142857
Average F1 Score 0.6865294873391995
Average AUC 0.7149642857142857
Max Accuracy 0.7246428571428571
Max F1 Score 0.6956178444532175
Max AUC 0.7246428571428574
[0.7110714285714286, 0.7092857142857143, 0.7192857142857143, 0.7135714285714285, 0.7189285714285715, 0.7164285714285714, 0.7132142857142857, 0.71, 0.7246428571428571, 0.7132142857142857]
[0.7110714285714286, 0.7092857142857143, 0.7192857142857142, 0.7135714285714285, 0.7189285714285714, 0.7164285714285714, 0.7132142857142856, 0.71, 0.7246428571428574, 0.7132142857142857]
Best Sample Index based on Max Accuracy 8
Best Sample Index based on Max F1 Score 8
Best Sample Index based on Max AUC 8
Best Features based on Max Accuracy ['(2, 3, 1)', '(2, 1)', '(3, 1)', '(3,)', '(1, 4)', 'bigram_entropy', '(3, 1, 1)', '(2, 1, 2)', '(1, 2, 1)', '(1, 2)', 'unigram_entropy', '(2, 3)', '(1, 2, 3)']
Best Features based on Max F1 Score ['(2, 3, 1)', '(2, 1)', '(3, 1)', '(3,

## 20 Percentile

In [88]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_20_mi_mrmr, f1_score_list_lr_20_mi_mrmr, auc_list_lr_20_mi_mrmr, param_list_lr_20_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_20*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_20_mi_mrmr, f1_score_list_rfc_20_mi_mrmr, auc_list_rfc_20_mi_mrmr, param_list_rfc_20_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_20', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_20_mi_mrmr, f1_score_list_svm_20_mi_mrmr, auc_list_svm_20_mi_mrmr, param_list_svm_20_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_20', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_20_mi_mrmr, f1_score_list_xgb_20_mi_mrmr, auc_list_xgb_20_mi_mrmr, param_list_xgb_20_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_20', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_20_mi_mrmr, f1_score_list_lgb_20_mi_mrmr, auc_list_lgb_20_mi_mrmr, param_list_lgb_20_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_20', params=params)

(11200, 25) (11200,) (2800, 25) (2800,)
Average Accuracy 0.7241785714285714
Average F1 Score 0.6903865939213955
Average AUC 0.7241785714285714
Max Accuracy 0.73
Max F1 Score 0.6956172094893446
Max AUC 0.7300000000000002
[0.715, 0.7242857142857143, 0.7296428571428571, 0.7267857142857143, 0.7217857142857143, 0.7267857142857143, 0.7214285714285714, 0.7207142857142858, 0.73, 0.7253571428571428]
[0.715, 0.7242857142857142, 0.7296428571428571, 0.7267857142857144, 0.7217857142857143, 0.7267857142857144, 0.7214285714285714, 0.7207142857142856, 0.7300000000000002, 0.7253571428571428]
Best Sample Index based on Max Accuracy 8
Best Sample Index based on Max F1 Score 2
Best Sample Index based on Max AUC 8
Best Features based on Max Accuracy ['(1, 3)', '(4,)', 'pattern_hvg_5_node_entropy', '(3, 1, 2)', '(2, 3, 1)', 'bigram_entropy', '(1, 2, 1)', '(2, 3, 2)', '(4, 1, 1)', 'unigram_entropy', '(1, 2, 3)', '(3,)', '(1, 4)', '(4, 1)', '(2, 3)', '(3, 3)', '(2,)', '(2, 1)', 'trigram_entropy', '(3, 1)', '(

## 30 Percentile

In [89]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_30_mi_mrmr, f1_score_list_lr_30_mi_mrmr, auc_list_lr_30_mi_mrmr, param_list_lr_30_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_30*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_30_mi_mrmr, f1_score_list_rfc_30_mi_mrmr, auc_list_rfc_30_mi_mrmr, param_list_rfc_30_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_30', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_30_mi_mrmr, f1_score_list_svm_30_mi_mrmr, auc_list_svm_30_mi_mrmr, param_list_svm_30_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_30', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_30_mi_mrmr, f1_score_list_xgb_30_mi_mrmr, auc_list_xgb_30_mi_mrmr, param_list_xgb_30_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_30', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_30_mi_mrmr, f1_score_list_lgb_30_mi_mrmr, auc_list_lgb_30_mi_mrmr, param_list_lgb_30_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_30', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(11200, 31) (11200,) (2800, 31) (2800,)
Average Accuracy 0.7239285714285714
Average F1 Score 0.6890963386333072
Average AUC 0.7239285714285714
Max Accuracy 0.7303571428571428
Max F1 Score 0.6947027901334412
Max AUC 0.7303571428571428
[0.7135714285714285, 0.7246428571428571, 0.7278571428571429, 0.7260714285714286, 0.7225, 0.7278571428571429, 0.7207142857142858, 0.7207142857142858, 0.7303571428571428, 0.725]
[0.7135714285714285, 0.7246428571428571, 0.7278571428571428, 0.7260714285714286, 0.7225000000000001, 0.7278571428571429, 0.7207142857142856, 0.7207142857142856, 0.7303571428571428, 0.725]
Best Sample Index based on Max Accuracy 8
Best Sample Index based on Max F1 Score 8
Best Sample Index based on Max AUC 8
Best Features based on Max Accuracy ['(2, 1, 1)', '(1, 3)', '(4,)', 'pattern_hvg_5_node_entropy', '(3, 3, 3)', '(6, 3)', '(3, 2)', '(3, 1, 2)', '(2, 3, 1)', '(6, 2, 3)', 'bigram_entropy', '(1, 2, 1)', '(4, 1, 1)', 'unigram_entropy', '(2, 1, 4)', '(1, 2, 3)', '(3,)', '(1, 4)', '(4,

## 50 Percentile

In [90]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_50_mi_mrmr, f1_score_list_lr_50_mi_mrmr, auc_list_lr_50_mi_mrmr, param_list_lr_50_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_50*', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_50_mi_mrmr, f1_score_list_rfc_50_mi_mrmr, auc_list_rfc_50_mi_mrmr, param_list_rfc_50_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_50', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_50_mi_mrmr, f1_score_list_svm_50_mi_mrmr, auc_list_svm_50_mi_mrmr, param_list_svm_50_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_50', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_50_mi_mrmr, f1_score_list_xgb_50_mi_mrmr, auc_list_xgb_50_mi_mrmr, param_list_xgb_50_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_50', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_50_mi_mrmr, f1_score_list_lgb_50_mi_mrmr, auc_list_lgb_50_mi_mrmr, param_list_lgb_50_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_50', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(11200, 51) (11200,) (2800, 51) (2800,)
Average Accuracy 0.72675
Average F1 Score 0.6943169543496983
Average AUC 0.72675
Max Accuracy 0.7325
Max F1 Score 0.6994492525570418
Max AUC 0.7324999999999999
[0.7164285714285714, 0.7264285714285714, 0.73, 0.7271428571428571, 0.725, 0.7325, 0.7221428571428572, 0.7271428571428571, 0.7303571428571428, 0.7303571428571428]
[0.7164285714285714, 0.7264285714285714, 0.73, 0.7271428571428571, 0.725, 0.7324999999999999, 0.7221428571428572, 0.7271428571428572, 0.7303571428571428, 0.7303571428571429]
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 7
Best Sample Index based on Max AUC 5
Best Features based on Max Accuracy ['(2, 1, 1)', '(1, 3)', '(4,)', '(3, 4)', 'N5', 'pattern_hvg_5_node_entropy', '(1, 1, 2)', '(2, 2, 3)', '(3, 3, 3)', '(2, 6)', '(2, 3, 3)', '(1,)', '(6, 3)', '(3, 2)', '(3, 1, 2)', '(2, 3, 1)', '(6, 2, 3)', 'bigram_entropy', '(1, 2, 1)', '(2, 3, 2)', '(4, 1, 1)', '(2, 4)', '(4, 1, 4)', '(3, 3, 2)', 'unigra

## 75 Percentile

In [91]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_75_mi_mrmr, f1_score_list_lr_75_mi_mrmr, auc_list_lr_75_mi_mrmr, param_list_lr_75_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_75', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_75_mi_mrmr, f1_score_list_rfc_75_mi_mrmr, auc_list_rfc_75_mi_mrmr, param_list_rfc_75_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_75', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_75_mi_mrmr, f1_score_list_svm_75_mi_mrmr, auc_list_svm_75_mi_mrmr, param_list_svm_75_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_75', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_75_mi_mrmr, f1_score_list_xgb_75_mi_mrmr, auc_list_xgb_75_mi_mrmr, param_list_xgb_75_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_75', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_75_mi_mrmr, f1_score_list_lgb_75_mi_mrmr, auc_list_lgb_75_mi_mrmr, param_list_lgb_75_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_75', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(11200, 81) (11200,) (2800, 81) (2800,)
Average Accuracy 0.7269642857142858
Average F1 Score 0.6965453592741321
Average AUC 0.7269642857142858
Max Accuracy 0.7314285714285714
Max F1 Score 0.7020602218700475
Max AUC 0.7314285714285715
[0.7207142857142858, 0.7292857142857143, 0.7314285714285714, 0.7264285714285714, 0.7275, 0.7285714285714285, 0.7207142857142858, 0.7275, 0.7289285714285715, 0.7285714285714285]
[0.7207142857142856, 0.7292857142857142, 0.7314285714285715, 0.7264285714285714, 0.7275, 0.7285714285714285, 0.7207142857142856, 0.7275, 0.7289285714285714, 0.7285714285714285]
Best Sample Index based on Max Accuracy 2
Best Sample Index based on Max F1 Score 2
Best Sample Index based on Max AUC 2
Best Features based on Max Accuracy ['(2, 1, 1)', 'pattern_hvg_5_node_entropy', '(1, 6, 2)', '(6, 3)', '(2, 3, 2)', 'E4', '(2, 4, 1)', '(4, 1, 1)', '(1, 4, 2)', 'unigram_entropy', '(6, 1, 6)', '(2, 1, 3)', 'T5', '(1, 1, 3)', '(1, 6, 3)', 'Q5', '(3, 3)', '(3, 1, 1)', '(1, 2)', '(3, 4, 4)', '

## 90 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_90_mi_mrmr, f1_score_list_lr_90_mi_mrmr, auc_list_lr_90_mi_mrmr, param_list_lr_90_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_90', params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_90_mi_mrmr, f1_score_list_rfc_90_mi_mrmr, auc_list_rfc_90_mi_mrmr, param_list_rfc_90_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_90', params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_90_mi_mrmr, f1_score_list_svm_90_mi_mrmr, auc_list_svm_90_mi_mrmr, param_list_svm_90_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_90', params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_90_mi_mrmr, f1_score_list_xgb_90_mi_mrmr, auc_list_xgb_90_mi_mrmr, param_list_xgb_90_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_90', params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_90_mi_mrmr, f1_score_list_lgb_90_mi_mrmr, auc_list_lgb_90_mi_mrmr, param_list_lgb_90_mi_mrmr = model_train_predict(lgbc, 'mi_mrmr_feat_list_90', params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(11577, 119) (11577,) (2895, 119) (2895,)
Average Accuracy 0.7267702936096718
Average F1 Score 0.6951194888474277
Average AUC 0.7267345374503164
Max Accuracy 0.7322970639032815
Max F1 Score 0.707877874104787
Max AUC 0.7322682765256371
[0.7291882556131261, 0.7212435233160622, 0.7319516407599309, 0.7291882556131261, 0.7322970639032815, 0.730915371329879, 0.7295336787564767, 0.7177892918825561, 0.7291882556131261, 0.7164075993091538]
[0.7291517122490044, 0.7212068596868354, 0.7319155749941773, 0.7291471781968409, 0.7322682765256371, 0.7308765611457502, 0.7294970161164078, 0.7177576391619925, 0.7291502804430581, 0.7163742759834598]
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(2, 1, 1)', 'pattern_hvg_5_node_entropy', 'O5', '(3, 4, 1)', '(1, 6, 2)', '(6, 3)', 'K5', '(4, 4, 4)', '(2, 3, 2)', '(2, 4, 1)', '(4, 1, 1)', '(1, 4, 2)', 'unigram_entropy', '(6, 1, 6)', '(1, 2, 4)', '(2, 

# PCA

In [None]:
def model_train_predict_pca(model, k, dataframes=list_sample_dataframes, params=None):
    
    accuracy_list = []
    f1_score_list = []
    auc_list = []
    param_list = []
    
    for sample in dataframes:
        x = sample.drop(['Unnamed: 0', 'conversion_class'], axis=1)
        y = sample['conversion_class']
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify=y)
        
        pca = PCA(n_components=k)
        x_train = pca.fit_transform(x_train)
        x_test = pca.transform(x_test)
        
        clf = GridSearchCV(estimator=model, param_grid=params, cv=5, n_jobs=-1)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        # model.fit(x_train, y_train)
        # y_pred = model.predict(x_test)
        accuracy_list.append(accuracy_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))
        auc_list.append(roc_auc_score(y_test, y_pred))
        param_list.append(clf.best_params_)

    print('Average Accuracy', np.mean(accuracy_list))
    print('Average F1 Score', np.mean(f1_score_list))
    print('Average AUC', np.mean(auc_list)) 
    
    print('Max Accuracy', max(accuracy_list))
    print('Max F1 Score', max(f1_score_list))
    print('Max AUC', max(auc_list))  
    
    best_accuracy_index = accuracy_list.index(max(accuracy_list))
    best_f1_score_index = f1_score_list.index(max(f1_score_list))
    best_auc_index = auc_list.index(max(auc_list))
    
    print('Best Sample Index based on Max Accuracy', best_accuracy_index)
    print('Best Sample Index based on Max F1 Score', best_f1_score_index)
    print('Best Sample Index based on Max AUC', best_auc_index)
    print('Best Parameters', param_list[best_accuracy_index])
    
    return accuracy_list, f1_score_list, auc_list, param_list  

## 10 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_10_pca, f1_score_list_lr_10_pca, auc_list_lr_10_pca, param_list_lr_10_pca = model_train_predict_pca(lr, 14, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_10_pca, f1_score_list_rfc_10_pca, auc_list_rfc_10_pca, param_list_rfc_10_pca = model_train_predict_pca(rfc, 14, params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_10_pca, f1_score_list_svm_10_pca, auc_list_svm_10_pca, param_list_svm_10_pca = model_train_predict_pca(svc, 14, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_10_pca, f1_score_list_xgb_10_pca, auc_list_xgb_10_pca, param_list_xgb_10_pca = model_train_predict_pca(xgbc, 14, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_10_pca, f1_score_list_lgb_10_pca, auc_list_lgb_10_pca, param_list_lgb_10_pca = model_train_predict_pca(lgbc, 14, params=params)

Average Accuracy 0.7082220175034546
Average F1 Score 0.6869494123746714
Average AUC 0.7082220175034546
Max Accuracy 0.7167204053431598
Max F1 Score 0.6926536731634183
Max AUC 0.7167204053431598
Best Sample Index based on Max Accuracy 6
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Parameters {'C': 1, 'max_iter': 1000}


Average Accuracy 0.7134039613081529
Average F1 Score 0.6993223047507815
Average AUC 0.7134039613081529
Max Accuracy 0.7240902809765085
Max F1 Score 0.7093643862202813
Max AUC 0.7240902809765086
Best Sample Index based on Max Accuracy 6
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Parameters {'max_depth': 10, 'n_estimators': 100}




Average Accuracy 0.7120221096269
Average F1 Score 0.6938674254539988
Average AUC 0.7120221096269
Max Accuracy 0.7245508982035929
Max F1 Score 0.7064310260186548
Max AUC 0.7245508982035928
Best Sample Index based on Max Accuracy 6
Best Sample Index based on Max F1 Sc

## 20 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_20_pca, f1_score_list_lr_20_pca, auc_list_lr_20_pca, param_list_lr_20_pca = model_train_predict_pca(lr, 28, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_20_pca, f1_score_list_rfc_20_pca, auc_list_rfc_20_pca, param_list_rfc_20_pca = model_train_predict_pca(rfc, 28, params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_20_pca, f1_score_list_svm_20_pca, auc_list_svm_20_pca, param_list_svm_20_pca = model_train_predict_pca(svc, 28, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_20_pca, f1_score_list_xgb_20_pca, auc_list_xgb_20_pca, param_list_xgb_20_pca = model_train_predict_pca(xgbc, 28, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_20_pca, f1_score_list_lgb_20_pca, auc_list_lgb_20_pca, param_list_lgb_20_pca = model_train_predict_pca(lgbc, 28, params=params)

Average Accuracy 0.718770152003685
Average F1 Score 0.6913630638007733
Average AUC 0.718770152003685
Max Accuracy 0.7252418240442192
Max F1 Score 0.6961740435108777
Max AUC 0.7252418240442192
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 3
Best Parameters {'C': 1, 'max_iter': 1000}


Average Accuracy 0.7198295716259788
Average F1 Score 0.6937269405967577
Average AUC 0.7198295716259788
Max Accuracy 0.728005527406725
Max F1 Score 0.6990242682011508
Max AUC 0.7280055274067251
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 3
Best Parameters {'max_depth': 10, 'n_estimators': 100}




Average Accuracy 0.7180792261630584
Average F1 Score 0.691778897203769
Average AUC 0.7180792261630585
Max Accuracy 0.7305389221556886
Max F1 Score 0.7030456852791878
Max AUC 0.7305389221556886
Best Sample Index based on Max Accuracy 6
Best Sample Index based on Max F1 

## 30 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_30_pca, f1_score_list_lr_30_pca, auc_list_lr_30_pca, param_list_lr_30_pca = model_train_predict_pca(lr, 42, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_30_pca, f1_score_list_rfc_30_pca, auc_list_rfc_30_pca, param_list_rfc_30_pca = model_train_predict_pca(rfc, 42, params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_30_pca, f1_score_list_svm_30_pca, auc_list_svm_30_pca, param_list_svm_30_pca = model_train_predict_pca(svc, 42, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_30_pca, f1_score_list_xgb_30_pca, auc_list_xgb_30_pca, param_list_xgb_30_pca = model_train_predict_pca(xgbc, 42, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_30_pca, f1_score_list_lgb_30_pca, auc_list_lgb_30_pca, param_list_lgb_30_pca = model_train_predict_pca(lgbc, 42, params=params)

Average Accuracy 0.7238599723629664
Average F1 Score 0.6927581680958473
Average AUC 0.7238599723629664
Max Accuracy 0.7284661446338093
Max F1 Score 0.697459584295612
Max AUC 0.7284661446338092
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 3
Best Parameters {'C': 10, 'max_iter': 1000}


Average Accuracy 0.7204514048825426
Average F1 Score 0.6899453064401337
Average AUC 0.7204514048825426
Max Accuracy 0.7270842929525564
Max F1 Score 0.6964420893262678
Max AUC 0.7270842929525565
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 3
Best Parameters {'max_depth': 10, 'n_estimators': 50}




Average Accuracy 0.7215108245048365
Average F1 Score 0.6936305842427877
Average AUC 0.7215108245048365
Max Accuracy 0.7275449101796407
Max F1 Score 0.7006018054162488
Max AUC 0.7275449101796407
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max 

## 50 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_50_pca, f1_score_list_lr_50_pca, auc_list_lr_50_pca, param_list_lr_50_pca = model_train_predict_pca(lr, 69, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_50_pca, f1_score_list_rfc_50_pca, auc_list_rfc_50_pca, param_list_rfc_50_pca = model_train_predict_pca(rfc, 69, params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_50_pca, f1_score_list_svm_50_pca, auc_list_svm_50_pca, param_list_svm_50_pca = model_train_predict_pca(svc, 69, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_50_pca, f1_score_list_xgb_50_pca, auc_list_xgb_50_pca, param_list_xgb_50_pca = model_train_predict_pca(xgbc, 69, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_50_pca, f1_score_list_lgb_50_pca, auc_list_lgb_50_pca, param_list_lgb_50_pca = model_train_predict_pca(lgbc, 69, params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Average Accuracy 0.7247121142330724
Average F1 Score 0.6917009877742535
Average AUC 0.7247121142330724
Max Accuracy 0.7291570704744358
Max F1 Score 0.6996430392656808
Max AUC 0.7291570704744358
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 3
Best Parameters {'C': 1000, 'max_iter': 1000}


Average Accuracy 0.720059880239521
Average F1 Score 0.6872230363847164
Average AUC 0.720059880239521
Max Accuracy 0.7270842929525564
Max F1 Score 0.6927663987555095
Max AUC 0.7270842929525565
Best Sample Index based on Max Accuracy 2
Best Sample Index based on Max F1 Score 2
Best Sample Index based on Max AUC 2
Best Parameters {'max_depth': 10, 'n_estimators': 50}




Average Accuracy 0.7198295716259788
Average F1 Score 0.6903777117052374
Average AUC 0.7198295716259788
Max Accuracy 0.7254721326577614
Max F1 Score 0.6983565107458912
Max AUC 0.7254721326577614
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max

## 75 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_75_pca, f1_score_list_lr_75_pca, auc_list_lr_75_pca, param_list_lr_75_pca = model_train_predict_pca(lr, 104, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_75_pca, f1_score_list_rfc_75_pca, auc_list_rfc_75_pca, param_list_rfc_75_pca = model_train_predict_pca(rfc, 104, params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_75_pca, f1_score_list_svm_75_pca, auc_list_svm_75_pca, param_list_svm_75_pca = model_train_predict_pca(svc, 104, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_75_pca, f1_score_list_xgb_75_pca, auc_list_xgb_75_pca, param_list_xgb_75_pca = model_train_predict_pca(xgbc, 104, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_75_pca, f1_score_list_lgb_75_pca, auc_list_lgb_75_pca, param_list_lgb_75_pca = model_train_predict_pca(lgbc, 104, params=params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Average Accuracy 0.7243666513127589
Average F1 Score 0.6908610683180784
Average AUC 0.7243666513127591
Max Accuracy 0.7286964532473514
Max F1 Score 0.6994897959183672
Max AUC 0.7286964532473515
Best Sample Index based on Max Accuracy 6
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Parameters {'C': 1, 'max_iter': 1000}


Average Accuracy 0.7195071395670198
Average F1 Score 0.6850853156046011
Average AUC 0.7195071395670197
Max Accuracy 0.728005527406725
Max F1 Score 0.6917253980683894
Max AUC 0.728005527406725
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Parameters {'max_depth': 10, 'n_estimators': 100}




Average Accuracy 0.7204514048825426
Average F1 Score 0.69111402252016
Average AUC 0.7204514048825426
Max Accuracy 0.7259327498848457
Max F1 Score 0.6993183539510224
Max AUC 0.7259327498848457
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 

## 90 Percentile

In [None]:
lr = LogisticRegression()
params = {'C': [1, 10, 100, 1000], 'max_iter': [1000, 2000, 5000, 10000]}
accuracy_list_lr_90_pca, f1_score_list_lr_90_pca, auc_list_lr_90_pca, param_list_lr_90_pca = model_train_predict_pca(lr, 125, params=params)
print("\n================================================================\n")
rfc = RandomForestClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_rfc_90_pca, f1_score_list_rfc_90_pca, auc_list_rfc_90_pca, param_list_rfc_90_pca = model_train_predict_pca(rfc, 125, params=params)
print("\n================================================================\n")
# svc = SVC()
# params = {'C': [ 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale'],}
# accuracy_list_svm_90_pca, f1_score_list_svm_90_pca, auc_list_svm_90_pca, param_list_svm_90_pca = model_train_predict_pca(svc, 125, params=params)
print("\n================================================================\n")
xgbc = xgb.XGBClassifier()
params = {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_xgb_90_pca, f1_score_list_xgb_90_pca, auc_list_xgb_90_pca, param_list_xgb_90_pca = model_train_predict_pca(xgbc, 125, params=params)
print("\n================================================================\n")
lgbc = lgb.LGBMClassifier()
params = {'learning_rate': [0.1, 0.05, 0.01], 'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10]}
accuracy_list_lgb_90_pca, f1_score_list_lgb_90_pca, auc_list_lgb_90_pca, param_list_lgb_90_pca = model_train_predict_pca(lgbc, 125, params=params)




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Average Accuracy 0.7243896821741133
Average F1 Score 0.690894872590538
Average AUC 0.7243896821741134
Max Accuracy 0.7286964532473514
Max F1 Score 0.6994897959183672
Max AUC 0.7286964532473515
Best Sample Index based on Max Accuracy 6
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Parameters {'C': 1, 'max_iter': 1000}


Average Accuracy 0.7198986642100416
Average F1 Score 0.6859037866699136
Average AUC 0.7198986642100415
Max Accuracy 0.7268539843390143
Max F1 Score 0.6928004099410711
Max AUC 0.7268539843390143
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 3
Best Parameters {'max_depth': 10, 'n_estimators': 50}




Average Accuracy 0.7172270842929526
Average F1 Score 0.6925283746892279
Average AUC 0.7172270842929526
Max Accuracy 0.7236296637494243
Max F1 Score 0.702463054187192
Max AUC 0.7236296637494242
Best Sample Index based on Max Accuracy 2
Best Sample Index based on Max F1

# Saving results

In [None]:
length_text_list = [length_text] * 240
models = ['lr', 'rfc', 'xgbc', 'lgbm']
models = [value for value in models for _ in range(10)] * 6
percentiles = ['10', '20', '30', '50', '75', '90']
percentiles = [value for value in percentiles for _ in range(40)]
filename_sample_list_final = filename_sample_list * 24

print(len(models))
print(len(length_text_list))
print(len(percentiles))
print(len(filename_sample_list_final))

240
240
240
240


In [None]:
overall_accuracy_list_mi = (accuracy_list_lr_10_mi + accuracy_list_rfc_10_mi  + accuracy_list_xgb_10_mi + accuracy_list_lgb_10_mi +
                            accuracy_list_lr_20_mi + accuracy_list_rfc_20_mi  + accuracy_list_xgb_20_mi + accuracy_list_lgb_20_mi +
                            accuracy_list_lr_30_mi + accuracy_list_rfc_30_mi  + accuracy_list_xgb_30_mi + accuracy_list_lgb_30_mi +
                            accuracy_list_lr_50_mi + accuracy_list_rfc_50_mi  + accuracy_list_xgb_50_mi + accuracy_list_lgb_50_mi +
                            accuracy_list_lr_75_mi + accuracy_list_rfc_75_mi  + accuracy_list_xgb_75_mi + accuracy_list_lgb_75_mi +
                            accuracy_list_lr_90_mi + accuracy_list_rfc_90_mi  + accuracy_list_xgb_90_mi + accuracy_list_lgb_90_mi)

overall_f1_score_list_mi = (f1_score_list_lr_10_mi + f1_score_list_rfc_10_mi + f1_score_list_xgb_10_mi + f1_score_list_lgb_10_mi +
                            f1_score_list_lr_20_mi + f1_score_list_rfc_20_mi + f1_score_list_xgb_20_mi + f1_score_list_lgb_20_mi +
                            f1_score_list_lr_30_mi + f1_score_list_rfc_30_mi + f1_score_list_xgb_30_mi + f1_score_list_lgb_30_mi +
                            f1_score_list_lr_50_mi + f1_score_list_rfc_50_mi + f1_score_list_xgb_50_mi + f1_score_list_lgb_50_mi +
                            f1_score_list_lr_75_mi + f1_score_list_rfc_75_mi + f1_score_list_xgb_75_mi + f1_score_list_lgb_75_mi +
                            f1_score_list_lr_90_mi + f1_score_list_rfc_90_mi + f1_score_list_xgb_90_mi + f1_score_list_lgb_90_mi)

overall_auc_list_mi =  (auc_list_lr_10_mi + auc_list_rfc_10_mi + auc_list_xgb_10_mi + auc_list_lgb_10_mi +
                        auc_list_lr_20_mi + auc_list_rfc_20_mi + auc_list_xgb_20_mi + auc_list_lgb_20_mi +
                        auc_list_lr_30_mi + auc_list_rfc_30_mi + auc_list_xgb_30_mi + auc_list_lgb_30_mi +
                        auc_list_lr_50_mi + auc_list_rfc_50_mi + auc_list_xgb_50_mi + auc_list_lgb_50_mi +
                        auc_list_lr_75_mi + auc_list_rfc_75_mi + auc_list_xgb_75_mi + auc_list_lgb_75_mi +
                        auc_list_lr_90_mi + auc_list_rfc_90_mi + auc_list_xgb_90_mi + auc_list_lgb_90_mi)

overall_param_list_mi = (param_list_lr_10_mi + param_list_rfc_10_mi + param_list_xgb_10_mi + param_list_lgb_10_mi +
                            param_list_lr_20_mi + param_list_rfc_20_mi + param_list_xgb_20_mi + param_list_lgb_20_mi +
                            param_list_lr_30_mi + param_list_rfc_30_mi + param_list_xgb_30_mi + param_list_lgb_30_mi +
                            param_list_lr_50_mi + param_list_rfc_50_mi + param_list_xgb_50_mi + param_list_lgb_50_mi +
                            param_list_lr_75_mi + param_list_rfc_75_mi + param_list_xgb_75_mi + param_list_lgb_75_mi +
                            param_list_lr_90_mi + param_list_rfc_90_mi + param_list_xgb_90_mi + param_list_lgb_90_mi)

In [None]:
overall_accuracy_list_mrmr = (accuracy_list_lr_10_mrmr + accuracy_list_rfc_10_mrmr + accuracy_list_xgb_10_mrmr + accuracy_list_lgb_10_mrmr +
                            accuracy_list_lr_20_mrmr + accuracy_list_rfc_20_mrmr + accuracy_list_xgb_20_mrmr + accuracy_list_lgb_20_mrmr +
                            accuracy_list_lr_30_mrmr + accuracy_list_rfc_30_mrmr + accuracy_list_xgb_30_mrmr + accuracy_list_lgb_30_mrmr +
                            accuracy_list_lr_50_mrmr + accuracy_list_rfc_50_mrmr + accuracy_list_xgb_50_mrmr + accuracy_list_lgb_50_mrmr +
                            accuracy_list_lr_75_mrmr + accuracy_list_rfc_75_mrmr + accuracy_list_xgb_75_mrmr + accuracy_list_lgb_75_mrmr +
                            accuracy_list_lr_90_mrmr + accuracy_list_rfc_90_mrmr + accuracy_list_xgb_90_mrmr + accuracy_list_lgb_90_mrmr)

overall_f1_score_list_mrmr = (f1_score_list_lr_10_mrmr + f1_score_list_rfc_10_mrmr  + f1_score_list_xgb_10_mrmr + f1_score_list_lgb_10_mrmr +
                            f1_score_list_lr_20_mrmr + f1_score_list_rfc_20_mrmr  + f1_score_list_xgb_20_mrmr + f1_score_list_lgb_20_mrmr +
                            f1_score_list_lr_30_mrmr + f1_score_list_rfc_30_mrmr  + f1_score_list_xgb_30_mrmr + f1_score_list_lgb_30_mrmr +
                            f1_score_list_lr_50_mrmr + f1_score_list_rfc_50_mrmr  + f1_score_list_xgb_50_mrmr + f1_score_list_lgb_50_mrmr +
                            f1_score_list_lr_75_mrmr + f1_score_list_rfc_75_mrmr  + f1_score_list_xgb_75_mrmr + f1_score_list_lgb_75_mrmr +
                            f1_score_list_lr_90_mrmr + f1_score_list_rfc_90_mrmr  + f1_score_list_xgb_90_mrmr + f1_score_list_lgb_90_mrmr)

overall_auc_list_mrmr =  (auc_list_lr_10_mrmr + auc_list_rfc_10_mrmr + auc_list_xgb_10_mrmr + auc_list_lgb_10_mrmr +
                        auc_list_lr_20_mrmr + auc_list_rfc_20_mrmr + auc_list_xgb_20_mrmr + auc_list_lgb_20_mrmr +
                        auc_list_lr_30_mrmr + auc_list_rfc_30_mrmr + auc_list_xgb_30_mrmr + auc_list_lgb_30_mrmr +
                        auc_list_lr_50_mrmr + auc_list_rfc_50_mrmr + auc_list_xgb_50_mrmr + auc_list_lgb_50_mrmr +
                        auc_list_lr_75_mrmr + auc_list_rfc_75_mrmr + auc_list_xgb_75_mrmr + auc_list_lgb_75_mrmr +
                        auc_list_lr_90_mrmr + auc_list_rfc_90_mrmr + auc_list_xgb_90_mrmr + auc_list_lgb_90_mrmr)

overall_param_list_mrmr = (param_list_lr_10_mrmr + param_list_rfc_10_mrmr + param_list_xgb_10_mrmr + param_list_lgb_10_mrmr +
                            param_list_lr_20_mrmr + param_list_rfc_20_mrmr + param_list_xgb_20_mrmr + param_list_lgb_20_mrmr +
                            param_list_lr_30_mrmr + param_list_rfc_30_mrmr + param_list_xgb_30_mrmr + param_list_lgb_30_mrmr +
                            param_list_lr_50_mrmr + param_list_rfc_50_mrmr + param_list_xgb_50_mrmr + param_list_lgb_50_mrmr +
                            param_list_lr_75_mrmr + param_list_rfc_75_mrmr + param_list_xgb_75_mrmr + param_list_lgb_75_mrmr +
                            param_list_lr_90_mrmr + param_list_rfc_90_mrmr + param_list_xgb_90_mrmr + param_list_lgb_90_mrmr)

In [None]:
overall_accuracy_list_mi_mrmr = (accuracy_list_lr_10_mi_mrmr + accuracy_list_rfc_10_mi_mrmr + accuracy_list_xgb_10_mi_mrmr + accuracy_list_lgb_10_mi_mrmr +
                            accuracy_list_lr_20_mi_mrmr + accuracy_list_rfc_20_mi_mrmr + accuracy_list_xgb_20_mi_mrmr + accuracy_list_lgb_20_mi_mrmr +
                            accuracy_list_lr_30_mi_mrmr + accuracy_list_rfc_30_mi_mrmr + accuracy_list_xgb_30_mi_mrmr + accuracy_list_lgb_30_mi_mrmr +
                            accuracy_list_lr_50_mi_mrmr + accuracy_list_rfc_50_mi_mrmr + accuracy_list_xgb_50_mi_mrmr + accuracy_list_lgb_50_mi_mrmr +
                            accuracy_list_lr_75_mi_mrmr + accuracy_list_rfc_75_mi_mrmr + accuracy_list_xgb_75_mi_mrmr + accuracy_list_lgb_75_mi_mrmr +
                            accuracy_list_lr_90_mi_mrmr + accuracy_list_rfc_90_mi_mrmr + accuracy_list_xgb_90_mi_mrmr + accuracy_list_lgb_90_mi_mrmr)

overall_f1_score_list_mi_mrmr = (f1_score_list_lr_10_mi_mrmr + f1_score_list_rfc_10_mi_mrmr + f1_score_list_xgb_10_mi_mrmr + f1_score_list_lgb_10_mi_mrmr +
                            f1_score_list_lr_20_mi_mrmr + f1_score_list_rfc_20_mi_mrmr + f1_score_list_xgb_20_mi_mrmr + f1_score_list_lgb_20_mi_mrmr +
                            f1_score_list_lr_30_mi_mrmr + f1_score_list_rfc_30_mi_mrmr + f1_score_list_xgb_30_mi_mrmr + f1_score_list_lgb_30_mi_mrmr +
                            f1_score_list_lr_50_mi_mrmr + f1_score_list_rfc_50_mi_mrmr + f1_score_list_xgb_50_mi_mrmr + f1_score_list_lgb_50_mi_mrmr +
                            f1_score_list_lr_75_mi_mrmr + f1_score_list_rfc_75_mi_mrmr + f1_score_list_xgb_75_mi_mrmr + f1_score_list_lgb_75_mi_mrmr +
                            f1_score_list_lr_90_mi_mrmr + f1_score_list_rfc_90_mi_mrmr + f1_score_list_xgb_90_mi_mrmr + f1_score_list_lgb_90_mi_mrmr)

overall_auc_list_mi_mrmr =  (auc_list_lr_10_mi_mrmr + auc_list_rfc_10_mi_mrmr + auc_list_xgb_10_mi_mrmr + auc_list_lgb_10_mi_mrmr +
                        auc_list_lr_20_mi_mrmr + auc_list_rfc_20_mi_mrmr + auc_list_xgb_20_mi_mrmr + auc_list_lgb_20_mi_mrmr +
                        auc_list_lr_30_mi_mrmr + auc_list_rfc_30_mi_mrmr + auc_list_xgb_30_mi_mrmr + auc_list_lgb_30_mi_mrmr +
                        auc_list_lr_50_mi_mrmr + auc_list_rfc_50_mi_mrmr + auc_list_xgb_50_mi_mrmr + auc_list_lgb_50_mi_mrmr +
                        auc_list_lr_75_mi_mrmr + auc_list_rfc_75_mi_mrmr + auc_list_xgb_75_mi_mrmr + auc_list_lgb_75_mi_mrmr +
                        auc_list_lr_90_mi_mrmr + auc_list_rfc_90_mi_mrmr + auc_list_xgb_90_mi_mrmr + auc_list_lgb_90_mi_mrmr)

overall_param_list_mi_mrmr = (param_list_lr_10_mi_mrmr + param_list_rfc_10_mi_mrmr + param_list_xgb_10_mi_mrmr + param_list_lgb_10_mi_mrmr +
                            param_list_lr_20_mi_mrmr + param_list_rfc_20_mi_mrmr + param_list_xgb_20_mi_mrmr + param_list_lgb_20_mi_mrmr +
                            param_list_lr_30_mi_mrmr + param_list_rfc_30_mi_mrmr + param_list_xgb_30_mi_mrmr + param_list_lgb_30_mi_mrmr +
                            param_list_lr_50_mi_mrmr + param_list_rfc_50_mi_mrmr + param_list_xgb_50_mi_mrmr + param_list_lgb_50_mi_mrmr +
                            param_list_lr_75_mi_mrmr + param_list_rfc_75_mi_mrmr + param_list_xgb_75_mi_mrmr + param_list_lgb_75_mi_mrmr +
                            param_list_lr_90_mi_mrmr + param_list_rfc_90_mi_mrmr + param_list_xgb_90_mi_mrmr + param_list_lgb_90_mi_mrmr)

In [None]:
overall_accuracy_list_pca = (accuracy_list_lr_10_pca + accuracy_list_rfc_10_pca + accuracy_list_xgb_10_pca + accuracy_list_lgb_10_pca +
                            accuracy_list_lr_20_pca + accuracy_list_rfc_20_pca + accuracy_list_xgb_20_pca + accuracy_list_lgb_20_pca +
                            accuracy_list_lr_30_pca + accuracy_list_rfc_30_pca + accuracy_list_xgb_30_pca + accuracy_list_lgb_30_pca +
                            accuracy_list_lr_50_pca + accuracy_list_rfc_50_pca + accuracy_list_xgb_50_pca + accuracy_list_lgb_50_pca +
                            accuracy_list_lr_75_pca + accuracy_list_rfc_75_pca + accuracy_list_xgb_75_pca + accuracy_list_lgb_75_pca +
                            accuracy_list_lr_90_pca + accuracy_list_rfc_90_pca + accuracy_list_xgb_90_pca + accuracy_list_lgb_90_pca)

overall_f1_score_list_pca = (f1_score_list_lr_10_pca + f1_score_list_rfc_10_pca + f1_score_list_xgb_10_pca + f1_score_list_lgb_10_pca +
                            f1_score_list_lr_20_pca + f1_score_list_rfc_20_pca + f1_score_list_xgb_20_pca + f1_score_list_lgb_20_pca +
                            f1_score_list_lr_30_pca + f1_score_list_rfc_30_pca + f1_score_list_xgb_30_pca + f1_score_list_lgb_30_pca +
                            f1_score_list_lr_50_pca + f1_score_list_rfc_50_pca + f1_score_list_xgb_50_pca + f1_score_list_lgb_50_pca +
                            f1_score_list_lr_75_pca + f1_score_list_rfc_75_pca + f1_score_list_xgb_75_pca + f1_score_list_lgb_75_pca +
                            f1_score_list_lr_90_pca + f1_score_list_rfc_90_pca + f1_score_list_xgb_90_pca + f1_score_list_lgb_90_pca)

overall_auc_list_pca =  (auc_list_lr_10_pca + auc_list_rfc_10_pca + auc_list_xgb_10_pca + auc_list_lgb_10_pca +
                        auc_list_lr_20_pca + auc_list_rfc_20_pca + auc_list_xgb_20_pca + auc_list_lgb_20_pca +
                        auc_list_lr_30_pca + auc_list_rfc_30_pca + auc_list_xgb_30_pca + auc_list_lgb_30_pca +
                        auc_list_lr_50_pca + auc_list_rfc_50_pca + auc_list_xgb_50_pca + auc_list_lgb_50_pca +
                        auc_list_lr_75_pca + auc_list_rfc_75_pca + auc_list_xgb_75_pca + auc_list_lgb_75_pca +
                        auc_list_lr_90_pca + auc_list_rfc_90_pca + auc_list_xgb_90_pca + auc_list_lgb_90_pca)

overall_param_list_pca = (param_list_lr_10_pca + param_list_rfc_10_pca + param_list_xgb_10_pca + param_list_lgb_10_pca +
                            param_list_lr_20_pca + param_list_rfc_20_pca + param_list_xgb_20_pca + param_list_lgb_20_pca +
                            param_list_lr_30_pca + param_list_rfc_30_pca + param_list_xgb_30_pca + param_list_lgb_30_pca +
                            param_list_lr_50_pca + param_list_rfc_50_pca + param_list_xgb_50_pca + param_list_lgb_50_pca +
                            param_list_lr_75_pca + param_list_rfc_75_pca + param_list_xgb_75_pca + param_list_lgb_75_pca +
                            param_list_lr_90_pca + param_list_rfc_90_pca + param_list_xgb_90_pca + param_list_lgb_90_pca)

In [None]:
print(len(overall_accuracy_list_mi))
print(len(overall_f1_score_list_mi))
print(len(overall_auc_list_mi))
print(len(overall_param_list_mi))

print(len(overall_accuracy_list_mrmr))
print(len(overall_f1_score_list_mrmr))
print(len(overall_auc_list_mrmr))
print(len(overall_param_list_mrmr))

print(len(overall_accuracy_list_mi_mrmr))
print(len(overall_f1_score_list_mi_mrmr))
print(len(overall_auc_list_mi_mrmr))
print(len(overall_param_list_mi_mrmr))

print(len(overall_accuracy_list_pca))
print(len(overall_f1_score_list_pca))
print(len(overall_auc_list_pca))
print(len(overall_param_list_pca))

240
240
240
240
240
240
240
240
240
240
240
240
240
240
240
240


In [None]:
results_dictionary = {
    'length_text': length_text_list,
    'samples': filename_sample_list_final,
    'models': models,
    'percentiles': percentiles,
    'mi_accuracy': overall_accuracy_list_mi,
    'mi_f1_score': overall_f1_score_list_mi,
    'mi_auc': overall_auc_list_mi,
    'mrmr_accuracy': overall_accuracy_list_mrmr,
    'mrmr_f1_score': overall_f1_score_list_mrmr,
    'mrmr_auc': overall_auc_list_mrmr,
    'mi_mrmr_accuracy': overall_accuracy_list_mi_mrmr,
    'mi_mrmr_f1_score': overall_f1_score_list_mi_mrmr,
    'mi_mrmr_auc': overall_auc_list_mi_mrmr,
    'pca_accuracy': overall_accuracy_list_pca,
    'pca_f1_score': overall_f1_score_list_pca,
    'pca_auc': overall_auc_list_pca,
    'mi_params': overall_param_list_mi,
    'mrmr_params': overall_param_list_mrmr,
    'mi_mrmr_params': overall_param_list_mi_mrmr,
    'pca_params': overall_param_list_pca
}
results_df = pd.DataFrame(results_dictionary)

results_df.to_csv('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/short_trajectory/results/overall_results_{}_20.csv'.format(length_text), index=False)