In [45]:
import pandas as pd
import numpy as np

from sklearn.model_selection import *
from sklearn.metrics import *

from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.linear_model import *
from sklearn.svm import *

import xgboost as xgb

import tensorflow as tf

import os
import re
import ast

In [46]:
directory_dataframes = '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/subsamples/'
directory_features = '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features/'

def get_sample_df(directory=directory_dataframes):
    list_dataframes = []
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            list_dataframes.append(pd.read_csv(f))
            
    return list_dataframes

def get_features(regex_str, directory=directory_features):
    regex = re.compile('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features/{}'.format(regex_str))
    
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if regex.match(f):
            file1 = open(f,"r+")
            feat_list = file1.read().splitlines()
            
            #txt file converts everything to string, so we need to convert it back to list
            for i in range(len(feat_list)):
                #adding ; to be used a separator for list
                if i<len(feat_list):
                    new_val = feat_list[i].replace('y','y;').replace(') ','); ').replace('4 ', '4; ').replace('5 ', '5; ')
                    feat_list[i] = new_val
                
    for val in feat_list:
        #separating the string into a list of features
        new_val = val.split('; ')
        feat_list[feat_list.index(val)] = new_val
        
    return feat_list

list_sample_dataframes = get_sample_df(directory_dataframes)

In [47]:
def model_train_predict(model, regex_str, dataframes=list_sample_dataframes):
    
    feat_list = get_features(regex_str)
    
    accuracy_list = []
    f1_score_list = []
    auc_list = []
    
    for sample, feat in zip(dataframes, feat_list):
        feat[len(feat)-1] = feat[len(feat)-1].replace('y;', 'y')
        x = sample[feat]
        y = sample['conversion_class']
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
        # print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        accuracy_list.append(accuracy_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))
        auc_list.append(roc_auc_score(y_test, y_pred))

    print('Average Accuracy', np.mean(accuracy_list))
    print('Average F1 Score', np.mean(f1_score_list))
    print('Average AUC', np.mean(auc_list)) 
    
    print('Max Accuracy', max(accuracy_list))
    print('Max F1 Score', max(f1_score_list))
    print('Max AUC', max(auc_list))  
    
    best_accuracy_index = accuracy_list.index(max(accuracy_list))
    best_f1_score_index = f1_score_list.index(max(f1_score_list))
    best_auc_index = auc_list.index(max(auc_list))
    
    print('Best Sample Index based on Max Accuracy', best_accuracy_index)
    print('Best Sample Index based on Max F1 Score', best_f1_score_index)
    print('Best Sample Index based on Max AUC', best_auc_index)
    
    print('Best Features based on Max Accuracy', feat_list[best_accuracy_index])
    print('Best Features based on Max F1 Score', feat_list[best_f1_score_index])
    print('Best Features based on Max AUC', feat_list[best_auc_index]) 
    
     
    
    return accuracy_list, f1_score_list, auc_list  


# Mutual Information

## 10 Percentile

In [48]:
lr = LogisticRegression()
accuracy_list_lr_10_mi, f1_score_list_lr_10_mi, auc_list_lr_10_mi = model_train_predict(lr, 'mi_feat_list_10*')

Average Accuracy 0.8086538461538462
Average F1 Score 0.8034728121749242
Average AUC 0.8086538461538462
Max Accuracy 0.8269230769230769
Max F1 Score 0.8181818181818181
Max AUC 0.8269230769230769
Best Sample Index based on Max Accuracy 2
Best Sample Index based on Max F1 Score 2
Best Sample Index based on Max AUC 2
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(1, 2, 1)', '(2, 1, 2)', '(2, 3, 1)', '(3, 1, 1)']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(1, 2, 1)', '(2, 1, 2)', '(2, 3, 1)', '(3, 1, 1)']
Best Features based on Max AUC ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(1, 2, 1)', '(2, 1, 2)', '(2, 3, 1)', '(3, 1, 1)']


In [49]:
rfc = RandomForestClassifier()
accuracy_list_rfc_10_mi, f1_score_list_rfc_10_mi, auc_list_rfc_10_mi = model_train_predict(rfc, 'mi_feat_list_10')

Average Accuracy 0.8634615384615383
Average F1 Score 0.8680182852509262
Average AUC 0.8634615384615383
Max Accuracy 0.8942307692307693
Max F1 Score 0.8942307692307693
Max AUC 0.8942307692307693
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']
Best Features based on Max AUC ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']


In [50]:
svc = SVC()
accuracy_list_svm_10_mi, f1_score_list_svm_10_mi, auc_list_svm_10_mi = model_train_predict(svc, 'mi_feat_list_10')

Average Accuracy 0.8197115384615385
Average F1 Score 0.8240373202310808
Average AUC 0.8197115384615385
Max Accuracy 0.8413461538461539
Max F1 Score 0.8411214953271028
Max AUC 0.8413461538461539
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 8
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(4, 1, 1)']
Best Features based on Max AUC ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']


In [51]:
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_10_mi, f1_score_list_xgb_10_mi, auc_list_xgb_10_mi = model_train_predict(xgbc, 'mi_feat_list_10')

Average Accuracy 0.8528846153846154
Average F1 Score 0.8551270069315375
Average AUC 0.8528846153846155
Max Accuracy 0.8894230769230769
Max F1 Score 0.8878048780487806
Max AUC 0.889423076923077
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']
Best Features based on Max AUC ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']


## 20 Percentile

In [52]:
lr = LogisticRegression()
accuracy_list_lr_20_mi, f1_score_list_lr_20_mi, auc_list_lr_20_mi = model_train_predict(lr, 'mi_feat_list_20')

Average Accuracy 0.8192307692307692
Average F1 Score 0.815936348894392
Average AUC 0.8192307692307693
Max Accuracy 0.8365384615384616
Max F1 Score 0.8316831683168318
Max AUC 0.8365384615384616
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 5
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(1, 3)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(3, 1, 1)', '(1, 1, 4)', '(1, 4, 1)', '(4, 1, 1)', '(3, 3, 1)', 'E5', 'A5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(1, 3)', '(1, 2, 1)', '(2, 1, 2)', '(

In [53]:
rfc = RandomForestClassifier()
accuracy_list_rfc_20_mi, f1_score_list_rfc_20_mi, auc_list_rfc_20_mi = model_train_predict(rfc, 'mi_feat_list_20')

Average Accuracy 0.8692307692307694
Average F1 Score 0.8737568302170541
Average AUC 0.8692307692307691
Max Accuracy 0.9086538461538461
Max F1 Score 0.9107981220657276
Max AUC 0.9086538461538461
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)', '(4, 1, 1)', 'C4', 'E5', 'B5', 'G5', 'A5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2

In [54]:
svc = SVC()
accuracy_list_svm_20_mi, f1_score_list_svm_20_mi, auc_list_svm_20_mi = model_train_predict(svc, 'mi_feat_list_20')

Average Accuracy 0.8211538461538461
Average F1 Score 0.8252295644895268
Average AUC 0.8211538461538461
Max Accuracy 0.8509615384615384
Max F1 Score 0.8502415458937198
Max AUC 0.8509615384615385
Best Sample Index based on Max Accuracy 7
Best Sample Index based on Max F1 Score 7
Best Sample Index based on Max AUC 7
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(4, 4)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)', '(4, 1, 1)', 'E4', 'C4', 'B5', 'L5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(4, 4)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)',

In [55]:
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_20_mi, f1_score_list_xgb_20_mi, auc_list_xgb_20_mi = model_train_predict(xgbc, 'mi_feat_list_20')

Average Accuracy 0.860096153846154
Average F1 Score 0.8615263218649828
Average AUC 0.860096153846154
Max Accuracy 0.8990384615384616
Max F1 Score 0.8985507246376813
Max AUC 0.8990384615384615
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)', '(4, 1, 1)', 'C4', 'E5', 'B5', 'G5', 'A5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 

## 30 Percentile

In [56]:
lr = LogisticRegression()
accuracy_list_lr_30_mi, f1_score_list_lr_30_mi, auc_list_lr_30_mi = model_train_predict(lr, 'mi_feat_list_30')

Average Accuracy 0.8134615384615385
Average F1 Score 0.8105844002059858
Average AUC 0.8134615384615385
Max Accuracy 0.8317307692307693
Max F1 Score 0.8258706467661692
Max AUC 0.8317307692307693
Best Sample Index based on Max Accuracy 7
Best Sample Index based on Max F1 Score 7
Best Sample Index based on Max AUC 7
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(6, 3)', '(1, 4)', '(4, 1)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)', '(2, 6, 3)', '(6, 3, 1)', '(1, 4, 1)', '(4, 1, 2)', '(2, 1, 4)', '(4, 1, 4)', '(1, 3, 3)', '(3, 2, 1)', '(4, 1, 1)', '(3, 1, 4)', 'A4', 'E4', 'C4', 'F4', 'E5', 'B5', 'A5', 'L5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_e

In [57]:
rfc = RandomForestClassifier()
accuracy_list_rfc_30_mi, f1_score_list_rfc_30_mi, auc_list_rfc_30_mi = model_train_predict(rfc, 'mi_feat_list_30')

Average Accuracy 0.8663461538461537
Average F1 Score 0.8708026810450251
Average AUC 0.8663461538461537
Max Accuracy 0.9038461538461539
Max F1 Score 0.9056603773584906
Max AUC 0.9038461538461539
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(3, 3)', '(1, 6)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(2, 2, 3)', '(3, 1, 1)', '(1, 1, 4)', '(1, 4, 1)', '(4, 1, 1)', '(3, 3, 1)', '(2, 3, 2)', '(3, 1, 4)', 'C4', 'F4', 'E5', 'B5', 'N5', 'Q5', 'V5', 'G5', 'A5', 'P5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', 

In [58]:
svc = SVC()
accuracy_list_svm_30_mi, f1_score_list_svm_30_mi, auc_list_svm_30_mi = model_train_predict(svc, 'mi_feat_list_30')

Average Accuracy 0.81875
Average F1 Score 0.8221220116526453
Average AUC 0.81875
Max Accuracy 0.8317307692307693
Max F1 Score 0.835680751173709
Max AUC 0.8317307692307693
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(1, 3)', '(3, 3)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(3, 1, 1)', '(1, 1, 4)', '(1, 4, 1)', '(4, 1, 4)', '(3, 3, 3)', '(4, 1, 1)', '(2, 3, 2)', '(3, 1, 4)', '(4, 4, 2)', 'A4', 'D4', 'C4', 'E5', 'B5', 'N5', 'A5', 'L5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '

In [59]:
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_30_mi, f1_score_list_xgb_30_mi, auc_list_xgb_30_mi = model_train_predict(xgbc, 'mi_feat_list_30')

Average Accuracy 0.8605769230769231
Average F1 Score 0.8618692224323132
Average AUC 0.8605769230769231
Max Accuracy 0.8990384615384616
Max F1 Score 0.8995215311004785
Max AUC 0.8990384615384616
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(3, 3)', '(1, 6)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(2, 2, 3)', '(3, 1, 1)', '(1, 1, 4)', '(1, 4, 1)', '(4, 1, 1)', '(3, 3, 1)', '(2, 3, 2)', '(3, 1, 4)', 'C4', 'F4', 'E5', 'B5', 'N5', 'Q5', 'V5', 'G5', 'A5', 'P5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', 

## 50 Percentile

In [60]:
lr = LogisticRegression()
accuracy_list_lr_50_mi, f1_score_list_lr_50_mi, auc_list_lr_50_mi = model_train_predict(lr, 'mi_feat_list_50')

Average Accuracy 0.8139423076923077
Average F1 Score 0.8109564246820504
Average AUC 0.8139423076923077
Max Accuracy 0.8317307692307693
Max F1 Score 0.8275862068965517
Max AUC 0.8317307692307694
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 2)', '(2, 3)', '(3, 1)', '(6, 3)', '(3, 2)', '(1, 4)', '(4, 1)', '(1, 3)', '(3, 3)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 2)', '(2, 2, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(2, 2, 3)', '(3, 1, 1)', '(2, 6, 3)', '(6, 3, 1)', '(1, 1, 4)', '(1, 4, 1)', '(2, 1, 4)', '(1, 3, 3)', '(3, 3, 3)', '(4, 1, 1)', '(3, 3, 1)', '(2, 3, 2)', '(3, 2, 3)', '(1, 6, 2)', '(3, 1, 4)', '(2, 3, 3)', '(2, 1, 6)', '(1, 4, 4)', '(6, 2, 3)', '(

In [61]:
rfc = RandomForestClassifier()
accuracy_list_rfc_50_mi, f1_score_list_rfc_50_mi, auc_list_rfc_50_mi = model_train_predict(rfc, 'mi_feat_list_50')

Average Accuracy 0.8735576923076923
Average F1 Score 0.8781486173160058
Average AUC 0.8735576923076923
Max Accuracy 0.9086538461538461
Max F1 Score 0.9107981220657276
Max AUC 0.9086538461538461
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 2)', '(2, 3)', '(3, 1)', '(6, 3)', '(3, 2)', '(1, 4)', '(4, 1)', '(1, 3)', '(3, 3)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 2)', '(2, 2, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(2, 2, 3)', '(3, 1, 1)', '(2, 6, 3)', '(6, 3, 1)', '(1, 1, 4)', '(1, 4, 1)', '(2, 1, 4)', '(1, 3, 3)', '(3, 3, 3)', '(4, 1, 1)', '(3, 3, 1)', '(2, 3, 2)', '(3, 2, 3)', '(1, 6, 2)', '(3, 1, 4)', '(2, 3, 3)', '(2, 1, 6)', '(1, 4, 4)', '(6, 2, 3)', '(

In [62]:
svc = SVC()
accuracy_list_svm_50_mi, f1_score_list_svm_50_mi, auc_list_svm_50_mi = model_train_predict(svc, 'mi_feat_list_50')

Average Accuracy 0.8182692307692309
Average F1 Score 0.8212250583763681
Average AUC 0.8182692307692309
Max Accuracy 0.8365384615384616
Max F1 Score 0.8349514563106797
Max AUC 0.8365384615384616
Best Sample Index based on Max Accuracy 7
Best Sample Index based on Max F1 Score 7
Best Sample Index based on Max AUC 7
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 2)', '(2, 3)', '(3, 1)', '(3, 2)', '(1, 4)', '(4, 1)', '(1, 3)', '(3, 3)', '(4, 4)', '(6, 6)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)', '(2, 6, 3)', '(1, 1, 4)', '(1, 4, 1)', '(4, 1, 2)', '(2, 1, 4)', '(4, 1, 4)', '(2, 1, 3)', '(1, 3, 3)', '(3, 3, 3)', '(3, 2, 1)', '(4, 1, 1)', '(3, 3, 1)', '(3, 2, 3)', '(3, 1, 4)', '(2, 3, 3)', '(1, 1, 3)', '(4, 4, 1)', '(6, 2, 3)', '(1, 4, 2)', '(4, 

In [63]:
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_50_mi, f1_score_list_xgb_50_mi, auc_list_xgb_50_mi = model_train_predict(xgbc, 'mi_feat_list_50')

Average Accuracy 0.8615384615384617
Average F1 Score 0.862320791887624
Average AUC 0.8615384615384617
Max Accuracy 0.9134615384615384
Max F1 Score 0.9126213592233009
Max AUC 0.9134615384615385
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 2)', '(2, 3)', '(3, 1)', '(6, 3)', '(3, 2)', '(1, 4)', '(4, 1)', '(1, 3)', '(3, 3)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 2)', '(2, 2, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(2, 2, 3)', '(3, 1, 1)', '(2, 6, 3)', '(6, 3, 1)', '(1, 1, 4)', '(1, 4, 1)', '(2, 1, 4)', '(1, 3, 3)', '(3, 3, 3)', '(4, 1, 1)', '(3, 3, 1)', '(2, 3, 2)', '(3, 2, 3)', '(1, 6, 2)', '(3, 1, 4)', '(2, 3, 3)', '(2, 1, 6)', '(1, 4, 4)', '(6, 2, 3)', '(2

# mRMR

## 10 Percentile

In [64]:
lr = LogisticRegression()
accuracy_list_lr_10_mrmr, f1_score_list_lr_10_mrmr, auc_list_lr_10_mrmr = model_train_predict(lr, 'mrmr_feat_list_10*')

Average Accuracy 0.8125
Average F1 Score 0.8103343383923104
Average AUC 0.8125
Max Accuracy 0.8365384615384616
Max F1 Score 0.8316831683168318
Max AUC 0.8365384615384616
Best Sample Index based on Max Accuracy 6
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Features based on Max Accuracy ['(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1)', '(4, 1, 1)', '(2,)', '(3, 1, 1)', '(4, 1, 2)', '(3,)', '(1, 2)', '(2, 3, 1)', '(1, 2, 1)', 'trigram_entropy', '(4,)', '(2, 3)']
Best Features based on Max F1 Score ['(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1)', '(4, 1, 1)', '(2,)', '(3, 1, 1)', '(4, 1, 2)', '(3,)', '(1, 2)', '(2, 3, 1)', '(1, 2, 1)', 'trigram_entropy', '(4,)', '(2, 3)']
Best Features based on Max AUC ['(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1)', '(4, 1, 1)', '(2,)', '(3, 1, 1)', '(4, 1, 2)', '(3,)', '(1, 2)', '(2, 3, 1)', '(1, 2, 1)', 'trigram_entropy', '(4,)', '(2, 3)']


In [65]:
rfc = RandomForestClassifier()
accuracy_list_rfc_10_mrmr, f1_score_list_rfc_10_mrmr, auc_list_rfc_10_mrmr = model_train_predict(rfc, 'mrmr_feat_list_10')

Average Accuracy 0.8567307692307693
Average F1 Score 0.8598598362488181
Average AUC 0.8567307692307693
Max Accuracy 0.8846153846153846
Max F1 Score 0.8823529411764707
Max AUC 0.8846153846153846
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['(3, 1)', '(4, 4, 4)', '(1, 4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)', '(1, 1, 2)', '(4, 1, 1)', '(2, 3, 1)']
Best Features based on Max F1 Score ['(3, 1)', '(4, 4, 4)', '(1, 4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)', '(1, 1, 2)', '(4, 1, 1)', '(2, 3, 1)']
Best Features based on Max AUC ['(3, 1)', '(4, 4, 4)', '(1, 4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)', '(1, 1, 2)', '(4, 1, 1)', '(2, 3, 1)']


In [66]:
svc = SVC()
accuracy_list_svm_10_mrmr, f1_score_list_svm_10_mrmr, auc_list_svm_10_mrmr = model_train_predict(svc, 'mrmr_feat_list_10')

Average Accuracy 0.814423076923077
Average F1 Score 0.8161212209205118
Average AUC 0.814423076923077
Max Accuracy 0.8461538461538461
Max F1 Score 0.8446601941747574
Max AUC 0.8461538461538463
Best Sample Index based on Max Accuracy 7
Best Sample Index based on Max F1 Score 7
Best Sample Index based on Max AUC 7
Best Features based on Max Accuracy ['(2, 1)', 'unigram_entropy', '(1, 2, 2)', '(1, 1, 2)', '(3, 1)', 'pattern_hvg_5_node_entropy', '(1, 2)', '(3, 1, 1)', '(4, 1, 1)', '(1, 2, 1)', '(3,)', '(2, 3)', '(2,)', 'trigram_entropy']
Best Features based on Max F1 Score ['(2, 1)', 'unigram_entropy', '(1, 2, 2)', '(1, 1, 2)', '(3, 1)', 'pattern_hvg_5_node_entropy', '(1, 2)', '(3, 1, 1)', '(4, 1, 1)', '(1, 2, 1)', '(3,)', '(2, 3)', '(2,)', 'trigram_entropy']
Best Features based on Max AUC ['(2, 1)', 'unigram_entropy', '(1, 2, 2)', '(1, 1, 2)', '(3, 1)', 'pattern_hvg_5_node_entropy', '(1, 2)', '(3, 1, 1)', '(4, 1, 1)', '(1, 2, 1)', '(3,)', '(2, 3)', '(2,)', 'trigram_entropy']


In [67]:
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_10_mrmr, f1_score_list_xgb_10_mrmr, auc_list_xgb_10_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_10')

Average Accuracy 0.8490384615384613
Average F1 Score 0.8508873020658629
Average AUC 0.8490384615384616
Max Accuracy 0.8990384615384616
Max F1 Score 0.8985507246376813
Max AUC 0.8990384615384615
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['(3, 1)', '(4, 4, 4)', '(1, 4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)', '(1, 1, 2)', '(4, 1, 1)', '(2, 3, 1)']
Best Features based on Max F1 Score ['(3, 1)', '(4, 4, 4)', '(1, 4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)', '(1, 1, 2)', '(4, 1, 1)', '(2, 3, 1)']
Best Features based on Max AUC ['(3, 1)', '(4, 4, 4)', '(1, 4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)', '(1, 1, 2)', '(4, 1, 1)', '(2, 3, 1)']


## 20 Percentile

In [68]:
lr = LogisticRegression()
accuracy_list_lr_20_mrmr, f1_score_list_lr_20_mrmr, auc_list_lr_20_mrmr = model_train_predict(lr, 'mrmr_feat_list_20*')

Average Accuracy 0.820673076923077
Average F1 Score 0.8178363568507718
Average AUC 0.820673076923077
Max Accuracy 0.8317307692307693
Max F1 Score 0.8275862068965517
Max AUC 0.8317307692307694
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(3, 1)', '(1, 4, 1)', '(3, 3)', '(1, 2, 1)', 'pattern_hvg_5_node_entropy', '(2, 1)', 'bigram_entropy', '(3, 1, 1)', '(1, 1, 2)', '(2,)', '(2, 3)', '(1, 2)', 'unigram_entropy', '(4, 1, 1)', '(2, 3, 1)', '(3,)', 'trigram_entropy', '(2, 1, 2)', '(2, 1, 1)', '(4, 1)', '(1, 2, 3)', 'pattern_hvg_4_nodes_entropy', '(4,)', '(6, 3)', '(1, 4)', '(3, 3, 1)', '(2, 1, 4)', '(2, 6, 3)']
Best Features based on Max F1 Score ['(3, 1)', '(1, 4, 1)', '(3, 3)', '(1, 2, 1)', 'pattern_hvg_5_node_entropy', '(2, 1)', 'bigram_entropy', '(3, 1, 1)', '(1, 1, 2)', '(2,)', '(2, 3)', '(1, 2)', 'unigram_entropy', '(4, 1, 1)', '(2, 3, 1)', '(3,)', 'trigram_entropy', '(2, 1

In [69]:
rfc = RandomForestClassifier()
accuracy_list_rfc_20_mrmr, f1_score_list_rfc_20_mrmr, auc_list_rfc_20_mrmr = model_train_predict(rfc, 'mrmr_feat_list_20')

Average Accuracy 0.8596153846153847
Average F1 Score 0.8636702785162379
Average AUC 0.8596153846153847
Max Accuracy 0.9038461538461539
Max F1 Score 0.9056603773584906
Max AUC 0.9038461538461539
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['(3, 1)', '(4, 4, 4)', '(1, 4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)', '(1, 1, 2)', '(4, 1, 1)', '(2, 3, 1)', '(3,)', 'bigram_entropy', '(2, 1, 2)', 'unigram_entropy', '(1, 2, 3)', '(4, 1)', 'pattern_hvg_4_nodes_entropy', '(2, 1, 1)', '(1, 4)', '(6, 3)', '(4,)', '(3, 3, 1)', '(1, 3)', '(2, 6, 3)']
Best Features based on Max F1 Score ['(3, 1)', '(4, 4, 4)', '(1, 4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)', '(1, 1, 2)', '(4, 1, 1)', '(2, 3, 1)', '(3,)', 'bigram_entropy', '(2, 1, 2)', 'unigra

In [70]:
svc = SVC()
accuracy_list_svm_20_mrmr, f1_score_list_svm_20_mrmr, auc_list_svm_20_mrmr = model_train_predict(svc, 'mrmr_feat_list_20')

Average Accuracy 0.8177884615384615
Average F1 Score 0.8204059840295683
Average AUC 0.8177884615384615
Max Accuracy 0.8413461538461539
Max F1 Score 0.8436018957345972
Max AUC 0.8413461538461537
Best Sample Index based on Max Accuracy 6
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Features based on Max Accuracy ['(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1)', '(4, 1, 1)', '(2,)', '(3, 1, 1)', '(4, 1, 2)', '(3,)', '(1, 2)', '(2, 3, 1)', '(1, 2, 1)', 'trigram_entropy', '(4,)', '(2, 3)', '(1, 1, 2)', '(2, 1, 2)', 'unigram_entropy', '(4, 1)', '(2, 1, 1)', '(1, 2, 3)', 'bigram_entropy', '(1, 4)', 'pattern_hvg_4_nodes_entropy', '(3, 3)', '(1, 4, 1)', '(1, 3)', '(6, 3, 1)', '(3, 1, 4)']
Best Features based on Max F1 Score ['(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1)', '(4, 1, 1)', '(2,)', '(3, 1, 1)', '(4, 1, 2)', '(3,)', '(1, 2)', '(2, 3, 1)', '(1, 2, 1)', 'trigram_entropy', '(4,)', '(2, 3)', '(1, 1, 2)', '(2, 1, 2)', 'unigram_entropy', '(4, 1)', '

In [71]:
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_20_mrmr, f1_score_list_xgb_20_mrmr, auc_list_xgb_20_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_20')

Average Accuracy 0.8567307692307692
Average F1 Score 0.8577591926777736
Average AUC 0.8567307692307692
Max Accuracy 0.8846153846153846
Max F1 Score 0.8846153846153846
Max AUC 0.8846153846153845
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['(3, 1)', '(4, 4, 4)', '(1, 4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)', '(1, 1, 2)', '(4, 1, 1)', '(2, 3, 1)', '(3,)', 'bigram_entropy', '(2, 1, 2)', 'unigram_entropy', '(1, 2, 3)', '(4, 1)', 'pattern_hvg_4_nodes_entropy', '(2, 1, 1)', '(1, 4)', '(6, 3)', '(4,)', '(3, 3, 1)', '(1, 3)', '(2, 6, 3)']
Best Features based on Max F1 Score ['(3, 1)', '(4, 4, 4)', '(1, 4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)', '(1, 1, 2)', '(4, 1, 1)', '(2, 3, 1)', '(3,)', 'bigram_entropy', '(2, 1, 2)', 'unigra

## 30 Percentile

In [72]:
lr = LogisticRegression()
accuracy_list_lr_30_mrmr, f1_score_list_lr_30_mrmr, auc_list_lr_30_mrmr = model_train_predict(lr, 'mrmr_feat_list_30*')

Average Accuracy 0.8197115384615385
Average F1 Score 0.8172184202769394
Average AUC 0.8197115384615385
Max Accuracy 0.8317307692307693
Max F1 Score 0.8275862068965517
Max AUC 0.8317307692307694
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 5
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(2, 1, 4)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(1, 2, 1)', 'unigram_entropy', '(2,)', '(2, 3)', '(1, 1, 2)', 'bigram_entropy', '(1, 2)', '(2, 3, 1)', '(4, 1)', '(3,)', 'trigram_entropy', '(2, 1, 2)', '(4, 1, 1)', 'pattern_hvg_4_nodes_entropy', '(1, 2, 3)', '(1, 4)', '(6, 3)', '(2, 1, 1)', '(1, 4, 1)', '(1, 3)', '(4,)', '(2, 6, 3)', '(3, 1, 2)', '(1, 3, 3)', '(6, 3, 1)', '(3, 3, 1)', '(4, 1, 2)', '(3, 2)', '(1, 1, 3)', '(1, 2, 2)', '(3, 1, 4)', '(4, 1, 4)', '(2, 2, 3)', 'C4', '(2, 3, 2)']
Best Features based on Max F1 Score ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(2, 1, 4)', '(2, 1)', 'pattern_

In [73]:
rfc = RandomForestClassifier()
accuracy_list_rfc_30_mrmr, f1_score_list_rfc_30_mrmr, auc_list_rfc_30_mrmr = model_train_predict(rfc, 'mrmr_feat_list_30')

Average Accuracy 0.8663461538461539
Average F1 Score 0.8704999868537907
Average AUC 0.8663461538461539
Max Accuracy 0.9134615384615384
Max F1 Score 0.9150943396226415
Max AUC 0.9134615384615385
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['(3, 1)', '(4, 4, 4)', '(1, 4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)', '(1, 1, 2)', '(4, 1, 1)', '(2, 3, 1)', '(3,)', 'bigram_entropy', '(2, 1, 2)', 'unigram_entropy', '(1, 2, 3)', '(4, 1)', 'pattern_hvg_4_nodes_entropy', '(2, 1, 1)', '(1, 4)', '(6, 3)', '(4,)', '(3, 3, 1)', '(1, 3)', '(2, 6, 3)', '(1, 1, 4)', '(2, 2, 3)', '(2, 1, 4)', '(3, 1, 4)', '(1, 3, 3)', '(6, 3, 1)', '(3, 2)', '(6, 2, 3)', '(3, 1, 2)', '(3, 3)', '(4, 1, 2)', '(1, 1, 3)', '(2, 2, 2)', '(2, 3, 2)']
Best Features based on Max F1 Score ['(3, 1)', '(4, 4, 4)', '(1, 4, 1)', '(1, 2)', 'pattern_hvg

In [74]:
svc = SVC()
accuracy_list_svm_30_mrmr, f1_score_list_svm_30_mrmr, auc_list_svm_30_mrmr = model_train_predict(svc, 'mrmr_feat_list_30')

Average Accuracy 0.8177884615384615
Average F1 Score 0.8201286084130646
Average AUC 0.8177884615384615
Max Accuracy 0.8413461538461539
Max F1 Score 0.8421052631578948
Max AUC 0.841346153846154
Best Sample Index based on Max Accuracy 6
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Features based on Max Accuracy ['(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1)', '(4, 1, 1)', '(2,)', '(3, 1, 1)', '(4, 1, 2)', '(3,)', '(1, 2)', '(2, 3, 1)', '(1, 2, 1)', 'trigram_entropy', '(4,)', '(2, 3)', '(1, 1, 2)', '(2, 1, 2)', 'unigram_entropy', '(4, 1)', '(2, 1, 1)', '(1, 2, 3)', 'bigram_entropy', '(1, 4)', 'pattern_hvg_4_nodes_entropy', '(3, 3)', '(1, 4, 1)', '(1, 3)', '(6, 3, 1)', '(3, 1, 4)', '(1, 2, 2)', '(1, 1, 4)', '(3, 3, 3)', '(3, 3, 2)', '(6, 3)', '(2, 1, 4)', '(1, 1, 3)', '(4, 4, 1)', '(3, 3, 1)', '(2, 2)', '(6, 2, 3)', '(2, 6, 3)', '(1, 3, 3)', '(1, 4, 4)']
Best Features based on Max F1 Score ['(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1)', '(4, 1, 1

In [75]:
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_30_mrmr, f1_score_list_xgb_30_mrmr, auc_list_xgb_30_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_30')

Average Accuracy 0.8591346153846153
Average F1 Score 0.8596171802205553
Average AUC 0.8591346153846153
Max Accuracy 0.8894230769230769
Max F1 Score 0.8909952606635071
Max AUC 0.8894230769230769
Best Sample Index based on Max Accuracy 9
Best Sample Index based on Max F1 Score 9
Best Sample Index based on Max AUC 9
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(3, 2, 2)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(1, 2, 1)', 'trigram_entropy', '(2, 3)', '(1, 2)', '(3,)', '(4, 1, 1)', 'unigram_entropy', '(2,)', '(2, 3, 1)', '(1, 1, 2)', '(2, 1, 2)', 'bigram_entropy', '(1, 2, 3)', '(4,)', '(4, 1)', 'pattern_hvg_4_nodes_entropy', '(1, 4)', '(2, 1, 1)', '(1, 3)', '(6, 3)', '(1, 4, 1)', '(3, 3, 1)', '(3, 1, 2)', '(2, 1, 4)', '(2, 6, 3)', '(3, 2)', '(3, 3, 3)', '(3, 1, 4)', '(6, 3, 1)', '(1, 1, 3)', '(1, 2, 2)', '(1, 3, 3)', '(2, 3, 2)', '(2, 3, 3)', '(4, 1, 2)']
Best Features based on Max F1 Score ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(3, 2, 2)', '(2, 1)', 'p

## 50 Percentile

In [76]:
lr = LogisticRegression()
accuracy_list_lr_50_mrmr, f1_score_list_lr_50_mrmr, auc_list_lr_50_mrmr = model_train_predict(lr, 'mrmr_feat_list_50*')

Average Accuracy 0.816826923076923
Average F1 Score 0.8142847054835001
Average AUC 0.816826923076923
Max Accuracy 0.8365384615384616
Max F1 Score 0.8300000000000001
Max AUC 0.8365384615384616
Best Sample Index based on Max Accuracy 7
Best Sample Index based on Max F1 Score 7
Best Sample Index based on Max AUC 7
Best Features based on Max Accuracy ['(2, 1)', 'unigram_entropy', '(1, 2, 2)', '(1, 1, 2)', '(3, 1)', 'pattern_hvg_5_node_entropy', '(1, 2)', '(3, 1, 1)', '(4, 1, 1)', '(1, 2, 1)', '(3,)', '(2, 3)', '(2,)', 'trigram_entropy', '(2, 1, 2)', 'bigram_entropy', '(2, 3, 1)', 'pattern_hvg_4_nodes_entropy', '(4, 1)', '(2, 1, 1)', '(1, 4)', '(1, 2, 3)', '(4,)', '(1, 3)', '(6, 3)', '(2, 1, 4)', '(1, 4, 1)', '(3, 3)', '(3, 2)', '(6, 3, 1)', '(3, 3, 1)', '(3, 1, 4)', '(6, 2, 3)', '(1, 1, 4)', '(2, 6, 3)', '(3, 3, 3)', '(3, 1, 2)', '(3, 2, 1)', '(2, 3, 2)', '(4, 1, 2)', '(2, 1, 3)', '(2, 2, 3)', '(1, 1, 3)', '(2, 2, 2)', '(1, 3, 3)', '(2, 2)', '(2, 3, 3)', 'P5', '(4, 1, 4)', '(3, 2, 3)', '(2

In [77]:
rfc = RandomForestClassifier()
accuracy_list_rfc_50_mrmr, f1_score_list_rfc_50_mrmr, auc_list_rfc_50_mrmr = model_train_predict(rfc, 'mrmr_feat_list_50')

Average Accuracy 0.8677884615384617
Average F1 Score 0.8727882918872252
Average AUC 0.8677884615384617
Max Accuracy 0.9134615384615384
Max F1 Score 0.9142857142857143
Max AUC 0.9134615384615385
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['(3, 1)', '(4, 4, 4)', '(1, 4, 1)', '(1, 2)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)', '(1, 1, 2)', '(4, 1, 1)', '(2, 3, 1)', '(3,)', 'bigram_entropy', '(2, 1, 2)', 'unigram_entropy', '(1, 2, 3)', '(4, 1)', 'pattern_hvg_4_nodes_entropy', '(2, 1, 1)', '(1, 4)', '(6, 3)', '(4,)', '(3, 3, 1)', '(1, 3)', '(2, 6, 3)', '(1, 1, 4)', '(2, 2, 3)', '(2, 1, 4)', '(3, 1, 4)', '(1, 3, 3)', '(6, 3, 1)', '(3, 2)', '(6, 2, 3)', '(3, 1, 2)', '(3, 3)', '(4, 1, 2)', '(1, 1, 3)', '(2, 2, 2)', '(2, 3, 2)', '(2, 3, 3)', '(3, 3, 3)', '(2, 1, 3)', '(3, 2, 2)', '(1, 2, 2)', '(3, 3, 2)', '(3, 2, 1)', '(4,

In [78]:
svc = SVC()
accuracy_list_svm_50_mrmr, f1_score_list_svm_50_mrmr, auc_list_svm_50_mrmr = model_train_predict(svc, 'mrmr_feat_list_50')

Average Accuracy 0.8158653846153847
Average F1 Score 0.8188765174772387
Average AUC 0.8158653846153847
Max Accuracy 0.8317307692307693
Max F1 Score 0.8341232227488151
Max AUC 0.8317307692307693
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(2, 1, 4)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(2,)', '(1, 2, 1)', 'unigram_entropy', '(1, 2)', '(2, 3)', '(2, 3, 1)', '(4, 1, 1)', '(1, 1, 2)', '(2, 1, 2)', 'bigram_entropy', '(3,)', 'trigram_entropy', '(1, 2, 3)', '(4, 1)', 'pattern_hvg_4_nodes_entropy', '(1, 4)', '(1, 3)', '(4,)', '(6, 3)', '(2, 1, 1)', '(1, 4, 1)', '(2, 6, 3)', '(1, 3, 3)', '(3, 1, 4)', '(3, 1, 2)', '(4, 1, 2)', '(1, 1, 3)', '(6, 3, 1)', '(1, 2, 2)', '(3, 2)', '(3, 3, 1)', '(2, 2)', '(3, 3, 3)', '(1, 1)', '(2, 2, 3)', '(6, 2, 3)', '(2, 2, 2)', '(2, 1, 3)', '(2, 3, 2)', '(4, 1, 4)', '(3, 3, 2)', '(1, 1, 1)', '(2, 2, 1)

In [79]:
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_50_mrmr, f1_score_list_xgb_50_mrmr, auc_list_xgb_50_mrmr = model_train_predict(xgbc, 'mrmr_feat_list_50')

Average Accuracy 0.8533653846153847
Average F1 Score 0.8538792566502618
Average AUC 0.8533653846153847
Max Accuracy 0.8894230769230769
Max F1 Score 0.8888888888888888
Max AUC 0.889423076923077
Best Sample Index based on Max Accuracy 9
Best Sample Index based on Max F1 Score 9
Best Sample Index based on Max AUC 9
Best Features based on Max Accuracy ['(3, 1)', '(1, 1, 4)', '(3, 3)', '(3, 2, 2)', '(2, 1)', 'pattern_hvg_5_node_entropy', '(3, 1, 1)', '(1, 2, 1)', 'trigram_entropy', '(2, 3)', '(1, 2)', '(3,)', '(4, 1, 1)', 'unigram_entropy', '(2,)', '(2, 3, 1)', '(1, 1, 2)', '(2, 1, 2)', 'bigram_entropy', '(1, 2, 3)', '(4,)', '(4, 1)', 'pattern_hvg_4_nodes_entropy', '(1, 4)', '(2, 1, 1)', '(1, 3)', '(6, 3)', '(1, 4, 1)', '(3, 3, 1)', '(3, 1, 2)', '(2, 1, 4)', '(2, 6, 3)', '(3, 2)', '(3, 3, 3)', '(3, 1, 4)', '(6, 3, 1)', '(1, 1, 3)', '(1, 2, 2)', '(1, 3, 3)', '(2, 3, 2)', '(2, 3, 3)', '(4, 1, 2)', '(4, 4, 1)', '(4, 1, 4)', '(3, 2, 3)', '(3, 3, 2)', '(2, 2)', '(3, 2, 1)', '(2, 1, 3)', '(1, 4, 

# MI and mRMR

## 10 Percentile

In [80]:
lr = LogisticRegression()
accuracy_list_lr_10_mi_mrmr, f1_score_list_lr_10_mi_mrmr, auc_list_lr_10_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_10*')

Average Accuracy 0.7995192307692308
Average F1 Score 0.7954003026136258
Average AUC 0.7995192307692307
Max Accuracy 0.8269230769230769
Max F1 Score 0.8181818181818181
Max AUC 0.8269230769230769
Best Sample Index based on Max Accuracy 2
Best Sample Index based on Max F1 Score 2
Best Sample Index based on Max AUC 2
Best Features based on Max Accuracy ['(3, 1)', '(2, 3, 1)', '(1, 2)', '(3, 1, 1)', '(2, 3)', 'trigram_entropy', '(3,)', '(2, 1)', '(2,)', '(1, 2, 1)']
Best Features based on Max F1 Score ['(3, 1)', '(2, 3, 1)', '(1, 2)', '(3, 1, 1)', '(2, 3)', 'trigram_entropy', '(3,)', '(2, 1)', '(2,)', '(1, 2, 1)']
Best Features based on Max AUC ['(3, 1)', '(2, 3, 1)', '(1, 2)', '(3, 1, 1)', '(2, 3)', 'trigram_entropy', '(3,)', '(2, 1)', '(2,)', '(1, 2, 1)']


In [81]:
rfc = RandomForestClassifier()
accuracy_list_rfc_10_mi_mrmr, f1_score_list_rfc_10_mi_mrmr, auc_list_rfc_10_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_10')

Average Accuracy 0.8528846153846154
Average F1 Score 0.8569882448093468
Average AUC 0.8528846153846154
Max Accuracy 0.8942307692307693
Max F1 Score 0.8962264150943395
Max AUC 0.8942307692307692
Best Sample Index based on Max Accuracy 6
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Features based on Max Accuracy ['(3, 1)', '(2, 3, 1)', '(1, 2)', '(3, 1, 1)', '(2, 1)', '(2, 3)', '(1, 2, 1)', '(4,)', '(2,)', '(3,)']
Best Features based on Max F1 Score ['(3, 1)', '(2, 3, 1)', '(1, 2)', '(3, 1, 1)', '(2, 1)', '(2, 3)', '(1, 2, 1)', '(4,)', '(2,)', '(3,)']
Best Features based on Max AUC ['(3, 1)', '(2, 3, 1)', '(1, 2)', '(3, 1, 1)', '(2, 1)', '(2, 3)', '(1, 2, 1)', '(4,)', '(2,)', '(3,)']


In [82]:
svc = SVC()
accuracy_list_svm_10_mi_mrmr, f1_score_list_svm_10_mi_mrmr, auc_list_svm_10_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_10')

Average Accuracy 0.8197115384615385
Average F1 Score 0.8246724239098426
Average AUC 0.8197115384615385
Max Accuracy 0.8461538461538461
Max F1 Score 0.8446601941747574
Max AUC 0.8461538461538463
Best Sample Index based on Max Accuracy 6
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Features based on Max Accuracy ['(3, 1)', '(2, 3, 1)', '(1, 2)', '(3, 1, 1)', '(2, 1)', '(2, 3)', '(1, 2, 1)', '(4,)', '(2,)', '(3,)']
Best Features based on Max F1 Score ['(3, 1)', '(2, 3, 1)', '(1, 2)', '(3, 1, 1)', '(2, 1)', '(2, 3)', '(1, 2, 1)', '(4,)', '(2,)', '(3,)']
Best Features based on Max AUC ['(3, 1)', '(2, 3, 1)', '(1, 2)', '(3, 1, 1)', '(2, 1)', '(2, 3)', '(1, 2, 1)', '(4,)', '(2,)', '(3,)']


In [83]:
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_10_mi_mrmr, f1_score_list_xgb_10_mi_mrmr, auc_list_xgb_10_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_10')

Average Accuracy 0.8451923076923077
Average F1 Score 0.8467768712220918
Average AUC 0.8451923076923077
Max Accuracy 0.8846153846153846
Max F1 Score 0.8823529411764707
Max AUC 0.8846153846153846
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['(3, 1)', '(2, 3, 1)', '(1, 2)', '(1, 4, 1)', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)']
Best Features based on Max F1 Score ['(3, 1)', '(2, 3, 1)', '(1, 2)', '(1, 4, 1)', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)']
Best Features based on Max AUC ['(3, 1)', '(2, 3, 1)', '(1, 2)', '(1, 4, 1)', '(3, 1, 1)', '(2, 1)', 'trigram_entropy', '(2, 3)', '(2,)', '(1, 2, 1)']


## 20 Percentile

In [84]:
lr = LogisticRegression()
accuracy_list_lr_20_mi_mrmr, f1_score_list_lr_20_mi_mrmr, auc_list_lr_20_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_20*')

Average Accuracy 0.8201923076923077
Average F1 Score 0.8176057324267239
Average AUC 0.8201923076923077
Max Accuracy 0.8413461538461539
Max F1 Score 0.8374384236453203
Max AUC 0.8413461538461539
Best Sample Index based on Max Accuracy 8
Best Sample Index based on Max F1 Score 8
Best Sample Index based on Max AUC 8
Best Features based on Max Accuracy ['(2, 1)', '(1, 2, 3)', '(3, 3)', '(1, 4)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(1, 4, 1)', '(4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1, 2)', '(3,)', '(1, 2)', 'trigram_entropy', '(3, 1)', '(3, 1, 1)', '(2, 3)', 'pattern_hvg_5_node_entropy', '(2,)', '(1, 2, 1)']
Best Features based on Max F1 Score ['(2, 1)', '(1, 2, 3)', '(3, 3)', '(1, 4)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(1, 4, 1)', '(4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1, 2)', '(3,)', '(1, 2)', 'trigram_entropy', '(3, 1)', '(3, 1, 1)', '(2, 3)', 'pattern_hvg_5_node_entropy', '(2,)', '(1, 2, 1)']
Best Features based on Max AUC ['(2, 1)', '

In [85]:
rfc = RandomForestClassifier()
accuracy_list_rfc_20_mi_mrmr, f1_score_list_rfc_20_mi_mrmr, auc_list_rfc_20_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_20')

Average Accuracy 0.860096153846154
Average F1 Score 0.8646508853489978
Average AUC 0.8600961538461538
Max Accuracy 0.9086538461538461
Max F1 Score 0.9107981220657276
Max AUC 0.9086538461538461
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['(2, 1)', '(1, 2, 3)', '(1, 4)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(1, 4, 1)', '(4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1, 2)', '(3,)', '(1, 2)', 'trigram_entropy', '(3, 1)', '(3, 1, 1)', '(2, 3)', 'pattern_hvg_5_node_entropy', 'pattern_hvg_4_nodes_entropy', '(2,)', '(1, 2, 1)']
Best Features based on Max F1 Score ['(2, 1)', '(1, 2, 3)', '(1, 4)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(1, 4, 1)', '(4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1, 2)', '(3,)', '(1, 2)', 'trigram_entropy', '(3, 1)', '(3, 1, 1)', '(2, 3)', 'pattern_hvg_5_node_entropy', 'pattern_hvg_4_nodes_entropy', '(2,)', '(1, 2, 1)']
Be

In [86]:
svc = SVC()
accuracy_list_svm_20_mi_mrmr, f1_score_list_svm_20_mi_mrmr, auc_list_svm_20_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_20')

Average Accuracy 0.8211538461538461
Average F1 Score 0.8241498277081535
Average AUC 0.8211538461538461
Max Accuracy 0.8461538461538461
Max F1 Score 0.8446601941747574
Max AUC 0.8461538461538463
Best Sample Index based on Max Accuracy 7
Best Sample Index based on Max F1 Score 7
Best Sample Index based on Max AUC 7
Best Features based on Max Accuracy ['(2, 1)', '(1, 2, 3)', '(1, 4)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(1, 4, 1)', '(4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1, 1)', '(2, 1, 2)', '(3,)', '(1, 2)', 'trigram_entropy', '(3, 1)', '(3, 1, 1)', '(2, 3)', 'pattern_hvg_5_node_entropy', '(2,)', '(1, 2, 1)']
Best Features based on Max F1 Score ['(2, 1)', '(1, 2, 3)', '(1, 4)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(1, 4, 1)', '(4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1, 1)', '(2, 1, 2)', '(3,)', '(1, 2)', 'trigram_entropy', '(3, 1)', '(3, 1, 1)', '(2, 3)', 'pattern_hvg_5_node_entropy', '(2,)', '(1, 2, 1)']
Best Features based on Max AUC ['(2, 

In [87]:
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_20_mi_mrmr, f1_score_list_xgb_20_mi_mrmr, auc_list_xgb_20_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_20')

Average Accuracy 0.8528846153846154
Average F1 Score 0.853972259130431
Average AUC 0.8528846153846155
Max Accuracy 0.8894230769230769
Max F1 Score 0.8878048780487806
Max AUC 0.889423076923077
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['(2, 1)', '(1, 2, 3)', '(1, 4)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(1, 4, 1)', '(4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1, 2)', '(3,)', '(1, 2)', 'trigram_entropy', '(3, 1)', '(3, 1, 1)', '(2, 3)', 'pattern_hvg_5_node_entropy', 'pattern_hvg_4_nodes_entropy', '(2,)', '(1, 2, 1)']
Best Features based on Max F1 Score ['(2, 1)', '(1, 2, 3)', '(1, 4)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(1, 4, 1)', '(4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1, 2)', '(3,)', '(1, 2)', 'trigram_entropy', '(3, 1)', '(3, 1, 1)', '(2, 3)', 'pattern_hvg_5_node_entropy', 'pattern_hvg_4_nodes_entropy', '(2,)', '(1, 2, 1)']
Bes

## 30 Percentile

In [88]:
lr = LogisticRegression()
accuracy_list_lr_30_mi_mrmr, f1_score_list_lr_30_mi_mrmr, auc_list_lr_30_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_30*')

Average Accuracy 0.8192307692307692
Average F1 Score 0.8165049800250719
Average AUC 0.8192307692307692
Max Accuracy 0.8317307692307693
Max F1 Score 0.8275862068965517
Max AUC 0.8317307692307694
Best Sample Index based on Max Accuracy 4
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 4
Best Features based on Max Accuracy ['(2, 1)', '(1, 2, 3)', '(3, 3)', '(3, 1, 4)', '(1, 4)', '(1, 1, 2)', 'Q5', 'unigram_entropy', '(2, 3, 1)', '(6, 3, 1)', '(2, 6, 3)', '(1, 4, 1)', '(4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1, 1)', '(6, 3)', '(2, 1, 2)', '(3,)', '(1, 2)', '(1, 1, 4)', '(3, 1, 2)', 'trigram_entropy', '(3, 1)', '(3, 1, 1)', '(2, 3)', 'pattern_hvg_5_node_entropy', 'pattern_hvg_4_nodes_entropy', '(2,)', '(1, 2, 1)']
Best Features based on Max F1 Score ['(2, 1)', '(1, 2, 3)', '(3, 3)', '(3, 1, 4)', '(1, 4)', '(1, 1, 2)', 'Q5', 'unigram_entropy', '(2, 3, 1)', '(6, 3, 1)', '(2, 6, 3)', '(1, 4, 1)', '(4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1,

In [89]:
rfc = RandomForestClassifier()
accuracy_list_rfc_30_mi_mrmr, f1_score_list_rfc_30_mi_mrmr, auc_list_rfc_30_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_30')

Average Accuracy 0.8649038461538462
Average F1 Score 0.8688989149437802
Average AUC 0.8649038461538462
Max Accuracy 0.9086538461538461
Max F1 Score 0.9099526066350712
Max AUC 0.9086538461538461
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['(2, 1)', '(1, 2, 3)', '(3, 1, 4)', '(3, 3)', '(1, 4)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(2, 3, 2)', '(1, 4, 1)', '(4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 2, 3)', '(2, 1, 2)', '(3,)', '(1, 2)', '(3, 3, 1)', '(1, 1, 4)', '(3, 1, 2)', 'trigram_entropy', '(3, 1)', '(3, 1, 1)', '(2, 3)', 'pattern_hvg_5_node_entropy', 'pattern_hvg_4_nodes_entropy', '(2,)', '(1, 2, 1)']
Best Features based on Max F1 Score ['(2, 1)', '(1, 2, 3)', '(3, 1, 4)', '(3, 3)', '(1, 4)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(2, 3, 2)', '(1, 4, 1)', '(4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 2, 3)', '(2, 1, 2)', '(3,)', '(1, 2)'

In [90]:
svc = SVC()
accuracy_list_svm_30_mi_mrmr, f1_score_list_svm_30_mi_mrmr, auc_list_svm_30_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_30')

Average Accuracy 0.8173076923076923
Average F1 Score 0.8202637577614196
Average AUC 0.8173076923076923
Max Accuracy 0.8413461538461539
Max F1 Score 0.8436018957345972
Max AUC 0.8413461538461537
Best Sample Index based on Max Accuracy 6
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Features based on Max Accuracy ['(2, 1)', '(1, 2, 3)', '(3, 3)', '(3, 1, 4)', '(1, 4)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(4, 1, 2)', '(4, 1)', '(1, 4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1, 1)', '(2, 1, 2)', '(3,)', '(1, 2)', '(1, 1, 4)', '(1, 3, 3)', '(2, 1, 4)', 'trigram_entropy', '(3, 1)', '(3, 1, 1)', '(2, 3)', 'pattern_hvg_5_node_entropy', 'pattern_hvg_4_nodes_entropy', '(2,)', '(1, 2, 1)']
Best Features based on Max F1 Score ['(2, 1)', '(1, 2, 3)', '(3, 3)', '(3, 1, 4)', '(1, 4)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(4, 1, 2)', '(4, 1)', '(1, 4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1, 1)', '(2, 1, 2)', '(3,)', '(1, 2)'

In [91]:
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_30_mi_mrmr, f1_score_list_xgb_30_mi_mrmr, auc_list_xgb_30_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_30')

Average Accuracy 0.8600961538461538
Average F1 Score 0.8610567404905171
Average AUC 0.8600961538461538
Max Accuracy 0.8846153846153846
Max F1 Score 0.8837209302325582
Max AUC 0.8846153846153848
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 4
Best Sample Index based on Max AUC 6
Best Features based on Max Accuracy ['(1, 3)', '(2, 1)', '(1, 2, 3)', '(3, 3)', '(3, 1, 4)', '(1, 4)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(2, 3, 2)', '(1, 4, 1)', '(4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1, 1)', '(2, 1, 2)', '(3,)', '(1, 2)', '(1, 1, 4)', '(3, 1, 2)', '(4, 1, 4)', 'C4', 'trigram_entropy', '(3, 1)', '(3, 1, 1)', '(2, 3)', 'pattern_hvg_5_node_entropy', 'pattern_hvg_4_nodes_entropy', '(2,)', '(1, 2, 1)']
Best Features based on Max F1 Score ['(2, 1)', '(1, 2, 3)', '(3, 3)', '(3, 1, 4)', '(1, 4)', '(1, 1, 2)', 'Q5', 'unigram_entropy', '(2, 3, 1)', '(6, 3, 1)', '(2, 6, 3)', '(1, 4, 1)', '(4, 1)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1,

## 50 Percentile

In [92]:
lr = LogisticRegression()
accuracy_list_lr_50_mi_mrmr, f1_score_list_lr_50_mi_mrmr, auc_list_lr_50_mi_mrmr = model_train_predict(lr, 'mi_mrmr_feat_list_50*')

Average Accuracy 0.8182692307692309
Average F1 Score 0.8154903420904123
Average AUC 0.8182692307692309
Max Accuracy 0.8365384615384616
Max F1 Score 0.8300000000000001
Max AUC 0.8365384615384616
Best Sample Index based on Max Accuracy 7
Best Sample Index based on Max F1 Score 7
Best Sample Index based on Max AUC 7
Best Features based on Max Accuracy ['P5', '(1, 3)', '(3, 3, 3)', '(2, 1, 3)', '(2, 1)', '(1, 1, 3)', '(1, 2, 3)', '(3, 3)', '(3, 1, 4)', '(1, 4)', '(2, 2)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(2, 6, 3)', '(1, 4, 1)', '(4, 1)', '(4, 1, 2)', '(4, 1, 1)', '(3, 2, 3)', '(4,)', 'bigram_entropy', '(2, 1, 1)', '(6, 2, 3)', '(2, 1, 2)', '(3,)', '(1, 2)', '(3, 3, 1)', '(1, 1, 4)', '(2, 1, 4)', '(1, 3, 3)', '(4, 1, 4)', 'trigram_entropy', '(1, 2, 2)', '(3, 1)', 'V5', '(4, 4, 1)', '(3, 1, 1)', '(2, 3)', '(3, 2)', 'pattern_hvg_5_node_entropy', '(2, 3, 3)', '(3, 2, 1)', 'pattern_hvg_4_nodes_entropy', '(2,)', '(1, 2, 1)']
Best Features based on Max F1 Score ['P5', '(1, 3)', '(3,

In [93]:
rfc = RandomForestClassifier()
accuracy_list_rfc_50_mi_mrmr, f1_score_list_rfc_50_mi_mrmr, auc_list_rfc_50_mi_mrmr = model_train_predict(rfc, 'mi_mrmr_feat_list_50')

Average Accuracy 0.8634615384615385
Average F1 Score 0.8686397019420695
Average AUC 0.8634615384615385
Max Accuracy 0.9086538461538461
Max F1 Score 0.9099526066350712
Max AUC 0.9086538461538461
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['(1, 3)', '(3, 3, 3)', '(2, 1)', '(1, 2, 3)', '(3, 1, 4)', '(3, 3)', '(1, 4)', '(2, 2)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(2, 3, 2)', '(6, 3, 1)', '(2, 6, 3)', '(1, 4, 1)', '(4, 1)', '(3, 2, 3)', '(4, 1, 1)', '(4,)', 'bigram_entropy', '(2, 1, 1)', '(6, 2, 3)', '(6, 3)', '(2, 2, 3)', '(2, 1, 2)', '(3,)', '(1, 2)', '(3, 3, 1)', '(1, 1, 4)', '(2, 1, 4)', '(1, 3, 3)', '(3, 1, 2)', '(2, 2, 2)', 'trigram_entropy', '(1, 2, 2)', '(3, 1)', 'A5', '(3, 1, 1)', '(2, 3)', '(3, 2)', 'pattern_hvg_5_node_entropy', '(2, 3, 3)', 'pattern_hvg_4_nodes_entropy', '(2,)', '(1, 2, 1)']
Best Features based on Max F1 Score ['(1, 3)', '(3, 3, 3)', '(2, 1

In [94]:
svc = SVC()
accuracy_list_svm_50_mi_mrmr, f1_score_list_svm_50_mi_mrmr, auc_list_svm_50_mi_mrmr = model_train_predict(svc, 'mi_mrmr_feat_list_50')

Average Accuracy 0.8158653846153847
Average F1 Score 0.8190845215474718
Average AUC 0.8158653846153847
Max Accuracy 0.8317307692307693
Max F1 Score 0.8341232227488151
Max AUC 0.8317307692307693
Best Sample Index based on Max Accuracy 1
Best Sample Index based on Max F1 Score 1
Best Sample Index based on Max AUC 1
Best Features based on Max Accuracy ['(1, 3)', '(3, 3, 3)', '(2, 1, 3)', '(1, 3, 1)', '(2, 1)', '(1, 1, 3)', '(1, 2, 3)', '(3, 3)', '(3, 1, 4)', '(1, 4)', '(2, 2)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(6, 3, 1)', '(3, 2, 3)', '(1, 4, 1)', '(4, 1)', '(4, 1, 2)', '(4, 1, 1)', '(1, 1)', '(2, 2, 1)', '(4,)', 'bigram_entropy', '(2, 1, 1)', '(2, 2, 3)', '(2, 1, 2)', '(3,)', '(1, 2)', '(1, 1, 4)', '(1, 3, 3)', '(2, 1, 4)', '(3, 1, 2)', '(3, 3, 1)', '(4, 1, 4)', 'trigram_entropy', '(1, 2, 2)', '(3, 1)', '(1, 1, 1)', '(4, 4, 1)', '(3, 1, 1)', '(2, 3)', '(3, 2)', 'pattern_hvg_5_node_entropy', 'pattern_hvg_4_nodes_entropy', '(2,)', '(1, 2, 1)']
Best Features based on Max F1 Sco

In [95]:
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_50_mi_mrmr, f1_score_list_xgb_50_mi_mrmr, auc_list_xgb_50_mi_mrmr = model_train_predict(xgbc, 'mi_mrmr_feat_list_50')

Average Accuracy 0.8605769230769231
Average F1 Score 0.8624837657338604
Average AUC 0.860576923076923
Max Accuracy 0.8942307692307693
Max F1 Score 0.8962264150943395
Max AUC 0.8942307692307692
Best Sample Index based on Max Accuracy 9
Best Sample Index based on Max F1 Score 9
Best Sample Index based on Max AUC 9
Best Features based on Max Accuracy ['(1, 3)', '(2, 1, 3)', '(2, 1)', '(1, 1, 3)', '(1, 2, 3)', '(3, 3)', '(3, 1, 4)', '(1, 4)', '(2, 2)', '(1, 1, 2)', 'unigram_entropy', '(2, 3, 1)', '(6, 3, 1)', '(2, 6, 3)', '(1, 4, 1)', '(4, 1)', '(4, 1, 2)', '(4, 1, 1)', '(3, 2, 3)', '(4,)', 'bigram_entropy', '(3, 2, 2)', '(2, 1, 1)', '(6, 2, 3)', '(6, 3)', '(2, 1, 2)', '(3,)', '(4, 4)', '(1, 2)', '(1, 1, 4)', '(3, 3, 1)', '(2, 1, 4)', '(3, 1, 2)', '(4, 1, 4)', 'trigram_entropy', '(1, 2, 2)', '(3, 1)', '(4, 4, 1)', '(3, 1, 1)', '(2, 3)', '(3, 2)', 'pattern_hvg_5_node_entropy', '(2, 3, 3)', 'pattern_hvg_4_nodes_entropy', '(2,)', '(1, 2, 1)']
Best Features based on Max F1 Score ['(1, 3)', '(2

# Saving results

In [119]:
models = ['lr', 'rfc', 'svc', 'xgbc']
models = [value for value in models for _ in range(10)] * 4
percentiles = ['10', '20', '30', '50']
percentiles = [value for value in percentiles for _ in range(40)]
samples = [x for x in range(1, 11)]
samples = samples * 16

print(len(models))
print(len(percentiles))
print(len(samples))

160
160
160


In [97]:
overall_accuracy_list_mi = (accuracy_list_lr_10_mi + accuracy_list_rfc_10_mi + accuracy_list_svm_10_mi + accuracy_list_xgb_10_mi +
                            accuracy_list_lr_20_mi + accuracy_list_rfc_20_mi + accuracy_list_svm_20_mi + accuracy_list_xgb_20_mi +
                            accuracy_list_lr_30_mi + accuracy_list_rfc_30_mi + accuracy_list_svm_30_mi + accuracy_list_xgb_30_mi +
                            accuracy_list_lr_50_mi + accuracy_list_rfc_50_mi + accuracy_list_svm_50_mi + accuracy_list_xgb_50_mi)

overall_f1_score_list_mi = (f1_score_list_lr_10_mi + f1_score_list_rfc_10_mi + f1_score_list_svm_10_mi + f1_score_list_xgb_10_mi +
                            f1_score_list_lr_20_mi + f1_score_list_rfc_20_mi + f1_score_list_svm_20_mi + f1_score_list_xgb_20_mi +
                            f1_score_list_lr_30_mi + f1_score_list_rfc_30_mi + f1_score_list_svm_30_mi + f1_score_list_xgb_30_mi +
                            f1_score_list_lr_50_mi + f1_score_list_rfc_50_mi + f1_score_list_svm_50_mi + f1_score_list_xgb_50_mi)

overall_auc_list_mi = (auc_list_lr_10_mi + auc_list_rfc_10_mi + auc_list_svm_10_mi + auc_list_xgb_10_mi +
                            auc_list_lr_20_mi + auc_list_rfc_20_mi + auc_list_svm_20_mi + auc_list_xgb_20_mi +
                            auc_list_lr_30_mi + auc_list_rfc_30_mi + auc_list_svm_30_mi + auc_list_xgb_30_mi +
                            auc_list_lr_50_mi + auc_list_rfc_50_mi + auc_list_svm_50_mi + auc_list_xgb_50_mi)


In [100]:
overall_accuracy_list_mrmr = (accuracy_list_lr_10_mrmr + accuracy_list_rfc_10_mrmr + accuracy_list_svm_10_mrmr + accuracy_list_xgb_10_mrmr +
                            accuracy_list_lr_20_mrmr + accuracy_list_rfc_20_mrmr + accuracy_list_svm_20_mrmr + accuracy_list_xgb_20_mrmr +
                            accuracy_list_lr_30_mrmr + accuracy_list_rfc_30_mrmr + accuracy_list_svm_30_mrmr + accuracy_list_xgb_30_mrmr +
                            accuracy_list_lr_50_mrmr + accuracy_list_rfc_50_mrmr + accuracy_list_svm_50_mrmr + accuracy_list_xgb_50_mrmr)

overall_f1_score_list_mrmr = (f1_score_list_lr_10_mrmr + f1_score_list_rfc_10_mrmr + f1_score_list_svm_10_mrmr + f1_score_list_xgb_10_mrmr +
                            f1_score_list_lr_20_mrmr + f1_score_list_rfc_20_mrmr + f1_score_list_svm_20_mrmr + f1_score_list_xgb_20_mrmr +
                            f1_score_list_lr_30_mrmr + f1_score_list_rfc_30_mrmr + f1_score_list_svm_30_mrmr + f1_score_list_xgb_30_mrmr +
                            f1_score_list_lr_50_mrmr + f1_score_list_rfc_50_mrmr + f1_score_list_svm_50_mrmr + f1_score_list_xgb_50_mrmr)

overall_auc_list_mrmr = (auc_list_lr_10_mrmr + auc_list_rfc_10_mrmr + auc_list_svm_10_mrmr + auc_list_xgb_10_mrmr +
                            auc_list_lr_20_mrmr + auc_list_rfc_20_mrmr + auc_list_svm_20_mrmr + auc_list_xgb_20_mrmr +
                            auc_list_lr_30_mrmr + auc_list_rfc_30_mrmr + auc_list_svm_30_mrmr + auc_list_xgb_30_mrmr +
                            auc_list_lr_50_mrmr + auc_list_rfc_50_mrmr + auc_list_svm_50_mrmr + auc_list_xgb_50_mrmr)

In [101]:
overall_accuracy_list_mi_mrmr = (accuracy_list_lr_10_mi_mrmr + accuracy_list_rfc_10_mi_mrmr + accuracy_list_svm_10_mi_mrmr + accuracy_list_xgb_10_mi_mrmr +
                            accuracy_list_lr_20_mi_mrmr + accuracy_list_rfc_20_mi_mrmr + accuracy_list_svm_20_mi_mrmr + accuracy_list_xgb_20_mi_mrmr +
                            accuracy_list_lr_30_mi_mrmr + accuracy_list_rfc_30_mi_mrmr + accuracy_list_svm_30_mi_mrmr + accuracy_list_xgb_30_mi_mrmr +
                            accuracy_list_lr_50_mi_mrmr + accuracy_list_rfc_50_mi_mrmr + accuracy_list_svm_50_mi_mrmr + accuracy_list_xgb_50_mi_mrmr)

overall_f1_score_list_mi_mrmr = (f1_score_list_lr_10_mi_mrmr + f1_score_list_rfc_10_mi_mrmr + f1_score_list_svm_10_mi_mrmr + f1_score_list_xgb_10_mi_mrmr +
                            f1_score_list_lr_20_mi_mrmr + f1_score_list_rfc_20_mi_mrmr + f1_score_list_svm_20_mi_mrmr + f1_score_list_xgb_20_mi_mrmr +
                            f1_score_list_lr_30_mi_mrmr + f1_score_list_rfc_30_mi_mrmr + f1_score_list_svm_30_mi_mrmr + f1_score_list_xgb_30_mi_mrmr +
                            f1_score_list_lr_50_mi_mrmr + f1_score_list_rfc_50_mi_mrmr + f1_score_list_svm_50_mi_mrmr + f1_score_list_xgb_50_mi_mrmr)

overall_auc_list_mi_mrmr = (auc_list_lr_10_mi_mrmr + auc_list_rfc_10_mi_mrmr + auc_list_svm_10_mi_mrmr + auc_list_xgb_10_mi_mrmr +
                            auc_list_lr_20_mi_mrmr + auc_list_rfc_20_mi_mrmr + auc_list_svm_20_mi_mrmr + auc_list_xgb_20_mi_mrmr +
                            auc_list_lr_30_mi_mrmr + auc_list_rfc_30_mi_mrmr + auc_list_svm_30_mi_mrmr + auc_list_xgb_30_mi_mrmr +
                            auc_list_lr_50_mi_mrmr + auc_list_rfc_50_mi_mrmr + auc_list_svm_50_mi_mrmr + auc_list_xgb_50_mi_mrmr)

In [130]:
mi_dictionary = {
    'samples': samples,
    'models': models,
    'percentiles': percentiles,
    'accuracy': overall_accuracy_list_mi,
    'f1_score': overall_f1_score_list_mi,
    'auc': overall_auc_list_mi,
}
mi_df = pd.DataFrame(mi_dictionary)

mrmr_dictionary = {
    'samples': samples,
    'models': models,
    'percentiles': percentiles,
    'accuracy': overall_accuracy_list_mrmr,
    'f1_score': overall_f1_score_list_mrmr,
    'auc': overall_auc_list_mrmr,
}
mrmr_df = pd.DataFrame(mrmr_dictionary)

mi_mrmr_dictionary = {
    'samples': samples,
    'models': models,
    'percentiles': percentiles,
    'accuracy': overall_accuracy_list_mi_mrmr,
    'f1_score': overall_f1_score_list_mi_mrmr,
    'auc': overall_auc_list_mi_mrmr,
}
mi_mrmr_df = pd.DataFrame(mi_mrmr_dictionary)

mi_df.to_csv('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/results/mutual_info_results.csv')
mrmr_df.to_csv('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/results/mrmr_results.csv')
mi_mrmr_df.to_csv('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/results/mutual_info_mrmr_results.csv')


