In [20]:
import pandas as pd
import numpy as np

from sklearn.model_selection import *
from sklearn.metrics import *

from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.linear_model import *
from sklearn.svm import *

import xgboost as xgb

import tensorflow as tf

import os
import re
import ast

In [36]:
directory_dataframes = '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/subsamples/'
directory_features = '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features/'

def get_sample_df(directory=directory_dataframes):
    list_dataframes = []
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            list_dataframes.append(pd.read_csv(f))
            
    return list_dataframes

def get_features(regex_str, directory=directory_features):
    regex = re.compile('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features/{}'.format(regex_str))
    
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if regex.match(f):
            file1 = open(f,"r+")
            feat_list = file1.read().splitlines()
            
            #txt file converts everything to string, so we need to convert it back to list
            for val in feat_list:
                #adding ; to be used a separator for list
                new_val = val.replace('y','y;').replace(') ','); ').replace('4 ', '4; ').replace('5 ', '5; ')
                feat_list[feat_list.index(val)] = new_val
                
    for val in feat_list:
        #separating the string into a list of features
        new_val = val.split('; ')
        feat_list[feat_list.index(val)] = new_val
        
    return feat_list

list_sample_dataframes = get_sample_df(directory_dataframes)

In [37]:
def model_train_predict(model, regex_str, dataframes=list_sample_dataframes):
    
    feat_list = get_features(regex_str)
    
    accuracy_list = []
    f1_score_list = []
    auc_list = []
    
    for sample, feat in zip(dataframes, feat_list):
        x = sample[feat]
        y = sample['conversion_class']
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
        # print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        accuracy_list.append(accuracy_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))
        auc_list.append(roc_auc_score(y_test, y_pred))

    print('Average Accuracy', np.mean(accuracy_list))
    print('Average F1 Score', np.mean(f1_score_list))
    print('Average AUC', np.mean(auc_list)) 
    
    print('Max Accuracy', max(accuracy_list))
    print('Max F1 Score', max(f1_score_list))
    print('Max AUC', max(auc_list))  
    
    best_accuracy_index = accuracy_list.index(max(accuracy_list))
    best_f1_score_index = f1_score_list.index(max(f1_score_list))
    best_auc_index = auc_list.index(max(auc_list))
    
    print('Best Sample Index based on Max Accuracy', best_accuracy_index)
    print('Best Sample Index based on Max F1 Score', best_f1_score_index)
    print('Best Sample Index based on Max AUC', best_auc_index)
    
    print('Best Features based on Max Accuracy', feat_list[best_accuracy_index])
    print('Best Features based on Max F1 Score', feat_list[best_f1_score_index])
    print('Best Features based on Max AUC', feat_list[best_auc_index]) 
    
     
    
    return accuracy_list, f1_score_list, auc_list  


Models to be used for baseline
<li>Logistic Regression</li>
<li>Random Forest</li>
<li>Support Vector Machine</li>
<li>XGB</li>
<li>Neural Network</li>

# Mutual Information

## 10 Percentile

In [38]:
lr = LogisticRegression()
accuracy_list_lr_10_mi, f1_score_list_lr_10_mi, auc_list_lr_10_mi = model_train_predict(lr, 'mi_feat_list_10')

Average Accuracy 0.8086538461538462
Average F1 Score 0.8034728121749242
Average AUC 0.8086538461538462
Max Accuracy 0.8269230769230769
Max F1 Score 0.8181818181818181
Max AUC 0.8269230769230769
Best Sample Index based on Max Accuracy 2
Best Sample Index based on Max F1 Score 2
Best Sample Index based on Max AUC 2
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(1, 2, 1)', '(2, 1, 2)', '(2, 3, 1)', '(3, 1, 1)']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(1, 2, 1)', '(2, 1, 2)', '(2, 3, 1)', '(3, 1, 1)']
Best Features based on Max AUC ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(1, 2, 1)', '(2, 1, 2)', '(2, 3, 1)', '(3, 1, 1)']


In [39]:
rfc = RandomForestClassifier()
accuracy_list_rfc_10_mi, f1_score_list_rfc_10_mi, auc_list_rfc_10_mi = model_train_predict(rfc, 'mi_feat_list_10')

Average Accuracy 0.8596153846153844
Average F1 Score 0.8636717825660881
Average AUC 0.8596153846153844
Max Accuracy 0.8894230769230769
Max F1 Score 0.8888888888888888
Max AUC 0.889423076923077
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']
Best Features based on Max AUC ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']


In [40]:
svc = SVC()
accuracy_list_svm_10_mi, f1_score_list_svm_10_mi, auc_list_svm_10_mi = model_train_predict(svc, 'mi_feat_list_10')

Average Accuracy 0.8197115384615385
Average F1 Score 0.8240373202310808
Average AUC 0.8197115384615385
Max Accuracy 0.8413461538461539
Max F1 Score 0.8411214953271028
Max AUC 0.8413461538461539
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 8
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(4, 1, 1)']
Best Features based on Max AUC ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']


In [41]:
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_10_mi, f1_score_list_xgb_10_mi, auc_list_xgb_10_mi = model_train_predict(xgbc, 'mi_feat_list_10')

Average Accuracy 0.8528846153846154
Average F1 Score 0.8551270069315375
Average AUC 0.8528846153846155
Max Accuracy 0.8894230769230769
Max F1 Score 0.8878048780487806
Max AUC 0.889423076923077
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']
Best Features based on Max AUC ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', '(2,)', '(3,)', '(4,)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 2, 1)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)']


## 20 Percentile

In [43]:
lr = LogisticRegression()
accuracy_list_lr_20_mi, f1_score_list_lr_20_mi, auc_list_lr_20_mi = model_train_predict(lr, 'mi_feat_list_20')

Average Accuracy 0.8192307692307692
Average F1 Score 0.815936348894392
Average AUC 0.8192307692307693
Max Accuracy 0.8365384615384616
Max F1 Score 0.8316831683168318
Max AUC 0.8365384615384616
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 5
Best Sample Index based on Max AUC 5
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(1, 3)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(3, 1, 1)', '(1, 1, 4)', '(1, 4, 1)', '(4, 1, 1)', '(3, 3, 1)', 'E5', 'A5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(1, 3)', '(1, 2, 1)', '(2, 1, 2)', '(

In [44]:
rfc = RandomForestClassifier()
accuracy_list_rfc_20_mi, f1_score_list_rfc_20_mi, auc_list_rfc_20_mi = model_train_predict(rfc, 'mi_feat_list_20')

Average Accuracy 0.8682692307692308
Average F1 Score 0.8729923989398609
Average AUC 0.8682692307692308
Max Accuracy 0.9086538461538461
Max F1 Score 0.9107981220657276
Max AUC 0.9086538461538461
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)', '(1, 4, 1)', '(4, 1, 1)', 'C4', 'E5', 'B5', 'G5', 'A5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2

In [45]:
svc = SVC()
accuracy_list_svm_30_mi, f1_score_list_svm_30_mi, auc_list_svm_30_mi = model_train_predict(svc, 'mi_feat_list_30')

Average Accuracy 0.81875
Average F1 Score 0.8221220116526453
Average AUC 0.81875
Max Accuracy 0.8317307692307693
Max F1 Score 0.835680751173709
Max AUC 0.8317307692307693
Best Sample Index based on Max Accuracy 5
Best Sample Index based on Max F1 Score 6
Best Sample Index based on Max AUC 6
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(1, 3)', '(3, 3)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(3, 1, 1)', '(1, 1, 4)', '(1, 4, 1)', '(4, 1, 4)', '(3, 3, 3)', '(4, 1, 1)', '(2, 3, 2)', '(3, 1, 4)', '(4, 4, 2)', 'A4', 'D4', 'C4', 'E5', 'B5', 'N5', 'A5', 'L5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '

In [46]:
xgbc = xgb.XGBClassifier()
accuracy_list_xgb_50_mi, f1_score_list_xgb_50_mi, auc_list_xgb_50_mi = model_train_predict(xgbc, 'mi_feat_list_50')

Average Accuracy 0.8615384615384617
Average F1 Score 0.862320791887624
Average AUC 0.8615384615384617
Max Accuracy 0.9134615384615384
Max F1 Score 0.9126213592233009
Max AUC 0.9134615384615385
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(6,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 2)', '(2, 3)', '(3, 1)', '(6, 3)', '(3, 2)', '(1, 4)', '(4, 1)', '(1, 3)', '(3, 3)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 2)', '(2, 2, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(2, 2, 3)', '(3, 1, 1)', '(2, 6, 3)', '(6, 3, 1)', '(1, 1, 4)', '(1, 4, 1)', '(2, 1, 4)', '(1, 3, 3)', '(3, 3, 3)', '(4, 1, 1)', '(3, 3, 1)', '(2, 3, 2)', '(3, 2, 3)', '(1, 6, 2)', '(3, 1, 4)', '(2, 3, 3)', '(2, 1, 6)', '(1, 4, 4)', '(6, 2, 3)', '(2

## 30 Percentile

In [47]:
lr = LogisticRegression()
accuracy_list_lr_30_mi, f1_score_list_lr_30_mi, auc_list_lr_30_mi = model_train_predict(lr, 'mi_feat_list_30')

Average Accuracy 0.8134615384615385
Average F1 Score 0.8105844002059858
Average AUC 0.8134615384615385
Max Accuracy 0.8317307692307693
Max F1 Score 0.8258706467661692
Max AUC 0.8317307692307693
Best Sample Index based on Max Accuracy 7
Best Sample Index based on Max F1 Score 7
Best Sample Index based on Max AUC 7
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(6, 3)', '(1, 4)', '(4, 1)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 1)', '(2, 6, 3)', '(6, 3, 1)', '(1, 4, 1)', '(4, 1, 2)', '(2, 1, 4)', '(4, 1, 4)', '(1, 3, 3)', '(3, 2, 1)', '(4, 1, 1)', '(3, 1, 4)', 'A4', 'E4', 'C4', 'F4', 'E5', 'B5', 'A5', 'L5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_e

In [48]:
rfc = RandomForestClassifier()
accuracy_list_rfc_30_mi, f1_score_list_rfc_30_mi, auc_list_rfc_30_mi = model_train_predict(rfc, 'mi_feat_list_30')

Average Accuracy 0.8663461538461539
Average F1 Score 0.8702828746201291
Average AUC 0.8663461538461539
Max Accuracy 0.9038461538461539
Max F1 Score 0.9056603773584906
Max AUC 0.9038461538461539
Best Sample Index based on Max Accuracy 3
Best Sample Index based on Max F1 Score 3
Best Sample Index based on Max AUC 3
Best Features based on Max Accuracy ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', '(3,)', '(4,)', '(1, 1)', '(1, 2)', '(2, 1)', '(2, 3)', '(3, 1)', '(1, 4)', '(4, 1)', '(3, 3)', '(1, 6)', '(1, 1, 1)', '(1, 1, 2)', '(1, 2, 1)', '(2, 1, 2)', '(1, 2, 3)', '(2, 3, 1)', '(3, 1, 2)', '(2, 2, 3)', '(3, 1, 1)', '(1, 1, 4)', '(1, 4, 1)', '(4, 1, 1)', '(3, 3, 1)', '(2, 3, 2)', '(3, 1, 4)', 'C4', 'F4', 'E5', 'B5', 'N5', 'Q5', 'V5', 'G5', 'A5', 'P5']
Best Features based on Max F1 Score ['unigram_entropy', 'bigram_entropy', 'trigram_entropy', 'pattern_hvg_4_nodes_entropy', 'pattern_hvg_5_node_entropy', '(2,)', 

In [None]:
svc = SVC()
accuracy_list_svm_30_mi, f1_score_list_svm_30_mi, auc_list_svm_30_mi = model_train_predict(svc, 'mi_feat_list_30')