In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import *
from sklearn.feature_selection import *

from imblearn.under_sampling import RandomUnderSampler


In [2]:
shop_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/data/long_trajectory/shopper_data_slm_feat_v2.csv')
shop_df.drop(['session_id_hash', 'product_action', 'reduced_time', 'HVGms_edges', 'pattern_hvg_4_nodes', 'pattern_hvg_5_nodes', 'unigram', 'bigram', 'trigram', 'unigram_prob_freq', 'bigram_prob_freq', 'trigram_prob_freq', 'pattern_hvg_4_nodes_prob_freq', 'pattern_hvg_5_nodes_prob_freq'], axis=1, inplace=True)
shop_df['conversion_class'] = np.where(shop_df['conversion_class'] == 'NC', 0, 1)
print(shop_df.shape)
shop_df.head()

(152514, 161)


Unnamed: 0,conversion_class,unigram_entropy,bigram_entropy,trigram_entropy,pattern_hvg_4_nodes_entropy,pattern_hvg_5_node_entropy,"(1,)","(2,)","(6,)","(3,)",...,S5,R5,M5,F5,C5,D5,K5,J5,I5,H5
0,0,0.601924,1.080574,1.450805,1.286057,1.84622,0.8125,0.125,0.0625,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.661563,1.004242,1.011404,1.039721,1.098612,0.625,0.375,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.679193,1.036199,1.19355,0.693147,1.004242,0.583333,0.416667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.673012,0.964963,0.974315,1.329661,1.609438,0.6,0.4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.678209,1.183133,1.625267,1.322965,1.968677,0.413793,0.586207,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
x, y = shop_df.drop('conversion_class', axis=1), shop_df['conversion_class']
subsamples = []
for i in range(10):
    rus = RandomUnderSampler(random_state=i)
    x_resampled, y_resampled = rus.fit_resample(x, y)
    subsamples.append((x_resampled, y_resampled))
    print(x_resampled.head(1))

        unigram_entropy  bigram_entropy  trigram_entropy  \
145644         0.585953        1.029653         1.273028   

        pattern_hvg_4_nodes_entropy  pattern_hvg_5_node_entropy      (1,)  \
145644                     0.682908                    1.011404  0.727273   

            (2,)  (6,)  (3,)  (4,)  ...   S5   R5   M5   F5   C5   D5   K5  \
145644  0.272727   0.0   0.0   0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0   

         J5   I5   H5  
145644  0.0  0.0  0.0  

[1 rows x 160 columns]
        unigram_entropy  bigram_entropy  trigram_entropy  \
111689         0.682908        1.012331         1.236685   

        pattern_hvg_4_nodes_entropy  pattern_hvg_5_node_entropy      (1,)  \
111689                     1.366159                    1.889159  0.571429   

            (2,)  (6,)  (3,)  (4,)  ...   S5        R5   M5   F5   C5   D5  \
111689  0.428571   0.0   0.0   0.0  ...  0.0  0.111111  0.0  0.0  0.0  0.0   

         K5   J5   I5   H5  
111689  0.0  0.0  0.0  0.0  

[1 

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(122011, 160) (30503, 160) (122011,) (30503,)


# MI (Mutual Information)

In [5]:
mi_feat_list_10_percentile = []
mi_feat_list_20_percentile = []
mi_feat_list_30_percentile = []
mi_feat_list_50_percentile = []
mi_feat_list_75_percentile = []
mi_feat_list_90_percentile = []


for samples in subsamples:
    x_sample = samples[0]
    y_sample = samples[1]
    
    sel_10_percentile_cols = SelectPercentile(mutual_info_classif, percentile=10)
    sel_10_percentile_cols.fit(x_sample, y_sample)
    mi_feat_list_10_percentile.append(x_sample.columns[sel_10_percentile_cols.get_support()])
    
    sel_20_percentile_cols = SelectPercentile(mutual_info_classif, percentile=20)
    sel_20_percentile_cols.fit(x_sample, y_sample)
    mi_feat_list_20_percentile.append(x_sample.columns[sel_20_percentile_cols.get_support()])
    
    sel_30_percentile_cols = SelectPercentile(mutual_info_classif, percentile=30)
    sel_30_percentile_cols.fit(x_sample, y_sample)
    mi_feat_list_30_percentile.append(x_sample.columns[sel_30_percentile_cols.get_support()])
    
    sel_50_percentile_cols = SelectPercentile(mutual_info_classif, percentile=50)
    sel_50_percentile_cols.fit(x_sample, y_sample)
    mi_feat_list_50_percentile.append(x_sample.columns[sel_50_percentile_cols.get_support()])
    
    sel_75_percentile_cols = SelectPercentile(mutual_info_classif, percentile=75)
    sel_75_percentile_cols.fit(x_sample, y_sample)
    mi_feat_list_75_percentile.append(x_sample.columns[sel_75_percentile_cols.get_support()])
    
    sel_90_percentile_cols = SelectPercentile(mutual_info_classif, percentile=90)
    sel_90_percentile_cols.fit(x_sample, y_sample)
    mi_feat_list_90_percentile.append(x_sample.columns[sel_90_percentile_cols.get_support()])

print(len(mi_feat_list_10_percentile[0]))
print(len(mi_feat_list_20_percentile[0]))
print(len(mi_feat_list_30_percentile[0]))
print(len(mi_feat_list_50_percentile[0]))
print(len(mi_feat_list_75_percentile[0]))
print(len(mi_feat_list_90_percentile[0]))

print(len(mi_feat_list_10_percentile))
print(len(mi_feat_list_20_percentile))
print(len(mi_feat_list_30_percentile))
print(len(mi_feat_list_50_percentile))
print(len(mi_feat_list_75_percentile))
print(len(mi_feat_list_90_percentile))

16
32
48
80
120
144
10
10
10
10
10
10


# mRMR

In [6]:
def mrmr_feat_selected(x, y, f, correlation, k):
    feat_list = []
    selected = []
    not_selected = x.columns.to_list()
    # repeat k times
    for i in range(k):
    
        if i > 0:
            last_selected = selected[-1]
            correlation.loc[not_selected, last_selected] = x[not_selected].corrwith(x[last_selected]).abs().clip(.00001)
            
        score = f.loc[not_selected] / correlation.loc[not_selected, selected].mean(axis = 1).fillna(.00001)
        best = score.index[score.argmax()]
        selected.append(best)
        not_selected.remove(best)
        
    feat_list.append(selected)
    return feat_list

In [7]:
mrmr_feat_list_10_percentile = []
mrmr_feat_list_20_percentile = []
mrmr_feat_list_30_percentile = []
mrmr_feat_list_50_percentile = []
mrmr_feat_list_75_percentile = []
mrmr_feat_list_90_percentile = []

for samples in subsamples:
    x_sample = samples[0]
    y_sample = samples[1]
    
    F = pd.Series(f_regression(x_sample, y_sample)[0], index = x_sample.columns)
    corr = pd.DataFrame(.001, index = x_sample.columns, columns = x_sample.columns)

    x = mrmr_feat_selected(x_sample, y_sample, F, corr, 14)
    mrmr_feat_list_10_percentile += x
    
    x = mrmr_feat_selected(x_sample, y_sample, F, corr, 28)
    mrmr_feat_list_20_percentile += x
    
    x = mrmr_feat_selected(x_sample, y_sample, F, corr, 42)
    mrmr_feat_list_30_percentile += x
    
    x = mrmr_feat_selected(x_sample, y_sample, F, corr, 69)
    mrmr_feat_list_50_percentile += x
    
    x = mrmr_feat_selected(x_sample, y_sample, F, corr, 104)
    mrmr_feat_list_75_percentile += x

    x = mrmr_feat_selected(x_sample, y_sample, F, corr, 125)
    mrmr_feat_list_90_percentile += x
        
print(len(mrmr_feat_list_10_percentile[0]))
print(len(mrmr_feat_list_20_percentile[0]))
print(len(mrmr_feat_list_30_percentile[0]))
print(len(mrmr_feat_list_50_percentile[0]))
print(len(mrmr_feat_list_75_percentile[0]))
print(len(mrmr_feat_list_90_percentile[0]))

print(len(mrmr_feat_list_10_percentile))
print(len(mrmr_feat_list_20_percentile))
print(len(mrmr_feat_list_30_percentile))
print(len(mrmr_feat_list_50_percentile))
print(len(mrmr_feat_list_75_percentile))
print(len(mrmr_feat_list_90_percentile))

14
28
42
69
104
125
10
10
10
10
10
10


# MI & mRMR

In [8]:
mi_mrmr_feat_list_10_percentile = []
for feat in zip(mi_feat_list_10_percentile, mrmr_feat_list_10_percentile):
    mi_mrmr_feat_list_10_percentile.append(list(set(feat[0]).intersection(feat[1])))

mi_mrmr_feat_list_20_percentile = []
for feat in zip(mi_feat_list_20_percentile, mrmr_feat_list_20_percentile):
    mi_mrmr_feat_list_20_percentile.append(list(set(feat[0]).intersection(feat[1])))
    
mi_mrmr_feat_list_30_percentile = []
for feat in zip(mi_feat_list_30_percentile, mrmr_feat_list_30_percentile):
    mi_mrmr_feat_list_30_percentile.append(list(set(feat[0]).intersection(feat[1])))
    
mi_mrmr_feat_list_50_percentile = []
for feat in zip(mi_feat_list_50_percentile, mrmr_feat_list_50_percentile):
    mi_mrmr_feat_list_50_percentile.append(list(set(feat[0]).intersection(feat[1]))) 
    
mi_mrmr_feat_list_75_percentile = []
for feat in zip(mi_feat_list_75_percentile, mrmr_feat_list_75_percentile):
    mi_mrmr_feat_list_75_percentile.append(list(set(feat[0]).intersection(feat[1]))) 

mi_mrmr_feat_list_90_percentile = []
for feat in zip(mi_feat_list_90_percentile, mrmr_feat_list_90_percentile):
    mi_mrmr_feat_list_90_percentile.append(list(set(feat[0]).intersection(feat[1]))) 
    
print(len(mi_mrmr_feat_list_10_percentile[0]))
print(len(mi_mrmr_feat_list_20_percentile[0]))
print(len(mi_mrmr_feat_list_30_percentile[0]))
print(len(mi_mrmr_feat_list_50_percentile[0]))
print(len(mi_mrmr_feat_list_75_percentile[0]))
print(len(mi_mrmr_feat_list_90_percentile[0]))

10
21
30
59
94
118


# Saving in CSV and Text Files

In [9]:
i = 1
for samples in subsamples:
    sample_df = pd.concat([samples[0], samples[1]], axis = 1)
    sample_df.to_csv('/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/subsamples_v2/subsample_' + str(i) + '_v2.csv')
    i+=1

In [10]:
def write_list_of_lists_to_file(list_of_lists, file_path):
    with open(file_path, 'w') as file:
        for sublist in list_of_lists:
            line = ' '.join(str(element) for element in sublist) + '\n'
            file.write(line)

In [11]:
write_list_of_lists_to_file(mi_feat_list_10_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mi_feat_list_10_percentile_v2.txt')
write_list_of_lists_to_file(mi_feat_list_20_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mi_feat_list_20_percentile_v2.txt')
write_list_of_lists_to_file(mi_feat_list_30_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mi_feat_list_30_percentile_v2.txt')
write_list_of_lists_to_file(mi_feat_list_50_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mi_feat_list_50_percentile_v2.txt')
write_list_of_lists_to_file(mi_feat_list_75_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mi_feat_list_75_percentile_v2.txt')
write_list_of_lists_to_file(mi_feat_list_90_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mi_feat_list_90_percentile_v2.txt')

write_list_of_lists_to_file(mrmr_feat_list_10_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mrmr_feat_list_10_percentile_v2.txt')
write_list_of_lists_to_file(mrmr_feat_list_20_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mrmr_feat_list_20_percentile_v2.txt')
write_list_of_lists_to_file(mrmr_feat_list_30_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mrmr_feat_list_30_percentile_v2.txt')
write_list_of_lists_to_file(mrmr_feat_list_50_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mrmr_feat_list_50_percentile_v2.txt')
write_list_of_lists_to_file(mrmr_feat_list_75_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mrmr_feat_list_75_percentile_v2.txt')
write_list_of_lists_to_file(mrmr_feat_list_90_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mrmr_feat_list_90_percentile_v2.txt')

write_list_of_lists_to_file(mi_mrmr_feat_list_10_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mi_mrmr_feat_list_10_percentile_v2.txt')
write_list_of_lists_to_file(mi_mrmr_feat_list_20_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mi_mrmr_feat_list_20_percentile_v2.txt')
write_list_of_lists_to_file(mi_mrmr_feat_list_30_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mi_mrmr_feat_list_30_percentile_v2.txt')
write_list_of_lists_to_file(mi_mrmr_feat_list_50_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mi_mrmr_feat_list_50_percentile_v2.txt')
write_list_of_lists_to_file(mi_mrmr_feat_list_75_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mi_mrmr_feat_list_75_percentile_v2.txt')
write_list_of_lists_to_file(mi_mrmr_feat_list_90_percentile, '/Users/nitanshjain/Documents/Projects/Shopper_Intent_Prediction/shopper-intent-prediction/long_trajectory/features_v2/mi_mrmr_feat_list_90_percentile_v2.txt')

In [12]:

# https://towardsdatascience.com/mrmr-explained-exactly-how-you-wished-someone-explained-to-you-9cf4ed27458b