# Feature Selection

In this section, we will focus on the session information, in which action and details are in form of many categorical variables. It is difficult to visual them one by one, hence we will apply feature reduction techniques such as LassoCV and RFE

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LassoCV, LogisticRegression

In [2]:
session = pd.read_csv('./data/id_book.csv', index_col='id')
session.head()

Unnamed: 0_level_0,user_id,action,action_type,action_detail,device_type,secs_elapsed,book
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
d1mm9tcy42,d1mm9tcy42,lookup,,,Windows Desktop,319.0,1
d1mm9tcy42,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0,1
d1mm9tcy42,d1mm9tcy42,lookup,,,Windows Desktop,301.0,1
d1mm9tcy42,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0,1
d1mm9tcy42,d1mm9tcy42,lookup,,,Windows Desktop,435.0,1


## Last Action

Assume last row of each user session to be the last action of the session.

In [3]:
last_action = session.groupby(['user_id']).tail(1)

In [4]:
last_action.head()

Unnamed: 0_level_0,user_id,action,action_type,action_detail,device_type,secs_elapsed,book
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
d1mm9tcy42,d1mm9tcy42,show,view,p3,Windows Desktop,76511.0,1
yo8nz8bqcq,yo8nz8bqcq,show,,,Mac Desktop,4080.0,0
4grx6yxeby,4grx6yxeby,personalize,data,wishlist_content_update,Windows Desktop,61374.0,0
ncf87guaf0,ncf87guaf0,personalize,data,wishlist_content_update,Windows Desktop,1254.0,0
4rvqpxoh3h,4rvqpxoh3h,index,-unknown-,-unknown-,iPhone,886.0,1


In [5]:
df_fs = last_action.drop(['user_id', 'secs_elapsed'], axis=1)

In [6]:
df_fs_en = pd.get_dummies(df_fs, drop_first=True, prefix=df_fs.columns[:-1])

In [18]:
df_fs_en.shape

(73815, 359)

In [8]:
X = df_fs_en.drop(['book'], axis=1)
y = df_fs_en['book']

In [9]:
lcv = LassoCV()
lcv.fit(X, y)

LassoCV()

In [10]:
lcv_mask = lcv.coef_ != 0

var = sum(lcv_mask)

In [11]:
rfe_lr = RFE(estimator=LogisticRegression(multi_class='multinomial',
                                          solver='lbfgs', max_iter=1000), n_features_to_select=var, step=5)
rfe_lr.fit(X, y)

RFE(estimator=LogisticRegression(max_iter=1000, multi_class='multinomial'),
    n_features_to_select=185, step=5)

In [12]:
rfe_rf = RFE(estimator=RandomForestClassifier(),
             n_features_to_select=var, step=10)
rfe_rf.fit(X, y)

RFE(estimator=RandomForestClassifier(), n_features_to_select=185, step=10)

In [13]:
lr_mask = rfe_lr.support_
rf_mask = rfe_rf.support_

In [14]:
votes = np.sum([lcv_mask, lr_mask, rf_mask], axis=0)
mask = votes == 3

In [15]:
X_select = X.loc[:, mask]

In [16]:
X_select.shape

(73815, 80)

In [17]:
# df = pd.merge(X_select, y, left_index=True, right_index=True)

X_select.to_csv('./data/session_df.csv')