In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

## Load session data

In [2]:
session_data = pd.read_csv('./input/sessions.csv')

print("session (Shape):", session_data.shape)

session (Shape): (10567737, 6)


In [3]:
print("null (Shape)",session_data[pd.isnull(session_data.user_id)].shape)

null (Shape) (34496, 6)


In [4]:
#fill NaN for other columns
session_data['action'].fillna('nan', inplace=True)
session_data['action_detail'].fillna('nan', inplace=True)
session_data['action_type'].fillna('nan', inplace=True)
session_data['secs_elapsed'].fillna(0, inplace=True)

In [5]:
session_data.head(10)

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0
5,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,7703.0
6,d1mm9tcy42,lookup,,,Windows Desktop,115.0
7,d1mm9tcy42,personalize,data,wishlist_content_update,Windows Desktop,831.0
8,d1mm9tcy42,index,view,view_search_results,Windows Desktop,20842.0
9,d1mm9tcy42,lookup,,,Windows Desktop,683.0


##  Remove hiphens

In [6]:
for col in ['action', 'action_type', 'action_detail']:
    l = session_data[col].unique()
    l = [s for s in l if not pd.isnull(s)]
    
    print([s for s in l if '-' in s])
    
    # replace hiphens with space
    session_data[col] = [ s.replace('-', '') for s in session_data[col]]
    
    # check
    l = session_data[col].unique()
    l = [s for s in l if not pd.isnull(s)]
    print([s for s in l if '-' in s])

['southern-europe', 'social-media', 'united-states', 'south-america', 'rest-of-world']
[]
['-unknown-']
[]
['-unknown-']
[]


In [7]:
# remove spaces from device_type
session_data['device_type'] = [ s.replace(' ', "_") for s in session_data.device_type]

In [8]:
# adding spaces for CountVectorizer split
session_data['device_type']   = ["%s "%w for w in session_data['device_type']]
session_data['action_type']   = ["%s "%w for w in session_data['action_type']]
session_data['action']        = ["%s "%w for w in session_data['action']]
session_data['action_detail'] = ["%s "%w for w in session_data['action_detail']]

In [9]:
# converting secs_elapsed to int 
session_data['secs_elapsed'] = session_data.secs_elapsed.astype(int)

In [10]:
session_data.head(5)

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows_Desktop,319
1,d1mm9tcy42,search_results,click,view_search_results,Windows_Desktop,67753
2,d1mm9tcy42,lookup,,,Windows_Desktop,301
3,d1mm9tcy42,search_results,click,view_search_results,Windows_Desktop,22141
4,d1mm9tcy42,lookup,,,Windows_Desktop,435


## Create Session Id based on secs_elapsed

In [13]:
nw_se_th = 25   # nw_se_th : new session threshold

def generate_session_id(df, th):  
    s_id = 1
    s_ve = []
    
    for se in df['secs_elapsed']:
        if se < th * 60:
            s_ve.append(s_id)
        else:
            s_ve.append(s_id)
            s_id += 1
            
    df['session_id' + '_' + str(th)] = s_ve
    return df

In [14]:
session_data = session_data.groupby('user_id').apply(lambda x: generate_session_id(x, nw_se_th))

In [15]:
session_data[session_data.user_id == 'd1mm9tcy42']

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed,session_id_25
0,d1mm9tcy42,lookup,,,Windows_Desktop,319.0,1.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows_Desktop,67753.0,1.0
2,d1mm9tcy42,lookup,,,Windows_Desktop,301.0,2.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows_Desktop,22141.0,2.0
4,d1mm9tcy42,lookup,,,Windows_Desktop,435.0,3.0
...,...,...,...,...,...,...,...
122,d1mm9tcy42,similar_listings,data,similar_listings,Windows_Desktop,137.0,51.0
123,d1mm9tcy42,ajax_refresh_subtotal,click,change_trip_characteristics,Windows_Desktop,791.0,51.0
124,d1mm9tcy42,personalize,data,wishlist_content_update,Windows_Desktop,73.0,51.0
125,d1mm9tcy42,show,,,Windows_Desktop,947.0,51.0


In [16]:
session_data[session_data.user_id == 'yo8nz8bqcq']

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed,session_id_25
127,yo8nz8bqcq,dashboard,view,dashboard,Mac_Desktop,2739.0,1.0
128,yo8nz8bqcq,create,submit,create_user,Mac_Desktop,0.0,2.0
129,yo8nz8bqcq,confirm_email,click,confirm_email_link,Mac_Desktop,115983.0,2.0
130,yo8nz8bqcq,show,view,p3,Mac_Desktop,20285.0,3.0
131,yo8nz8bqcq,show_personalize,data,user_profile_content_update,Mac_Desktop,3255.0,4.0
132,yo8nz8bqcq,show,view,user_profile,Mac_Desktop,47308.0,5.0
133,yo8nz8bqcq,header_userpic,data,header_userpic,Mac_Desktop,14156.0,6.0
134,yo8nz8bqcq,personalize,data,wishlist_content_update,Mac_Desktop,36.0,7.0
135,yo8nz8bqcq,show,,,Mac_Desktop,4080.0,7.0


In [17]:
grouped = session_data.groupby('user_id', sort = False)[['action', 'action_type', 'action_detail']]

In [18]:
session_data_a = grouped.sum()

In [19]:
session_data_a.head(5)

Unnamed: 0_level_0,action,action_type,action_detail
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
d1mm9tcy42,lookup search_results lookup search_results lo...,nan click nan click nan click nan data view na...,nan view_search_results nan view_search_result...
yo8nz8bqcq,dashboard create confirm_email show show_perso...,view submit click view data view data data nan,dashboard create_user confirm_email_link p3 us...
4grx6yxeby,verify create nan pending requested header_use...,unknown submit message_post booking_request vi...,unknown create_user message_post pending p5 he...
ncf87guaf0,lookup show search_results search_results show...,nan view click click view view click data data...,nan p3 view_search_results view_search_results...
4rvqpxoh3h,campaigns active create notifications listings...,unknown unknown unknown unknown unknown unknow...,unknown unknown unknown unknown unknown unknow...


In [20]:
session_data_a.iloc[0][0]

'lookup search_results lookup search_results lookup search_results lookup personalize index lookup search_results lookup personalize index similar_listings ajax_refresh_subtotal similar_listings ajax_refresh_subtotal show personalize show lookup search_results lookup search_results personalize lookup index show personalize show ajax_refresh_subtotal show personalize show ajax_refresh_subtotal show personalize show ajax_refresh_subtotal show personalize header_userpic show ask_question nan personalize similar_listings personalize show show lookup personalize index similar_listings personalize show show lookup personalize index similar_listings show personalize show lookup search_results lookup search_results lookup personalize index show personalize show personalize lookup index other_hosting_reviews_first show show personalize lookup index show personalize show lookup personalize index similar_listings ajax_refresh_subtotal similar_listings ajax_refresh_subtotal similar_listings show p

In [21]:
session_data_se = session_data.groupby('user_id', sort = False)['secs_elapsed'].agg({'sum': np.sum})

In [22]:
session_data_se.head(5)

Unnamed: 0_level_0,sum
user_id,Unnamed: 1_level_1
d1mm9tcy42,3427529.0
yo8nz8bqcq,207842.0
4grx6yxeby,1135444.0
ncf87guaf0,3755100.0
4rvqpxoh3h,2555.0


In [23]:
# selecting columns with session_id counts
session_id_cols = [c for c in session_data.columns if 'session_id' in c]
session_data_sc = session_data.groupby('user_id', sort = False)[session_id_cols].max()

# creating device type datagram
session_data_dt = session_data.groupby('user_id', sort = False)['device_type'].sum()

In [24]:
session_data_dt

user_id
d1mm9tcy42    Windows_Desktop Windows_Desktop Windows_Deskto...
yo8nz8bqcq    Mac_Desktop Mac_Desktop Mac_Desktop Mac_Deskto...
4grx6yxeby    Windows_Desktop Windows_Desktop Windows_Deskto...
ncf87guaf0    Windows_Desktop Windows_Desktop Windows_Deskto...
4rvqpxoh3h    iPhone iPhone iPhone iPhone iPhone iPhone iPho...
                                    ...                        
cv0na2lf5a    iPhone iPhone iPhone iPhone Windows_Desktop Wi...
zp8xfonng8    Android_Phone Android_Phone Android_Phone Andr...
fa6260ziny    Windows_Desktop Windows_Desktop Windows_Deskto...
87k0fy4ugm    Mac_Desktop Mac_Desktop Mac_Desktop Mac_Deskto...
9uqfg8txu3    Windows_Desktop Windows_Desktop Windows_Deskto...
Name: device_type, Length: 135483, dtype: object

In [25]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [26]:
vec_dt = CountVectorizer(binary=True)
vec_aa = CountVectorizer(ngram_range=(1, 2))
vec_at = CountVectorizer(ngram_range=(1, 2))
vec_ad = CountVectorizer(ngram_range=(1, 2))

In [27]:
# need labels to select train data
train_data = pd.read_csv('./input/train_users_2.csv', index_col=0)

In [28]:
# only choose id exist in train data
session_train_id = session_data_a.index.intersection(train_data.index)
print('Intersection Shape:',session_train_id.shape)

Intersection Shape: (73815,)


In [29]:
vec_dt  = vec_dt.fit(session_data_dt.ix[session_train_id].values)
vec_aa  = vec_aa.fit(session_data_a.ix[session_train_id].action.values)
vec_at  = vec_at.fit(session_data_a.ix[session_train_id].action_type.values)
vec_ad  = vec_ad.fit(session_data_a.ix[session_train_id].action_detail.values)

In [30]:
session_all_dt  = vec_dt.transform(session_data_dt.values)
session_all_aa  = vec_aa.transform(session_data_a.action.values)
session_all_at  = vec_at.transform(session_data_a.action_type.values)
session_all_ad  = vec_ad.transform(session_data_a.action_detail.values)

In [31]:
print(session_all_dt.shape)
print(session_all_aa.shape)
print(session_all_at.shape)
print(session_all_ad.shape)

(135483, 14)
(135483, 17426)
(135483, 95)
(135483, 6673)


## Stakc features

In [32]:
from scipy.sparse import coo_matrix, hstack, vstack, csr_matrix

In [33]:
session_all_a = hstack([session_all_at, session_all_aa, session_all_ad])
print('stack shape:', session_all_a.shape)
session_all_a = session_all_a.tocsr()

stack shape: (135483, 24194)


In [34]:
# add column 'unique_devices' number of unique devices
session_all = pd.DataFrame(session_all_dt.sum(axis = 1), index=session_data_se.index, columns=['unique_devices'])
session_all = pd.concat([session_all, session_data_se], axis = 1)
session_all = pd.concat([session_all, session_data_sc], axis = 1)
session_all.head()

Unnamed: 0_level_0,unique_devices,sum,session_id_25
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
d1mm9tcy42,2,3427529.0,51.0
yo8nz8bqcq,1,207842.0,7.0
4grx6yxeby,2,1135444.0,11.0
ncf87guaf0,3,3755100.0,81.0
4rvqpxoh3h,1,2555.0,1.0


In [35]:
from sklearn.preprocessing import LabelEncoder

labels = train_data.ix[session_train_id].country_destination
le = LabelEncoder()
y = le.fit_transform(labels.values) 

## Select best K features with K highest scores.

In [36]:
from sklearn.feature_selection import SelectKBest, chi2

X = session_all_a[:session_train_id.shape[0]].copy()
sel = SelectKBest(chi2, k=500)
sel.fit(X,y)

SelectKBest(k=500, score_func=<function chi2 at 0x000001E3A8304AE8>)

In [37]:
X_new = sel.transform(session_all_a)
X_new  = pd.DataFrame(X_new.toarray(), index=session_data_a.index)

In [38]:
X_new

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
d1mm9tcy42,0,0,0,0,0,20,0,0,6,0,...,0,8,8,1,7,0,0,0,0,0
yo8nz8bqcq,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4grx6yxeby,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ncf87guaf0,0,0,0,0,0,64,0,25,17,0,...,0,4,2,1,0,0,0,0,0,0
4rvqpxoh3h,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cv0na2lf5a,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,7,0,0,0,0
zp8xfonng8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fa6260ziny,0,0,0,0,0,20,0,13,3,0,...,0,4,1,0,1,0,1,0,0,0
87k0fy4ugm,0,0,0,0,0,2,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0


In [39]:
session_all = pd.concat([session_all, X_new], axis = 1)

In [40]:
session_all.shape

(135483, 503)

In [41]:
session_all

Unnamed: 0_level_0,unique_devices,sum,session_id_25,0,1,2,3,4,5,6,...,490,491,492,493,494,495,496,497,498,499
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
d1mm9tcy42,2,3427529.0,51.0,0,0,0,0,0,20,0,...,0,8,8,1,7,0,0,0,0,0
yo8nz8bqcq,1,207842.0,7.0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
4grx6yxeby,2,1135444.0,11.0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
ncf87guaf0,3,3755100.0,81.0,0,0,0,0,0,64,0,...,0,4,2,1,0,0,0,0,0,0
4rvqpxoh3h,1,2555.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cv0na2lf5a,2,2534232.0,66.0,0,0,0,0,0,1,0,...,0,1,0,0,0,7,0,0,0,0
zp8xfonng8,1,51618.0,8.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fa6260ziny,1,823297.0,46.0,0,0,0,0,0,20,0,...,0,4,1,0,1,0,1,0,0,0
87k0fy4ugm,2,352068.0,7.0,0,0,0,0,0,2,0,...,1,0,0,0,0,0,0,0,0,0


## Save preprocess session data

In [42]:
session_all.to_csv('session_featrues_500.csv')