# Merge user and session data
- For each user, we count each 'action_detail' and join it to user information table.
- Because of the performance issue, we use the underlying numpy array and use a hashtable for matrix indices.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('train_users_2.csv')
df_s = pd.read_csv('sessions.csv')

In [3]:
df = df[df.country_destination.isin(['NDF', 'US'])]

df.age.fillna(1000, inplace=True)
df['ageCat'] = pd.cut(df['age'], 
                        bins=[0,10,20,30,40,50,60,70,80,90,100,10000])
df_s['action_detail'].fillna('-unknown-', inplace=1)
df_s['action_type'].fillna('-unknown-', inplace=1)

# Combine User and Session Data

In [4]:
ids_with_session = df[df.id.isin(df_s.user_id)].id
df = df[df.id.isin(ids_with_session)]
df_s = df_s[df_s.user_id.isin(ids_with_session)]

In [5]:
actions = [x for x in df_s['action_detail'].unique()]

In [6]:
import pyprind

# add columns for each action values
# 'bag-of-word' type model, with counting vectorization. 
for x in actions:
    df[x] = 0
df['n_action'] = 0
df['session_device'] = ''

# for speed-up purpose, we use numpy values. 
# compare to df.iterrows()
df_v = df.values
df_s_v = df_s.values

hash_id_idx = {user_id:idx for idx, user_id in enumerate(df_v[:,0])}
hash_action_idx = {col:idx for idx, col in enumerate(df.columns) if col in actions}

n = df_s_v.shape[0]
pbar = pyprind.ProgBar(n)
for i in range(n):
    user_id = df_s_v[i,0]
    user_action = df_s_v[i,3]
    user_device = df_s_v[i,4]

    user_idx = hash_id_idx[user_id]
    action_idx = hash_action_idx[user_action]
    df_v[user_idx, action_idx] += 1
    df_v[user_idx, -2] += 1
    df_v[user_idx, -1] = user_device #need to update to consider multiple device

    pbar.update()

df = pd.DataFrame(df_v, columns=df.columns)

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:20


In [7]:
df.shape

(65136, 147)

In [8]:
df.iloc[:, -2:].isnull().sum()

n_action          0
session_device    0
dtype: int64

In [9]:
df.iloc[0:10, -2:]

Unnamed: 0,n_action,session_device
0,9,Mac Desktop
1,16,Windows Desktop
2,152,Windows Desktop
3,3,iPad Tablet
4,7,iPhone
5,46,Windows Desktop
6,364,Mac Desktop
7,8,iPhone
8,83,Windows Desktop
9,9,iPad Tablet


In [10]:
df.iloc[0:10,17:25]

Unnamed: 0,dashboard,create_user,confirm_email_link,p3,user_profile_content_update,user_profile,header_userpic,wishlist_content_update
0,1,1,1,1,1,1,1,1
1,0,1,0,0,0,0,1,1
2,0,1,1,8,0,0,1,10
3,0,1,1,0,0,0,1,0
4,1,0,1,0,0,0,1,0
5,0,1,0,8,0,0,1,7
6,1,0,2,38,5,5,2,43
7,1,0,1,0,0,0,1,0
8,0,1,0,8,0,0,1,12
9,0,1,0,0,0,0,1,0


In [11]:
df.to_csv('train_user_session_merged.csv', index=False)
pd.Series(actions).to_csv('actions.csv', index=False)