# Merge user and session data
- We apply a feature engineering to user information based on the previous exploratory analysis. 
- We add 'action' activities using word vectors by counting the number of each action. 
  (like word counting vector from text analysis)
- We also keep the total second for each action by creating another 360-dim word vector of total seconds for each action. 
- Therefore, we add 720-dim feature vector for each user. 
- Finally, we have 650Kx720-dim DataFrame and save it for modeling. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('train_users_2.csv')
df_s = pd.read_csv('sessions.csv')

In [5]:
def get_signup_device(s):
    if s in ['Mac Desktop', 'Windows Desktop']:
        return s.split(' ')[0]
    elif s in ['iPhone', 'iPad']:
        return 'iOS'
    else:
        return 'Else'

def preprocess(df_origin):
    df = df_origin.copy()
    df = df[df.country_destination.isin(['NDF', 'US'])]
    
    df['signup_device'] = df['first_device_type'].apply(get_signup_device)
    
    df.age.fillna(1000, inplace=True)
    df['ageCat'] = pd.cut(df['age'], 
                            bins=[0,10,20,30,40,50,60,70,80,90,100,10000])

    return df

In [6]:
df = preprocess(df)

In [7]:
df.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination,signup_device,ageCat
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,1000.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF,Mac,"(100, 10000]"
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF,Mac,"(30, 40]"
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US,Windows,"(50, 60]"
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US,Mac,"(40, 50]"
5,osr2jwljor,2010-01-01,20100101215619,2010-01-02,-unknown-,1000.0,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,US,Mac,"(100, 10000]"


## ID analysis

In [8]:
df.shape

(186919, 18)

In [9]:
ids_with_session = df[df.id.isin(df_s.user_id)].id
ids_with_session.shape

(65136,)

- 35% of users in train data are having session data. 
- This is because sesseion data are collected for users for 2014. 
- We first only use user data with session data. 

In [10]:
df_test = pd.read_csv('test_users.csv')

In [11]:
df_test.shape

(62096, 15)

In [12]:
ids_test_with_session = df_test[df_test.id.isin(df_s.user_id)].id
ids_test_with_session.shape

(61668,)

- 99.3% of test users have session data
- Therefore, we are safe to model using session data. 

In [13]:
df = df[df.id.isin(ids_with_session)]

In [14]:
df.shape

(65136, 18)

In [15]:
df_s = df_s[df_s.user_id.isin(ids_with_session)]

In [16]:
df_s.shape

(4733234, 6)

# Combine User and Session Data

In [17]:
import pyprind

def combineData(df_origin, df_s):
    
    df = df_origin.copy()
    
    col_name='action'
    col_idx = 1
    
    # extract unique action values 
    actions = [x if pd.notnull(x) else 'missing' for x in df_s[col_name].unique()]
    
    # add columns for each action values
    # 'bag-of-word' type model, with counting vectorization. 
    for x in actions:
        df[x] = 0
        df[x+'_sec'] = 0
        
    # for speed-up purpose, we use numpy values. 
    # compare to df.iterrows()
    df_v = df.values
    df_s_v = df_s.values

    hash_id_idx = {user_id:idx for idx, user_id in enumerate(df_v[:,0])}
    hash_action_idx = {col:idx for idx, col in enumerate(df.columns) if col in actions}
    
    n = df_s_v.shape[0]
    pbar = pyprind.ProgBar(n)
    for i in range(n):
        user_id = df_s_v[i,0]
        user_action = df_s_v[i,col_idx]
        user_action_sec = df_s_v[i,5]

        if pd.isnull(user_action):
            user_action = 'missing'
        if pd.isnull(user_action_sec):
            user_action_sec = 0

        user_idx = hash_id_idx[user_id]
        action_idx = hash_action_idx[user_action]
        df_v[user_idx, action_idx] += 1
        df_v[user_idx, action_idx+1] += user_action_sec

        pbar.update()
    
    df = pd.DataFrame(df_v, columns=df.columns)
    return df, actions

In [18]:
df2, actions = combineData(df, df_s)

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:29


In [19]:
df2.shape

(65136, 676)

In [21]:
df2.iloc[0:10,20:28]

Unnamed: 0,create,create_sec,confirm_email,confirm_email_sec,show,show_sec,show_personalize,show_personalize_sec
0,1,0,1,115983.0,3,71673,1,3255
1,4,617933,0,0.0,0,0,0,0
2,2,12952,1,274002.0,15,562785,0,0
3,1,0,1,1371620.0,0,0,0,0
4,0,0,1,46262.0,0,0,0,0
5,1,0,0,0.0,14,242969,0,0
6,1,8834,2,11658.0,81,337983,5,3273
7,0,0,1,342501.0,0,0,0,0
8,1,0,0,0.0,16,190245,0,0
9,1,0,0,0.0,0,0,0,0


In [22]:
df2.to_csv('train_user_session_merged.csv', index=False)
pd.Series(actions).to_csv('actions.csv', index=False)