# Data Preparation

### Preparing train set

In [1]:
import datetime as dt

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import seaborn as sns

plt.style.use('ggplot')
%matplotlib inline

  import pandas.util.testing as tm


In [2]:
gender = pd.read_csv('./data/age_gender_bkts.csv')
train = pd.read_csv('./data/train_users_2.csv')
test = pd.read_csv('./data/test_users.csv')
country = pd.read_csv('./data/countries.csv')
session = pd.read_csv('./data/sessions.csv')
age_gender = pd.read_csv('./data/age_gender_bkts.csv')

In [3]:
age_cat = age_gender['age_bucket'].unique().tolist()
age_sort = age_cat[::-1]
age_sort.insert(0, 'miss')

In [4]:
train['age_fill'] = train['age'].apply(
    lambda x: -1 if x > 1000 else x).fillna(-1)
train['age_gp'] = pd.cut(
    train['age_fill'], bins=np.arange(-5, 106, 5), labels=age_sort)

In [5]:
train['first_affiliate_tracked'].fillna('miss', inplace=True)

In [6]:
lang = {'en': 'eng',
        'zh': 'asia',
        'ko': 'asia',
        'ja': 'asia',
        'fr': 'eu',
        'es': 'eu',
        'de': 'eu',
        'ru': 'eu',
        'pt': 'eu'}

In [7]:
train['lang'] = train['language'].map(lang).fillna('other')

In [8]:
browser = {'Chrome': 'chrome',
           'Chrome Mobile': 'chrome',
           'Safari': 'safari',
           'Mobile Safari': 'safari',
           'Firefox': 'firefox',
           'Mobile Firefox': 'firefox',
           'IE': 'ie',
           'IE Mobile': 'ie',
           '-unknown-': 'unknown'}

In [9]:
train['browser'] = train['first_browser'].map(browser).fillna('other')

In [10]:
train['gender_stated'] = train['gender'].apply(
    lambda x: 'no' if x == '-unknown-' else 'yes')

In [11]:
flow_cat = ['0', '1-10', '11-20', '20+']
train['flow_cat'] = pd.cut(train['signup_flow'], bins=[
                           0, 1, 10, 20, 100], labels=flow_cat, include_lowest=True)

In [12]:
train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,...,signup_app,first_device_type,first_browser,country_destination,age_fill,age_gp,lang,browser,gender_stated,flow_cat
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,...,Web,Mac Desktop,Chrome,NDF,-1.0,miss,eng,chrome,no,0
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,...,Web,Mac Desktop,Chrome,NDF,38.0,35-39,eng,chrome,yes,0
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,...,Web,Windows Desktop,IE,US,56.0,55-59,eng,ie,yes,1-10
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,...,Web,Mac Desktop,Firefox,other,42.0,40-44,eng,firefox,yes,0
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,...,Web,Mac Desktop,Chrome,US,41.0,40-44,eng,chrome,no,0


In [13]:
col_to_keep = ['id', 'signup_method',
       'affiliate_channel', 
       'signup_app', 'age_gp', 'lang',
       'browser', 'gender_stated', 'flow_cat', 'country_destination']

In [14]:
df_train = train[col_to_keep]

df_train.set_index('id', inplace=True)

df_train = pd.get_dummies(df_train, prefix=df_train.columns[:-1], columns=df_train.columns[:-1], drop_first=True)

In [15]:
df_train.head()

Unnamed: 0_level_0,country_destination,signup_method_facebook,signup_method_google,affiliate_channel_content,affiliate_channel_direct,affiliate_channel_other,affiliate_channel_remarketing,affiliate_channel_sem-brand,affiliate_channel_sem-non-brand,affiliate_channel_seo,...,lang_other,browser_firefox,browser_ie,browser_other,browser_safari,browser_unknown,gender_stated_yes,flow_cat_1-10,flow_cat_11-20,flow_cat_20+
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
gxn3p5htnn,NDF,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
820tgsjxq7,NDF,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4ft3gnwmtx,US,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,1,0,0
bjjt8pjhuk,other,1,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
87mebub9p4,US,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Preparing Test Set

In [16]:
test['age_fill'] = test['age'].apply(
    lambda x: -1 if x > 1000 else x).fillna(-1)
test['age_gp'] = pd.cut(
    test['age_fill'], bins=np.arange(-5, 106, 5), labels=age_sort)

In [17]:
test['first_affiliate_tracked'].fillna('miss', inplace=True)

In [18]:
test['lang'] = test['language'].map(lang).fillna('other')

In [19]:
test['browser'] = test['first_browser'].map(browser).fillna('other')

In [20]:
test['gender_stated'] = test['gender'].apply(
    lambda x: 'no' if x == '-unknown-' else 'yes')

In [21]:
test['flow_cat'] = pd.cut(test['signup_flow'], bins=[
                           0, 1, 10, 20, 100], labels=flow_cat, include_lowest=True)

In [22]:
df_test = test[col_to_keep[:-1]]
df_test.set_index('id', inplace=True)
df_test = pd.get_dummies(df_test, prefix=df_test.columns, columns=df_test.columns, drop_first=False)
df_test = df_test[df_train.columns[1:]]

In [23]:
print(len(df_test.columns), len(df_train.columns))

45 46


### Combining session data

In [24]:
session_df = pd.read_csv('./data/session_df.csv', index_col=0)
encoded_col = session_df.columns

In [25]:
last_action = session.drop(['secs_elapsed'], axis=1).groupby(['user_id']).tail(1)
last_action.set_index('user_id', inplace=True)

In [26]:
last_act_df = pd.get_dummies(last_action, drop_first=False, prefix=last_action.columns)

In [27]:
session_df = last_act_df.loc[:, encoded_col]

In [43]:
train_fin = df_train.merge(session_df, left_index=True, right_index=True, how='left').fillna(0)
test_fin = df_test.merge(session_df, left_index=True, right_index=True, how='left').fillna(0)

In [29]:
print(train_fin.shape, test_fin.shape)

(213451, 126) (62096, 125)


In [44]:
train_fin.to_csv('./data/train_df.csv')
test_fin.to_csv('./data/test_df.csv')