In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
from sklearn.preprocessing import LabelEncoder, StandardScaler
from datetime import datetime, timedelta, date
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_users = pd.read_csv('./input/train_users_2.csv')
test_users = pd.read_csv('./input/test_users.csv')

train_users_labels = train_users.loc[:,'country_destination']
train_users_attrs = train_users.iloc[:,0:15]
train_users = train_users_attrs

In [3]:
train_users

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213446,zxodksqpep,2014-06-30,20140630235636,,MALE,32.0,basic,0,en,sem-brand,google,omg,Web,Mac Desktop,Safari
213447,mhewnxesx9,2014-06-30,20140630235719,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
213448,6o3arsjbb4,2014-06-30,20140630235754,,-unknown-,32.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox
213449,jh95kwisub,2014-06-30,20140630235822,,-unknown-,,basic,25,en,other,other,tracked-other,iOS,iPhone,Mobile Safari


In [4]:
#Creating and Encoding y
le_y = LabelEncoder()
y = le_y.fit_transform(train_users_labels)
print(y)

[ 7  7 10 ...  7  7  7]


In [5]:
train_users = train_users.drop(['date_first_booking'], axis=1)
test_users = test_users.drop(['date_first_booking'], axis=1)

In [6]:
# Date is split into 3 parts as year, month and day in both test and train. These are added as
# new features in both test and train

date_acc_created = np.vstack(train_users.date_account_created.astype(str).apply(
        lambda x: list(map(int, x.split('-')))).values)
train_users['created_year'] = date_acc_created[:,0]
train_users['created_month'] = date_acc_created[:,1]
train_users['created_day'] = date_acc_created[:,2]
train_users = train_users.drop(['date_account_created'], axis=1)

date_acc_created_test = np.vstack(test_users.date_account_created.astype(str).apply(
        lambda x: list(map(int, x.split('-')))).values)
test_users['created_year'] = date_acc_created_test[:,0]
test_users['created_month'] = date_acc_created_test[:,1]
test_users['created_day'] = date_acc_created_test[:,2]
test_users = test_users.drop(['date_account_created'], axis=1)

In [7]:
# Replacing unknown values in gender with -1 and null values with -1
train_users.loc[ train_users['gender'] == '-unknown-', 'gender'] = -1
train_users.loc[ train_users['gender'].isnull(), 'gender' ] = -1
test_users.loc[ test_users['gender'] == '-unknown-', 'gender'] = -1
test_users.loc[ test_users['gender'].isnull(), 'gender'] = -1

In [8]:
gender_translation = {field: i for i, field in enumerate(train_users.gender.unique()) }
gender_translation

{-1: 0, 'MALE': 1, 'FEMALE': 2, 'OTHER': 3}

In [9]:
for data in [train_users, test_users]:
    data['gender'] = data['gender'].apply(lambda x: gender_translation[x])

In [10]:
# Finding valid values for gender and invalid values for gender
nan_gender_count = len(train_users.loc[train_users['gender'] == -1, 'gender'])
valid_gender_count = len(train_users.gender.values) - nan_gender_count

# Creating a map with the gender distribution
count_map = pd.value_counts(train_users['gender'].values)
print ("Existing gender value distribution")
for k, v in count_map.iteritems():
    if k == -1:
        continue
    print (k, ":", float(v)/float(valid_gender_count))
    
for k, v in count_map.iteritems():
    if k == -1:
        continue
    c = int ( nan_gender_count * float(v)/float(valid_gender_count) )
    for i in range(len(train_users.gender.values)):
        if train_users.gender.values[i] == -1:
            train_users.gender.values[i] = k
            c -= 1
        if c == 0:
            break
train_users.gender.values[213450] = 0

Existing gender value distribution
0 : 0.44829023991454714
2 : 0.2953417880450314
1 : 0.25504682573518045
3 : 0.0013211463052410154


In [11]:
nan_gender_count = len(test_users.loc[test_users['gender'] == -1, 'gender'])
valid_gender_count = len(test_users.gender.values) - nan_gender_count
count_map = pd.value_counts(test_users['gender'].values)
print ("Existing gender value distribution")
for k, v in count_map.iteritems():
    if k == -1:
        continue
    print (k, ":", float(v)/float(valid_gender_count))

for k, v in count_map.iteritems():
    if k == -1:
        continue
    c = int ( nan_gender_count * float(v)/float(valid_gender_count) )
    for i in range(len(test_users.gender.values)):
        if test_users.gender.values[i] == -1:
            test_users.gender.values[i] = k
            c -= 1
        if c == 0:
            break
test_users.gender.values[62094] = 0

Existing gender value distribution
0 : 0.5441896418448854
2 : 0.23323563514558104
1 : 0.22173730997165678
3 : 0.0008374130378768359


In [12]:
train_users.loc[train_users['age'] > 95, 'age'] = np.nan
train_users.loc[train_users['age'] < 16, 'age'] = np.nan
test_users.loc[test_users['age'] > 95, 'age'] = np.nan
test_users.loc[test_users['age'] < 16, 'age'] = np.nan

In [13]:
# Replace missing age with median
print (train_users.age.median())
print (test_users.age.median())
train_users.loc[ train_users['age'].isnull(), 'age' ] = train_users.age.median()
test_users.loc[ test_users['age'].isnull(), 'age' ] = test_users.age.median()

34.0
31.0


In [14]:
# Encoding for first_affiliate_tracked
train_users.loc[ train_users['first_affiliate_tracked'].isnull(), 'first_affiliate_tracked'] = "untracked"
test_users.loc[ test_users['first_affiliate_tracked'].isnull(), 'first_affiliate_tracked'] = "untracked"

In [15]:
# Creating a DataFrame with train+test data
df_all = pd.concat((train_users, test_users), axis=0, ignore_index=True)

In [16]:
df_all['datehour_timestamp_first_active'] = df_all['timestamp_first_active'].astype(str)

df_all['date_timestamp_first_active'] = [ d[:8] for d in df_all['datehour_timestamp_first_active'] ]
df_all['hour_timestamp_first_active'] = [ d[8:] for d in df_all['datehour_timestamp_first_active'] ]

df_all['date_timestamp_first_active'] = [ datetime.strptime(d, '%Y%m%d') for d in df_all['date_timestamp_first_active']]
df_all['hour_timestamp_first_active'] = [ datetime.strptime(h, '%H%M%S') for h in df_all['hour_timestamp_first_active']]

df_all['weekday_first_active'] = [ datetime.strftime(d,'%a') if not pd.isnull(d) else d for d in df_all['date_timestamp_first_active']]

# create mo_first_active: month of the first active
df_all['mo_first_active'] = [ d.month if not pd.isnull(d) else 0 for d in df_all['date_timestamp_first_active'] ]
df_all['ho_first_active'] = [ d.hour if not pd.isnull(d) else 0 for d in df_all['hour_timestamp_first_active'] ]

In [17]:
from workalendar.usa import UnitedStates

country_dict = { 'NDF': 'None', 'US': UnitedStates(), 'other': 'None', 'FR': 'None', 
 'IT': 'None', 'GB': 'None', 'ES': 'None', 'CA': 'None', 'DE': 'None', 
 'NL' : 'None', 'AU' : 'None', 'PT' : 'None' }

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)
        
        
for c in country_dict:
    if country_dict[c] != 'None':
        hol = country_dict[c].holidays(2008)
        for year in range(2009, 2016, 1):
            hol += country_dict[c].holidays(year)
        
        hol = pd.Series([h[0] for h in hol ])
        print(hol)
        
        hol_dict = {}
        for d in daterange( date(2009, 1, 1), date(2014, 12, 31) ):
            delta_list = hol - d
            delta_list = [dd.days for dd in delta_list]
            next_holiday_delta = min([n for n in delta_list if n >= 0])
            nex2_holiday_delta = min([n for n in delta_list if n > next_holiday_delta])
            hol_dict[d] = (next_holiday_delta, nex2_holiday_delta)
            
        column_name = 'days_to_next_' + c + '_hol'
        df_all[column_name] = [ hol_dict[d.date()][0] for d in df_all['date_timestamp_first_active'] ]

0     2008-01-01
1     2008-01-21
2     2008-02-18
3     2008-05-26
4     2008-07-04
         ...    
84    2015-09-07
85    2015-10-12
86    2015-11-11
87    2015-11-26
88    2015-12-25
Length: 89, dtype: object


In [18]:
df_all

Unnamed: 0,id,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,...,created_year,created_month,created_day,datehour_timestamp_first_active,date_timestamp_first_active,hour_timestamp_first_active,weekday_first_active,mo_first_active,ho_first_active,days_to_next_US_hol
0,gxn3p5htnn,20090319043255,0,34.0,facebook,0,en,direct,direct,untracked,...,2010,6,28,20090319043255,2009-03-19,1900-01-01 04:32:55,Thu,3,4,67
1,820tgsjxq7,20090523174809,1,38.0,facebook,0,en,seo,google,untracked,...,2011,5,25,20090523174809,2009-05-23,1900-01-01 17:48:09,Sat,5,17,2
2,4ft3gnwmtx,20090609231247,2,56.0,basic,3,en,direct,direct,untracked,...,2010,9,28,20090609231247,2009-06-09,1900-01-01 23:12:47,Tue,6,23,24
3,bjjt8pjhuk,20091031060129,2,42.0,facebook,0,en,direct,direct,untracked,...,2011,12,5,20091031060129,2009-10-31,1900-01-01 06:01:29,Sat,10,6,11
4,87mebub9p4,20091208061105,0,41.0,basic,0,en,direct,direct,untracked,...,2010,9,14,20091208061105,2009-12-08,1900-01-01 06:11:05,Tue,12,6,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275542,cv0na2lf5a,20140930235232,0,31.0,basic,0,en,direct,direct,untracked,...,2014,9,30,20140930235232,2014-09-30,1900-01-01 23:52:32,Tue,9,23,13
275543,zp8xfonng8,20140930235306,0,31.0,basic,23,ko,direct,direct,untracked,...,2014,9,30,20140930235306,2014-09-30,1900-01-01 23:53:06,Tue,9,23,13
275544,fa6260ziny,20140930235408,0,31.0,basic,0,de,direct,direct,linked,...,2014,9,30,20140930235408,2014-09-30,1900-01-01 23:54:08,Tue,9,23,13
275545,87k0fy4ugm,20140930235430,0,31.0,basic,0,en,sem-brand,google,omg,...,2014,9,30,20140930235430,2014-09-30,1900-01-01 23:54:30,Tue,9,23,13


In [19]:
features = list(df_all.columns)

remove = ['timestamp_first_active',
         'datehour_timestamp_first_active',
         'date_timestamp_first_active',
         'hour_timestamp_first_active']

for rf in remove:
    features.remove(rf)

df_all = df_all[features]

In [20]:
df_all

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,created_year,created_month,created_day,weekday_first_active,mo_first_active,ho_first_active,days_to_next_US_hol
0,gxn3p5htnn,0,34.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,2010,6,28,Thu,3,4,67
1,820tgsjxq7,1,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,2011,5,25,Sat,5,17,2
2,4ft3gnwmtx,2,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,2010,9,28,Tue,6,23,24
3,bjjt8pjhuk,2,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,2011,12,5,Sat,10,6,11
4,87mebub9p4,0,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,2010,9,14,Tue,12,6,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275542,cv0na2lf5a,0,31.0,basic,0,en,direct,direct,untracked,Web,Windows Desktop,IE,2014,9,30,Tue,9,23,13
275543,zp8xfonng8,0,31.0,basic,23,ko,direct,direct,untracked,Android,Android Phone,-unknown-,2014,9,30,Tue,9,23,13
275544,fa6260ziny,0,31.0,basic,0,de,direct,direct,linked,Web,Windows Desktop,Firefox,2014,9,30,Tue,9,23,13
275545,87k0fy4ugm,0,31.0,basic,0,en,sem-brand,google,omg,Web,Mac Desktop,Safari,2014,9,30,Tue,9,23,13


In [21]:
to_le = [ i for i, t in zip(df_all.dtypes.index, df_all.dtypes.values)  if t == 'O']
to_le.remove('id')
to_le

['signup_method',
 'language',
 'affiliate_channel',
 'affiliate_provider',
 'first_affiliate_tracked',
 'signup_app',
 'first_device_type',
 'first_browser',
 'weekday_first_active']

In [22]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

le = LabelEncoder()
for ri in to_le: # ri: replaced by integer
    le.fit(df_all[ri])   # takes labels and assigns each to a integer,
    df_all[ri] = le.transform(df_all[ri])
    print(le.classes_)
    print(le.transform(le.classes_))

['basic' 'facebook' 'google' 'weibo']
[0 1 2 3]
['-unknown-' 'ca' 'cs' 'da' 'de' 'el' 'en' 'es' 'fi' 'fr' 'hr' 'hu' 'id'
 'is' 'it' 'ja' 'ko' 'nl' 'no' 'pl' 'pt' 'ru' 'sv' 'th' 'tr' 'zh']
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
['api' 'content' 'direct' 'other' 'remarketing' 'sem-brand'
 'sem-non-brand' 'seo']
[0 1 2 3 4 5 6 7]
['baidu' 'bing' 'craigslist' 'daum' 'direct' 'email-marketing' 'facebook'
 'facebook-open-graph' 'google' 'gsp' 'meetup' 'naver' 'other' 'padmapper'
 'vast' 'wayn' 'yahoo' 'yandex']
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
['linked' 'local ops' 'marketing' 'omg' 'product' 'tracked-other'
 'untracked']
[0 1 2 3 4 5 6]
['Android' 'Moweb' 'Web' 'iOS']
[0 1 2 3]
['Android Phone' 'Android Tablet' 'Desktop (Other)' 'Mac Desktop'
 'Other/Unknown' 'SmartPhone (Other)' 'Windows Desktop' 'iPad' 'iPhone']
[0 1 2 3 4 5 6 7 8]
['-unknown-' 'AOL Explorer' 'Android Browser' 'Apple Mail' 'Arora'
 'Avant Browser' 'BlackBerr

In [24]:
## save file with id
df_all.to_csv('./input/TrainTest_Preprocess_w_id.csv', index=False)

In [23]:
df_all = df_all.drop('id', axis=1)
vals = df_all.values
Numtrain = len(train_users)
X_train = vals[:Numtrain]
X_test = vals[Numtrain:]
print('X train shape',X_train.shape)
print('X test shape', X_test.shape)

X train shape (213451, 18)
X test shape (62096, 18)


In [24]:
X_train

array([[ 0., 34.,  1., ...,  3.,  4., 67.],
       [ 1., 38.,  1., ...,  5., 17.,  2.],
       [ 2., 56.,  0., ...,  6., 23., 24.],
       ...,
       [ 0., 32.,  0., ...,  6., 23.,  4.],
       [ 0., 34.,  0., ...,  6., 23.,  4.],
       [ 0., 34.,  0., ...,  6., 23.,  4.]])

## Save Preprocessed data

In [27]:
df_all.to_csv('./input/TrainTest_Preprocess.csv', index=False)
pd.DataFrame(y).to_csv('./input/label.csv', index = False)

## Evaluation

In [25]:
# Reference Kaggle
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer

def dcg_score(y_true, y_score, k=5):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score(te_labels, predict, k):
    
    lb = LabelBinarizer()
    lb.fit(range(len(predict) + 1))
    T = lb.transform(te_labels)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predict):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        if best == 0:
            best = 0.000000001
        score = float(actual) / float(best)
        scores.append(score)
    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)

## Model: Naive Bayes

In [26]:
from sklearn.naive_bayes import GaussianNB


def folds_to_split(data,targets,train,test):
    data_tr = pd.DataFrame(data).iloc[train]
    data_te = pd.DataFrame(data).iloc[test]
    labels_tr = pd.DataFrame(targets).iloc[train]
    labels_te = pd.DataFrame(targets).iloc[test]
    return [data_tr, data_te, labels_tr, labels_te]



# Naive Bayes with 5-fold cross-validation
foldnum = 0
fold_results = pd.DataFrame()
kf = KFold(n_splits=5, random_state=1)


for train, test in kf.split(X_train):   
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train,y,train,test)
    
    
    gnb1 = GaussianNB()
    gnb1.fit(tr_data, tr_labels.values.ravel())
    prob_arr_gnb1 = gnb1.predict_proba(te_data)
    score_gnb1 = ndcg_score(te_labels.as_matrix(), prob_arr_gnb1, k=5)
    
    foldnum+=1
    print(foldnum, "-fold")
    fold_results.loc[foldnum, 'Ndcg_GNb'] = score_gnb1
    print(score_gnb1)
    
print(fold_results.mean())

1 -fold
0.7240732732766846
2 -fold
0.7563446874115004
3 -fold
0.7975030447600175
4 -fold
0.7981573131348827
5 -fold
0.811251560507997
Ndcg_GNb    0.777466
dtype: float64


## Model: Logistic Regression

In [31]:
from sklearn import linear_model 


solvers = 'newton-cg'
foldnum = 0
fold_results = pd.DataFrame()

for train, test in kf.split(X_train):
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, y , train, test)
    
    LR = linear_model.LogisticRegression(solver=solvers)
    
    
    LR.fit(tr_data, tr_labels.values.ravel())
    prob_arr_LR = LR.predict_proba(te_data)
    score_LR = ndcg_score(te_labels.as_matrix(), prob_arr_LR, k=5)
    
    foldnum+=1
    print(foldnum, "-fold")
    fold_results.loc[foldnum, 'nDCG_LR_newton'] = score_LR
    print(score_LR)

print("5-fold avg nDCG:",fold_results.mean())

1 -fold
0.7665057610601491
2 -fold
0.8007105073684946
3 -fold
0.8210259537672099
4 -fold
0.8327920859182466
5 -fold
0.8240652968803793
5-fold avg nDCG: nDCG_LR_newton    0.80902
dtype: float64


In [32]:
solvers = 'lbfgs'
foldnum = 0
fold_results = pd.DataFrame()

for train, test in kf.split(X_train):
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, y, train, test)
    
    LR = linear_model.LogisticRegression(solver=solvers)
    
    
    LR.fit(tr_data, tr_labels.values.ravel())
    prob_arr_LR = LR.predict_proba(te_data)
    score_LR = ndcg_score(te_labels.as_matrix(), prob_arr_LR, k=5)
    
    foldnum+=1
    print(foldnum, "-fold")
    fold_results.loc[foldnum, 'nDCG_LR_lbfgs'] = score_LR
    print(score_LR)

print("5-fold avg nDCG:",fold_results.mean())

1 -fold
0.7663364944587071
2 -fold
0.7981556051868107
3 -fold
0.8179344185356227
4 -fold
0.8287542752381806
5 -fold
0.8216857666022349
5-fold avg nDCG: nDCG_LR_lbfgs    0.806573
dtype: float64
