In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost.sklearn import XGBClassifier
from datetime import datetime, timedelta, date
import warnings
warnings.filterwarnings("ignore")

## Load Data

In [2]:
train_users = pd.read_csv('./input/train_users_2.csv')
test_users = pd.read_csv('./input/test_users.csv')

train_users_labels = train_users.loc[:,'country_destination']
train_users_attrs = train_users.iloc[:,0:15]
train_users = train_users_attrs

In [3]:
#Creating and Encoding y
le_y = LabelEncoder()
y = le_y.fit_transform(train_users_labels)
print(y)

[ 7  7 10 ...  7  7  7]


In [4]:
df_all = pd.concat((train_users, test_users), axis=0, ignore_index=True)
print('Concat dataframe shape:', df_all.shape)

df_all = df_all.drop('date_first_booking', axis=1)

Concat dataframe shape: (275547, 15)


## Feature Engineering

In [5]:
df_all['datehour_timestamp_first_active'] = df_all['timestamp_first_active'].astype(str)

df_all['date_timestamp_first_active'] = [ d[:8] for d in df_all['datehour_timestamp_first_active'] ]
df_all['hour_timestamp_first_active'] = [ d[8:] for d in df_all['datehour_timestamp_first_active'] ]

df_all['date_timestamp_first_active'] = [ datetime.strptime(d, '%Y%m%d') for d in df_all['date_timestamp_first_active']]
df_all['hour_timestamp_first_active'] = [ datetime.strptime(h, '%H%M%S') for h in df_all['hour_timestamp_first_active']]

df_all['weekday_first_active'] = [ datetime.strftime(d,'%a') if not pd.isnull(d) else d for d in df_all['date_timestamp_first_active']]

In [6]:
# create d_ac-fa: number of days between account creation and first access
df_all['date_account_created'] = [ datetime.strptime(d, '%Y-%m-%d') for d in df_all['date_account_created']]
df_all['d_ac-fa'] = df_all['date_account_created'] - df_all['date_timestamp_first_active']
df_all['d_ac-fa'] = df_all['d_ac-fa']/ np.timedelta64(1, 'D')

# create mo_first_active: month of the first active
df_all['mo_first_active'] = [ d.month if not pd.isnull(d) else 0 for d in df_all['date_timestamp_first_active'] ]
df_all['ho_first_active'] = [ d.hour if not pd.isnull(d) else 0 for d in df_all['hour_timestamp_first_active'] ]

In [7]:
# create features
# d_ac : day of the month account creation 
# d_fa : day of the month first access
df_all['d_ac'] = [ d.day if not pd.isnull(d) else 0 for d in df_all['date_account_created']  ]    
df_all['d_fa'] = [ d.day if not pd.isnull(d) else 0 for d in df_all['date_timestamp_first_active']  ]

In [8]:
from workalendar.usa import UnitedStates

country_dict = { 'NDF': 'None', 'US': UnitedStates(), 'other': 'None', 'FR': 'None', 
 'IT': 'None', 'GB': 'None', 'ES': 'None', 'CA': 'None', 'DE': 'None', 
 'NL' : 'None', 'AU' : 'None', 'PT' : 'None' }

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)
        
        
for c in country_dict:
    if country_dict[c] != 'None':
        hol = country_dict[c].holidays(2008)
        for year in range(2009, 2016, 1):
            hol += country_dict[c].holidays(year)
        
        hol = pd.Series([h[0] for h in hol ])
        print(hol)
        
        hol_dict = {}
        for d in daterange( date(2009, 1, 1), date(2014, 12, 31) ):
            delta_list = hol - d
            delta_list = [dd.days for dd in delta_list]
            next_holiday_delta = min([n for n in delta_list if n >= 0])
            nex2_holiday_delta = min([n for n in delta_list if n > next_holiday_delta])
            hol_dict[d] = (next_holiday_delta, nex2_holiday_delta)
            
        column_name = 'days_to_next_' + c + '_hol'
        df_all[column_name] = [ hol_dict[d.date()][0] for d in df_all['date_timestamp_first_active'] ]

0     2008-01-01
1     2008-01-21
2     2008-02-18
3     2008-05-26
4     2008-07-04
         ...    
84    2015-09-07
85    2015-10-12
86    2015-11-11
87    2015-11-26
88    2015-12-25
Length: 89, dtype: object


In [9]:
features = list(df_all.columns)

remove = ['date_account_created',
          'timestamp_first_active',
         'datehour_timestamp_first_active',
         'date_timestamp_first_active',
         'hour_timestamp_first_active']

for rf in remove:
    features.remove(rf)

df_all = df_all[features]

In [10]:
df_all['age'].fillna(-999, inplace=True)

# Remove age outliers

av = df_all.age.values
df_all['age'] = np.where(np.logical_or(av<14, av>104), -999, av)

In [11]:
df_all

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,weekday_first_active,d_ac-fa,mo_first_active,ho_first_active,d_ac,d_fa,days_to_next_US_hol
0,gxn3p5htnn,-unknown-,-999.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,Thu,466.0,3,4,28,19,67
1,820tgsjxq7,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,Sat,732.0,5,17,25,23,2
2,4ft3gnwmtx,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,Tue,476.0,6,23,28,9,24
3,bjjt8pjhuk,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,Sat,765.0,10,6,5,31,11
4,87mebub9p4,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,Tue,280.0,12,6,14,8,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275542,cv0na2lf5a,-unknown-,31.0,basic,0,en,direct,direct,untracked,Web,Windows Desktop,IE,Tue,0.0,9,23,30,30,13
275543,zp8xfonng8,-unknown-,-999.0,basic,23,ko,direct,direct,untracked,Android,Android Phone,-unknown-,Tue,0.0,9,23,30,30,13
275544,fa6260ziny,-unknown-,-999.0,basic,0,de,direct,direct,linked,Web,Windows Desktop,Firefox,Tue,0.0,9,23,30,30,13
275545,87k0fy4ugm,-unknown-,-999.0,basic,0,en,sem-brand,google,omg,Web,Mac Desktop,Safari,Tue,0.0,9,23,30,30,13


In [12]:
# imputation most frequent 'untracked' for first_affiliate_tracked
print(df_all.first_affiliate_tracked.value_counts().index[0])
df_all['first_affiliate_tracked'].fillna(df_all.first_affiliate_tracked.value_counts().index[0], inplace=True)

untracked


In [13]:
to_le = [ i for i, t in zip(df_all.dtypes.index, df_all.dtypes.values)  if t == 'O']
to_le.remove('id')
to_le

['gender',
 'signup_method',
 'language',
 'affiliate_channel',
 'affiliate_provider',
 'first_affiliate_tracked',
 'signup_app',
 'first_device_type',
 'first_browser',
 'weekday_first_active']

In [14]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

le = LabelEncoder()
for ri in to_le: # ri: replaced by integer
    le.fit(df_all[ri])   # takes labels and assigns each to a integer,
    df_all[ri] = le.transform(df_all[ri])
    print(le.classes_)
    print(le.transform(le.classes_))

['-unknown-' 'FEMALE' 'MALE' 'OTHER']
[0 1 2 3]
['basic' 'facebook' 'google' 'weibo']
[0 1 2 3]
['-unknown-' 'ca' 'cs' 'da' 'de' 'el' 'en' 'es' 'fi' 'fr' 'hr' 'hu' 'id'
 'is' 'it' 'ja' 'ko' 'nl' 'no' 'pl' 'pt' 'ru' 'sv' 'th' 'tr' 'zh']
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
['api' 'content' 'direct' 'other' 'remarketing' 'sem-brand'
 'sem-non-brand' 'seo']
[0 1 2 3 4 5 6 7]
['baidu' 'bing' 'craigslist' 'daum' 'direct' 'email-marketing' 'facebook'
 'facebook-open-graph' 'google' 'gsp' 'meetup' 'naver' 'other' 'padmapper'
 'vast' 'wayn' 'yahoo' 'yandex']
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
['linked' 'local ops' 'marketing' 'omg' 'product' 'tracked-other'
 'untracked']
[0 1 2 3 4 5 6]
['Android' 'Moweb' 'Web' 'iOS']
[0 1 2 3]
['Android Phone' 'Android Tablet' 'Desktop (Other)' 'Mac Desktop'
 'Other/Unknown' 'SmartPhone (Other)' 'Windows Desktop' 'iPad' 'iPhone']
[0 1 2 3 4 5 6 7 8]
['-unknown-' 'AOL Explorer' 'Android Browser' 

## Save final Preprocessed csv

In [16]:
## save file with id
df_all.to_csv('./input/TrainTest_Final_Process.csv', index=False)

## Evaluation

In [18]:
# Reference Kaggle
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer

def dcg_score(y_true, y_score, k=5):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score(te_labels, predict, k):
    
    lb = LabelBinarizer()
    lb.fit(range(len(predict) + 1))
    T = lb.transform(te_labels)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predict):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        if best == 0:
            best = 0.000000001
        score = float(actual) / float(best)
        scores.append(score)
    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)

In [19]:
def folds_to_split(data,targets,train,test):
    data_tr = pd.DataFrame(data).iloc[train]
    data_te = pd.DataFrame(data).iloc[test]
    labels_tr = pd.DataFrame(targets).iloc[train]
    labels_te = pd.DataFrame(targets).iloc[test]
    return [data_tr, data_te, labels_tr, labels_te]

## Merge data with Session data

In [20]:
df_s = pd.read_csv('./session_featrues_500.csv')
df_s = df_s.rename(columns = {'user_id': 'id'})
df_s.shape

(135483, 504)

In [21]:
df_all = pd.read_csv('./input/TrainTest_Final_Process.csv')
df_merge = pd.merge(df_all, df_s, how = 'left', on = 'id')
df_merge.head(5)

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,...,490,491,492,493,494,495,496,497,498,499
0,gxn3p5htnn,0,-999.0,1,0,6,2,4,6,2,...,,,,,,,,,,
1,820tgsjxq7,2,38.0,1,0,6,7,8,6,2,...,,,,,,,,,,
2,4ft3gnwmtx,1,56.0,0,3,6,2,4,6,2,...,,,,,,,,,,
3,bjjt8pjhuk,1,42.0,1,0,6,2,4,6,2,...,,,,,,,,,,
4,87mebub9p4,0,41.0,0,0,6,2,4,6,2,...,,,,,,,,,,


In [22]:
df_merge = df_merge.drop('id', axis=1)

# filling NaN with -1
df_merge.fillna(-999, inplace = True)

In [23]:
df_merge.head(5)

Unnamed: 0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,...,490,491,492,493,494,495,496,497,498,499
0,0,-999.0,1,0,6,2,4,6,2,3,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
1,2,38.0,1,0,6,7,8,6,2,3,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2,1,56.0,0,3,6,2,4,6,2,6,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
3,1,42.0,1,0,6,2,4,6,2,3,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
4,0,41.0,0,0,6,2,4,6,2,3,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0


In [24]:
labels = pd.read_csv('./input/label.csv')
vals = df_merge.values
Numtrain = len(labels)
X_train = vals[:Numtrain]
X_test = vals[Numtrain:]
print('X train shape',X_train.shape)
print('X test shape', X_test.shape)

X train shape (213451, 521)
X test shape (62096, 521)


In [26]:
kf = KFold(n_splits=5, random_state=1)
                
for n_estimators in [20, 25, 30]:
    for subsample in [0.75, 0.5]:
        for learning_rate in [0.15, 0.2]:
            
            ndcg_scores = []
            for train, test in kf.split(X_train):
                [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)

                clf = XGBClassifier(max_depth=6,
                                    learning_rate=learning_rate,
                                    n_estimators=n_estimators,
                                    reg_lambda=10,
                                    objective='multi:softprob',
                                    subsample=subsample,
                                    colsample_bytree=0.5,
                                    seed=0)  


                clf.fit(tr_data, tr_labels.values.ravel())
                
                prob_arr_XGB = clf.predict_proba(te_data)
                score_XGB = ndcg_score(te_labels.as_matrix(), prob_arr_XGB, k=5)

                ndcg_scores.append(score_XGB)
                
            print("learning rate: %f  n_estimators: %d subsample: %f " % (learning_rate, n_estimators, subsample))
            print("nDCG scores: ", [ "%.4f" % s for s in ndcg_scores])
            print("mean: ", np.mean(ndcg_scores))
            print("******************************************************************************")

learning rate: 0.150000  n_estimators: 20 subsample: 0.750000 
nDCG scores:  ['0.7816', '0.8141', '0.8350', '0.8577', '0.8509']
mean:  0.8278540065488726
******************************************************************************
learning rate: 0.200000  n_estimators: 20 subsample: 0.750000 
nDCG scores:  ['0.7841', '0.8146', '0.8356', '0.8588', '0.8521']
mean:  0.8290439493145195
******************************************************************************
learning rate: 0.150000  n_estimators: 20 subsample: 0.500000 
nDCG scores:  ['0.7818', '0.8140', '0.8345', '0.8575', '0.8503']
mean:  0.827608249574147
******************************************************************************
learning rate: 0.200000  n_estimators: 20 subsample: 0.500000 
nDCG scores:  ['0.7846', '0.8143', '0.8348', '0.8582', '0.8521']
mean:  0.828797100513517
******************************************************************************
learning rate: 0.150000  n_estimators: 25 subsample: 0.750000 
nDCG sc

## Choose the highest score setting for submitting testing file.

In [27]:
clf = XGBClassifier(max_depth=6, learning_rate=0.2, 
                    n_estimators=30,
                    objective='multi:softprob', subsample=0.5, 
                    colsample_bytree=0.5, seed=0)

# Fit and predict
clf.fit(X_train, labels)
y_pred = clf.predict_proba(X_test)  

In [28]:
df_test = pd.read_csv('./input/test_users.csv')
id_test = df_test['id']

In [29]:
# Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le_y.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

In [30]:
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
f = 'sub.' + datetime.now().strftime("%d%m%y_%H%M%S") + '.csv'
sub.to_csv(f, index=False)