### Kaggle Datascience Challenge - AIRBNB NEW USER BOOKING

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
np.random.seed(0)

#Load train and test  data
df_train = pd.read_csv('train_users_2.csv')

df_test = pd.read_csv('test_users.csv')

# Saving target values in to labels
labels = df_train['country_destination'].values

# dropping the target values from dataframe 
df_train = df_train.drop(['country_destination'], axis=1)

id_test = df_test['id']

In [4]:
# View the head 
df_train.head

<bound method DataFrame.head of                 id date_account_created  timestamp_first_active  \
0       gxn3p5htnn           2010-06-28          20090319043255   
1       820tgsjxq7           2011-05-25          20090523174809   
2       4ft3gnwmtx           2010-09-28          20090609231247   
3       bjjt8pjhuk           2011-12-05          20091031060129   
4       87mebub9p4           2010-09-14          20091208061105   
5       osr2jwljor           2010-01-01          20100101215619   
6       lsw9q7uk0j           2010-01-02          20100102012558   
7       0d01nltbrs           2010-01-03          20100103191905   
8       a1vcnhxeij           2010-01-04          20100104004211   
9       6uh8zyj2gn           2010-01-04          20100104023758   
10      yuuqmid2rp           2010-01-04          20100104194251   
11      om1ss59ys8           2010-01-05          20100105051812   
12      k6np330cm1           2010-01-05          20100105060859   
13      dy3rgx56cu           2

In [5]:
piv_train = df_train.shape[0]

# Creating a DataFrame with train+test data : to create feature engineering 
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)


# Removing id and date_first_booking
df_all = df_all.drop(['id', 'date_first_booking'], axis=1)

# Filling nan
df_all = df_all.fillna(-1)

In [6]:
# date_account_created
dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_all['dac_year'] = dac[:,0]
df_all['dac_month'] = dac[:,1]
df_all['dac_day'] = dac[:,2]
df_all = df_all.drop(['date_account_created'], axis=1)

In [7]:
# Timestamp_first_active
tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
df_all['tfa_year'] = tfa[:,0]
df_all['tfa_month'] = tfa[:,1]
df_all['tfa_day'] = tfa[:,2]
df_all = df_all.drop(['timestamp_first_active'], axis=1)

In [8]:
#Age
# Override the improper values of age to -1
av = df_all.age.values
df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)


# One-hot-encoding features
# Dummy Encoding for categorical features, one feature at a time
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)

In [9]:
#Splitting train and test
vals = df_all.values

X = vals[:piv_train]

# Encode labels to 0,1,2....n
# Inverse transform gets back the old labels
le = LabelEncoder()
y = le.fit_transform(labels) 

X_test = vals[piv_train:]


In [10]:
X.shape

(213451L, 161L)

In [25]:
# choose one of randomForest or Gradient Boosting
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=150,
                             max_features=25, # num of variables to chose at each split
                            random_state=1,
                            verbose=2)  #controls the verbosity of the output from the model

#njobs=1
rf.fit(X,y)
y_pred = rf.predict_proba(X_test)

building tree 1 of 150
building tree 2 of 150
building tree 3 of 150
building tree 4 of 150
building tree 5 of 150
building tree 6 of 150
building tree 7 of 150
building tree 8 of 150
building tree 9 of 150
building tree 10 of 150
building tree 11 of 150
building tree 12 of 150
building tree 13 of 150
building tree 14 of 150
building tree 15 of 150
building tree 16 of 150
building tree 17 of 150
building tree 18 of 150
building tree 19 of 150
building tree 20 of 150
building tree 21 of 150
building tree 22 of 150
building tree 23 of 150
building tree 24 of 150
building tree 25 of 150
building tree 26 of 150
building tree 27 of 150
building tree 28 of 150
building tree 29 of 150
building tree 30 of 150
building tree 31 of 150
building tree 32 of 150
building tree 33 of 150
building tree 34 of 150
building tree 35 of 150
building tree 36 of 150
building tree 37 of 150
building tree 38 of 150
building tree 39 of 150
building tree 40 of 150
building tree 41 of 150
building tree 42 of 150
b

[Parallel(n_jobs=1)]: Done  40 tasks       | elapsed:   37.4s
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  2.3min finished
[Parallel(n_jobs=1)]: Done  40 tasks       | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    5.2s finished





In [22]:
# Taking a lot of time,try it at home
from sklearn.ensemble import GradientBoostingClassifier
gbm = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, 
                                     n_estimators=100, subsample=0.9,
                                       max_depth=6, init=None, random_state=None,
                                       max_features=20, verbose=2, max_leaf_nodes=None, warm_start=False, presort='auto')
 
gbm.fit(X,y)
y_pred=gbm.predict_prob(X_test)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1      362622.2617        5730.4351           51.34m
         2      329750.4083        3655.8483           50.93m
         3      306121.2140        2617.3908           49.88m
         4      286913.0288        2126.0226           49.09m
         5      272401.8690        1611.6500           48.41m
         6      260920.1372        1278.9531           47.82m
         7      251194.2546        1066.6782           47.34m
         8      243606.7519         842.6688           46.87m
         9      236862.9282         738.3897           46.38m
        10      231656.2811         576.6103           45.81m
        11      226753.7160         482.9087           45.32m


KeyboardInterrupt: 

In [13]:
print y_pred.shape

(62096L, 12L)


In [26]:
#Taking the 5 classes with highest probabilities
ids = []  #list of ids
countryList = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    countryList += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

In [27]:
# Generating  submission
sub = pd.DataFrame(np.column_stack((ids, countryList)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)