# Kaggle 2016 AirBnB Recruiting Competition: New User Bookings

Lorem ipsum delorum...

In [4]:
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

np.random.seed(0)

## Load data into dataframes

In [10]:
# Load main data files
train_df = pd.read_csv('data/train_users.csv')
test_df = pd.read_csv('data/test_users.csv')

# Shuffle training data
train_df = train_df.reindex(np.random.permutation(train_df.index))

# Split out training data labels and create a validation set
train_labels = train_df.ix[:, 'country_destination'].values
train_df = train_df.drop(['country_destination'], axis=1)

split_n = int(math.floor(len(train_df) * 0.9))
train_df, validation_df = train_df[:split_n], train_df[split_n:]
train_labels, validation_labels = train_labels[:split_n], train_labels[split_n:]

# Load sample submission file
submission_df = pd.read_csv('data/sample_submission_NDF.csv')

In [None]:
# Load additional 
demographic_df = pd.read_csv('data/age_gender_bkts.csv')
session_df = pd.read_csv('data/sessions.csv')
country_df = pd.read_csv('data/countries.csv')

## Data Exploration

In [None]:
train_df.head()

In [None]:
train_labels.head()

In [None]:
submission_df.head()

In [None]:
demographic_df.head()

## Feature Engineering

In [None]:
#Removing id and date_first_booking
df_all = df_all.drop(['id', 'date_first_booking'], axis=1)
#Filling nan
df_all = df_all.fillna(-1)

#####Feature engineering#######
#date_account_created
dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_all['dac_year'] = dac[:,0]
df_all['dac_month'] = dac[:,1]
df_all['dac_day'] = dac[:,2]
df_all = df_all.drop(['date_account_created'], axis=1)

#timestamp_first_active
tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
df_all['tfa_year'] = tfa[:,0]
df_all['tfa_month'] = tfa[:,1]
df_all['tfa_day'] = tfa[:,2]
df_all = df_all.drop(['timestamp_first_active'], axis=1)

#Age
av = df_all.age.values
df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

#One-hot-encoding features
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)

In [None]:
#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]

## Modeling

In [None]:
#Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=43,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0)                  
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)

In [None]:
#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

In [None]:
#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)