### Airbnb Recruiting XGBoost Script

In [1]:
# Call Python Modules

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import operator

import xgboost as xgb

In [2]:
# Load Data Files
df_train = pd.read_csv('train_users_2.csv')
df_test = pd.read_csv('test_users.csv')
labels = df_train['country_destination'].values
df_train = df_train.drop(['country_destination'], axis=1)
id_test = df_test['id']
piv_train = df_train.shape[0]

In [3]:
# Combine Train and Test datasets
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
#Remove id and date_first_booking variables which are of no relavance while training the modules
df_all = df_all.drop(['id', 'date_first_booking'], axis=1)
#Fill Nan's with value
df_all = df_all.fillna(-1)

In [4]:
# Convert date_account_created
day_acc = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_all['dac_year'] = day_acc[:,0]
df_all['dac_month'] = day_acc[:,1]
df_all['dac_day'] = day_acc[:,2]
df_all = df_all.drop(['date_account_created'], axis=1)

In [5]:
# timestamp_first_active
tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4], x[4:6], x[6:8],x[8:10], x[10:12],x[12:14]]))).values)

df_all['tfa_year'] = tfa[:,0]
df_all['tfa_month'] = tfa[:,1]
df_all['tfa_day'] = tfa[:,2]
df_all = df_all.drop(['timestamp_first_active'], axis=1)

In [6]:
#Encode the categorical features using one-hot encoding
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 
             'affiliate_channel', 'affiliate_provider', 
             'first_affiliate_tracked', 'signup_app', 
             'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)

In [7]:
## Split train and test data

X = df_all.iloc[:piv_train,:]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = df_all.iloc[piv_train:,:]

In [8]:
# Set parameters values for XGBoost and train the model

params = {'eta': 0.1,
          'max_depth': 8,
          'nround': 100,
          'subsample': 0.7,
          'colsample_bytree': 0.8,
          'seed': 1,
          'objective': 'multi:softprob',
          'eval_metric':'ndcg',
          'num_class': 12,
          'nthread':3}
num_boost_round = 10
dtrain = xgb.DMatrix(X, y)
clf1 = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round)

In [9]:
# Get feature scores and store in DataFrame
importance = clf1.get_fscore()
importance_df = pd.DataFrame(
    sorted(importance.items(), key=operator.itemgetter(1)), 
    columns=['feature','fscore']
    )

In [10]:
# Plot feature importance of top 20
importance_df.iloc[-20:,:].plot(x='feature',y='fscore',kind='barh')

# Only select features w/ a feature score (can also specify min fscore)
# Retrain model with reduced feature set
df_all = df_all[importance_df.feature.values]
X = df_all.iloc[:piv_train,:]
X_test = df_all.iloc[piv_train:,:]
dtrain = xgb.DMatrix(X, y)
clf2 = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round)
y_pred = clf2.predict(xgb.DMatrix(X_test)).reshape(df_test.shape[0],12)

In [11]:
# Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

# Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('submission.csv',index=False)