# Kaggle 2016 AirBnB Recruiting Competition: New User Bookings

Lorem ipsum delorum...

In [47]:
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

np.random.seed(0)

## Load data into dataframes

In [65]:
# Load main data files
train_df = pd.read_csv('data/train_users.csv')
test_df = pd.read_csv('data/test_users.csv')

# Shuffle training data
train_df = train_df.reindex(np.random.permutation(train_df.index))

# Split out training data labels and create a validation set
train_labels = train_df.ix[:, 'country_destination'].values
train_df = train_df.drop(['country_destination'], axis=1)

split_n = int(math.floor(len(train_df) * 0.9))
train_df, validation_df = train_df[:split_n], train_df[split_n:]
train_labels, validation_labels = train_labels[:split_n], train_labels[split_n:]

# Test ids
test_ids = test_df['id']

# Load sample submission file
submission_df = pd.read_csv('data/sample_submission_NDF.csv')

In [66]:
# Load additional 
demographic_df = pd.read_csv('data/age_gender_bkts.csv')
session_df = pd.read_csv('data/sessions.csv')
country_df = pd.read_csv('data/countries.csv')

## Data Exploration

In [67]:
train_df.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
110400,9m8z2ybvf8,2013-09-23,20130923213711,2014-05-17,FEMALE,28.0,basic,0,en,direct,direct,linked,Web,Windows Desktop,Firefox
110856,0ivw33qioo,2013-09-24,20130924204815,,MALE,40.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox
181021,ulllrr16ly,2014-04-27,20140427024631,,FEMALE,47.0,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Firefox
169036,n2b04swsgm,2014-03-29,20140329012818,,-unknown-,,basic,23,en,direct,direct,untracked,Android,Other/Unknown,-unknown-
30990,e9viyhzbkk,2012-07-08,20120708201555,,-unknown-,,basic,3,en,other,craigslist,tracked-other,Web,Mac Desktop,Safari


In [68]:
train_df.describe()

Unnamed: 0,timestamp_first_active,age,signup_flow
count,192105.0,112993.0,192105.0
mean,20130850000000.0,49.788491,3.270763
std,9248814000.0,156.470615,7.642851
min,20090320000000.0,1.0,0.0
25%,20121230000000.0,28.0,0.0
50%,20130910000000.0,34.0,0.0
75%,20140310000000.0,43.0,0.0
max,20140630000000.0,2014.0,25.0


In [69]:
submission_df.head()

Unnamed: 0,id,country
0,5uwns89zht,NDF
1,jtl0dijy2j,NDF
2,xx0ulgorjt,NDF
3,6c6puo6ix0,NDF
4,czqhjk3yfe,NDF


In [70]:
demographic_df.head()

Unnamed: 0,age_bucket,country_destination,gender,population_in_thousands,year
0,100+,AU,male,1,2015
1,95-99,AU,male,9,2015
2,90-94,AU,male,47,2015
3,85-89,AU,male,118,2015
4,80-84,AU,male,199,2015


In [71]:
session_df.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753
2,d1mm9tcy42,lookup,,,Windows Desktop,301
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141
4,d1mm9tcy42,lookup,,,Windows Desktop,435


In [72]:
country_df.head()

Unnamed: 0,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance
0,AU,-26.853388,133.27516,15297.744,7741220,eng,0.0
1,CA,62.393303,-96.818146,2828.1333,9984670,eng,0.0
2,DE,51.165707,10.452764,7879.568,357022,deu,72.61
3,ES,39.896027,-2.487694,7730.724,505370,spa,92.25
4,FR,46.232193,2.209667,7682.945,643801,fra,92.06


## Feature Engineering

In [73]:
# Removing id and date_first_booking
train_df = train_df.drop(['id', 'date_first_booking'], axis=1)
test_df = test_df.drop(['id', 'date_first_booking'], axis=1)
validation_df = validation_df.drop(['id', 'date_first_booking'], axis=1)
    
# Fill NaN
train_df = train_df.fillna(-1)
test_df = test_df.fillna(-1)
validation_df = validation_df.fillna(-1)

In [74]:
# Date account created
for dataset in (train_df, test_df, validation_df):
    dac = np.vstack(dataset.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
    dataset['dac_year'] = dac[:,0]
    dataset['dac_month'] = dac[:,1]
    dataset['dac_day'] = dac[:,2]
    
train_df = train_df.drop(['date_account_created'], axis=1)
test_df = test_df.drop(['date_account_created'], axis=1)
validation_df = validation_df.drop(['date_account_created'], axis=1)

In [75]:
# Timestamp first active
for dataset in (train_df, test_df, validation_df):
    tfa = np.vstack(dataset.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
    dataset['tfa_year'] = tfa[:,0]
    dataset['tfa_month'] = tfa[:,1]
    dataset['tfa_day'] = tfa[:,2]
    
train_df = train_df.drop(['timestamp_first_active'], axis=1)
test_df = test_df.drop(['timestamp_first_active'], axis=1)
validation_df = validation_df.drop(['timestamp_first_active'], axis=1)

In [76]:
# Age
for dataset in (train_df, test_df, validation_df):
    av = dataset.age.values
    dataset['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

In [77]:
# One-hot-encoding features
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in ohe_feats:
    train_dummy = pd.get_dummies(train_df[f], prefix=f)
    train_df = train_df.drop([f], axis=1)
    train_df = pd.concat((train_df, train_dummy), axis=1)
    validation_dummy = pd.get_dummies(validation_df[f], prefix=f)
    validation_df = validation_df.drop([f], axis=1)
    validation_df = pd.concat((validation_df, validation_dummy), axis=1)
    test_dummy = pd.get_dummies(test_df[f], prefix=f)
    test_df = test_df.drop([f], axis=1)
    test_df = pd.concat((test_df, test_dummy), axis=1)

In [78]:
# Finalization training, validation, and testing data
X_train = train_df.values
X_val = validation_df.values
X_test = test_df.values

le = LabelEncoder()
y_train = le.fit_transform(train_labels)
y_val = le.fit_transform(validation_labels)

## Modeling

In [62]:
# Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=43,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0)                  
xgb.fit(X_train, y_train)
y_pred = xgb.predict_proba(X_test)

In [79]:
# Taking the 5 classes with highest probabilities
ids = []  # list of ids
cts = []  # list of countries
for i in range(len(test_ids)):
    idx = test_ids[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

In [80]:
# Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('./data/output/submission.csv',index=False)

IOError: [Errno 2] No such file or directory: './data/output/submission.csv'