# Airbnb New User Booking Predictions

In [43]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [44]:
users = pd.read_csv('../data/users_sessions.csv')

In [45]:
users.head()

Unnamed: 0,date_account_created,year_created,month_created,timestamp_first_active,active_created_duration,gender,age,age_bucket,signup_method,signup_flow,...,iPodtouch,view,click,data,submit,booking_request,click_through_rate,cvr1,cvr2,country_destination
0,2010-06-28,2010,6,2009-03-19,466,,34.0,30-35,facebook,0,...,,,,,,,,,,NDF
1,2011-05-25,2011,5,2009-05-23,732,Male,38.0,35-40,facebook,0,...,,,,,,,,,,NDF
2,2010-09-28,2010,9,2009-06-09,476,Female,56.0,55-60,basic,3,...,,,,,,,,,,US
3,2011-12-05,2011,12,2009-10-31,765,Female,42.0,40-45,facebook,0,...,,,,,,,,,,other
4,2010-09-14,2010,9,2009-12-08,280,,41.0,40-45,basic,0,...,,,,,,,,,,US


### Preprocessing the data

In [46]:
df = users[['year_created', 
            'month_created', 
            'gender', 
            'age_bucket', 
            'signup_method',
            'first_affiliate_tracked',
            'affiliate_channel', 
            'affiliate_provider', 
            'signup_app', 
            'first_device_type',
            'first_browser',
            'Android Phone', 
            'Blackberry', 'Chromebook',
            'Linux Desktop', 
            'Mac Desktop', 
            'Opera Phone', 
            'Tablet',
            'Windows Desktop', 
            'Windows Phone', 
            'iPad Tablet', 
            'iPhone',
            'iPodtouch', 
            'view', 
            'click', 
            'data', 
            'submit', 
            'booking_request',
            'click_through_rate', 
            'cvr1', 
            'cvr2',
            'country_destination']]

In [47]:
df.head()

Unnamed: 0,year_created,month_created,gender,age_bucket,signup_method,first_affiliate_tracked,affiliate_channel,affiliate_provider,signup_app,first_device_type,...,iPodtouch,view,click,data,submit,booking_request,click_through_rate,cvr1,cvr2,country_destination
0,2010,6,,30-35,facebook,untracked,direct,direct,Web,Mac Desktop,...,,,,,,,,,,NDF
1,2011,5,Male,35-40,facebook,untracked,seo,google,Web,Mac Desktop,...,,,,,,,,,,NDF
2,2010,9,Female,55-60,basic,untracked,direct,direct,Web,Windows Desktop,...,,,,,,,,,,US
3,2011,12,Female,40-45,facebook,untracked,direct,direct,Web,Mac Desktop,...,,,,,,,,,,other
4,2010,9,,40-45,basic,untracked,direct,direct,Web,Mac Desktop,...,,,,,,,,,,US


In [48]:
# Import preprocessing tools from sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

X = df.loc[:, df.columns != 'country_destination']

y = df['country_destination']

In [49]:
# One-hot encoding of categorical features using pandas dummies method
to_be_encoded = ['year_created',
                 'month_created',
                 'gender',
                 'age_bucket',
                 'signup_method',
                 'affiliate_channel', 
                 'affiliate_provider',
                 'first_affiliate_tracked',
                 'signup_app', 
                 'first_device_type',
                 'first_browser']

for feature in to_be_encoded:
    X_encoded = pd.get_dummies(X[feature], prefix= feature)
    X = X.drop([feature], axis= 1)
    X = pd.concat((X, X_encoded), axis= 1)
    
X.head()

Unnamed: 0,Android Phone,Blackberry,Chromebook,Linux Desktop,Mac Desktop,Opera Phone,Tablet,Windows Desktop,Windows Phone,iPad Tablet,...,first_browser_SeaMonkey,first_browser_Silk,first_browser_SiteKiosk,first_browser_SlimBrowser,first_browser_Sogou Explorer,first_browser_Stainless,first_browser_TenFourFox,first_browser_TheWorld Browser,first_browser_Yandex.Browser,first_browser_wOSBrowser
0,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
1,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
2,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
3,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0


In [50]:
X = X.fillna(-1)
X = X.replace(np.inf, 0)

In [51]:
# Encoding the target variable
le = LabelEncoder()
y = le.fit_transform(y)
y

array([ 7,  7, 10, ...,  7,  7,  7])

#### Select best features

In [103]:
from sklearn.feature_selection import SelectKBest, chi2

X_new = SelectKBest(chi2, k= 20).fit_transform(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size= 0.3, random_state= 42)

#### Split data into training-testing sets

In [52]:
# Import cross validation
from sklearn.model_selection import train_test_split

# Split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 42)

#### Scale features (min-max)

In [53]:
# Function to get features scaled b/w 0-1 for models that are sensitive to unscaled data, such as KNN
def get_scaled_values(data):
    
    scaler = MinMaxScaler()
    scaled_vals = scaler.fit_transform(data)
    
    return scaled_vals

In [54]:
Xs_train = get_scaled_values(X_train)
Xs_test = get_scaled_values(X_test)

### Creating evaluation metric function

In [55]:
import math

def ndcg(pred, actu):
    sum = 0
    for i in range(len(pred)):
        idx = np.argsort(pred[i])[::-1].tolist().index(actu[i]) + 1
        if idx <= 5:
            sum += 1/math.log(1+idx, 2)
    return sum/len(pred)

def scoring(model, X_test, y_test):
    return ndcg(model.predict_proba(X_test), y_test)

### Model Building & Evaluation

In [56]:
# Import predictive models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.grid_search import GridSearchCV

# Import classification report
from sklearn.metrics import classification_report, f1_score

#### Naive Bayes Classifier

In [57]:
nb = GaussianNB()

In [58]:
nb.fit(X_train, y_train)

GaussianNB(priors=None)

In [59]:
scoring(nb, X_test, y_test)

0.03573118167622339

In [60]:
f1_score(y_test, nb.predict(X_test), average= 'weighted')

0.011075596431405

#### Logistic Regression

In [61]:
lr = LogisticRegression(penalty='l2', C=0.1,
                            multi_class='ovr',
                            # max_iter=300,
                            solver='lbfgs',
                            n_jobs=-1, 
                            random_state=1)

In [62]:
lr.fit(X_train, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=1, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [63]:
scoring(lr, X_test, y_test)

0.8174608113347719

In [64]:
f1_score(y_test, lr.predict(X_test), average= 'weighted')

  'precision', 'predicted', average, warn_for)


0.5487368621612928

#### Decision Tree Classifier

In [65]:
tree = DecisionTreeClassifier(min_samples_split= 40)

In [66]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=40, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [67]:
scoring(tree, X_test, y_test)

0.7900719602568396

In [68]:
f1_score(y_test, tree.predict(X_test), average= 'weighted')

0.55623410840676779

#### Ensemble - Random Forest Classifier

In [69]:
forest = RandomForestClassifier(random_state= 1, n_estimators= 45, min_samples_split= 3, min_samples_leaf= 2)

In [70]:
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=2,
            min_samples_split=3, min_weight_fraction_leaf=0.0,
            n_estimators=45, n_jobs=1, oob_score=False, random_state=1,
            verbose=0, warm_start=False)

In [72]:
scoring(forest, X_test, y_test)

0.8198350156079728

In [73]:
f1_score(y_test, forest.predict(X_test), average= 'weighted')

  'precision', 'predicted', average, warn_for)


0.57576029169257481

#### Ensemble - Boosted Trees (XGB)

In [74]:
xgb = XGBClassifier(max_depth= 4, learning_rate= 0.2, n_estimators= 20, 
              objective= 'multi:softprob', subsample= 0.5, colsample_bytree= 0.5,
              seed= 0, nthread= 4)

In [75]:
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.2, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=20, nthread=4,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.5)

In [76]:
scoring(xgb, X_test, y_test)

0.8220200816218997

In [77]:
f1_score(y_test, xgb.predict(X_test), average= 'weighted')

  'precision', 'predicted', average, warn_for)


0.56162731893556617

#### K-Nearest Neighbors

In [78]:
knn = KNeighborsClassifier(n_neighbors= 5)

In [80]:
from sklearn.feature_selection import SelectKBest, chi2

X_scaled = get_scaled_values(X)
X_new = SelectKBest(chi2, k= 20).fit_transform(X_scaled, y)
Xs_train, Xs_test, y_train, y_test = train_test_split(X_new, y, test_size= 0.3, random_state= 42)

In [81]:
knn.fit(Xs_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [82]:
scoring(knn, Xs_test, y_test)

0.7599588539692758

In [83]:
f1_score(y_test, knn.predict(Xs_test), average= 'weighted')

  'precision', 'predicted', average, warn_for)


0.51791548585367397

#### Final Model using Grid Search

In [84]:
params = {'max_depth': [4, 5, 6],
         'n_estimators': [20, 25, 30],
         'learning_rate': [0.2, 0.3]
         }

clf = GridSearchCV(XGBClassifier(objective= 'multi:softprob', seed= 0, nthread= 4),
                  param_grid= params,
                  n_jobs= 2,
                  scoring= scoring,
                  cv= 3,
                  verbose= 1)

In [85]:
clf.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 75.8min
[Parallel(n_jobs=2)]: Done  54 out of  54 | elapsed: 91.8min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=4,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=2,
       param_grid={'max_depth': [4, 5, 6], 'n_estimators': [20, 25, 30], 'learning_rate': [0.2, 0.3]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=<function scoring at 0x11428d158>, verbose=1)

In [86]:
clf.grid_scores_

[mean: 0.82577, std: 0.00009, params: {'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 20},
 mean: 0.82609, std: 0.00014, params: {'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 25},
 mean: 0.82616, std: 0.00028, params: {'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 30},
 mean: 0.82627, std: 0.00019, params: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 20},
 mean: 0.82655, std: 0.00027, params: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 25},
 mean: 0.82660, std: 0.00014, params: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 30},
 mean: 0.82649, std: 0.00018, params: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 20},
 mean: 0.82666, std: 0.00024, params: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 25},
 mean: 0.82677, std: 0.00030, params: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 30},
 mean: 0.82603, std: 0.00039, params: {'learning_rate': 0.3, 'max_depth': 4, 'n_estimators': 20},
 mean: 0.82620, std:

In [87]:
clf.best_score_

0.8267723308921581

In [89]:
len(xgb.feature_importances_)

156

In [95]:
top_features = []
for i, col in enumerate(X_train.columns):
   top_features.append((col, xgb.feature_importances_[i]))

top_features = sorted(top_features, key= lambda x: x[1], reverse= True)
top_features[:20]

[('cvr1', 0.045500506),
 ('click_through_rate', 0.044826426),
 ('Mac Desktop', 0.042130098),
 ('click', 0.035389282),
 ('age_bucket_30-35', 0.034715202),
 ('data', 0.032355916),
 ('gender_Female', 0.032018874),
 ('gender_Male', 0.032018874),
 ('submit', 0.031344794),
 ('cvr2', 0.031344794),
 ('signup_method_basic', 0.029322548),
 ('view', 0.027974386),
 ('booking_request', 0.027974386),
 ('iPhone', 0.024941018),
 ('Windows Desktop', 0.023592854),
 ('signup_method_facebook', 0.022581732),
 ('signup_app_Web', 0.020896528),
 ('iPad Tablet', 0.019211324),
 ('year_created_2011', 0.018200202),
 ('first_device_type_Mac Desktop', 0.016514998)]

In [101]:
import operator
ig = xgb.booster().get_score(importance_type='gain')
ig = sorted(ig.items(), key= operator.itemgetter(1), reverse= True)
ig[:20]

[('Blackberry', 153.98508125),
 ('age_bucket_30-35', 78.85397367961167),
 ('cvr2', 60.24663583806453),
 ('signup_method_facebook', 51.943872343283566),
 ('age_bucket_25-30', 46.138335008108086),
 ('year_created_2014', 39.26702261666667),
 ('booking_request', 39.224275791084345),
 ('affiliate_channel_content', 36.69130148148148),
 ('gender_Male', 35.56441898210525),
 ('gender_Female', 34.215170052631564),
 ('signup_method_basic', 28.908176728735647),
 ('first_device_type_Mac Desktop', 27.143512510204086),
 ('first_device_type_Other/Unknown', 19.389172496666667),
 ('year_created_2011', 19.100022537037038),
 ('age_bucket_35-40', 19.043809888888887),
 ('signup_app_Web', 17.410547708064517),
 ('year_created_2013', 13.296360116666666),
 ('first_affiliate_tracked_omg', 12.441151642857148),
 ('age_bucket_45-50', 12.399080777777776),
 ('year_created_2010', 12.164442749999997)]