# Multiclass classification of Expedia hotels: application of Gradient Boosting Machines

Goal:

to predict 3 classes being

0 - no action

1 - click

2 - booking
        

1) Import all the necessary libraries:

In [33]:
import pandas as pd
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score, StratifiedShuffleSplit
import numpy as np
from sklearn.metrics import accuracy_score, log_loss

2) Define multiclass labels:

In [4]:
def multi_class(row):
    click = row['click_bool']
    book = row['booking_bool']
    if int(book) == 1:
        return 2
    elif int(book) == 0 and int(click) == 1:
        return 1
    elif int(book) == 0 and int(click) == 0:
        return 0

3) Extract balanced sample from train.csv:

In [5]:
data = pd.read_csv('data/train.csv')

# Click + booking samples
part_1 = data[data['click_bool'] == 1]

# No action samples
select_rows = int(200000)
part_2 = data[data['click_bool'] == 0].iloc[:select_rows, :]

result = pd.concat([part_1, part_2])
result['class'] = result.apply(multi_class, axis=1)
result.to_csv('data/sample_1.csv', index=False)

Check the new data:

In [7]:
result.shape

(643672, 55)

In [8]:
result.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,class
12,1,2013-04-04 08:32:15,12,187,,,219,68914,2,3.0,...,,,,0.0,0.0,11.0,1,114.29,1,2
59,4,2012-12-31 08:59:22,5,219,,,219,139893,2,3.0,...,,,,,,,1,,0,1
63,6,2013-06-05 12:27:51,14,100,,,100,104251,3,4.0,...,,,,,,,1,162.38,1,2
68,8,2013-03-20 17:50:44,5,219,,,219,27669,3,3.5,...,,,,0.0,0.0,,1,96.41,1,2
90,11,2013-02-25 08:39:33,5,219,,,219,20499,2,3.5,...,,,,0.0,0.0,,1,,0,1


4) Load balanced data: target and features. Drop unnecessary variables to form a valid feature set:

In [9]:
train = pd.read_csv('data/sample_1.csv')
target = np.ravel(train['class'].values)
train = train.drop(['date_time', 'click_bool', 'gross_bookings_usd', 'booking_bool', 'class'], axis=1)

5) Impute missing values with a negative value:

In [10]:
features = train.fillna(-100).values

6) Split data into train and test with test being 30% of the train:

In [11]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, 
                                                                     target, test_size = 0.3, random_state = 3456)

In [12]:
X_train.shape, y_train.shape
X_test.shape, y_test.shape

((193102, 50), (193102,))

7) Fit Gradient Boosting Machines with default settings:

In [15]:
clf = GradientBoostingClassifier(random_state=3421)
clf.fit(X_train, y_train)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=3421, subsample=1.0, verbose=0,
              warm_start=False)

8) Compute predictions:

In [21]:
predict_train = clf.predict_proba(X_train)  
predict_test = clf.predict_proba(X_test)

8) Evaluate on train vs on test using logloss:

In [24]:
print log_loss(y_train, predict_train)
print log_loss(y_test, predict_test)

0.311835821174
0.315164311592


9) Use Cross Validation to see the robustness of the approach:

In [39]:
cv = cross_validation.StratifiedShuffleSplit(y=target, n_iter=3, test_size=0.3, random_state=3456)
cross_validation.cross_val_score(clf, features, target, cv=cv)
print -scores

[ 3.59210471  1.11607273  2.70271332]
