# Multiclass classification of Expedia hotels: application of Gradient Boosting Machines

Goal:

to predict 3 classes being

0 - no action

1 - click

2 - booking
        

1) Import all the necessary libraries:

In [40]:
import pandas as pd
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score, StratifiedShuffleSplit
import numpy as np
from sklearn.metrics import accuracy_score, log_loss, make_scorer

Define multiclass labels:

In [4]:
def multi_class(row):
    click = row['click_bool']
    book = row['booking_bool']
    if int(book) == 1:
        return 2
    elif int(book) == 0 and int(click) == 1:
        return 1
    elif int(book) == 0 and int(click) == 0:
        return 0

3) Extract balanced sample from train.csv:

In [5]:
data = pd.read_csv('data/train.csv')

# Click + booking samples
part_1 = data[data['click_bool'] == 1]

# No action samples
select_rows = int(200000)
part_2 = data[data['click_bool'] == 0].iloc[:select_rows, :]

result = pd.concat([part_1, part_2])
result['class'] = result.apply(multi_class, axis=1)
result.to_csv('data/sample_1.csv', index=False)

Check the new data:

In [7]:
result.shape

(643672, 55)

In [8]:
result.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,class
12,1,2013-04-04 08:32:15,12,187,,,219,68914,2,3.0,...,,,,0.0,0.0,11.0,1,114.29,1,2
59,4,2012-12-31 08:59:22,5,219,,,219,139893,2,3.0,...,,,,,,,1,,0,1
63,6,2013-06-05 12:27:51,14,100,,,100,104251,3,4.0,...,,,,,,,1,162.38,1,2
68,8,2013-03-20 17:50:44,5,219,,,219,27669,3,3.5,...,,,,0.0,0.0,,1,96.41,1,2
90,11,2013-02-25 08:39:33,5,219,,,219,20499,2,3.5,...,,,,0.0,0.0,,1,,0,1


Load balanced data: target and features. Drop unnecessary variables to form a valid feature set:

In [2]:
train = pd.read_csv('data/sample_1.csv')
target = np.ravel(train['class'].values)
train = train.drop(['date_time', 'click_bool', 'gross_bookings_usd', 'booking_bool', 'class'], axis=1)

Impute missing values with a negative value:

In [3]:
features = train.fillna(-100).values

Split data into train and test with test being 30% of the train:

In [4]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, 
                                                                     target, test_size = 0.3, random_state = 3456)

In [44]:
X_train.shape, y_train.shape
X_test.shape, y_test.shape

((193102, 50), (193102,))

In [15]:
clf = GradientBoostingClassifier(random_state=3421)
clf.fit(X_train, y_train)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=3421, subsample=1.0, verbose=0,
              warm_start=False)

7) Finding best loss and optimizing # of estimators:

In [47]:
print "N-estimators\tLogLoss Score on Train\tLogLoss Score on Test\tAccuracy Score Train\tAccuracy Score Test"
for i in xrange(30,301,30):
    clf = GradientBoostingClassifier(n_estimators=i, random_state=3421)
    clf.fit(X_train, y_train)
    train_predictions = clf.predict_proba(X_train)
    test_predictions = clf.predict_proba(X_test)
    train_pred = clf.predict(X_train)
    test_pred = clf.predict(X_test)
    print i,"\t", log_loss(y_train, train_predictions), "\t", log_loss(y_test, test_predictions), "\t", accuracy_score(y_train, train_pred), "\t", accuracy_score(y_test, test_pred)

 N-estimators	LogLoss Score on Train	LogLoss Score on Test	Accuracy Score Train	Accuracy Score Test
30 	0.336983368152 	0.339102023468 	0.891559580087 	0.890741680563
60 	0.316209709266 	0.318845499504 	0.891617284773 	0.890788288055
90 	0.312489707631 	0.315623761489 	0.891723816499 	0.890907396091
120 	0.310883365518 	0.314551549051 	0.891856981157 	0.890954003584
150 	0.309965571941 	0.314149819156 	0.891945757596 	0.890959182194
180 	0.309210668759 	0.313903822382 	0.892054508733 	0.891026504127
210 	0.308542716215 	0.313706592239 	0.89213884635 	0.891104183281
240 	0.30787271815 	0.313522727441 	0.892216525734 	0.891145612164
270 	0.307333253548 	0.313396145631 	0.892311960406 	0.891145612164
300 	0.306815746578 	0.313318782964 	0.892374103913 	0.891171505215


In [43]:
for learning_rate in [0.05, 0.01]:
    print "Learning Rate =", learning_rate
    print "Min Samples Split\tLogLoss Score Train\tLogLoss Score Test\tAccuracy Score Train\tAccuracy Score Test"
    for i in xrange(2,5):
        clf = GradientBoostingClassifier(min_samples_split=i, learning_rate=learning_rate, n_estimators=100, random_state=3421)
        clf.fit(X_train, y_train)
        train_predictions = clf.predict_proba(X_train)
        test_predictions = clf.predict_proba(X_test)
        train_pred = clf.predict(X_train)
        test_pred = clf.predict(X_test)
        print i,"\t", log_loss(y_train, train_predictions), "\t", log_loss(y_test, test_predictions), "\t", accuracy_score(y_train, train_pred), "\t", accuracy_score(y_test, test_pred)

Learning Rate = 0.05
Min Samples Split	LogLoss Score Train	LogLoss Score Test	Accuracy Score Train	Accuracy Score Test
2 	0.319348850761 	0.321817284157 	0.891568457731 	0.890752037783
3 	0.319348850761 	0.321817284157 	0.891568457731 	0.890752037783
4 	0.319348850761 	0.321817284157 	0.891568457731 	0.890752037783
Learning Rate = 0.01
Min Samples Split	LogLoss Score Train	LogLoss Score Test	Accuracy Score Train	Accuracy Score Test
2 	0.478601996728 	0.479818044743 	0.891559580087 	0.890741680563
3 	0.478601996728 	0.479818044743 	0.891559580087 	0.890741680563
4 	0.478601996728 	0.479818044743 	0.891559580087 	0.890741680563
