# Report 2: Evaluation Using Gradient Boosting Machines 
# For random sample

Goal:
to predict 3 classes being
0 - no action
1 - click
2 - booking

Import all the necessary libraries:

In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn import preprocessing

Load 3 samples based on earlier data processing:

In [2]:
random = pd.read_csv('pig_export/processed_r.csv')

Check the data:

In [3]:
random.shape

(600000, 33)

In [4]:
random.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,...,comp2_rate,comp2_inv,comp5_rate,comp5_inv,comp8_rate,comp8_inv,class,year,month,day
0,623365,5,219,219,11440,2.0,2.5,0,1.1,,...,0.0,0.0,0.0,0.0,0.0,0.0,0,2013,1,2
1,177303,15,55,215,79374,4.0,4.0,0,3.47,0.0113,...,,,0.0,0.0,,,0,2013,5,25
2,186217,7,100,100,71023,3.0,4.0,0,3.43,0.2975,...,0.0,0.0,,,0.0,0.0,2,2013,6,28
3,377107,5,219,219,50605,3.0,4.0,1,1.79,0.1222,...,0.0,0.0,,,0.0,0.0,0,2012,11,29
4,324498,5,219,219,75716,4.0,4.0,1,1.79,0.0211,...,0.0,0.0,0.0,0.0,0.0,0.0,0,2013,3,11


Target and features:

In [5]:
target_random = np.ravel(random['class'].values)

In [6]:
features_random = random.drop(['class'], axis=1)

In [7]:
(1.0 - features_random.count() / len(features_random)) * 100.0

srch_id                         0.000000
site_id                         0.000000
visitor_location_country_id     0.000000
prop_country_id                 0.000000
prop_id                         0.000000
prop_starrating                 0.000000
prop_review_score               0.145667
prop_brand_bool                 0.000000
prop_location_score1            0.000000
prop_location_score2           21.962833
prop_log_historical_price       0.000000
position                        0.000000
price_usd                       0.000000
promotion_flag                  0.000000
srch_destination_id             0.000000
srch_length_of_stay             0.000000
srch_booking_window             0.000000
srch_adults_count               0.000000
srch_children_count             0.000000
srch_room_count                 0.000000
srch_saturday_night_bool        0.000000
orig_destination_distance      32.357167
random_bool                     0.000000
comp2_rate                     59.113167
comp2_inv       

Substitute missing values by mean/median/mode:

In [8]:
# continuous
random_df = features_random[['prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                                 'prop_log_historical_price', 'price_usd', 'orig_destination_distance']]
imp = preprocessing.Imputer(strategy='mean', axis=0) # strategy='median'
imp.fit(random_df)
random_df1 = pd.DataFrame(imp.transform(random_df))
random_df1.columns = random_df.columns

In [9]:
# categorical
random_df2 = features_random[['comp2_rate', 'comp2_inv', 'comp5_rate', 
                                 'comp5_inv', 'comp8_rate', 'comp8_inv']]
imp = preprocessing.Imputer(strategy='most_frequent', axis=0)
imp.fit(random_df2)
random_df3 = pd.DataFrame(imp.transform(random_df2))
random_df3.columns = random_df2.columns

In [10]:
random_recent_rest = features_random[['srch_id','site_id','visitor_location_country_id','prop_country_id',
                                     'prop_id','prop_starrating','prop_brand_bool','position','promotion_flag',
                                     'srch_destination_id','srch_length_of_stay','srch_booking_window',
                                     'srch_adults_count','srch_children_count','srch_room_count',
                                     'srch_saturday_night_bool','random_bool','year','month','day']]
random_recent = pd.DataFrame(np.concatenate((random_recent_rest, random_df1, random_df3), axis=1))
random_recent.columns = ['srch_id','site_id','visitor_location_country_id','prop_country_id',
                           'prop_id','prop_starrating','prop_brand_bool','position','promotion_flag',
                           'srch_destination_id','srch_length_of_stay','srch_booking_window',
                           'srch_adults_count','srch_children_count','srch_room_count',
                           'srch_saturday_night_bool','random_bool','year','month','day',
                            'prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                            'prop_log_historical_price', 'price_usd', 'orig_destination_distance', 
                             'comp2_rate', 'comp2_inv', 'comp5_rate', 
                             'comp5_inv', 'comp8_rate', 'comp8_inv']

In [11]:
random_recent.shape


(600000, 32)

Normalize/scale:

In [12]:
# balanced
random_df = random_recent[['prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                                 'prop_log_historical_price', 'price_usd', 'orig_destination_distance']]
random_df_rest = random_recent[['srch_id','site_id','visitor_location_country_id','prop_country_id',
                                     'prop_id','prop_starrating','prop_brand_bool','position','promotion_flag',
                                     'srch_destination_id','srch_length_of_stay','srch_booking_window',
                                     'srch_adults_count','srch_children_count','srch_room_count',
                                     'srch_saturday_night_bool','random_bool','comp2_rate','comp2_inv',
                                     'comp5_rate','comp5_inv','comp8_rate','comp8_inv','year','month','day']]
standard_scaler0 = preprocessing.StandardScaler(copy=False)
x_standardized0 = standard_scaler0.fit_transform(random_df)
random_most_recent = pd.DataFrame(np.concatenate((random_df_rest, x_standardized0), axis=1))
random_most_recent.columns = ['srch_id','site_id','visitor_location_country_id','prop_country_id',
                                     'prop_id','prop_starrating','prop_brand_bool','position','promotion_flag',
                                     'srch_destination_id','srch_length_of_stay','srch_booking_window',
                                     'srch_adults_count','srch_children_count','srch_room_count',
                                     'srch_saturday_night_bool','random_bool','comp2_rate','comp2_inv',
                                     'comp5_rate','comp5_inv','comp8_rate','comp8_inv','year','month','day',
                               'prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                                 'prop_log_historical_price', 'price_usd', 'orig_destination_distance']

# random: MinMaxScaler()
# first: StandardScaler(feature_range=(-1, 1))

In [13]:
random_most_recent.shape

(600000, 32)

In [14]:
random_most_recent.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_brand_bool,position,promotion_flag,srch_destination_id,...,comp8_inv,year,month,day,prop_review_score,prop_location_score1,prop_location_score2,prop_log_historical_price,price_usd,orig_destination_distance
0,623365.0,5.0,219.0,219.0,11440.0,2.0,0.0,27.0,0.0,11621.0,...,0.0,2013.0,1.0,2.0,-1.218106,-1.159607,-5.900217e-16,-0.071217,-0.013644,0.0
1,177303.0,15.0,55.0,215.0,79374.0,4.0,0.0,7.0,1.0,25564.0,...,0.0,2013.0,5.0,25.0,0.211336,0.387823,-0.8454672,0.458058,-0.011721,0.0
2,186217.0,7.0,100.0,100.0,71023.0,3.0,0.0,7.0,0.0,1996.0,...,0.0,2013.0,6.0,28.0,0.211336,0.361706,1.182524,-2.357468,-0.00785,-0.718722
3,377107.0,5.0,219.0,219.0,50605.0,3.0,1.0,14.0,0.0,15712.0,...,0.0,2012.0,11.0,29.0,0.211336,-0.70909,-0.05963813,0.441689,-0.008983,-0.707327
4,324498.0,5.0,219.0,219.0,75716.0,4.0,1.0,11.0,0.0,2502.0,...,0.0,2013.0,3.0,11.0,0.211336,-0.70909,-0.7760251,0.338016,-0.009874,-0.549108


Split data into train and test with test being 30% of the train:

In [15]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(random_most_recent, 
                                    target_random, test_size = 0.3, random_state = 3456)

In [16]:
X_train.shape, y_train.shape

((420000, 32), (420000,))

In [17]:
X_test.shape, y_test.shape

((180000, 32), (180000,))

Run GBM:

Evaluate using accuracy, precision, recall:

In [39]:
clf = GradientBoostingClassifier(n_estimators=150, random_state=3421)
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)

In [60]:
print "Accuracy Train\tAccuracy Test"
print accuracy_score(y_train, train_pred), "\t", accuracy_score(y_test, test_pred)

Accuracy Train	Accuracy Test
0.955016666667 	0.955466666667


Precision: of all predicted as "positive" how many are actually "positive"?
Recall: of all "positive" in the sample how many were predicted as "positive"?

In [61]:
precision_recall_fscore_support(y_train, train_pred) # no action, click, book

(array([ 0.95502616,  1.        ,  0.86538462]),
 array([ 0.99998255,  0.00206811,  0.00384846]),
 array([ 0.97698746,  0.00412768,  0.00766284]),
 array([401054,   7253,  11693]))

In [64]:
precision_recall_fscore_support(y_test, test_pred) # no action, click, book

(array([ 0.95557185,  0.22222222,  0.13333333]),
 array([  9.99883721e-01,   6.70016750e-04,   3.98803589e-04]),
 array([  9.77225720e-01,   1.33600534e-03,   7.95228628e-04]),
 array([172000,   2985,   5015]))

# Applying Linear Models

In [27]:
random_most_recent.shape # random

(600000, 32)

In [37]:
X_train.shape, y_train.shape # 70% is train

((420000, 32), (420000,))

In [38]:
X_test.shape, y_test.shape # 30% is test

((180000, 32), (180000,))

In [41]:
from sklearn.metrics import precision_recall_fscore_support

Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression

In [66]:
clf1 = LogisticRegression(solver='lbfgs', class_weight='balanced', random_state=3421)
clf1.fit(X_train, y_train)
train_pred1 = clf1.predict(X_train)
test_pred1 = clf1.predict(X_test)

In [67]:
print "Accuracy Train\tAccuracy Test"
print accuracy_score(y_train, train_pred1), "\t", accuracy_score(y_test, test_pred1)

Accuracy Train	Accuracy Test
0.520788095238 	0.524044444444


In [68]:
precision_recall_fscore_support(y_train, train_pred1) # no action, click, book

(array([ 0.98165714,  0.02893359,  0.05362645]),
 array([ 0.52175518,  0.26058183,  0.64902078]),
 array([ 0.68136291,  0.05208405,  0.09906729]),
 array([401054,   7253,  11693]))

In [69]:
precision_recall_fscore_support(y_test, test_pred1) # no action, click, book

(array([ 0.98261097,  0.02816343,  0.05556566]),
 array([ 0.52433721,  0.26164154,  0.67018943]),
 array([ 0.68379192,  0.05085298,  0.10262282]),
 array([172000,   2985,   5015]))

SGD Classifier

In [33]:
from sklearn.linear_model import SGDClassifier

In [46]:
clf2 = SGDClassifier(class_weight='balanced', random_state=3421)
clf2.fit(X_train, y_train)
train_pred2 = clf2.predict(X_train)
test_pred2 = clf2.predict(X_test)

In [47]:
print "Accuracy Train\tAccuracy Test"
print accuracy_score(y_train, train_pred2), "\t", accuracy_score(y_test, test_pred2)

Accuracy Train	Accuracy Test
0.944761904762 	0.94555


In [57]:
precision_recall_fscore_support(y_train, train_pred2) # no action, click, book

(array([ 0.95487661,  0.01408771,  0.02586207]),
 array([  9.89230877e-01,   8.54818696e-03,   2.56563756e-04]),
 array([  9.71750209e-01,   1.06401236e-02,   5.08087052e-04]),
 array([401054,   7253,  11693]))

In [56]:
precision_recall_fscore_support(y_test, test_pred2) # no action, click, book

(array([ 0.95552324,  0.01445396,  0.        ]),
 array([ 0.98937209,  0.00904523,  0.        ]),
 array([ 0.97215311,  0.01112714,  0.        ]),
 array([172000,   2985,   5015]))

Ridge Classifier

In [36]:
from sklearn.linear_model import RidgeClassifier

In [50]:
clf3 = RidgeClassifier(class_weight='balanced', random_state=3421)
clf3.fit(X_train, y_train)
train_pred3 = clf3.predict(X_train)
test_pred3 = clf3.predict(X_test)

In [51]:
print "Accuracy Train\tAccuracy Test"
print accuracy_score(y_train, train_pred3), "\t", accuracy_score(y_test, test_pred3)

Accuracy Train	Accuracy Test
0.435571428571 	0.437888888889


In [55]:
precision_recall_fscore_support(y_train, train_pred3) # no action, click, book

(array([ 0.98505153,  0.04583264,  0.06610504]),
 array([ 0.42062914,  0.6852337 ,  0.79320961]),
 array([ 0.58952419,  0.08591852,  0.12203947]),
 array([401054,   7253,  11693]))

In [54]:
precision_recall_fscore_support(y_test, test_pred3) # no action, click, book

(array([ 0.98566245,  0.04443869,  0.06617414]),
 array([ 0.42327326,  0.68944724,  0.7894317 ]),
 array([ 0.59222653,  0.08349562,  0.12211221]),
 array([172000,   2985,   5015]))