# Evaluation
(after report 2)

Goal:
to predict 3 classes being
0 - no action
1 - click
2 - booking

Import all the necessary libraries:

In [27]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn import preprocessing
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

Load 3 samples based on earlier data processing:

In [2]:
random = pd.read_csv('pig_export/processed_r.csv') # import random data
balanced = pd.read_csv('pig_export/processed_b.csv') # import balanced data

Check the data:

In [9]:
random.shape

(600000, 33)

In [8]:
balanced = balanced.drop('gross_bookings_usd', axis=1)
balanced.shape

(643672, 33)

Target and features:

In [10]:
target_random = np.ravel(random['class'].values)
target_balanced = np.ravel(balanced['class'].values)

In [11]:
features_random = random.drop(['class'], axis=1)
features_balanced = balanced.drop(['class'], axis=1)

Substitute missing values by mean/median/mode:

In [12]:
# continuous
# random
random_df = features_random[['prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                                 'prop_log_historical_price', 'price_usd', 'orig_destination_distance']]
imp = preprocessing.Imputer(strategy='mean', axis=0) # strategy='median'
imp.fit(random_df)
random_df1 = pd.DataFrame(imp.transform(random_df))
random_df1.columns = random_df.columns
# balanced
balanced_df = features_balanced[['prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                                 'prop_log_historical_price', 'price_usd', 'orig_destination_distance']]
imp1 = preprocessing.Imputer(strategy='mean', axis=0) # strategy='median'
imp1.fit(balanced_df)
balanced_df1 = pd.DataFrame(imp1.transform(balanced_df))
balanced_df1.columns = balanced_df.columns

In [13]:
# categorical
# random
random_df2 = features_random[['comp2_rate', 'comp2_inv', 'comp5_rate', 
                                 'comp5_inv', 'comp8_rate', 'comp8_inv']]
imp = preprocessing.Imputer(strategy='most_frequent', axis=0)
imp.fit(random_df2)
random_df3 = pd.DataFrame(imp.transform(random_df2))
random_df3.columns = random_df2.columns
# balanced
balanced_df2 = features_balanced[['comp2_rate', 'comp2_inv', 'comp5_rate', 
                                 'comp5_inv', 'comp8_rate', 'comp8_inv']]
imp1 = preprocessing.Imputer(strategy='most_frequent', axis=0)
imp1.fit(balanced_df2)
balanced_df3 = pd.DataFrame(imp1.transform(balanced_df2))
balanced_df3.columns = balanced_df2.columns

In [14]:
# random
random_recent_rest = features_random[['srch_id','site_id','visitor_location_country_id','prop_country_id',
                                     'prop_id','prop_starrating','prop_brand_bool','position','promotion_flag',
                                     'srch_destination_id','srch_length_of_stay','srch_booking_window',
                                     'srch_adults_count','srch_children_count','srch_room_count',
                                     'srch_saturday_night_bool','random_bool','year','month','day']]
random_recent = pd.DataFrame(np.concatenate((random_recent_rest, random_df1, random_df3), axis=1))
random_recent.columns = ['srch_id','site_id','visitor_location_country_id','prop_country_id',
                           'prop_id','prop_starrating','prop_brand_bool','position','promotion_flag',
                           'srch_destination_id','srch_length_of_stay','srch_booking_window',
                           'srch_adults_count','srch_children_count','srch_room_count',
                           'srch_saturday_night_bool','random_bool','year','month','day',
                            'prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                            'prop_log_historical_price', 'price_usd', 'orig_destination_distance', 
                             'comp2_rate', 'comp2_inv', 'comp5_rate', 
                             'comp5_inv', 'comp8_rate', 'comp8_inv']
# balanced
balanced_recent_rest = features_balanced[['srch_id','site_id','visitor_location_country_id','prop_country_id',
                                     'prop_id','prop_starrating','prop_brand_bool','position','promotion_flag',
                                     'srch_destination_id','srch_length_of_stay','srch_booking_window',
                                     'srch_adults_count','srch_children_count','srch_room_count',
                                     'srch_saturday_night_bool','random_bool','year','month','day']]
balanced_recent = pd.DataFrame(np.concatenate((balanced_recent_rest, balanced_df1, balanced_df3), axis=1))
balanced_recent.columns = ['srch_id','site_id','visitor_location_country_id','prop_country_id',
                           'prop_id','prop_starrating','prop_brand_bool','position','promotion_flag',
                           'srch_destination_id','srch_length_of_stay','srch_booking_window',
                           'srch_adults_count','srch_children_count','srch_room_count',
                           'srch_saturday_night_bool','random_bool','year','month','day',
                            'prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                            'prop_log_historical_price', 'price_usd', 'orig_destination_distance', 
                             'comp2_rate', 'comp2_inv', 'comp5_rate', 
                             'comp5_inv', 'comp8_rate', 'comp8_inv']

In [11]:
random_recent.shape

(600000, 32)

In [15]:
balanced_recent.shape

(643672, 32)

Normalize/scale:

In [16]:
# random
random_df = random_recent[['prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                                 'prop_log_historical_price', 'price_usd', 'orig_destination_distance']]
random_df_rest = random_recent[['srch_id','site_id','visitor_location_country_id','prop_country_id',
                                     'prop_id','prop_starrating','prop_brand_bool','position','promotion_flag',
                                     'srch_destination_id','srch_length_of_stay','srch_booking_window',
                                     'srch_adults_count','srch_children_count','srch_room_count',
                                     'srch_saturday_night_bool','random_bool','comp2_rate','comp2_inv',
                                     'comp5_rate','comp5_inv','comp8_rate','comp8_inv','year','month','day']]
standard_scaler0 = preprocessing.StandardScaler(copy=False)
x_standardized0 = standard_scaler0.fit_transform(random_df)
random_most_recent = pd.DataFrame(np.concatenate((random_df_rest, x_standardized0), axis=1))
random_most_recent.columns = ['srch_id','site_id','visitor_location_country_id','prop_country_id',
                                     'prop_id','prop_starrating','prop_brand_bool','position','promotion_flag',
                                     'srch_destination_id','srch_length_of_stay','srch_booking_window',
                                     'srch_adults_count','srch_children_count','srch_room_count',
                                     'srch_saturday_night_bool','random_bool','comp2_rate','comp2_inv',
                                     'comp5_rate','comp5_inv','comp8_rate','comp8_inv','year','month','day',
                               'prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                                 'prop_log_historical_price', 'price_usd', 'orig_destination_distance']

# balanced
balanced_df = balanced_recent[['prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                                 'prop_log_historical_price', 'price_usd', 'orig_destination_distance']]
balanced_df_rest = balanced_recent[['srch_id','site_id','visitor_location_country_id','prop_country_id',
                                     'prop_id','prop_starrating','prop_brand_bool','position','promotion_flag',
                                     'srch_destination_id','srch_length_of_stay','srch_booking_window',
                                     'srch_adults_count','srch_children_count','srch_room_count',
                                     'srch_saturday_night_bool','random_bool','comp2_rate','comp2_inv',
                                     'comp5_rate','comp5_inv','comp8_rate','comp8_inv','year','month','day']]
standard_scaler1 = preprocessing.StandardScaler(copy=False)
x_standardized1 = standard_scaler1.fit_transform(balanced_df)
balanced_most_recent = pd.DataFrame(np.concatenate((balanced_df_rest, x_standardized1), axis=1))
balanced_most_recent.columns = ['srch_id','site_id','visitor_location_country_id','prop_country_id',
                                     'prop_id','prop_starrating','prop_brand_bool','position','promotion_flag',
                                     'srch_destination_id','srch_length_of_stay','srch_booking_window',
                                     'srch_adults_count','srch_children_count','srch_room_count',
                                     'srch_saturday_night_bool','random_bool','comp2_rate','comp2_inv',
                                     'comp5_rate','comp5_inv','comp8_rate','comp8_inv','year','month','day',
                               'prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                                 'prop_log_historical_price', 'price_usd', 'orig_destination_distance']

In [19]:
random_most_recent.shape, balanced_most_recent.shape

((600000, 32), (643672, 32))

In [14]:
random_most_recent.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_brand_bool,position,promotion_flag,srch_destination_id,...,comp8_inv,year,month,day,prop_review_score,prop_location_score1,prop_location_score2,prop_log_historical_price,price_usd,orig_destination_distance
0,623365.0,5.0,219.0,219.0,11440.0,2.0,0.0,27.0,0.0,11621.0,...,0.0,2013.0,1.0,2.0,-1.218106,-1.159607,-5.900217e-16,-0.071217,-0.013644,0.0
1,177303.0,15.0,55.0,215.0,79374.0,4.0,0.0,7.0,1.0,25564.0,...,0.0,2013.0,5.0,25.0,0.211336,0.387823,-0.8454672,0.458058,-0.011721,0.0
2,186217.0,7.0,100.0,100.0,71023.0,3.0,0.0,7.0,0.0,1996.0,...,0.0,2013.0,6.0,28.0,0.211336,0.361706,1.182524,-2.357468,-0.00785,-0.718722
3,377107.0,5.0,219.0,219.0,50605.0,3.0,1.0,14.0,0.0,15712.0,...,0.0,2012.0,11.0,29.0,0.211336,-0.70909,-0.05963813,0.441689,-0.008983,-0.707327
4,324498.0,5.0,219.0,219.0,75716.0,4.0,1.0,11.0,0.0,2502.0,...,0.0,2013.0,3.0,11.0,0.211336,-0.70909,-0.7760251,0.338016,-0.009874,-0.549108


In [20]:
balanced_most_recent.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_brand_bool,position,promotion_flag,srch_destination_id,...,comp8_inv,year,month,day,prop_review_score,prop_location_score1,prop_location_score2,prop_log_historical_price,price_usd,orig_destination_distance
0,1.0,12.0,187.0,219.0,893.0,3.0,1.0,27.0,0.0,23246.0,...,0.0,2013.0,4.0,4.0,-0.38428,-0.042875,-0.73729,0.331697,-0.010989,-1.359045e-16
1,1.0,12.0,187.0,219.0,10404.0,4.0,1.0,26.0,0.0,23246.0,...,0.0,2013.0,4.0,4.0,0.148029,-0.458841,-0.910807,0.37534,-0.00542,-1.359045e-16
2,1.0,12.0,187.0,219.0,21315.0,3.0,1.0,21.0,0.0,23246.0,...,0.0,2013.0,4.0,4.0,0.680338,-0.458841,-0.853168,0.315331,-0.004656,-1.359045e-16
3,1.0,12.0,187.0,219.0,27348.0,2.0,1.0,34.0,0.0,23246.0,...,0.0,2013.0,4.0,4.0,0.148029,-0.042875,-0.925216,0.026196,0.031047,-1.359045e-16
4,1.0,12.0,187.0,219.0,29604.0,4.0,1.0,4.0,0.0,23246.0,...,0.0,2013.0,4.0,4.0,-0.38428,-0.168325,-0.255165,0.320786,-0.007713,-1.359045e-16


Split data into train and test with test being 30% of the train:

In [21]:
X_train, X_test, y_train, y_test = balanced_most_recent, random_most_recent, target_balanced, target_random

In [22]:
X_train.shape, y_train.shape

((643672, 32), (643672,))

In [23]:
X_test.shape, y_test.shape

((600000, 32), (600000,))

GBM

Evaluate using accuracy, precision, recall:

In [24]:
clf = GradientBoostingClassifier(n_estimators=150, random_state=3421)
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)

In [25]:
print "Accuracy Train\tAccuracy Test"
print accuracy_score(y_train, train_pred), "\t", accuracy_score(y_test, test_pred)

Accuracy Train	Accuracy Test
0.891640462844 	0.0582733333333


Precision: of all predicted as "positive" how many are actually "positive"?
Recall: of all "positive" in the sample how many were predicted as "positive"?

In [28]:
precision_recall_fscore_support(y_train, train_pred) # no action, click, book

(array([ 0.9552643 ,  0.88336058,  0.8509186 ]),
 array([ 0.999985  ,  0.71142992,  0.92215638]),
 array([ 0.97711322,  0.78812753,  0.88510641]),
 array([200000, 167079, 276593]))

In [29]:
precision_recall_fscore_support(y_test, test_pred) # no action, click, book

(array([ 0.95513367,  0.04142059,  0.03734515]),
 array([ 0.02150932,  0.70765775,  0.92129519]),
 array([ 0.0420712 ,  0.07826045,  0.07178064]),
 array([573054,  10238,  16708]))

Logistic Regression

In [30]:
clf1 = LogisticRegression(solver='lbfgs', class_weight='balanced', random_state=3421)
clf1.fit(X_train, y_train)
train_pred1 = clf1.predict(X_train)
test_pred1 = clf1.predict(X_test)

In [31]:
print "Accuracy Train\tAccuracy Test"
print accuracy_score(y_train, train_pred1), "\t", accuracy_score(y_test, test_pred1)

Accuracy Train	Accuracy Test
0.726665133795 	0.0599183333333


In [32]:
precision_recall_fscore_support(y_train, train_pred1) # no action, click, book

(array([ 0.93574256,  0.49793906,  0.66369456]),
 array([ 1.        ,  0.31668851,  0.77667186]),
 array([ 0.96680476,  0.38715012,  0.71575244]),
 array([200000, 167079, 276593]))

In [33]:
precision_recall_fscore_support(y_test, test_pred1) # no action, click, book

(array([ 0.95942408,  0.02158592,  0.03007259]),
 array([ 0.03453601,  0.31383083,  0.77489825]),
 array([ 0.06667206,  0.0403935 ,  0.05789823]),
 array([573054,  10238,  16708]))

SGD Classifier

In [34]:
clf2 = SGDClassifier(class_weight='balanced', random_state=3421)
clf2.fit(X_train, y_train)
train_pred2 = clf2.predict(X_train)
test_pred2 = clf2.predict(X_test)

In [35]:
print "Accuracy Train\tAccuracy Test"
print accuracy_score(y_train, train_pred2), "\t", accuracy_score(y_test, test_pred2)

Accuracy Train	Accuracy Test
0.692969400564 	0.0508183333333


In [36]:
precision_recall_fscore_support(y_train, train_pred2) # no action, click, book

(array([ 0.94450134,  0.38236127,  0.62038734]),
 array([ 0.9869    ,  0.20203616,  0.7769864 ]),
 array([ 0.96523529,  0.26437763,  0.6899121 ]),
 array([200000, 167079, 276593]))

In [37]:
precision_recall_fscore_support(y_test, test_pred2) # no action, click, book

(array([ 0.95528531,  0.01728961,  0.02820016]),
 array([ 0.02673047,  0.20042977,  0.78531243]),
 array([ 0.05200572,  0.0318332 ,  0.05444523]),
 array([573054,  10238,  16708]))

Ridge Classifier

In [38]:
clf3 = RidgeClassifier(class_weight='balanced', random_state=3421)
clf3.fit(X_train, y_train)
train_pred3 = clf3.predict(X_train)
test_pred3 = clf3.predict(X_test)

In [39]:
print "Accuracy Train\tAccuracy Test"
print accuracy_score(y_train, train_pred3), "\t", accuracy_score(y_test, test_pred3)

Accuracy Train	Accuracy Test
0.806865919288 	0.272071666667


In [40]:
precision_recall_fscore_support(y_train, train_pred3) # no action, click, book

(array([ 0.74708886,  0.8276321 ,  0.85642338]),
 array([ 0.95531   ,  0.67741607,  0.77772395]),
 array([ 0.83846562,  0.7450277 ,  0.81517861]),
 array([200000, 167079, 276593]))

In [41]:
precision_recall_fscore_support(y_test, test_pred3) # no action, click, book

(array([ 0.97396205,  0.04357478,  0.04440602]),
 array([ 0.24999913,  0.67483884,  0.78231985]),
 array([ 0.39787155,  0.08186358,  0.08404166]),
 array([573054,  10238,  16708]))

SVM

In [42]:
clf4 = LinearSVC(random_state=3421)
clf4.fit(X_train, y_train)
train_pred4 = clf4.predict(X_train)
test_pred4 = clf4.predict(X_test)

In [43]:
print "Accuracy Train\tAccuracy Test"
print accuracy_score(y_train, train_pred4), "\t", accuracy_score(y_test, test_pred4)

Accuracy Train	Accuracy Test
0.712718589592 	0.0964083333333


In [44]:
precision_recall_fscore_support(y_train, train_pred4) # no action, click, book

(array([ 0.87255467,  0.875     ,  0.62431354]),
 array([  1.00000000e+00,   8.37926969e-05,   9.35464744e-01]),
 array([  9.31940393e-01,   1.67569347e-04,   7.48854258e-01]),
 array([200000, 167079, 276593]))

In [45]:
precision_recall_fscore_support(y_test, test_pred4) # no action, click, book

(array([ 0.95910257,  0.05128205,  0.02807398]),
 array([  7.37033508e-02,   1.95350654e-04,   9.34103424e-01]),
 array([ 0.13688743,  0.00038922,  0.0545097 ]),
 array([573054,  10238,  16708]))