In [55]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import tree

In [3]:
with open('new_train.csv', 'r') as csvfile:
    df = pd.read_csv(csvfile, dtype={'booking_bool':bool,
                                     'prop_country_id':np.int32, 
                                     'prop_starrating':np.int32, 'prop_review_score':np.float,
                                     'prop_location_score1':np.float,
                                     'prop_location_score2':np.float,
                                     'price_usd':np.float, 
                                     'srch_query_affinity_score':np.float, 'orig_destination_distance':np.float,
                                     'position':np.int32, 'click_bool':bool})


In [4]:
target = ['booking_bool', 'click_bool']

In [5]:
data = ['year', 'month', 'prop_score','prop_location_score1','prop_location_score2','diff_hist_price',
                'usd_diff', 'star_diff', 'srch_query_affinity_score', 'orig_destination_distance',
                'prop_review_score','price_rank','stars_rank','score_rank']

### Take data from two countries with approx same number of entries and same percent of likes/clicks

In [6]:
df_sample = df[df.prop_country_id == 55]

In [7]:
df_test = df[df.prop_country_id == 31]

In [8]:
print float(len(df_sample[df_sample['click_bool'] == True]))/len(df_sample)*100
print float(len(df_sample[df_sample['booking_bool'] == True]))/len(df_sample)*100

4.68400605335
3.01368780069


In [9]:
print float(len(df_test[df_test['click_bool'] == True]))/len(df_test)*100
print float(len(df_test[df_test['booking_bool'] == True]))/len(df_test)*100

4.66477630323
2.5898881347


### Train data and test data:

In [10]:
X = df_sample[data]
y = df_sample[target]
pred = df_test[data]
verif = df_test[target]

### K Neighbors Classifier - most of clicked/booked predictions are false pozitives

In [49]:
knn  = KNeighborsClassifier(n_neighbors  = 5)
knn.fit(X,y)
prediction = knn.predict(pred)
#print cross_val_score(knn, X, y, cv = 10, scoring = 'accuracy').mean()

In [50]:
from collections import Counter
Counter(x for xs in prediction for x in set(xs))
head = ['click_bool','booking_bool']
prediction_df = pd.DataFrame(prediction, columns=head)

diff =  np.subtract(prediction_df.click_bool, np.int32(df_test.click_bool))
print "Total number of clicked predictions: ", len(prediction_df[prediction_df.click_bool == 1])
print "Number of correct clicked/non-clicked predictions: ", np.count_nonzero(diff == 0)
print "Number of false positives clicked", np.count_nonzero(diff == 1)
print "Number of false negatives clicked", np.count_nonzero(diff == -1)

diff2 =  np.subtract(prediction_df.booking_bool, np.int32(df_test.booking_bool))
print "Total number of booked predictions: ", len(prediction_df[prediction_df.booking_bool == 1])
print "Number of correct booked/non-booked predictions: ", np.count_nonzero(diff2 == 0)
print "Number of false positives booked", np.count_nonzero(diff2 == 1)
print "Number of false negatives booked", np.count_nonzero(diff2 == -1)

Total number of clicked predictions:  88
Number of correct clicked/non-clicked predictions:  140806
Number of false positives clicked 78
Number of false negatives clicked 6883
Total number of booked predictions:  351
Number of correct booked/non-booked predictions:  143623
Number of false positives booked 334
Number of false negatives booked 3810


### Random Forest Classifier - always predicts non-clicked/non-booked

In [17]:
clf = RandomForestClassifier(max_depth = 2, random_state = 0)
clf.fit(X, y)
prediction = clf.predict(pred)
search = 1
print any(e[0] == search for e in prediction)
print cross_val_score(clf, X, y, cv=10, scoring='accuracy').mean()


False
0.9531599311524774


### Decision tree - predicts many clicked/booked, out of which the biggest majority are false alarms

In [33]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
prediction = clf.predict(pred)
search = 1
print any(e[0] == search for e in prediction)
#print cross_val_score(clf, X, y, cv=10, scoring='accuracy').mean()

True


In [48]:
from collections import Counter
Counter(x for xs in prediction for x in set(xs))
head = ['click_bool','booking_bool']
prediction_df = pd.DataFrame(prediction, columns=head)

diff =  np.subtract(prediction_df.click_bool, np.int32(df_test.click_bool))
print "Total number of clicked predictions: ", len(prediction_df[prediction_df.click_bool == 1])
print "Number of correct clicked/non-clicked predictions: ", np.count_nonzero(diff == 0)
print "Number of false positives clicked", np.count_nonzero(diff == 1)
print "Number of false negatives clicked", np.count_nonzero(diff == -1)

diff2 =  np.subtract(prediction_df.booking_bool, np.int32(df_test.booking_bool))
print "Total number of booked predictions: ", len(prediction_df[prediction_df.booking_bool == 1])
print "Number of correct booked/non-booked predictions: ", np.count_nonzero(diff2 == 0)
print "Number of false positives booked", np.count_nonzero(diff2 == 1)
print "Number of false negatives booked", np.count_nonzero(diff2 == -1)


Total number of clicked predictions:  12480
Number of correct clicked/non-clicked predictions:  130070
Number of false positives clicked 11642
Number of false negatives clicked 6055
Total number of booked predictions:  17756
Number of correct booked/non-booked predictions:  127516
Number of false positives booked 17090
Number of false negatives booked 3161


### Gradient Boosting - predicts very few clicked/booked, but most of them are correct

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

In [51]:
y_GBC = y.click_bool
clf = GradientBoostingClassifier(max_depth = 2)
clf.fit(X, y_GBC)
prediction = clf.predict(pred)
#print cross_val_score(clf, X,y_GBC, cv=10, scoring='accuracy').mean()

In [52]:
print "Total number of clicked predictions:", np.count_nonzero(prediction == 1)
diff =  np.subtract(prediction, np.int32(df_test.click_bool))
print "Total number of correct predictions:", np.count_nonzero(diff == 0)
print "Number of false positives:", np.count_nonzero(diff == 1)
print "Number of false negatives:", np.count_nonzero(diff == -1)


Total number of clicked predictions: 40
Total number of correct predictions: 140886
Number of false positives: 14
Number of false negatives: 6867


In [53]:
clf = GradientBoostingClassifier(max_depth = 2)
clf.fit(X, y.booking_bool)
prediction = clf.predict(pred)
#print cross_val_score(clf, X, y.booking_bool, cv=10, scoring='accuracy').mean()

In [54]:
print "Total number of booked predictions:", np.count_nonzero(prediction == 1)
diff =  np.subtract(prediction, np.int32(df_test.booking_bool))
print "Total number of correct predictions:", np.count_nonzero(diff == 0)
print "Number of false positives:", np.count_nonzero(diff == 1)
print "Number of false negatives:", np.count_nonzero(diff == -1)


Total number of booked predictions: 35
Total number of correct predictions: 143929
Number of false positives: 23
Number of false negatives: 3815
