# K Nearest Neighbor 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
testPath = "~/sampledTestset2014.csv"
trainPath = "~/sampledTrainSet2013.csv"
train = pd.read_csv(trainPath)
test = pd.read_csv(testPath)

In [6]:
train.shape

(619172, 55)

In [7]:
test.shape

(51481, 55)

Applying KNN on large Dataset always create problem. So , 
we will try to reduce the size of dataset by removing any duplicate rows

In [8]:
print("We have", train.shape[0], "rows in our trainings set")

('We have', 619172, 'rows in our trainings set')


In [9]:
trainSample = train.drop_duplicates()
print("We have", trainSample.shape[0], "rows after dropping duplicate enteries")

('We have', 619172, 'rows after dropping duplicate enteries')


In [11]:
#looks likes we do not have any duplicates. So we will use train as it
del trainSample

In [10]:
train.apply(lambda x: sum(x.isnull()),axis=0) 

Unnamed: 0                        0
Unnamed: 0.1                      0
user_location_country             0
hotel_cluster                     0
user_location_region              0
user_location_city                0
hotel_country                     0
hotel_market                      0
srch_destination_id               0
date_time                         0
site_name                         0
posa_continent                    0
orig_destination_distance    223367
user_id                           0
is_mobile                         0
is_package                        0
channel                           0
srch_ci                        1492
srch_co                        1492
srch_adults_cnt                   0
srch_children_cnt                 0
srch_rm_cnt                       0
srch_destination_type_id          0
is_booking                        0
cnt                               0
hotel_continent                   0
year                              0
month                       

In [12]:
rf_dat = train

We will deal with missing as we build new model based on KNN

# MODEL 1: Similarity based on user locations

In [13]:
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import DistanceMetric
from sklearn import cross_validation
import ml_metrics as metrics # for map@5 accuracy 

predictor_set1 = [c for c in rf_dat.columns if c in 
                  ['user_location_city', 'user_location_country', 'user_location_region', 'season', 'hotel_cluster']]
train_noDup  = rf_dat[predictor_set1] 
print("size of train set before removal of duplicates:", train_noDup.shape)
# We are dropping duplicate rows from training set 
train_noDup = train_noDup.drop_duplicates(subset=predictor_set1)
print("size of train set after removal of duplicates:", train_noDup.shape)
le = LabelEncoder()
train_noDup['season'] = le.fit_transform(train_noDup['season'])
predictors = [c for c in train_noDup.columns if c in ['user_location_city', 'user_location_country','user_location_region', 'season']]
train_dat = train_noDup[predictors]
train_label = train_noDup['hotel_cluster']

model1 = KNeighborsClassifier(n_neighbors=10, weights='uniform',algorithm='auto', leaf_size=50, metric = 'sokalsneath')
#fit the model
model1.fit(train_dat, train_label) 

from sklearn import cross_validation
scores = cross_validation.cross_val_score(model1, train_dat, train_label, cv=3)
print("Overall accuracy using 3 fold Cross validation", scores*100)

test['season'] = le.fit_transform(test['season'])
predictors = [c for c in train_noDup.columns if c in ['user_location_city', 'user_location_country','user_location_region', 'season']]
test_dat = test[predictors]
test_label = test['hotel_cluster']
ind_score  = model1.score(test_dat,test['hotel_cluster'])
print("Individual accuracy score: ", ind_score*100)

prediction_m1 = model1.predict(test_dat)

('size of train set before removal of duplicates:', (619172, 5))
('size of train set after removal of duplicates:', (271027, 5))
('Overall accuracy using 3 fold Cross validation', array([ 0.98040301,  0.98399486,  0.8371166 ]))
('Individual accuracy score: ', 0.58662419144927258)


# Model 2: Similarity based on hotel parameters

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import DistanceMetric
import ml_metrics as metrics
from sklearn import cross_validation

predictor_set2 = [c for c in rf_dat.columns if c in 
                  ['hotel_continent', 'hotel_country', 'hotel_cluster','season', 'day']]
train_noDup  = rf_dat[predictor_set2] 
print("size of train set before removal of duplicates:", train_noDup.shape)

# We are dropping duplicate rows from training set 
train_noDup = train_noDup.drop_duplicates(subset=predictor_set2)
print("size of train set after removal of duplicates:", train_noDup.shape)

le = LabelEncoder()
train_noDup['season'] = le.fit_transform(train_noDup['season'])

predictors = [c for c in train_noDup.columns if c in ['hotel_continent', 'hotel_country','season', 'day']]
train_dat = train_noDup[predictors]
train_label = train_noDup['hotel_cluster']

model2 = KNeighborsClassifier(n_neighbors=10, weights='uniform',algorithm='auto', leaf_size=60, metric = 'sokalsneath')
model2.fit(train_dat, train_label) 
scores = cross_validation.cross_val_score(model2, train_dat, train_label, cv=3)
print("Overall accuracy using 3 fold Cross validation-Model2", scores*100)

test_dat = test[predictors]
test_dat['season'] = le.fit_transform(test_dat['season'])
test_label = test['hotel_cluster']

#individual accuracy score
ind_score  = model2.score(test_dat,test['hotel_cluster'])
print("Individual accuracy score for model 2: ", ind_score*100)
prediction_m2 = model2.predict(test_dat)

('size of train set before removal of duplicates:', (619172, 5))
('size of train set after removal of duplicates:', (133853, 5))
('Overall accuracy using 3 fold Cross validation-Model2', array([ 1.30550642,  1.93876636,  1.72274563]))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


('Individual accuracy score for model 2: ', 0.80612264719022542)


# Model 3: based on hotel_market

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import DistanceMetric
import ml_metrics as metrics

predictor_set3 = [c for c in rf_dat.columns if c in 
                  ['hotel_continent', 'hotel_market', 'srch_destination_id', 'hotel_cluster']]
train_noDup  = rf_dat[predictor_set3] 
print("size of train set before removal of duplicates:", train_noDup.shape)

# We are dropping duplicate rows from training set 
train_noDup = train_noDup.drop_duplicates(subset=predictor_set3)
print("size of train set after removal of duplicates:", train_noDup.shape)

predictors = [c for c in train_noDup.columns if c in ['hotel_continent', 'hotel_market', 'srch_destination_id']]
train_dat = train_noDup[predictors]
train_label = train_noDup['hotel_cluster']

test_dat = test[predictors]
test_label = test['hotel_cluster']


model3 = KNeighborsClassifier(n_neighbors=10, weights='uniform',algorithm='auto', leaf_size=60, metric = 'sokalsneath')
model3.fit(train_dat, train_label) 

scores = cross_validation.cross_val_score(model3, train_dat, train_label, cv=3)
print("Overall accuracy using 3 fold Cross validation-Model 3", scores*100)

#individual accuracy score
ind_score  = model3.score(test_dat,test["hotel_cluster"])
print("Individual accuracy score: ", ind_score*100)
prediction_m3 = model3.predict(test_dat)

('size of train set before removal of duplicates:', (619172, 4))
('size of train set after removal of duplicates:', (83395, 4))
('Overall accuracy using 3 fold Cross validation-Model 3', array([ 1.17139777,  1.00363322,  1.61708564]))
('Individual accuracy score: ', 2.1541928089974944)


# Model 4 : based on user and hotel srch parameters

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import DistanceMetric
import ml_metrics as metrics

predictor_set4 = [c for c in rf_dat.columns if c in 
                  ['hotel_continent', 'hotel_market', 'hotel_cluster','user_location_city', 'user_location_country', 'user_location_region']]
train_noDup  = rf_dat[predictor_set4] 
print("size of train set before removal of duplicates:", train_noDup.shape)

# We are dropping duplicate rows from training set 
train_noDup = train_noDup.drop_duplicates(subset=predictor_set4)
print("size of train set after removal of duplicates:", train_noDup.shape)

predictors = [c for c in train_noDup.columns if c in ['hotel_continent', 'hotel_market',
                                                      'user_location_city', 'user_location_country', 'user_location_region']]
train_dat = train_noDup[predictors]
train_label = train_noDup['hotel_cluster']
test_dat = test[predictors]
test_label = test['hotel_cluster']


model4 = KNeighborsClassifier(n_neighbors=10, weights='uniform',algorithm='auto', leaf_size=60, metric = 'sokalsneath')
model4.fit(train_dat, train_label) 
scores = cross_validation.cross_val_score(model4, train_dat, train_label, cv=3)
print("Overall accuracy using 3 fold Cross validation-Model 4", scores*100)

('size of train set before removal of duplicates:', (619172, 6))
('size of train set after removal of duplicates:', (373613, 6))


In [23]:
#individual accuracy score
ind_score  = model4.score(test_dat,test["hotel_cluster"])
print("Individual accuracy score: ", ind_score*100)

prediction_m4 = model4.predict(test_dat)

Individual accuracy score:  1.65217391304


# Model 5 : based on user and hotel locations

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import DistanceMetric
import ml_metrics as metrics

predictor_set5 = [c for c in rf_dat.columns if c in 
                  ['hotel_continent', 'hotel_country', 'hotel_cluster','user_location_city', 'user_location_country', 'user_location_region']]
train_noDup  = rf_dat[predictor_set5] 
print("size of train set before removal of duplicates:", train_noDup.shape)

# We are dropping duplicate rows from training set 
train_noDup = train_noDup.drop_duplicates(subset=predictor_set5)
print("size of train set after removal of duplicates:", train_noDup.shape)

predictors = [c for c in train_noDup.columns if c in ['hotel_continent', 'hotel_country',
                                                      'user_location_city', 'user_location_country', 'user_location_region']]
train_dat = train_noDup[predictors]
train_label = train_noDup['hotel_cluster']
test_dat = test[predictors]
test_label = test['hotel_cluster']


model5 = KNeighborsClassifier(n_neighbors=10, weights='uniform',algorithm='auto', leaf_size=60, metric = 'sokalsneath')
model5.fit(train_dat, train_label) 
scores = cross_validation.cross_val_score(model5, train_dat, train_label, cv=3)
print("Overall accuracy using 3 fold Cross validation-Model 5", scores*100)

#individual accuracy score
ind_score  = model5.score(test_dat,test["hotel_cluster"])
print("Individual accuracy score: ", ind_score*100)

prediction_m5 = model5.predict(test_dat)

size of train set before removal of duplicates: (91113, 6)
size of train set after removal of duplicates: (42975, 6)
Overall accuracy using 3 fold Cross validation-Model 5 [ 1.38589038  1.4586823   1.5537514 ]
Individual accuracy score:  1.52173913043


### Ensembling 1
We combine the output from five models and generate test prediction

In [25]:
import itertools
from operator import itemgetter
final_predction = []
for testrow in range(len(test)):
        pred_final = [prediction_m1[testrow],
                                  prediction_m2[testrow], 
                                  prediction_m3[testrow], 
                                  prediction_m4[testrow],
                                  prediction_m5[testrow]]
        final_predction.append(pred_final)  

### map@5 accuracy for second approach

In [26]:
import ml_metrics as metrics
target = [[l] for l in test['hotel_cluster']]
score = metrics.mapk(target, final_predction, k=5) 
accuracy = score*100
print ("Mean accuracy Precison Score is (in %): ",accuracy) 

Mean accuracy Precison Score is (in %):  2.35289855072


### KNN Model 6 -using Clicks and Book features 

Its hard to apply KNN on the categorical data. So We created some new features using Books and clicks rate 
based on different parameters as described in the script ~/Final/features-from-booksClicks.R

It contains features generated using books and click rate. 

below file already contains features created mentioed R script

In [3]:
train.columns.values

array(['Unnamed: 0', 'Unnamed: 0.1', 'user_location_country',
       'hotel_cluster', 'user_location_region', 'user_location_city',
       'hotel_country', 'hotel_market', 'srch_destination_id', 'date_time',
       'site_name', 'posa_continent', 'orig_destination_distance',
       'user_id', 'is_mobile', 'is_package', 'channel', 'srch_ci',
       'srch_co', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt',
       'srch_destination_type_id', 'is_booking', 'cnt', 'hotel_continent',
       'year', 'month', 'day', 'hour', 'part_of_day', 'type_of_day',
       'season', 'is_alone', 'dest_feature_pc1', 'dest_feature_pc2',
       'dest_feature_pc3', 'clickRate', 'bookRate', 'clickMRate',
       'bookMRate', 'clickHCRate', 'bookHCRate', 'clickURate', 'bookURate',
       'clickUCRRate', 'bookUCRRate', 'clickUCRate', 'bookUCRate',
       'popScoreHmc_Count', 'popScoreU_Count', 'popScoreSite_Count',
       'popScoreDest_Count', 'popScoreCityDest_Count',
       'popScoreCityH_Count'], dtype=ob

In [4]:
train.isnull().values.any()

True

In [8]:
train.iloc[1:3, 37:49]

Unnamed: 0,clickRate,bookRate,clickMRate,bookMRate,clickHCRate,bookHCRate,clickURate,bookURate,clickUCRRate,bookUCRRate,clickUCRate,bookUCRate
1,0.015,0.014,0.016,0.013,0.017,0.01,0.2,0.0,0.001,0.002,0.001,0.002
2,0.223,0.278,0.248,0.298,0.025,0.028,0.0,0.154,0.0,0.028,0.003,0.006


In [9]:
train = train.replace([np.inf, -np.inf], np.nan)

In [56]:
train['bookRate'].fillna(77 ,inplace=True)

In [61]:
test.describe()

Unnamed: 0.1,Unnamed: 0,clickRate,bookRate,clickMRate,bookMRate,clickHCRate,bookHCRate,clickURate,bookURate,clickUCRRate,bookUCRRate,clickUCRate,bookUCRate,hotel_cluster
count,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0
mean,1150.5,34.048719,33.982101,16.385627,17.725911,1.579967,1.755049,49.868321,51.38179,11.167537,11.217202,2.942194,2.945085,47.323478
std,664.097131,46.945136,46.993153,36.703514,37.862634,12.287351,12.940181,49.485457,49.444152,31.295232,31.34562,16.770058,16.769562,28.229712
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0
25%,575.75,0.048,0.0,0.02675,0.0,0.013,0.011,0.022,0.0,0.009,0.002,0.009,0.008,25.0
50%,1150.5,0.164,0.011,0.067,0.059,0.019,0.021,99.0,99.0,0.015,0.017,0.013,0.015,46.0
75%,1725.25,99.0,99.0,0.2,0.271,0.033,0.038,99.0,99.0,0.03,0.038,0.01725,0.025,70.0
max,2300.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0


In [11]:
Predset_var = ['clickRate', 'bookRate', 'clickMRate', 'bookMRate',
       'clickHCRate','bookHCRate', 'clickURate', 'bookURate',
       'clickUCRRate', 'bookUCRRate', 'clickUCRate', 'bookUCRate']

for col in Predset_var:
    train[col].fillna(77 ,inplace=True)
    test[col].fillna(77 ,inplace=True)

In [12]:
from sklearn.neighbors import KNeighborsClassifier
Knn8_model = KNeighborsClassifier(n_neighbors=3)

Knn8_model.fit(train[Predset_var], train['hotel_cluster']) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

### Result on the validation set

In [15]:
testScore = Knn8_model.score(test[Predset_var], test['hotel_cluster'])
print("Accuracy in % is: using KNN model with features using book clicks rate is:", testScore*100)

('Accuracy in % is: using KNN model with features using book clicks rate is:', 2.1852722363590451)


In [16]:
Predset_var = ['popScoreHmc_Count', 'popScoreU_Count', 'popScoreSite_Count',
       'popScoreDest_Count', 'popScoreCityDest_Count',
       'popScoreCityH_Count']
for col in Predset_var:
    train[col].fillna(0.0 ,inplace=True)
    test[col].fillna(0.8 ,inplace=True)

In [17]:
from sklearn.neighbors import KNeighborsClassifier
Knn9_model = KNeighborsClassifier(n_neighbors=3)

Knn9_model.fit(train[Predset_var], train['hotel_cluster']) 
testScore = Knn9_model.score(test[Predset_var], test['hotel_cluster'])
print("Accuracy in % is: using KNN model with features using book clicks rate is:", testScore*100)

('Accuracy in % is: using KNN model with features using book clicks rate is:', 6.3149511470251163)


In [19]:
Predset_var1 = ['clickRate', 'bookRate', 'clickMRate', 'bookMRate',
       'clickHCRate','bookHCRate', 'clickURate', 'bookURate',
       'clickUCRRate', 'bookUCRRate', 'clickUCRate', 'bookUCRate']

In [20]:
prediction_m1 = Knn8_model.predict(test[Predset_var1])
prediction_m3 = Knn9_model.predict(test[Predset_var])

In [22]:
import itertools
from operator import itemgetter
final_predction = []
for testrow in range(len(test)):
        pred_final = [prediction_m3[testrow],
                                  prediction_m1[testrow]]
        final_predction.append(pred_final)  

In [23]:
import ml_metrics as metrics
target = [[l] for l in test['hotel_cluster']]
score = metrics.mapk(target, final_predction, k=2) 
accuracy = score*100
print ("Mean accuracy Precison Score is (in %): ",accuracy) 

('Mean accuracy Precison Score is (in %): ', 7.3425147141663913)
