<h3>Kaggle Challenge</h3>
<h1>Expedia Hotel Recommendations</h1>
<hr style="height:2px;border:none;color:#333;background-color:#333;"/>
<b>Part II - Data Processing</b>

### Imports

In [1]:
%%time
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
sns.set(style="whitegrid")
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 7.0)


import time
import datetime

CPU times: user 1.43 s, sys: 419 ms, total: 1.85 s
Wall time: 9.79 s


In [2]:
%%time
# ROWS = 1000000
ROWS = None

print(datetime.datetime.now())
expediaDF = pd.read_csv('data/train.csv.gz', 
                        nrows=ROWS,
                        compression='gzip',
                        error_bad_lines=False)
print(datetime.datetime.now())

2019-04-10 20:20:08.666376
2019-04-10 20:25:26.242976
CPU times: user 3min 39s, sys: 1min 2s, total: 4min 41s
Wall time: 5min 17s


In [3]:
print("Shape:", expediaDF.shape)
expediaDF.head()

Shape: (37670293, 24)


Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,channel,srch_ci,srch_co,srch_adults_cnt,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,9,2014-08-27,2014-08-31,2,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,9,2014-08-29,2014-09-02,2,0,1,8250,1,1,1,2,50,628,1
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,9,2014-08-29,2014-09-02,2,0,1,8250,1,0,1,2,50,628,1
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,3,2014-11-23,2014-11-28,2,0,1,14984,1,0,1,2,50,1457,80
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,3,2014-11-23,2014-11-28,2,0,1,14984,1,0,1,2,50,1457,21


In [4]:
print(datetime.datetime.now())
testingDF = pd.read_csv('data/test.csv.gz', 
#                         nrows=100000,
                        compression='gzip',
                        error_bad_lines=False)
print(datetime.datetime.now())

2019-04-10 20:25:26.666470
2019-04-10 20:25:44.413969


In [5]:
print("Shape:", testingDF.shape)
testingDF.head()

Shape: (2528243, 22)


Unnamed: 0,id,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,channel,srch_ci,srch_co,srch_adults_cnt,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,hotel_continent,hotel_country,hotel_market
0,0,2015-09-03 17:09:54,2,3,66,174,37449,5539.0567,1,1,0,3,2016-05-19,2016-05-23,2,0,1,12243,6,6,204,27
1,1,2015-09-24 17:38:35,2,3,66,174,37449,5873.2923,1,1,0,10,2016-05-12,2016-05-15,2,0,1,14474,7,6,204,1540
2,2,2015-06-07 15:53:02,2,3,66,142,17440,3975.9776,20,0,0,1,2015-07-26,2015-07-27,4,0,1,11353,1,2,50,699
3,3,2015-09-14 14:49:10,2,3,66,258,34156,1508.5975,28,0,1,10,2015-09-14,2015-09-16,2,0,1,8250,1,2,50,628
4,4,2015-07-17 09:32:04,2,3,66,467,36345,66.7913,50,0,0,0,2015-07-22,2015-07-23,2,0,1,11812,1,2,50,538


### Sampling the data

The strategy here is to build a model on the last 6 months of 2014, from '2014-07-01' onwards in order to mirror the situation in the test set.

In [6]:
%%time
expediaDF['date_time'] = expediaDF['date_time'].astype('datetime64[ns]')
expediaDF = expediaDF[expediaDF['date_time'] > datetime.date(2014,7,1)]
print("Shape: ",expediaDF.shape)

Shape:  (16571429, 24)
CPU times: user 15.1 s, sys: 27.6 s, total: 42.7 s
Wall time: 1min 17s


### Handle missing Values

#### - Training Dataset

In [7]:
missingValues = expediaDF.isnull().sum().sort_values(ascending=False)
missingValues = missingValues.to_frame().reset_index()
missingValues.columns = ["feature", "count"]
missingValues["percentage"] = missingValues["count"] / expediaDF.shape[0]
print("Missing Values: ")
missingValues[missingValues["count"] > 0]

Missing Values: 


Unnamed: 0,feature,count,percentage
0,orig_destination_distance,6122098,0.369437
1,srch_co,5292,0.000319
2,srch_ci,5289,0.000319


- Removing checkin and checkout null rows

In [8]:
%%time
countBefore = expediaDF.shape[0]
newExpediaDF = expediaDF.dropna(subset=['srch_co', 'srch_ci','orig_destination_distance'])
countAfter = newExpediaDF.shape[0]
deletedRows = countBefore - countAfter
print("Rows deleted: ", deletedRows, "- % of the dataset: ", "{:.2%}".format(deletedRows / countBefore))

Rows deleted:  6125275 - % of the dataset:  36.96%
CPU times: user 6.2 s, sys: 8.84 s, total: 15 s
Wall time: 18.2 s


In [9]:
newExpediaDF.shape

(10446154, 24)

In [10]:
del expediaDF

- Removing the column orig_destination_distance

In [None]:
# newExpediaDF = newExpediaDF.drop(['orig_destination_distance'], axis=1)

In [11]:
print("No. of columns with missing values: ", newExpediaDF.isnull().any().sum())

No. of columns with missing values:  0


#### - Testing Dataset

In [12]:
missingValues = testingDF.isnull().sum().sort_values(ascending=False)
missingValues = missingValues.to_frame().reset_index()
missingValues.columns = ["feature", "count"]
missingValues["percentage"] = missingValues["count"] / testingDF.shape[0]
print("Missing Values: ")
missingValues[missingValues["count"] > 0]

Missing Values: 


Unnamed: 0,feature,count,percentage
0,orig_destination_distance,847461,0.335198
1,srch_ci,21,8e-06
2,srch_co,17,7e-06


- Filling missing values with 0

In [13]:
testingDF = testingDF.fillna(0)

- Include the is_booking column (all the testing data are booking)

In [14]:
testingDF['is_booking'] = 1

### Handle the outliers

In [15]:
%%time
#removing outliers
countBefore = newExpediaDF.shape[0]
num_train = newExpediaDF.select_dtypes(include=["number"])
cat_train = newExpediaDF.select_dtypes(exclude=["number"])
idx = np.all(stats.zscore(num_train) < 3, axis=1)
countAfter = np.sum(idx)
deletedRows = countBefore - countAfter
train_cleaned = pd.concat([num_train.loc[idx], cat_train.loc[idx]], axis=1)
print("Rows deleted: ", deletedRows, "- % of the dataset: ", "{:.2%}".format(deletedRows / countBefore))

Rows deleted:  2485560 - % of the dataset:  23.79%
CPU times: user 11.5 s, sys: 28.6 s, total: 40.1 s
Wall time: 50.9 s


In [16]:
train_cleaned.shape

(7960594, 24)

In [17]:
%%time
del num_train
del cat_train
del idx
del newExpediaDF
del countBefore
del countAfter

CPU times: user 34 µs, sys: 1.24 ms, total: 1.28 ms
Wall time: 1.29 ms


### Handle categorical values

In [18]:
%%time
from sklearn.preprocessing import LabelEncoder
def encode_label(df):
    X_cat = df.copy()
    X_cat = df.select_dtypes(include=['object'])
    X_enc = X_cat.copy()
    X_enc = X_enc.apply(LabelEncoder().fit_transform)
    mergedata = df.drop(X_cat.columns, axis=1)

    return pd.concat([mergedata,X_enc], axis=1).astype(float)

CPU times: user 664 ms, sys: 2.43 s, total: 3.09 s
Wall time: 6.31 s


In [19]:
train_cleaned["date_time"] = train_cleaned["date_time"].astype(str)

testingDF["date_time"] = testingDF["date_time"].astype(str)
testingDF["srch_ci"] = testingDF["srch_ci"].astype(str)
testingDF["srch_co"] = testingDF["srch_co"].astype(str)

In [20]:
%%time
train_cleaned = encode_label(train_cleaned)
testingDF = encode_label(testingDF)

CPU times: user 2min 8s, sys: 37.2 s, total: 2min 45s
Wall time: 3min 29s


In [21]:
train_cleaned.shape

(7960594, 24)

In [22]:
trainingDF = train_cleaned
del train_cleaned

### Modeling

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score

In [24]:
%%time
X = trainingDF.drop('hotel_cluster', axis=1)
y = trainingDF["hotel_cluster"]
X_test = trainingDF

CPU times: user 962 ms, sys: 2.81 s, total: 3.77 s
Wall time: 4.49 s


In [25]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

CPU times: user 6.43 s, sys: 4.33 s, total: 10.8 s
Wall time: 14.3 s


#### - Applying scale to the test set 

In [26]:
%%time
print(datetime.datetime.now())

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(datetime.datetime.now())


2019-04-10 20:33:16.783151
2019-04-10 20:33:38.062463
CPU times: user 8.73 s, sys: 9.15 s, total: 17.9 s
Wall time: 21.3 s


#### - Train the model

In [27]:
%%time
from sklearn.tree import DecisionTreeClassifier
print(datetime.datetime.now())

clf = DecisionTreeClassifier().fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))
print(datetime.datetime.now())


2019-04-10 20:33:38.924379
Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 0.26
2019-04-10 20:42:41.286748
CPU times: user 6min 27s, sys: 56.2 s, total: 7min 24s
Wall time: 9min 3s


In [None]:
from sklearn.ensemble import RandomForestClassifier
print(datetime.datetime.now())

rfc = RandomForestClassifier().fit(X_train, y_train)
print('Accuracy of Random Tree classifier on training set: {:.2f}'
     .format(rfc.score(X_train, y_train)))
print('Accuracy of Random Tree classifier on test set: {:.2f}'
     .format(rfc.score(X_test, y_test)))
print(datetime.datetime.now())

In [None]:
def predictAndEvaluate(model):
    print(datetime.datetime.now())

    m = model().fit(X_train, y_train)
    print('Accuracy on training set: {:.2f}'.format(m.score(X_train, y_train)))
    print('Accuracy on test set: {:.2f}'.format(m.score(X_test, y_test)))
    print(datetime.datetime.now())
    y_test_predicted_probability = best_model.predict_proba(X_test)

    prob = y_test_predicted_probability.argsort()
    predictions = []
    for p in prob:
        predictions.append(list(reversed(p[-5:])))
        
    targ = [[l] for l in y_test]
    score = metrics.mapk(targ, predictions, k=5)
    print('Accuracy of the predictions (MAP@5): {:.2f}'.format(score))

### Validation

In [28]:
import ml_metrics as metrics

In [29]:
%%time
best_model = clf
y_test_predicted_probability = best_model.predict_proba(X_test)

prob = y_test_predicted_probability.argsort()
predictions = []
for p in prob:
    predictions.append(list(reversed(p[-5:])))

CPU times: user 19.4 s, sys: 20 s, total: 39.5 s
Wall time: 1min 11s


#### - Evaluating the predictions

In [30]:
%%time
targ = [[l] for l in y_test]
score = metrics.mapk(targ, predictions, k=5)
print('Accuracy of the predictions (MAP@5): {:.2f}'
     .format(score))

Accuracy of the predictions (MAP@5): 0.27
CPU times: user 37.5 s, sys: 980 ms, total: 38.4 s
Wall time: 46.5 s


### Generating result file

In [31]:
%%time
print(datetime.datetime.now())
predicted_probability = best_model.predict_proba(testingDF)

prob = predicted_probability.argsort()

result = []
for idx, p in enumerate(prob):
    p = prob[idx]
    predictions = list(reversed(p[-5:]))
    strPredictions = " ".join(str(v) for v in predictions)
    result.append(strPredictions)
    
resultDF = pd.DataFrame(result, columns=['hotel_cluster'])
resultDF = pd.concat([testingDF.id, resultDF.hotel_cluster], axis=1)

print(datetime.datetime.now())
resultDF.hotel_cluster.to_csv('predicted_with_pandas.csv',header=True, index_label='id')

2019-04-10 20:44:40.322007
2019-04-10 20:45:36.172102
CPU times: user 38.4 s, sys: 12.7 s, total: 51.2 s
Wall time: 1min 2s
