<h3>Kaggle Challenge</h3>
<h1>Expedia Hotel Recommendations</h1>
<hr style="height:2px;border:none;color:#333;background-color:#333;"/>
<b>Part II - Data Processing</b>

### Imports

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
sns.set(style="whitegrid")
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 7.0)


import time
import datetime

In [21]:
# ROWS = 3000000
ROWS = None

print(datetime.datetime.now())
expediaDF = pd.read_csv('data/train.csv.gz', 
                        nrows=ROWS,
                        compression='gzip',
                        error_bad_lines=False)
print(datetime.datetime.now())

2019-04-07 12:16:26.511319
2019-04-07 12:23:09.316620


In [22]:
print(datetime.datetime.now())
testingDF = pd.read_csv('data/test.csv.gz', 
#                         nrows=100000,
                        compression='gzip',
                        error_bad_lines=False)
print(datetime.datetime.now())

2019-04-07 12:23:09.383402
2019-04-07 12:23:29.399872


### Handle missing Values

In [23]:
missingValues = expediaDF.isnull().sum().sort_values(ascending=False)
missingValues = missingValues.to_frame().reset_index()
missingValues.columns = ["feature", "count"]
missingValues["percentage"] = missingValues["count"] / expediaDF.shape[0]
print("Missing Values: ")
missingValues[missingValues["count"] > 0]

Missing Values: 


Unnamed: 0,feature,count,percentage
0,orig_destination_distance,13525001,0.359036
1,srch_co,47084,0.00125
2,srch_ci,47083,0.00125


- Removing checkin and checkout null rows

In [24]:
countBefore = expediaDF.shape[0]
newExpediaDF = expediaDF.dropna(subset=['srch_co', 'srch_ci'])
countAfter = newExpediaDF.shape[0]
deletedRows = countBefore - countAfter
print("Rows deleted: ", deletedRows, "- % of the dataset: ", "{:.2%}".format(deletedRows / countBefore))

Rows deleted:  47088 - % of the dataset:  0.13%


In [None]:
del expediaDF

- Removing the column orig_destination_distance

In [None]:
newExpediaDF = newExpediaDF.drop(['orig_destination_distance'], axis=1)

In [None]:
print("No. of columns with missing values: ", newExpediaDF.isnull().any().sum())

### Handle the outliers

In [None]:
#removing outliers
countBefore = newExpediaDF.shape[0]
num_train = newExpediaDF.select_dtypes(include=["number"])
cat_train = newExpediaDF.select_dtypes(exclude=["number"])
idx = np.all(stats.zscore(num_train) < 4, axis=1)
countAfter = np.sum(idx)
deletedRows = countBefore - countAfter
train_cleaned = pd.concat([num_train.loc[idx], cat_train.loc[idx]], axis=1)
print("Rows deleted: ", deletedRows, "- % of the dataset: ", "{:.2%}".format(deletedRows / countBefore))

In [None]:
del num_train
del cat_train
del idx
del newExpediaDF
del countBefore
del countAfter

### Handle categorical values

In [None]:
from sklearn.preprocessing import LabelEncoder
def encode_label(df):
    X_cat = df.copy()
    X_cat = df.select_dtypes(include=['object'])
    X_enc = X_cat.copy()

    X_enc = X_enc.apply(LabelEncoder().fit_transform)
    mergedata = df.drop(X_cat.columns, axis=1)

    return pd.concat([mergedata,X_enc], axis=1).astype(float)

In [None]:
train_cleaned = encode_label(train_cleaned)

In [None]:
trainingDF = train_cleaned
del train_cleaned

### Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score

In [None]:
X = trainingDF.drop('hotel_cluster', axis=1)
y = trainingDF["hotel_cluster"]
X_test = trainingDF

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

#### - Applying scale to the test set 

In [None]:
print(datetime.datetime.now())

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(datetime.datetime.now())


#### - Train the model

In [None]:
from sklearn.tree import DecisionTreeClassifier
print(datetime.datetime.now())

clf = DecisionTreeClassifier().fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))
print(datetime.datetime.now())


In [20]:
from sklearn.ensemble import RandomForestClassifier
print(datetime.datetime.now())

clf = RandomForestClassifier().fit(X_train, y_train)
print('Accuracy of Random Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Random Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))
print(datetime.datetime.now())

2019-04-07 11:25:15.359399
Accuracy of Decision Tree classifier on training set: 0.96
Accuracy of Decision Tree classifier on test set: 0.25
2019-04-07 11:32:30.821137


### Validation

In [None]:
import ml_metrics as metrics
from sklearn import cross_validation

In [None]:
best_model = clf
y_test_predicted_probability = best_model.predict_proba(X_test)

prob = y_test_predicted_probability.argsort()
predictions = []
for p in prob:
    predictions.append(list(reversed(p[-5:])))

#### - Evaluating the predictions

In [None]:
targ = [[l] for l in y_test]
score = metrics.mapk(targ, preds, k=5)
print('Accuracy of the predictions (MAP@5): {:.2f}'
     .format(score))

knn, dt,rf
500 => 0.27
1M => 0.27
2M => 0.24, 0,26
3M => ,0.26/0.278/0.25