In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

There are 37 Million Entries for training data, I will use 10k as a sample so it will not take as long to process models

In [None]:
sample_df=pd.read_csv('/kaggle/input/expedia-hotel-recommendations/train.csv',nrows=10000)

In [None]:
sample_df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x='hotel_continent', data=sample_df)

In [None]:
fig, ax=plt.subplots()
fig.set_size_inches(20,15)
sns.heatmap(sample_df.corr(),cmap='coolwarm',ax=ax,annot=True,linewidths=2)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(13, 8)
sns.countplot(x='hotel_cluster',data=sample_df, ax=ax)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(13, 8)
sns.countplot(x='is_package',data=sample_df, order=[0,1], ax=ax)

Converting Dates into month attribute, this is because the month has the most seasonal attribute

In [None]:
sample_df['srch_ci']=pd.to_datetime(sample_df['srch_ci'])
sample_df['srch_co']=pd.to_datetime(sample_df['srch_co'])
sample_df['date_time']=pd.to_datetime(sample_df['date_time'])

sample_df['check_in_month']=sample_df['srch_ci'].apply(lambda x:x.month)
sample_df['date_time_month']=sample_df['date_time'].apply(lambda x:x.month)

In [None]:
#Plotting month distribution
fig, ax = plt.subplots()
fig.set_size_inches(13, 8)
sns.countplot('date_time_month',data=sample_df[sample_df["is_booking"] == 1],order=list(range(1,13)),ax=ax)


Adding days spent variable.

In [None]:
sample_df['time_delta']=(sample_df['srch_co']-sample_df['srch_ci'])
sample_df['days_spent']=sample_df['time_delta'].dt.days
sample_df=sample_df.drop(columns=['time_delta'])

Dealing with NA values

In [None]:
sample_df.isnull().sum()

orig_destination_distance is the big offender, there are NA values in srch_ci and srch_co but since there are only 7 I will remove those rows

In [None]:
sample_df=pd.DataFrame(sample_df)

In [None]:
sample_df.head()

This dropna call removes the 7 offending NA values.

In [None]:
sample_df=sample_df.dropna(subset=['srch_ci'])
sample_df.isnull().sum()

 Then we must deal with orig_destination_distance. I will swap orig_destination_distance's mean value as the replacement.

In [None]:
dist_mean=sample_df['orig_destination_distance'].mean()
sample_df['orig_destination_distance']=sample_df['orig_destination_distance'].fillna(dist_mean)

Checking that there are no more NaN values and indeed there are no longer any values.

In [None]:
sample_df.isnull().sum()

We must now standardize the orig_destination_distance variable

In [None]:
odd_std=sample_df['orig_destination_distance'].std()
odd_mean=sample_df['orig_destination_distance'].mean()

sample_df['orig_destination_distance']=(sample_df['orig_destination_distance']-odd_mean)/odd_std

Checking that mean=0 and std=1

In [None]:
print(sample_df['orig_destination_distance'].mean())
print(sample_df['orig_destination_distance'].std())

orig_destination_distance is now standardized

We must also standardize the days_spent varaible

In [None]:
ds_std=sample_df['days_spent'].std()
ds_mean=sample_df['days_spent'].mean()
sample_df['days_spent']=(sample_df['days_spent']-ds_mean)/ds_std

Checking that mean=0 and std=1

In [None]:
print(sample_df['days_spent'].mean())
print(sample_df['days_spent'].std())

days_spent is now standardized

Standardizing srch_adults_cnt,srch_children_cnt srch_rm_cnt and cnt

In [None]:
ad_mean=sample_df['srch_adults_cnt'].mean()
ad_std=sample_df['srch_adults_cnt'].std()
ch_mean=sample_df['srch_children_cnt'].mean()
ch_std=sample_df['srch_children_cnt'].std()


cnt_mean=sample_df['cnt'].mean()
cnt_std=sample_df['cnt'].std()

room_mean=sample_df['srch_rm_cnt'].mean()
room_std=sample_df['srch_rm_cnt'].std()

sample_df['srch_adults_cnt']=(sample_df['srch_adults_cnt']-ad_mean)/ad_std
sample_df['srch_children_cnt']=(sample_df['srch_children_cnt']-ch_mean)/ch_std
sample_df['cnt']=(sample_df['cnt']-cnt_mean)/cnt_std
sample_df['srch_rm_cnt']=(sample_df['srch_rm_cnt']-room_mean)/room_std



print(sample_df['srch_adults_cnt'].mean())
print(sample_df['srch_children_cnt'].std())
print(sample_df['srch_adults_cnt'].mean())
print(sample_df['srch_children_cnt'].std())

print(sample_df['cnt'].mean())
print(sample_df['cnt'].std())
print(sample_df['srch_rm_cnt'].mean())
print(sample_df['srch_rm_cnt'].std())

print(sample_df.columns)

There are many variables that are categorical data but not ordinal. These are hotel_continent, hotel_country, hotel_market,user_location_country,user_location_region, user_location_city,site_name, posa_continent check_in_month,date_time_month and channel . I will use one hot encoding on all these variables.

In [None]:
categorical_columns=['hotel_continent', 'hotel_country', 'hotel_market','user_location_country','user_location_region', 'user_location_city','site_name','posa_continent','check_in_month','date_time_month','channel']
alternative_df=sample_df.drop(columns=categorical_columns)
sample_df=pd.get_dummies(sample_df,columns=categorical_columns)

Certian variables have been used or will not be useful for model processing. There are date_time, srch_ci, srch_co and user_id.

In [None]:
column_drops=['date_time','srch_ci','srch_co','user_id']
sample_df=sample_df.drop(columns=column_drops)

Taking target variable and storing it as y

In [None]:
y=sample_df['hotel_cluster']

Taking dataset without hotel cluster and storing as x

In [None]:
x=sample_df.drop(columns='hotel_cluster')

Train_test_split set up

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(x,y,test_size=.3,random_state=10)

Model test 1: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model=RandomForestClassifier()
rf_model.fit(X_train,y_train)
y_pred_rf=rf_model.predict(X_test)



In [None]:
most_important=rf_model.feature_importances_
index_list=sorted(range(len(rf_model.feature_importances_)),key=lambda i: rf_model.feature_importances_[i])[-10:]
impFeatures=list(x.columns[index_list])



In [None]:
for i in index_list:
    print (round(rf_model.feature_importances_[i],3))
print(impFeatures)

Checking accuracy

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error
print(metrics.accuracy_score(y_test,y_pred_rf))
print(metrics.mean_squared_error(y_test,y_pred_rf))

Model test 2: K means clustering

In [None]:
from sklearn.neighbors import KNeighborsClassifier

kn_model=KNeighborsClassifier()
kn_model.fit(X_train,y_train)
y_pred_kn=kn_model.predict(X_test)

Checking accuracy

In [None]:
print(metrics.accuracy_score(y_test,y_pred_kn))
print(metrics.mean_squared_error(y_test,y_pred_kn))

Model test 3: Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_dt=DecisionTreeClassifier()
model_dt.fit(X_train,y_train)
y_pred_dt=model_dt.predict(X_test)

print(metrics.accuracy_score(y_test,y_pred_dt))

Model 4: MLPCClassifier

In [None]:
from sklearn.neural_network import MLPClassifier
model_nn=MLPClassifier(solver='adam')
model_nn.fit(X_train,y_train)
y_pred_nn=model_nn.predict(X_test)

print(metrics.accuracy_score(y_test,y_pred_nn))


In [None]:
y.values

Checking accuracy