In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
os.chdir('../input/')
os.listdir()

In [None]:
df = pd.read_csv("hotel-booking-demand/hotel_bookings.csv")

We should first make train-validation-test split, but given this is an EDA practice we are not going to do that.

**Since there are only 32 variables with very nice feature documentation, before we dive into any EDA we should first look at each feature along with its' descriptions and some sample data in order to save some unnecessary effort and list some potential problems and things that need to pay attention to.**

**In the next section, we will first list all features along with their dtype in the orginal dataset. Then a few rows from the df as a sample. Finally examine them one by one.**

In [None]:
df.shape

In [None]:
sum(df.duplicated())

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
pd.DataFrame({'feature':list(df.columns),'Datatype':[df[x].dtype for x in df.columns]})

In [None]:
df.head(10)

* **'hotel':not much to pay attention to. This is a categorical variable indicating type of hotel(or just two particular hotels)**

* **'is_canceled': a binary variable indicating whether this booking is canceled or not. Probably the target we want to predict.**

* **'lead_time': the original description "Number of days that elapsed between the entering date and the arrival date" is not very accurate since we can observe some canceled booking has values other than 0. This suggest that lead_time is actually the interval between the book entering date and the planned arrival date.**

* **'arrival_date_year','arrival_date_month', 'arrival_date_week_number','arrival_date_day_of_month': Arrival time. Again for those canceled booking, this means the planned arrival date. Here the 'arrival_date_day_of_month' seems uniformly distributed across the month. Intuitively, it may not very useful. But what about when it conditioning on other factors? Probably worth to find out.**

* **'stays_in_weekend_nights','stays_in_week_nights': nights stayed. from a rough scan of the dataset we can find some entries have 0 in both columns while the is_canceled feature says it is not canceled. What does that mean? Is that just an error? Or it just means the guest check in check out during the same day in the daytime?**

* **'adults', 'children', 'babies': # of guests. children,babies are highly concentrate at 0. Probably not very important.**

* **'meal': categorical. Undefined/SC,BB,HB,FB**

* **'country':categorical. Country of origin**

* **'market_segment', 'distribution_channel': the second one could be just a simplified feature of the first one. Need further check.** 
       
* **'is_repeated_guest', 'previous_cancellations','previous_bookings_not_canceled': a customer could be divided into 1)who never booked before,2)who has booked before. Those who has booked before can be further divided into 3)who booked before but all canceled,4)who booked before but arrived at least once. Then we can see the latter two features already have the information of the first. ''**

* **'reserved_room_type','assigned_room_type': two categorical. could combine these two features, but very likely to lose some important info** 

* **'booking_changes': if canceled is that count as a 'change'? seems no. need further check**

* **'deposit_type': categorical,No Deposit,Non Refund,Refundable. Could be an important feature.**

* **'agent': ID of the travel agency that made the booking. There are null values in this column. Is it due to missing value or just because guests book the room directly without a travel agency? Need further check. It should be transformed into categorical.**

* **'company': company id. probably not very useful since it's a very imbalanced feature even if we transform it into a binary feature.**

* **'days_in_waiting_list':**

* **'customer_type':**

* **'adr':**

* **'required_car_parking_spaces', 'total_of_special_requests':**

* **'reservation_status': basically is covered by 'is_canceled'**

* **'reservation_status_date': potentially useful to construct other features such as the time gap between date canceled and anticipated arrival date.**

**Next we do a quick data cleaning. Handle all missing values and do some datatype transformation.**

In [None]:
import missingno as msno
msno.matrix(df)

In [None]:
df.isna().sum(axis=0)

First we take care of 'company'

In [None]:
(df.company.isna().sum())/len(df.company)

In [None]:
df.market_segment.value_counts()

In [None]:
df.distribution_channel.value_counts()

In [None]:
len(df.company)-df.company.isna().sum()

In [None]:
len(df[(df.market_segment=='Corporate') & (df.distribution_channel=='Corporate') & (df.company != 'NaN')])

We can see that there are 5259 non-na values in the 'company' column, while around 4k-5k values are labeled 'corporate' in both market_segment and distribution_channel columns, which very likely to suggest that 'company' information is already included in the latter two columns. In the other hand, there are only 6% rows have a valid value in the company column, which is not very helpful for identifying different companies not even to say finding a general pattern. So I believe it's safe to drop the company column.

Furthermore, in order to avoid having too many features when we encode all categorical variables, I believe we should only choose one between 'market_segment' and 'distribution_channel' in the future study since they pretty much give the same infomation.

In [None]:
cleaned_df =df.drop('company',axis=1)

Then we move on to 'agent'. The missing values in agent column could because of guest directly book the hotel without any agency. Let's look at all rows of market_segment and distribution_channel where the agent value is missing.

In [None]:
temp = df[df.agent.isna()].loc[:,['market_segment','distribution_channel']]

In [None]:
temp.market_segment.value_counts()

In [None]:
temp.distribution_channel.value_counts()

In [None]:
df.agent.value_counts()

What we can do here is to fill a value in the agent column where the corresponding distribution_channel column has the value 'Direct' or 'Corporate'. But before doing that, one thing we need to keep in mind is we have to tranform the agent column into a categorical feature in the end. According to the above cell, we know that agent column has 333 distinct values which will cost us too much to directly transform it into a categorcal feature. Thus, we basically have two choices: 1) convert it into a binary feature indicating there is an agency or not. 2) discard it since we don't really need to identify different agency, also because 'market_segment' and 'distribution_channel' already have the information about whether an agency is involved or not.

My choice here is to discard agent feature.

In the future study, I believe we should only keep one out of these three closely related features.

In [None]:
cleaned_df.drop('agent',axis=1,inplace=True)

Next, 'country' and 'children' both have a few missing values compare to the size of the dataset. We just delete those rows.

In [None]:
cleaned_df.dropna(axis=0,how='any',inplace=True)

So far, we have taken care of missing values.

In [None]:
cleaned_df.shape

Next, we want to further lightweight our dataset, put some features aside and check some potential problems mentioned in the beginning.
Here is the list of things we want to do next:
* arrival_date_year: check if we should keep it or not(done)
* arrival_date_month,arrival_date_week_number,arrival_date_day_of_month: check if we should just keep week_number(done)
* stays_in_weekend_nights, stays_in_week_nights: examine the cases have 0 in both columns(done)
* country: check if we should keep it or not, at least we won't use this feature in the prediction of cancellation(leave it for EDA)
* is_repeated_guest: we want to discard this value since previous_not_canceled feature already cover the same information, but double check it.(done)
* reserved_room_type, assigned_room_type: we could try to combine these two together and construct features like 'is_different', but for now just keep them for EDA(leave it for EDA)
* revervation_status: we want to discard this value, even if in the future we have some new data with values other than 'check-out','canceled'. It won't help much to predict cancellation.(done)
* reservation_date: we can construct new feature like 'time interval between booking enter date and cancellation date' or 'time interval between cancellation date and planned arrival date' which could be very helpful, but for now we don't need it.(done)

We starts with 'arrival_date_year'. The reason I want to discard this feature is because we only have data from 2015 to 2017. We can't and don't want to capture how year affect cancellation with only 3 years' data. Even if we do, we'll need data from more different years and need to encode the year data in another form. Although, we could still make some plots against year in EDA, but that won't help much.

In [None]:
cleaned_df.arrival_date_year.value_counts()

From the below plots, it seems arrival_date_month and arrival_date_week_number resemble the same trend but the latter one contains more detialed information. We can consider just keep arrival_date_week_number.

In [None]:
city = cleaned_df[cleaned_df.hotel=='City Hotel']
resort = cleaned_df[cleaned_df.hotel== 'Resort Hotel']
f, axes = plt.subplots(1, 3, figsize=(30, 7))
axes[0].hist([city['arrival_date_month'], resort['arrival_date_month']], color=['r','b'], alpha=0.5,bins=12)
axes[1].hist([city['arrival_date_week_number'], resort['arrival_date_week_number']], color=['r','b'], alpha=0.5)
axes[2].hist([city['arrival_date_day_of_month'], resort['arrival_date_day_of_month']], color=['r','b'], alpha=0.5)

In [None]:
sns.countplot(x="arrival_date_day_of_month", hue="is_canceled", data=cleaned_df)

It seems there is no clear pattern between cancellation and arrival_date_day_of_month except that there is a drop in count numbers in 31 which is due to the fact not every month has 31th day. Together with the histogram, we probably could exclude arrival_date_day_of_month out.

For stays_in_weekend_nights and stays_in_week_nights, we mentioned in the beginning that we can observe some cases where both columns equal to 0 while the booking is not canceled. 

In [None]:
cleaned_df[(cleaned_df.stays_in_week_nights ==0)&(cleaned_df.stays_in_weekend_nights ==0)].is_canceled.hist(bins=[0,0.5,1])

In [None]:
len(cleaned_df[(cleaned_df.stays_in_week_nights ==0)&(cleaned_df.stays_in_weekend_nights ==0)])/len(cleaned_df)

We can see there are around 600 cases in total which only count for 0.7% of the whole dataset. Still, we can't say for sure how those cases comes from, but it won't harm much in our study, we can just leave them there and consider them as errors.

I believe is_repeated_guest is just where previous_bookings_not_canceled >= 1. Let's check that.

In [None]:
((cleaned_df.is_repeated_guest ==1) == (cleaned_df.previous_bookings_not_canceled >=1)).value_counts()

Most cases are like what we believed. The rest is very likely to be data errors.

In [None]:
cleaned_df.reservation_status.value_counts()

Given the information in the above section. We will drop features below:
* arrival_date_year
* arrival_date_month
* arrival_date_day_of_month
* country
* is_repeated_guest
* reservation_status
* reservation_status_date

In [None]:
eda_df = cleaned_df.drop(['arrival_date_year','arrival_date_month','arrival_date_day_of_month','country','is_repeated_guest','reservation_status','reservation_status_date'],axis=1)

In [None]:
eda_df.info()

We have 23 columns left. They all in the correct dtype except 'is_canceled' which already being encoded into a binary feature.

In [None]:
eda_df.describe()

From the above infomation, there are a few need further investigate.
* children
* babies
* previous_cancellations
* previous_bookings_not_canceled
* required_car_parking_spaces

In [None]:
eda_df[['children','babies',
'previous_cancellations',
'previous_bookings_not_canceled',
'required_car_parking_spaces']].hist()

They are highly skewed. probably can be converted into binary features.

Correlation matrix of numerical features from the whole data.(Check if any linear relationship)

In [None]:
corr = eda_df.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

Correlation between is_canceled and the rest from whole data

In [None]:
corr['is_canceled'].sort_values(ascending=False)

Do the above again just for City Hotels

In [None]:
corr1 = eda_df[eda_df.hotel=='City Hotel'].corr()
ax = sns.heatmap(
    corr1, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
corr1['is_canceled'].sort_values(ascending=False)

Again just for Resort Hotels

In [None]:
corr2 = eda_df[eda_df.hotel=='Resort Hotel'].corr()
ax = sns.heatmap(
    corr2, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
corr2['is_canceled'].sort_values(ascending=False)

In [None]:
corr3 = pd.DataFrame({'Whole data':corr['is_canceled'],'CityHotel':corr1['is_canceled'],'ResortHotel':corr2['is_canceled']})

In [None]:
ax = sns.heatmap(
    corr3, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

There is no clear linear relation among features or between 'is_canceled' and the rest.

Next, for categorical features. We first select all columns with dtype 'object', then convert 'is_canceled' into categorical. After having all these columns we form contingency tables between 'is_canceled' and all the rest categorical features then conduct the chi-square test for independence.

In [None]:
cate = []
for i in eda_df.columns:
    cate.append((eda_df[i].dtype == 'object'))
cat_features = eda_df[eda_df.columns[cate]]

In [None]:
cat_features

In [None]:
cat_label = eda_df.is_canceled.map({1:'canceled',0:'not canceled'})
cat_df = pd.concat([cat_label,cat_features],axis=1)

In [None]:
cat_df

In [None]:
import scipy
pvalue={}
#pvalues = pd.DataFrame(data = np.zeros((9,9)),index=list(cat_df.columns),columns=list(cat_df.columns))
#for i in cat_df.columns:
for j in cat_features.columns:
    tab = pd.crosstab(cat_label,cat_features[j], margins = False)
    chi2, p, dof, ex = scipy.stats.chi2_contingency(tab)
    pvalue[j] = p

In [None]:
pvalue

All p-values are extremely small, which indicate these categorical features are not independent with is_canceled.