In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import pandas_profiling 
import seaborn as sns
sns.set(style="darkgrid")

In [None]:
pd.set_option('display.max_columns', None)
data = pd.read_csv('../input/hotel-booking-demand/hotel_bookings.csv')

In [None]:
data.head()

In [None]:
dt = data.copy() # save original data as 'dt'

In [None]:
pandas_profiling.ProfileReport(data)

# feature engineering

we have 30k duplicated rows we want to first drop it.

### 1. Adr

adr is Average Daily Rate as defined by dividing the sum of all lodging transactions by the total number of staying nights. It has 1.6% zero values and we assume it is because it has 0 in the total lodging transactions, I replaced them by mean. I also noticed that there are two odd numbers (-6.38 and 5400), I eliminated those number and replaced them by mean as well. Noticed that the mean is calculated after removing the extream values. 

In [None]:
# assign extream values to zero
data['adr'].loc[data['adr']==-6.38] = 0

In [None]:
print (data['adr'].mean()) # find mean
print (data['adr'].std()) # find std

In [None]:
len(data['adr'].loc[data['adr']==0]) # find rows with values = 0

In [None]:
random_adr_list = np.random.randint(101.83112153446453 - 50.5357902855456,
                                    101.83112153446453 + 50.5357902855456,
                                    size = 1960)
# generate random list with the range of mean +/- standard deviation 

In [None]:
data['adr'].loc[data['adr']==0] = random_adr_list # replace 0 with the list we generated

### 2 Agent
'Agent' is ID of the travel agency that made the booking. This Variable has 13.7% missing values

In [None]:
data['agent'].corr(data['is_canceled'])

In [None]:
data['agent'] = data['agent'].fillna(data['agent'].mean()) # fill NaN with mean

In [None]:
pd.isnull(data['agent']).sum()

### babies and children
Both these two variables are highly skewed (with more than 90% of values greater than 0), let's take look. To balance the two skewed features, I combine these two and make it a boolean variable. If the family has children(or babies) is 1 otherwise is 0.

In [None]:
hist_b = data['babies'].hist()

In [None]:
hist_b = data['children'].hist()

In [None]:
data_kids = data['babies'] + data['children']
data['having_kids'] = [0 if x==0 else 1 for x in data_kids]

In [None]:
data['having_kids'].hist(bins=4)

In [None]:
data = data.drop(['babies','children'],axis = 1)

### booking_changed
Number of changes/amendments made to the booking from the moment the booking was entered on the PMS until the moment of check-in or cancellation. It has 84.9% zeros. Just like previous variables , I covert 'booking_changed' into boolean where has change is 1 and no change is 0. 

In [None]:
data['booking_changes_boo'] = [0 if x == 0 else 1 for x in data['booking_changes']]

In [None]:
data['booking_changes_boo'].hist(bins = 3,figsize=(10,5))

In [None]:
data = data.drop(['booking_changes'],axis = 1)

### companies
The ID of the company/entity that made the booking or responsible for paying the booking. ID is presented instead of designation for anonymity reasons. There is 94.3% of missing values; plus, company ID is random number, adding them cannot explain the results, so I decided to remove it. 

In [None]:
data = data.drop(['company'],axis = 1)

### days_in_waiting_list

![](http://)Number of days the booking was in the waiting list before it was confirmed to the customer.

In [None]:
data['days_in_waiting_list'].corr(data['is_canceled'])

In [None]:
data['days_in_waiting_list'].loc[data['days_in_waiting_list']>0].corr(data['is_canceled'])

Excluding the 0's, there are negative correlations!

In [None]:
data.plot.scatter(x='days_in_waiting_list',y='is_canceled')

Because of the reason that alomost 97% data shows no waiting time (0 in 'days_in_waiting_list'), this feature can't give us so much information about wether waiting too long can lead to the cancelation. Although,in the none_0 values, it is indicates that people generally tend to cancel their order if they wait too long(the correlation is -.15), which make sense, overall, the cancellation isn't much related to wether the waiting time is long or short as we can see from the chart above, people cancel or not regardless how long they are in the waiting list

### previous_cancellations
![](http://)We have two features indicate whether the previous booking was canceled or not by the customers. They both contains many 0's, Let's take a look 

In [None]:
print ( 'the number of 0 in ''previous_bookings_not_canceled ' 'is', 
    ((data['previous_bookings_not_canceled'].loc[data['previous_bookings_not_canceled'] == 0].count())/len(data))*100,'%')
print ( 'the number of 0 in ''previous_bookings_canceled ' 'is', 
    ((data['previous_cancellations'].loc[data['previous_cancellations'] == 0].count())/len(data))*100,'%')

In [None]:
sns.relplot(x="previous_bookings_not_canceled", y="is_canceled", 
            hue="previous_cancellations", palette="ch:r=-.5,l=.75", data=data);

# check the correlation of each variables

Assuming that all the variables can be linearly related to the dependent variable which is "is_canceled", we run correlation and find the top related features.

In [None]:
dt_num = data.select_dtypes(include = ['float64', 'int64'])
dt_num_corr = dt_num.corr()['is_canceled'][:-1]
top_features_list = dt_num_corr[abs(dt_num_corr) > 0.1].sort_values(ascending=False) # no correlation lagger than 0.5
print("There is {} strongly correlated values with is_cancled:\n{}".format(len(top_features_list), top_features_list))

In [None]:
dt_num.hist(figsize=(16, 20))

clearly, all features are not that linear related, I think we can find other insights

In [None]:
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

#### lead_time
Lead time is the top linearly related feature and it is continoues, I cut it to the 5 parts accroding to different percentail of the distribution. 

In [None]:
data['cat_lead_time'] = pd.cut(data.lead_time,
                               bins=[0,18,69,160,320,737],
                               labels=['0-18','19-69','70-160','161-320','321+'])

In [None]:
data.groupby(['cat_lead_time'])['is_canceled'].mean().plot.bar()

It is clear that more time leads to the cancelation.

#### previous cancellations

In [None]:
data.groupby(['previous_cancellations'])['is_canceled'].mean().plot.bar()

Generally, with more previous cancellations, people tend to cancel for current trip as well. This make sense, however, for the cancellation that only happened once it still shows pretty high cancellations, I assume that maybe due to the fact that this is the only record for this customer whom we don't have any previous information from.

####  required_car_parking_spaces 

In [None]:
data.groupby(['required_car_parking_spaces'])['is_canceled'].mean().plot.bar()

No clear sign of correlation given that most records shows 0 values in this feature.

#### total_of_special_requests

In [None]:
data.groupby(['total_of_special_requests'])['is_canceled'].mean().plot.bar()

More requests sent, more likely this person would drop the reservation. Maybe because increasing requests also increase the difficulties for the hotel to meet thoes requirements.

# conclusion

Accroding to the correlation based analysis, people tend to cancel their hotel reservation when:
1.They book the hotel too early before actual check_in
2.They had many cancellation record in the past.
3.They have too many request for the hotel that maybe hard for the hotel to accommondate.

However, those assumption only based on linear correlation, the possible improvemnet is to check the none linear relations features. Also, adding more features and change current features might be work as well.