In [None]:
# importing data libraries
import numpy as np
import pandas as pd

# statistics libraries
from scipy import stats

# importing visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv('../input/hotel-booking-demand/hotel_bookings.csv')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
# nan values

total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum() / data.isnull().count()).sort_values(ascending=False)

nan_cols = pd.concat([total, percent], axis=1, keys=['Total', 'nan'])

nan_cols = nan_cols[nan_cols['nan'] > 0]

nan_cols

In [None]:
plt.figure(figsize=(10, 10))

plt.pie(nan_cols.reset_index()['Total'])

plt.title("NaN Values")
plt.legend(labels=nan_cols.reset_index()['index'])
plt.show()

<h1 align='center'>Handling with Null Data</h1>

In [None]:
# We will drop columns 'company' and 'agent' because, there are too many null values for that columns

data.drop(['company', 'agent'], axis=1, inplace=True)

In [None]:
# Reviewing rows with null country values 
data[data['country'].isnull()]

In [None]:
# Reviewing rows with null children values 
data[data['children'].isnull()]

In [None]:
# There are 488 rows which have null country values and 4 rows which have null children values
# These rows are not esential in terms of our purpose, therefore we will be dropping them.

data.drop(data[data['country'].isnull()].index, axis=0, inplace=True)
data.drop(data[data['children'].isnull()].index, axis=0, inplace=True)

In [None]:
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum() / data.isnull().count()).sort_values(ascending=False)

nan_cols = pd.concat([total, percent], axis=1, keys=['Total', 'nan'])

nan_cols = nan_cols[nan_cols['nan'] > 0]

nan_cols

In [None]:
def plot_canceling_prob(col_name: str, data: pd.DataFrame):
    """
    Displays canceling probabilities for categorical data.
    """
    
    plt.figure(figsize=(16, 8))
    
    x = data.groupby('is_canceled')[col_name].value_counts(sort=True, normalize=True)[1].keys().values
    y = data.groupby('is_canceled')[col_name].value_counts(sort=True, normalize=True)[1].values
    leg = data.groupby('is_canceled')[col_name].value_counts(normalize=True, sort=True)[1].values
    

    g = sns.barplot(x, y)
    g.set(title=f'Canceled Booking Distribution on {col_name}')
    
    plt.legend(leg)
    plt.show(g)

In [None]:
def count_cat_prob_plot(col_name: str, data: pd.DataFrame):
    
    g1 = sns.countplot(x=col_name, data=data)
    plt.title(f"Count Plot for {col_name}")
    plt.show(g1)
    
    g2 = sns.catplot(x=col_name, y='is_canceled', data=data, kind='bar', aspect=3)
    plt.title(f"Canceling Probabilities for each {col_name}")
    plt.show(g2)
    
    plot_canceling_prob(col_name, data) 

In [None]:
# Looking at the overall data, after handling with null data 

data.shape 

<h1 align='center'>Data Analysis and Visualizations</h1>

<a id='main'></a>
### Data Exploration for Canceling a Booking:

- [1 - Hotel](#hotel)

- [2 - Lead time](#lead_time)

- 

- 

- 

- 
 
- 

- 

- 

- 

### Analysis out of Canceling:

- Family size vs Country

- Hotel occupancy rate depending weekdays vs weeekend

- 

- 
 
- 

- 

- 

- 

### All columns list:

- [Hotel](#hotel) [lead_time](#lead_time), [arrival_date_year](#arrival_date_year), [arrival_date_month](#arrival_date_month), [stays_in_weekend_nights](#stays_in_weekend_nights), [stays_in_week_nights](#stays_in_week_nights), [adults](#adults), [children](#children), [babies](#babies), [meal](#meal), [country](#country), [market_segment](#market_segment), [distribution_channel](#distribution_channel), [is_repeated_guest](#is_repeated_guest), [previous_cancellations](#previous_cancellations), [previous_bookings_not_canceled](#previous_bookings_not_canceled), [reserved_room_type](#reserved_room_type), [assigned_room_type](#assigned_room_type), [booking_changes](#booking_changes), [deposit_type](#deposit_type), [days_in_waiting_list](#days_in_waiting_list), [customer_type](#customer_type)
, [adr](#adr), [required_car_parking_spaces](#required_car_parking_spaces), [total_of_special_requests](#total_of_special_requests)


In [None]:
data.columns

In [None]:
# seaborn initial settings

sns.set(context='notebook', palette='Set1', style='whitegrid', rc={'figure.figsize':(16, 8)})

In [None]:
columns_to_remove = list()
columns_to_dummy = list()

In [None]:
# keep analysis for each feature

analysis = {}

for col in data.columns:
    analysis[col] = []

### 0 - Correlation HeatMap: - [main](#main)

In [None]:
data_corr = data.corr()

column = 'is_canceled'
corr_cols = data.shape[1]

cols = data_corr.nlargest(corr_cols, column)[column].index
coef = data_corr.nlargest(corr_cols, column)[cols].values

plt.figure(figsize=(16, 16))

g = sns.heatmap(coef, cbar=True, annot=True, square=True, fmt='.2f', 
                yticklabels=cols.values, xticklabels=cols.values)

Most correlated columns with cancelation:
    
- lead_time
    
- previous_cancelations

- adults

- days_in_waiting_list

- adr

- stays_date_week_nights

- arrival_date_years
    

<a id='hotel'></a>

### 1 - hotel: - [main](#main)
    
    - hotel name info.

In [None]:
data['hotel'].unique()

In [None]:
plt.pie(data['hotel'].value_counts().values, labels=data['hotel'].value_counts().keys())

plt.title("Hotels")
plt.show()

In [None]:
g = sns.countplot(x='hotel', hue='is_canceled', data=data)

g.set_title("Hotels")

plt.show(g)

* City hotel's has more bookings than resort hotel. Also, cancellation rate of City Hotel is higher than Resort Hotel.

In [None]:
analysis['hotel'].append('City hotel has more bookings and higher cancellation rates.')

In [None]:
g = sns.catplot(x='hotel', y='is_canceled', data=data, kind='bar', height=7, aspect=2)

g.despine(left=True) # removes axis line. Here removes y axis line.

g.set(xlabel='Hotel Name', ylabel='Canceling Probability', title="Hotel's Canceling Probabilities")

plt.show(g)

- Customers who booked to City hotel more likely to cancel their bookings

In [None]:
analysis['hotel'].append('Customers who booked to City hotel more likely to cancel their bookings.')

In [None]:
plot_canceling_prob('hotel', data)

- In total, City hotel has more canceled bookings. This may be due to City hotel's higher number of bookings compared to Resort Hotel.

In [None]:
analysis['hotel'].append("In total, City hotel has more canceled bookings. This because City hotel's higher number of bookings compared to Resort Hotel.")

In [None]:
#Displaying final indidual analysis on hotel

analysis['hotel']

In [None]:
# Total lost money due to canceling booking for each hotel


resort = data[data['hotel'] == 'Resort Hotel'].copy()
city = data[data['hotel'] == 'City Hotel'].copy()

resort['total_stays'] = resort['stays_in_week_nights'] + resort['stays_in_weekend_nights']
city['total_stays'] = city['stays_in_week_nights'] + city['stays_in_weekend_nights']

resort['customer_total_payment'] = resort['adr'].values * resort['total_stays'].values
city['customer_total_payment'] = city['adr'] * city['total_stays']

resort_lost_revenue = resort[resort['is_canceled'] == 1]['customer_total_payment'].sum()
city_lost_revenue = city[city['is_canceled'] == 1]['customer_total_payment'].sum()

resort_total_revenue = resort['customer_total_payment'].sum()
city_total_revenue = city['customer_total_payment'].sum()

sns.set_color_codes("pastel")
g = sns.barplot(x=['Resort', 'City'], y=[resort_total_revenue, city_total_revenue], color='b')
sns.set_color_codes("muted")
g = sns.barplot(x=['Resort', 'City'], y=[resort_lost_revenue, city_lost_revenue], color='b')


plt.legend([f'Resort Total: {round(resort_total_revenue)} - Lost: {round(resort_lost_revenue)}',
            f'City Total: {round(city_total_revenue)} - Lost: {round(city_lost_revenue)}'])
plt.title('Lost money due to Canceling Bookings')
plt.show()


There is a huge lost for hotels due to canceling bookings.

In [None]:
g = sns.countplot(x='arrival_date_month', data=data, hue='hotel')

plt.title("Occupancy Rate")
plt.legend(['resort', 'city'])
plt.show(g)

In [None]:
data['total_stays'] = data['stays_in_week_nights'] + data['stays_in_weekend_nights']
data['customer_total_payment'] = data['adr'] * data['total_stays']


months = ['', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September',
              'October', 'November', 'December']

g = sns.lineplot(x='arrival_date_month', y='adr', data=data, hue='hotel', color='r')

plt.title("Montly ADR")
plt.legend(['resort', 'city'])
plt.show(g) 

In [None]:
columns_to_dummy.append('hotel')

<a id='lead_time'></a>

### 2 - lead_time: - [main](#main)

    - Number of days that elapsed between the entering date of the booking into the PMS and the arrival date

In [None]:
data['lead_time'].describe()

In [None]:
g = sns.distplot(a=data['lead_time'], label='lead_time_distribution')

plt.xlim([0, 750])
plt.show(g)

- We see that there is a positive skewness in the lead time.

- Most of booking planned for a close time.

In [None]:
analysis['lead_time'].append('We see that there is a positive skewness in the lead time.')

In [None]:
# lead_time vs canceled

g = sns.FacetGrid(data, col='is_canceled', height=6)
g = g.map(sns.distplot, 'lead_time')

plt.xlim([0, 750])
plt.show(g)

- Most of not canceled bookings have a short lead time comparing to canceled bookings.

In [None]:
# lead_time dist vs is_canceled

g = sns.kdeplot(data['lead_time'][data['is_canceled'] == 0],
               color='Red', shade=True)

g = sns.kdeplot(data['lead_time'][data['is_canceled'] == 1],
               color='Blue', shade=True)

g.set_xlabel('lead_time')
g.set_ylabel('Freq')

g = g.legend(['Not Canceled', 'Canceled'])

plt.plot([50, 50], [0.00, 0.0045], ':')
plt.xlim([0, 750])
plt.show(g)

- Long term bookings more likely to be canceled.

- High lead time causes high canceling probability.

In [None]:
analysis['lead_time'].append('High lead time causes high canceling probability.')

In [None]:
data['lead_time'].min(), data['lead_time'].mean(), data['lead_time'].max()

In [None]:
# we can use binning method to convert lead time in day to months 

data['lead_time_30'] = data['lead_time'] // 30
data['lead_time_60'] = data['lead_time'] // 60
data['lead_time_120'] = data['lead_time'] // 120
data['lead_time_360'] = data['lead_time'] // 360

In [None]:

# respect to 30 days binning.
g = sns.countplot(x='lead_time_30', hue='is_canceled', data=data)

plt.title('lead time 30 day binned')
plt.show(g)

- Bookings are maded for 7 months later more likely to be canceled. 

In [None]:
# prices according to lead_time

# we will use monthly lead_time

g = sns.lineplot(x='lead_time_30', y='adr', data=data, hue='hotel', markers=True, dashes=False)

plt.title("Lead time vs ADR")
plt.legend(['resort', 'city'])
plt.show(g) 


- Increase in lead time decreases average daily room rate

In [None]:
analysis['lead_time'].append('Bookings are maded for 7 months later more likely to be canceled.')

In [None]:
countries_lead_time = data.groupby('country')['lead_time'].sum().reset_index(name = 'Total Lead Time')

In [None]:
# Lead time averages by countries

import plotly.express as px

px.choropleth(countries_lead_time,
                    locations = "country",
                    color= "Total Lead Time", 
                    hover_name= "Total Lead Time",
                    color_continuous_scale=px.colors.sequential.Oranges,
                    title="Lead Time by Countries")


In [None]:
columns_to_remove.extend(['lead_time_60', 'lead_time_30', 'lead_time_120', 'lead_time_360'])

In [None]:
#Showcasing final analysis on lead_time 

analysis['lead_time']

<a id='arrival_date_year'></a>

### 3 - arrival_date_year: - [main](#main)

In [None]:
data['arrival_date_year'].describe()

In [None]:
data['arrival_date_year'].unique()

In [None]:
data['arrival_date_year'].value_counts()

In [None]:
g1 = sns.countplot(x='arrival_date_year', hue='hotel', data=data)

plt.title('Yearly Occupation Rate')
plt.show(g1)

In [None]:
g2 = sns.countplot(x='arrival_date_year', hue='is_canceled', data=data)

plt.title('Yearly Canceling Counts')
plt.show(g2)

- The highest number of booking belongs to 2016 then 2017 and 2015

In [None]:
analysis['arrival_date_year'].append('The highest number of booking belongs to 2016 then 2017 and 2015.')

In [None]:
g = sns.catplot(x='arrival_date_year', y='is_canceled', data=data, kind='bar', height=7, aspect=2)

g.despine(left=True)

g.set(title='Canceling Probabilities - Arrival Years')

- 2015 - 2016 - 2017 have similar canceling probabilities.

In [None]:
analysis['arrival_date_year'].append('2015 - 2016 - 2017 have similar canceling probabilities.')

In [None]:
g = sns.catplot(x='arrival_date_year', y='is_canceled', hue='hotel', data=data, kind='bar', height=7, aspect=2)

g.despine(left=True)

g.set(title='Canceling Probabilities - Arrival Years')

plt.show(g)

- In each year canceling probability is higher for City Hotel.

In [None]:
g = sns.boxplot(x='arrival_date_year', y='adr', data=data, hue='hotel')

plt.title("Yearly ADR")
plt.show(g) 

- Resort hotel made some discount on its room rate in 2016. City hotel increased it's rooms rate every year. 

In [None]:
## Revenue Total and lost by years

yearly_revenue_city = data[(data['hotel'] == 'City Hotel') & (data['is_canceled'] == 0)].groupby('arrival_date_year')['adr'].sum().reset_index(name = 'Total Revenue')
yearly_revenue_resort = data[(data['hotel'] == 'Resort Hotel') & (data['is_canceled'] == 0)].groupby('arrival_date_year')['adr'].sum().reset_index(name = 'Total Revenue')

yearly_revenue_city_cancel = data[(data['hotel'] == 'City Hotel') & (data['is_canceled'] == 1)].groupby('arrival_date_year')['adr'].sum().reset_index(name = 'Total Revenue')
yearly_revenue_resort_cancel = data[(data['hotel'] == 'Resort Hotel') & (data['is_canceled'] == 1)].groupby('arrival_date_year')['adr'].sum().reset_index(name = 'Total Revenue')


g = sns.lineplot(x='arrival_date_year', y='Total Revenue', data=yearly_revenue_resort)
g = sns.lineplot(x='arrival_date_year', y='Total Revenue', data=yearly_revenue_resort_cancel)

g = sns.lineplot(x='arrival_date_year', y='Total Revenue', data=yearly_revenue_city)
g = sns.lineplot(x='arrival_date_year', y='Total Revenue', data=yearly_revenue_city_cancel)


plt.title("Yearly Revenue")
plt.legend(['Resort Revenue', 'Resort Lost Revenue', 'City Revenue', 'City Lost Revenue'])
plt.show(g) 

- Yearly Revenue of the both hotel increased in 2016.

In [None]:
analysis['arrival_date_year'].append('In each year canceling probability is higher for Ciy Hotel.')

In [None]:
columns_to_dummy.append('arrival_date_year')

In [None]:
#Showcasing final analysis on arrival_date_year

analysis['arrival_date_year']

<a id='arrival_date_week_number'></a>

### 4 - arrival_date_week_number: - [main](#main)

In [None]:
data['arrival_date_week_number'].describe()

In [None]:
data['arrival_date_week_number'].value_counts()[:10]

In [None]:
# we have info for arrival month that's why we can use week info to detect which week in a month they arrived.

data['arrival_date_weekth_in_month'] = data['arrival_date_week_number'] % 4

In [None]:
data['arrival_date_weekth_in_month'].describe()

In [None]:
data['arrival_date_weekth_in_month'].value_counts(sort=False)

In [None]:
g = sns.catplot(x='arrival_date_weekth_in_month', y='is_canceled', data=data, kind='bar', height=7, aspect=2)

g.despine(left=True)

g.set(title='Canceling Probabilities - Arrival Weekth of Month')

plt.show(g)

In [None]:
plot_canceling_prob('arrival_date_weekth_in_month', data)

-  Canceling probability is high for weeks 1st and 2nd.

In [None]:
analysis['arrival_date_week_number'].append('Canceling probability is high for weeks 1st and 2nd.')

In [None]:
g = sns.boxplot(x='arrival_date_weekth_in_month', y='adr', data=data, hue='hotel')

plt.title("Ith week ADR")
plt.show(g) 

In [None]:
columns_to_dummy.append('arrival_date_weekth_in_month')

In [None]:
#Showcasing final analysis on arrival_date_week_number 
analysis['arrival_date_week_number']

<a id='arrival_date_month'></a>

### 5 - arrival_date_month: - [main](#main)

In [None]:
data['arrival_date_month'].describe()

In [None]:
data['arrival_date_month'].unique()

In [None]:
canceled_months = data[data['is_canceled'] == 1].groupby('arrival_date_month').size().reset_index(name='Total')
total_months = data.groupby('arrival_date_month').size().reset_index(name='Total')

g = sns.barplot(x='arrival_date_month', y='Total', data=total_months)
g = sns.barplot(x='arrival_date_month', y='Total', data=canceled_months, color='r')

plt.title("Montly Bookings vs Canceled Bookings")
plt.show()

In [None]:
g = sns.catplot(x='arrival_date_month', y='is_canceled', data=data, kind='bar', height=7, aspect=2)

g.despine(left=True)

g.set(title='Arrival Month - Canceling Probability')

plt.show(g)

- We have higher canceling probabilities for summer.

In [None]:
analysis['arrival_date_month'].append('We have higher canceling probabilities for summer.')

In [None]:
# we may try to generate season data.

def month_to_season(month):
    
    if month in ['June', 'July', 'August']:
        return "summer"
    elif month in ['March', 'April', 'May']:
        return "spring"
    elif month in ['October', 'November', 'September']:
        return "autumn"
    else:
        return "winter"

In [None]:
data['seasons'] = data['arrival_date_month'].apply(month_to_season)

In [None]:
data['seasons'].value_counts()

In [None]:
canceled_seasons = data[data['is_canceled'] == 1].groupby('seasons').size().reset_index(name='Total')
total_seasons = data.groupby('seasons').size().reset_index(name='Total')

g = sns.barplot(x='seasons', y='Total', data=total_seasons)
g = sns.barplot(x='seasons', y='Total', data=canceled_seasons, color='r')

plt.title("Seasons Bookings vs Canceled Bookings")
plt.show()


In [None]:
g = sns.catplot(x='seasons', y='is_canceled', data=data, kind='bar', height=7, aspect=2)

g.despine(left=True)

g.set(title='Arrival Season - Canceling Probability')

plt.show(g)

- Lowest Cancel Probability is for winter season. 

In [None]:
analysis['arrival_date_month'].append('Lowest Cancel Probability is for winter season.')

In [None]:
plot_canceling_prob('seasons', data)

In [None]:
g = sns.boxplot(x='seasons', y='adr', data=data, hue='hotel')

plt.title("Seasons ADR")
plt.show(g) 

In [None]:
g = sns.catplot(x='seasons', y='is_canceled', hue='hotel', data=data, kind='bar', height=7, aspect=2)

g.despine(left=True)

g.set(title='Arrival Season - Canceling Probability')

plt.show(g)

In [None]:
columns_to_dummy.extend(['arrival_date_month'])

In [None]:
#Showcasing final analysis on arrival_date_month
analysis['arrival_date_month']

In [None]:
columns_to_remove.append('seasons')

<a id='arrival_date_day_of_month'></a>

### 6 - arrival_date_day_of_month: - [main](#main)

In [None]:
data['arrival_date_day_of_month'].describe()

In [None]:
g = sns.countplot(x='arrival_date_day_of_month', data=data)

g.set(title='Arrival Day of Month - Reservation Count')

plt.show(g)

In [None]:
g = sns.catplot(x='arrival_date_day_of_month', y='is_canceled', data=data, kind='bar', height=7, aspect=2)

g.despine(left=True)

g.set(title='Arrival Day of Month - Canceling Probability')

plt.show(g)

In [None]:
g = sns.catplot(x='arrival_date_day_of_month', y='is_canceled', hue='hotel', data=data, kind='bar', height=7, aspect=2)

g.despine(left=True)

g.set(title='Arrival Day of Month - Canceling Probability')

plt.show(g)

In [None]:
g = sns.lineplot(x='arrival_date_day_of_month', y='adr', data=data, hue='hotel')

plt.xticks(range(0, 31))
plt.title("Arrival Date of Month ADR")
plt.show(g) 

In [None]:
# we will convert day of month to day of week

def date_to_day_of_week(row):
    
    import datetime
    
    months = ['', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September',
              'October', 'November', 'December']
    
    year = row['arrival_date_year']
    month = months.index(row['arrival_date_month'])
    day = row['arrival_date_day_of_month']
    
    arrival_date = datetime.date(year, month, day)
    
    # row['arrival_date_day_of_week'] = arrival_date.strftime("%A")
    
    return arrival_date.strftime("%A")

In [None]:
data['arrival_date_day_of_week'] = np.nan

In [None]:
data['arrival_date_day_of_week'] = data.reset_index().apply(date_to_day_of_week, axis=1)

In [None]:
g = sns.countplot(x='arrival_date_day_of_week', data=data)

g.set(title='Arrival Day of Week')
plt.show(g)

In [None]:
g = sns.lineplot(x='arrival_date_day_of_week', y='adr', data=data, hue='hotel')

plt.title("Arrival Date of Week ADR")
plt.show(g) 

In [None]:
g = sns.catplot(x='arrival_date_day_of_week', y='is_canceled', data=data, kind='bar', height=7, aspect=2)

g.despine(left=True)

g.set(title='Arrival Day of Week - Canceling Probability')

plt.show(g)

In [None]:
g = sns.catplot(x='arrival_date_day_of_week', y='is_canceled', hue='hotel', data=data, kind='bar', height=7, aspect=2)

g.despine(left=True)

g.set(title='Arrival Day of Week - Canceling Probability')

plt.show(g)

In [None]:
analysis['arrival_date_day_of_month'].append('Risky days for hotels differ from each other.')

In [None]:
# we will drop the column of day_of_month

columns_to_remove.append('arrival_date_day_of_month')

In [None]:
columns_to_dummy.append('arrival_date_day_of_week')

In [None]:
#Showcasing final analysis on 'arrival_date_day_of_month'
analysis['arrival_date_day_of_month']

<a id='stays_in_weekend_nights'></a>

### 7 - stays_in_weekend_nights: - [main](#main)

- Number of weekend nights (Saturday or Sunday) the guest stayed or booked to stay at the hotel

In [None]:
data['stays_in_weekend_nights'].isna().sum()

In [None]:
data['stays_in_weekend_nights'].describe()

In [None]:
sorted(data['stays_in_weekend_nights'].unique())

In [None]:
g = sns.catplot(x='stays_in_weekend_nights', y='is_canceled', data=data, kind='bar', height=8, aspect=2)

g.despine(left=True)

g.set(title='Weekend Nights - Canceling Probabilities')

plt.show(g)

<a id='stays_in_week_nights'></a>

### 8 - stays_in_week_nights: - [main](#main)

 - Number of week nights (Monday to Friday) the guest stayed or booked to stay at the hotel

In [None]:
data['stays_in_week_nights'].isna().sum()

In [None]:
data['stays_in_week_nights'].describe()

In [None]:
data['stays_in_week_nights'].value_counts(normalize=True)

In [None]:
g = sns.catplot(x='stays_in_week_nights', y='is_canceled', data=data, kind='bar', height=8, aspect=2)

g.despine(left=True)

g.set(title='Weekend Nights - Canceling Probabilities')

plt.show(g)

In [None]:
# we may check the total stay time.

data['stays_total'] = data['stays_in_week_nights'] + data['stays_in_weekend_nights']

In [None]:
data['stays_total'].describe()

In [None]:
g = sns.catplot(x='stays_total', y='is_canceled', data=data, kind='bar', height=8, aspect=2)

g.despine(left=True)

g.set(title='Weekend Nights - Canceling Probabilities')

plt.show(g)

In [None]:
g = sns.boxplot(x='total_of_special_requests', y='stays_total', data=data, hue='hotel')

plt.title("Total Special Requests vs Stays Total")
plt.show(g)

In [None]:
g = sns.boxplot(x='is_repeated_guest', y='stays_total', data=data, hue='hotel')

plt.title("Repeated Guest vs Stays Total")
plt.show(g)

<a id='adults'></a>

### 9 - adults: - [main](#main)

- Number of adults

In [None]:
data['adults'].describe()

In [None]:
data['adults'].unique()

In [None]:
data['adults'].value_counts()

In [None]:
g = sns.countplot(x='adults', data=data)

plt.show(g)

In [None]:
g = sns.catplot(x='adults', y='is_canceled', data=data, kind='bar', height=8, aspect=2)

g.despine(left=True)

plt.show(g)

In [None]:
# we can create a column for all adults count > 4

def adults_large(adults):
    
    if adults > 4:
        return 5
    else:
        return adults
    

In [None]:
data['adults'] = data['adults'].apply(adults_large)

In [None]:
g = sns.catplot(x='adults', y='is_canceled', data=data, kind='bar', height=8, aspect=2)

g.despine(left=True)

plt.show(g)

<a id='children'></a>

### 10 - children: - [main](#main)

- Number of children

In [None]:
data['children'].value_counts()

In [None]:
# 10 children seems to be wrong or outliar value.

data[data['children'] > 9]

In [None]:
data.drop(data[data['children'] > 9].index, axis=0, inplace=True)

In [None]:
g = sns.catplot(x='children', y='is_canceled', data=data, kind='bar', height=8, aspect=2)

g.despine(left=True)

plt.show(g)

<a id='babies'></a>

### 11 - babies: - [main](#main)

- Number of babies

In [None]:
data['babies'].describe()

In [None]:
data['babies'].unique()

In [None]:
data['babies'].value_counts()

- 9 and 10 babies are seems to be outliars

In [None]:
data[data['babies'] == 9]

In [None]:
data[data['babies'] == 10]

In [None]:
data.groupby('babies')['is_canceled'].value_counts(normalize=True)

In [None]:
# we will remove the outliar baby counts

data.drop(data[data['babies'] > 8].index, axis=0, inplace=True)

In [None]:
g = sns.catplot(x='babies', y='is_canceled', data=data, kind='bar', height=8, aspect=2)

g.despine(left=True)

plt.show(g)

In [None]:
data.drop(data[data['babies'] > 8].index, axis=0, inplace=True)

In [None]:
data['babies'].value_counts() # we don't have outliars any more.

<a id='meal'></a>

### 12 - meal: - [main](#main)

Type of meal booked. Categories are presented in standard hospitality meal packages: <hr>
BO, BL and ML <hr>
Undefined/SC – no meal package <hr>
BB – Bed & Breakfast <hr>
HB – Half board (breakfast and one other meal – usually dinner) <hr>
FB – Full board (breakfast, lunch and dinner) <hr>

In [None]:
data['meal'].unique()

In [None]:
# 'SC' and 'undefined' means same thing no meal

In [None]:
data['meal'].replace(['SC', 'Undefined'], 'NoMeal', inplace=True)

In [None]:
g = sns.countplot(x='meal', hue='is_canceled', data=data)

g.set(title='Meal Type')

plt.show(g)

In [None]:
g = sns.catplot(x='meal', y='is_canceled', data=data, kind='bar', aspect=3)

g.set(title='Each meal type canceling Probs')

plt.show(g)

In [None]:
g = sns.lineplot(x='meal', y='adr', data=data, hue='hotel')

plt.title("Meal vs ADR")
plt.show(g)

- Full meal is the cheapest for city hotel and nomeal is the cheapest for the resort hotel.

In [None]:
columns_to_dummy.append('meal')

<a id='country'></a>

### 13 - Country: - [main](#main)

In [None]:
countries_bookings = data.groupby(['country']).size().reset_index(name = 'Total')
canceled_countries = data[data['is_canceled'] == 1].groupby(['country']).size().reset_index(name = 'Canceled')
not_canceled_countries = data[data['is_canceled'] == 0].groupby(['country']).size().reset_index(name = 'Not_Canceled')

In [None]:
import pycountry

def country_code_to_name(country_code):
    
    if len(country_code) == 2:
        country = pycountry.countries.get(alpha_2=country_code)
    else:
        country = pycountry.countries.get(alpha_3=country_code)

    if not country:
        return 'Not Found'
    else:
        return country.name
        

In [None]:
countries_bookings['country_name'] = countries_bookings['country'].apply(country_code_to_name)
not_canceled_countries['country_name'] = not_canceled_countries['country'].apply(country_code_to_name)
canceled_countries['country_name'] = canceled_countries['country'].apply(country_code_to_name)

In [None]:
import plotly.express as px

px.choropleth(countries_bookings,
                    locations = "country",
                    color= "Total", 
                    hover_name= "country_name",
                    color_continuous_scale=px.colors.sequential.Oranges,
                    title="Booking Counts by Countries")

In [None]:
px.choropleth(not_canceled_countries,
                    locations = "country",
                    color= "Not_Canceled", 
                    hover_name= "country_name",
                    color_continuous_scale=px.colors.sequential.Oranges,
                    title="Not_Canceled Booking Counts by Countries")

In [None]:
px.choropleth(canceled_countries,
                    locations = "country",
                    color= "Canceled", 
                    hover_name= "country_name",
                    color_continuous_scale=px.colors.sequential.Oranges,
                    title="Canceled Booking Counts by Countries")

In [None]:
columns_to_dummy.append('country')

<a id='market_segment'></a>

### 14 - market_segment: - [main](#main)

- Market segment designation. In categories, the term “TA” means “Travel Agents” and “TO” means “Tour Operators”

In [None]:
data['market_segment'].unique()

In [None]:
g = sns.countplot(x='market_segment', data=data)

plt.show(g)

In [None]:
g = sns.catplot(x='market_segment', y='is_canceled', data=data, kind='bar', aspect=3)

plt.show(g)

In [None]:
plot_canceling_prob('market_segment', data)

In [None]:
columns_to_dummy.append('market_segment')

<a id='distribution_channel'></a>

### 15 - distribution_channel: - [main](#main)

- Booking distribution channel. The term “TA” means “Travel Agents” and “TO” means “Tour Operators”

In [None]:
data['distribution_channel'].unique()

In [None]:
count_cat_prob_plot('distribution_channel', data)

In [None]:
columns_to_dummy.append('distribution_channel')

<a id='is_repeated_guest'></a>

### 16 - is_repeated_guest: - [main](#main)

- Value indicating if the booking name was from a repeated guest (1) or not (0)

In [None]:
data['is_repeated_guest'].unique()

In [None]:
count_cat_prob_plot('is_repeated_guest', data)

- Most of booking are from new customers.

- Repeated bookings have less canceling probability than new comers.

In [None]:
g = sns.countplot(x='is_repeated_guest', data=data, hue='is_canceled')

In [None]:
analysis['is_repeated_guest'].extend(['Most of booking are from new customers.', 'Repeated bookings have less canceling probability than new comers.'])

In [None]:
#Showcasing final analysis on is_repeated_guest
analysis['is_repeated_guest']

<a id='previous_bookings_not_canceled'></a>

### 17 - previous_bookings_not_canceled and previous_cancellations: - [main](#main)

- Number of previous bookings <b>not cancelled</b> by the customer prior to the current booking

In [None]:
data['previous_bookings_not_canceled'].unique()

In [None]:
data['previous_bookings_not_canceled'].describe()

In [None]:
def cancel_ratio(row):
        
    if not row['previous_bookings_not_canceled'] + row['previous_cancellations'] == 0:
        return row['previous_cancellations'] / (row['previous_bookings_not_canceled'] + row['previous_cancellations'])
    else:
        return 0

In [None]:
data['customer_cancel_ratio'] = data.apply(cancel_ratio, axis=1)

In [None]:
data['customer_cancel_ratio'].describe()

In [None]:
data.groupby('is_canceled')['customer_cancel_ratio'].mean()

In [None]:
g = sns.barplot(x='is_canceled', y='customer_cancel_ratio', data=data)


plt.title("Customer Cancel Ratio")
plt.show(g)

In [None]:
columns_to_remove.extend(['previous_bookings_not_canceled', 'previous_cancellations'])

<a id='previous_cancellations'></a>

### 18 - previous_cancellations: [main](#main)

 - Number of previous bookings that were <b>cancelled</b> by the customer prior to the current booking

In [None]:
data['previous_cancellations'].unique()

In [None]:
data['previous_cancellations'].value_counts()

In [None]:
count_cat_prob_plot('previous_cancellations', data[data['previous_cancellations'] > 1])

<a id='reserved_room_type'></a>

### 19 - reserved_room_type: [main](#main)

- Code of room type reserved. Code is presented instead of designation for anonymity reasons

In [None]:
data['reserved_room_type'].unique()

In [None]:
count_cat_prob_plot('reserved_room_type', data)

In [None]:
def is_room_changed(row):
    
    if row['assigned_room_type'] == row['reserved_room_type']:
        return 0
    else:
        return 1

In [None]:
data['room_changed'] = data.apply(is_room_changed, axis=1)

In [None]:
g = sns.catplot(x='room_changed', y='is_canceled', data=data, kind='bar', aspect=2, height=8)

In [None]:
columns_to_dummy.append('reserved_room_type')

<a id='assigned_room_type'></a>

### 20 - assigned_room_type: [main](#main)

- Code for the type of room assigned to the booking. Sometimes the assigned room type differs from the reserved room type due to hotel operation reasons (e.g. overbooking) or by customer request. Code is presented instead of designation for anonymity reasons

In [None]:
data['assigned_room_type'].unique()

In [None]:
count_cat_prob_plot('assigned_room_type', data)

In [None]:
### assigned room and reserved_room looks like similar columns that's why we will remove it.

columns_to_dummy.append('assigned_room_type')

<a id='booking_changes'></a>

### 21 - booking_changes: [main](#main)

- Number of changes/amendments made to the booking from the moment the booking was entered on the PMS until the moment of check-in or cancellation

In [None]:
data['booking_changes'].describe()

In [None]:
data['booking_changes'].unique()

In [None]:
data['booking_changes'].value_counts()

In [None]:
count_cat_prob_plot('booking_changes', data)

<a id='deposit_type'></a>

### 22 - deposit_type: [main](#main)

- Indication on if the customer made a deposit to guarantee the booking. No Deposit – no deposit was made;

In [None]:
data['deposit_type'].describe()

In [None]:
data['deposit_type'].unique()

In [None]:
data.groupby('deposit_type')['is_canceled'].value_counts(normalize=True)

In [None]:
count_cat_prob_plot('deposit_type', data)

In [None]:
g = sns.lineplot(x='deposit_type', y='customer_total_payment', data=data, hue='hotel')

 - Total payment has least value for non-refund deposity type

In [None]:
columns_to_dummy.append('deposit_type')

<a id='days_in_waiting_list'></a>

### 23 - days_in_waiting_list: [main](#main)

- Number of days the booking was in the waiting list before it was confirmed to the customer.

In [None]:
data['days_in_waiting_list'].describe()

In [None]:
data['days_in_waiting_list'].unique()

In [None]:
data['days_in_waiting_list_30'] = data['days_in_waiting_list'] // 30

In [None]:
count_cat_prob_plot('days_in_waiting_list_30', data[data['days_in_waiting_list_30'] > 0])

<a id='customer_type'></a>

### 24 - customer_type: [main](#main)

- Type of booking, assuming one of four categories 


Contract - when the booking has an allotment or other type of contract associated to it <hr>
Group – when the booking is associated to a group; <hr>
Transient – when the booking is not part of a group or contract, and is not associated to other transient booking; <hr>
Transient-party – when the booking is transient, but is associated to at least other transient booking <hr>

In [None]:
data['customer_type'].unique()

In [None]:
count_cat_prob_plot('customer_type', data)

In [None]:
columns_to_dummy.append('customer_type')

<a id='adr'></a>

### 25 - adr: [main](#main)

- Average Daily Rate as defined by dividing the sum of all lodging transactions by the total number of staying nights <hr>
A hotel’s ADR, Average Daily Rate, is the measure of the average rate paid per room that’s occupied at the property. Ultimately, it’s a KPI that helps hoteliers identify their room rates from a day-to-day perspective. ADR is calculated to have an understanding of a hotel’s profits and performance.

In [None]:
data['adr'].describe()

In [None]:
data[data['adr'] < 0]['adr']

In [None]:
data.drop(data[data['adr'] < 0].index, axis=0, inplace=True)

In [None]:
g = sns.scatterplot(x=data.index, y=data['adr'], hue=data['is_canceled'])

plt.show(g)

In [None]:
# We have an outliar for adr column which greater than 5000

In [None]:
data[data['adr'] > 1000]

In [None]:
data.drop(data[data['adr'] > 1000].index, axis=0, inplace=True)

In [None]:
g = sns.scatterplot(x=data.index, y=data['adr'], hue=data['is_canceled'])

plt.show(g)

- We have to be carefull when we train our model because room booking separated by index. 

<a id='required_car_parking_spaces'></a>

### 26 - required_car_parking_spaces: [main](#main)

- Number of car parking spaces required by the customer 

In [None]:
data['required_car_parking_spaces'].describe()

In [None]:
data['required_car_parking_spaces'].unique()

In [None]:
data['required_car_parking_spaces'].value_counts()

In [None]:
count_cat_prob_plot('required_car_parking_spaces', data)

- Customers who required parking slots don't cancel their bookings.

In [None]:
def car_parking_space(required_park):
    
    if required_park > 0:
        return 1
    else:
        return 0

In [None]:
data['required_car_parking_spaces'] = data['required_car_parking_spaces'].apply(car_parking_space)

In [None]:
count_cat_prob_plot('required_car_parking_spaces', data)

In [None]:
columns_to_remove.append('required_car_parking_spaces')

<a id='total_of_special_requests'></a>

### 27 - total_of_special_requests: - [main](#main)

- Number of special requests made by the customer (e.g. twin bed or high floor)

In [None]:
data['total_of_special_requests'].unique()

In [None]:
data['total_of_special_requests'].value_counts()

In [None]:
count_cat_prob_plot('total_of_special_requests', data)

- Increasing special requests descreases canceling probabilities.

In [None]:
g = sns.lineplot(x='total_of_special_requests', y='customer_total_payment', data=data, hue='hotel')

plt.title('Special Requests vs Total Payment')
plt.show(g)

- Increase in special requests increases total payment

<a id='reservation_status'></a>

### 28 - reservation_status: - [main](#main)

- Reservation last status, assuming one of three categories: <hr>
Canceled – booking was canceled by the customer; <hr>
Check-Out – customer has checked in but already departed; <hr>
No-Show – customer did not check-in and did inform the hotel of the reason why <hr>

In [None]:
data['reservation_status'].unique()

In [None]:
count_cat_prob_plot('reservation_status', data)

In [None]:
columns_to_remove.append('reservation_status')

In [None]:
columns_to_remove.append('reservation_status_date')

<h1 align='center'>Finalize Dataset and Save </h1>

In [None]:
from sklearn.utils import shuffle

data = shuffle(data).reset_index(drop=True)

In [None]:
data.shape

In [None]:
columns_to_remove

In [None]:
columns_to_dummy

In [None]:
cleaned_data = data.drop(columns=columns_to_remove, axis=1)

In [None]:
cleaned_data = pd.get_dummies(cleaned_data, columns=columns_to_dummy)

In [None]:
cleaned_data.shape # because of dummy columns we have too many columns. 

In [None]:
cleaned_data.info() # we removed all object (str) data.