In [78]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Data Exploration

In [79]:
data = pd.read_csv('./Hotel Reservations.csv')

In [80]:
data.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


In [81]:
data.columns

Index(['Booking_ID', 'no_of_adults', 'no_of_children', 'no_of_weekend_nights',
       'no_of_week_nights', 'type_of_meal_plan', 'required_car_parking_space',
       'room_type_reserved', 'lead_time', 'arrival_year', 'arrival_month',
       'arrival_date', 'market_segment_type', 'repeated_guest',
       'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
       'avg_price_per_room', 'no_of_special_requests', 'booking_status'],
      dtype='object')

In [82]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Booking_ID                            36275 non-null  object 
 1   no_of_adults                          36275 non-null  int64  
 2   no_of_children                        36275 non-null  int64  
 3   no_of_weekend_nights                  36275 non-null  int64  
 4   no_of_week_nights                     36275 non-null  int64  
 5   type_of_meal_plan                     36275 non-null  object 
 6   required_car_parking_space            36275 non-null  int64  
 7   room_type_reserved                    36275 non-null  object 
 8   lead_time                             36275 non-null  int64  
 9   arrival_year                          36275 non-null  int64  
 10  arrival_month                         36275 non-null  int64  
 11  arrival_date   

### Categorical Variables
no_of_adults
<br>
no_of_children
<br>
no_of_weekend_nights
<br>
type_of_meal_plan
<br>
room_type_reserved
<br>
market_segment_type

In [83]:
req_cols = ['no_of_children', 'no_of_weekend_nights', 'type_of_meal_plan', 'lead_time', 'room_type_reserved', 'avg_price_per_room', 'booking_status']

In [84]:
data = data[req_cols]

In [85]:
data

Unnamed: 0,no_of_children,no_of_weekend_nights,type_of_meal_plan,lead_time,room_type_reserved,avg_price_per_room,booking_status
0,0,1,Meal Plan 1,224,Room_Type 1,65.00,Not_Canceled
1,0,2,Not Selected,5,Room_Type 1,106.68,Not_Canceled
2,0,2,Meal Plan 1,1,Room_Type 1,60.00,Canceled
3,0,0,Meal Plan 1,211,Room_Type 1,100.00,Canceled
4,0,1,Not Selected,48,Room_Type 1,94.50,Canceled
...,...,...,...,...,...,...,...
36270,0,2,Meal Plan 1,85,Room_Type 4,167.80,Not_Canceled
36271,0,1,Meal Plan 1,228,Room_Type 1,90.95,Canceled
36272,0,2,Meal Plan 1,148,Room_Type 1,98.39,Not_Canceled
36273,0,0,Not Selected,63,Room_Type 1,94.50,Canceled


In [86]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   no_of_children        36275 non-null  int64  
 1   no_of_weekend_nights  36275 non-null  int64  
 2   type_of_meal_plan     36275 non-null  object 
 3   lead_time             36275 non-null  int64  
 4   room_type_reserved    36275 non-null  object 
 5   avg_price_per_room    36275 non-null  float64
 6   booking_status        36275 non-null  object 
dtypes: float64(1), int64(3), object(3)
memory usage: 1.9+ MB


In [87]:
categorical_columns = ['no_of_weekend_nights', 'no_of_children', 'type_of_meal_plan', 'room_type_reserved']

## Chi-Square Test

In [88]:
from scipy.stats import chi2_contingency

In [91]:
for column in categorical_columns:
    print(f"Variable: {column}")
    print(f"Null-Hypothesis: {column} and booking_status are independent / {column} has no role in determining whether guest is likely to cancel the booking.")
    temp_df = pd.DataFrame(data[column].value_counts().reset_index())
    temp_df.rename(columns={'count': 'frequency'}, inplace=True)
    categories = list(temp_df.loc[temp_df.frequency > 20, column].values)
    data_new = data.loc[data[column].isin(categories)].reset_index(drop=True)
    contingency_table = pd.crosstab(data_new[column], data_new.booking_status)
    print(f"{column} contingency table:\n{contingency_table}\n")
    stat, p, dof, expected = chi2_contingency(contingency_table)
    print(f"Chi-Square Statistic: {stat}, Degree of freedom: {dof}\n")
    print(f"Expected Frequency:\n{expected}")
    print(f"p-value: {p}|{p:.10f}, {'H0 - rejected' if p < 0.05 else 'H0 - accepted'}\n\n")

Variable: no_of_weekend_nights
Null-Hypothesis: no_of_weekend_nights and booking_status are independent / no_of_weekend_nights has no role in determining whether guest is likely to cancel the booking.
no_of_weekend_nights contingency table:
booking_status        Canceled  Not_Canceled
no_of_weekend_nights                        
0                         5093         11779
1                         3432          6563
2                         3157          5914
3                           74            79
4                           83            46
5                           29             5

Chi-Square Statistic: 197.2088378397259, Degree of freedom: 5

Expected Frequency:
[[5.52316699e+03 1.13488330e+04]
 [3.27193303e+03 6.72306697e+03]
 [2.96945518e+03 6.10154482e+03]
 [5.00856181e+01 1.02914382e+02]
 [4.22290506e+01 8.67709494e+01]
 [1.11301374e+01 2.28698626e+01]]
p-value: 1.123163658201841e-40|0.0000000000, H0 - rejected


Variable: no_of_children
Null-Hypothesis: no_of_childre