In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Business Understanding 1

Describe the purpose of the data set you selected (i.e., why was this data collected in the first place?). How will you measure the effectiveness of a good algorithm? Why does your chosen validation method make sense for this specific
dataset and the stakeholders needs?

# Data Understanding 1

Describe the meaning and type of data (scale, values, etc.) for each attribute in the data file. Verify data quality: Are there missing values? Duplicate data? Outliers? Are those mistakes? How do you deal with these problems?

In [2]:
hotel = pd.read_csv("data/hotel_bookings.csv",low_memory=False)

In [3]:
hotel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [4]:
hotel.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


One hot encode "hotel", 'arrival_date_month', 'customer_type', 'reservation_status', 'deposit_type' but first make sure you check for dups

drop company column and reservation_status_date

remove NA rows for agent

In [5]:
# dropping company column as there isn't enough data 
del hotel['company']

#### Checking for duplicates

In [6]:
hotel_dup = hotel["hotel"].value_counts().reset_index()
hotel_dup

Unnamed: 0,index,hotel
0,City Hotel,79330
1,Resort Hotel,40060


In [7]:
arrival_date_dup = hotel["arrival_date_month"].value_counts().reset_index()
arrival_date_dup

Unnamed: 0,index,arrival_date_month
0,August,13877
1,July,12661
2,May,11791
3,October,11160
4,April,11089
5,June,10939
6,September,10508
7,March,9794
8,February,8068
9,November,6794


In [8]:
cust_type_dup = hotel["customer_type"].value_counts().reset_index()
cust_type_dup

Unnamed: 0,index,customer_type
0,Transient,89613
1,Transient-Party,25124
2,Contract,4076
3,Group,577


In [9]:
reservation_dup = hotel["reservation_status"].value_counts().reset_index()
reservation_dup

Unnamed: 0,index,reservation_status
0,Check-Out,75166
1,Canceled,43017
2,No-Show,1207


In [10]:
deposit_dup = hotel["deposit_type"].value_counts().reset_index()
deposit_dup

Unnamed: 0,index,deposit_type
0,No Deposit,104641
1,Non Refund,14587
2,Refundable,162


#### Dropping NA values from agent 

the loss of data is not significant

In [11]:
hotel.dropna(subset=['agent'])
hotel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 31 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

#### One hot encoding

"hotel", 'arrival_date_month', 'customer_type', 'reservation_status', 'deposit_type'

In [12]:
#one hot encoding using get dummies
categorical_features = ["hotel", 'arrival_date_month', 'customer_type', 'reservation_status', 'deposit_type']

tmp_df = pd.get_dummies(hotel.hotel,prefix="hotel")
hotel = pd.concat((hotel,tmp_df),axis=1)

tmp_df = pd.get_dummies(hotel.arrival_date_month,prefix="arrival_date_month")
hotel = pd.concat((hotel,tmp_df),axis=1)

tmp_df = pd.get_dummies(hotel.customer_type,prefix="customer_type")
hotel = pd.concat((hotel,tmp_df),axis=1)

tmp_df = pd.get_dummies(hotel.reservation_status,prefix="reservation_status")
hotel = pd.concat((hotel,tmp_df),axis=1)

tmp_df = pd.get_dummies(hotel.deposit_type,prefix="deposit_type")
hotel = pd.concat((hotel,tmp_df),axis=1)

#delete the extra columns
del hotel['hotel']
del hotel['arrival_date_month']
del hotel['customer_type']
del hotel['reservation_status']
del hotel['deposit_type']

hotel.head()


Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,reservation_status_Canceled,reservation_status_Check-Out,reservation_status_No-Show,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable
0,0,342,2015,27,1,0,0,2,0.0,0,...,0,0,1,0,0,1,0,1,0,0
1,0,737,2015,27,1,0,0,2,0.0,0,...,0,0,1,0,0,1,0,1,0,0
2,0,7,2015,27,1,0,1,1,0.0,0,...,0,0,1,0,0,1,0,1,0,0
3,0,13,2015,27,1,0,1,1,0.0,0,...,0,0,1,0,0,1,0,1,0,0
4,0,14,2015,27,1,0,2,2,0.0,0,...,0,0,1,0,0,1,0,1,0,0


In [13]:
hotel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 50 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   is_canceled                     119390 non-null  int64  
 1   lead_time                       119390 non-null  int64  
 2   arrival_date_year               119390 non-null  int64  
 3   arrival_date_week_number        119390 non-null  int64  
 4   arrival_date_day_of_month       119390 non-null  int64  
 5   stays_in_weekend_nights         119390 non-null  int64  
 6   stays_in_week_nights            119390 non-null  int64  
 7   adults                          119390 non-null  int64  
 8   children                        119386 non-null  float64
 9   babies                          119390 non-null  int64  
 10  meal                            119390 non-null  object 
 11  country                         118902 non-null  object 
 12  market_segment  

# Data Understanding 2

Visualize the any important attributes appropriately. Important: Provide an interpretation for any charts or graphs.

# Modeling and Evaluation 1

train and adjust parameters (GridSearch)

# Modeling and Evaluation 2

evaluate and compare

# Modeling and Evaluation 3

Visualize

# Modeling and Evaluation 4

Summarize and ramifications

# Deployment

Be critical of your performance and tell the reader how you current model might be usable by other parties. Did you achieve your goals? If not, can you reign in the utility of your modeling? How useful is your model for interested parties (i.e., the companies or organizations that might want to use it)? How would your deploy your model for interested parties? What other data should be collected? How often would the model need to be updated, etc.?


# Exceptional Work

You have free reign to provide additional analyses or combine analyses.