In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from colorama import Fore, Style
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error
from sklearn.inspection import permutation_importance
from sklearn.neural_network import MLPClassifier
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


## Load Data & Select Valid Countries

In [2]:
# Load the data
data_path = os.path.join('..', 'raw_data', 'hotel_bookings_raw.csv')

df = pd.read_csv(data_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 43 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [3]:
#Drop columns
df = df.drop(columns=['arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'children', 'babies', 'previous_cancellations', 'previous_bookings_not_canceled', 'reserved_room_type', 'assigned_room_type', 'booking_changes', 'deposit_type', 'agent', 'days_in_waiting_list', 'customer_type', 'required_car_parking_spaces', 'total_of_special_requests', 'reservation_status_date', 'MO_YR'])
#Drop duplicates
df.drop_duplicates(inplace=True)
#Drop none
df.dropna(inplace=True)

In [4]:
valid_countries = ['PRT', 'GBR', 'ESP', 'FRA', 'DEU']

# Step 3: Filter the DataFrame to only include these valid countries
filtered_df = df[df['country'].isin(valid_countries)]

# Display the filtered DataFrame
# print(f"After removing countries appearing less than {country_threshold} times there are {len(filtered_df)} samples left")
print(f"There are still {len(filtered_df['country'].unique())} countries")
print(filtered_df['country'].unique())

There are still 5 countries
['PRT' 'GBR' 'ESP' 'FRA' 'DEU']


In [5]:
country_percentages = filtered_df['country'].value_counts(normalize=True) * 100

# Display the percentages
print(country_percentages)

country
PRT    45.420270
GBR    17.832893
FRA    15.115074
ESP    12.459796
DEU     9.171968
Name: proportion, dtype: float64


In [6]:
#Drop undefined

filtered_df = filtered_df[filtered_df["meal"] != "Undefined"]
filtered_df = filtered_df[filtered_df["market_segment"] != "Undefined"]
filtered_df = filtered_df[filtered_df["distribution_channel"] != "Undefined"]

print(f"After removing undefined values there are {len(filtered_df)} samples left")

After removing undefined values there are 55593 samples left


In [7]:
# Create the df that we will encode with to not change original data further
encoded_df = filtered_df.copy()

In [8]:
#total stays, drop weeknights and weekends
encoded_df['total_stays'] = encoded_df['stays_in_weekend_nights'] + encoded_df['stays_in_week_nights']
encoded_df = encoded_df.drop(columns=["stays_in_weekend_nights", "stays_in_week_nights"], axis=1)
print(sorted(encoded_df['total_stays'].unique()))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34, 35, 38, 42, 45, 46, 48, 49, 56, 57]


## Mapping & Encoding

In [9]:
#Change months to number
# Create a mapping of month names to numbers
month_mapping = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}
#Strip spaces
encoded_df['arrival_date_month'] = encoded_df['arrival_date_month'].str.strip()
# Replace month names with numbers
encoded_df['arrival_date_month'] = encoded_df['arrival_date_month'].map(month_mapping)

#Change hotel to binary
hotel_mapping = {
    'City Hotel': 1,
    'Resort Hotel': 0
}
#Strip spaces
encoded_df['hotel'] = encoded_df['hotel'].str.strip()
# Replace month names with numbers
encoded_df['hotel'] = encoded_df['hotel'].map(hotel_mapping)

In [10]:
encoded_df.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_month', 'adults',
       'meal', 'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'adr', 'reservation_status', 'CPI_AVG',
       'INFLATION', 'INFLATION_CHG', 'CSMR_SENT', 'UNRATE', 'INTRSRT', 'GDP',
       'FUEL_PRCS', 'CPI_HOTELS', 'US_GINI', 'DIS_INC', 'total_stays'],
      dtype='object')

In [11]:
# perform 1 hot encoding of categorical values
# One Hot Encode meal, country, market_segment, distribution_channel, reservation_status -> categorical (3-5 categories, encoden)
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Columns to one-hot encode
# can add 'country' here to use it as a predictor
columns_to_encode = ['meal', 'market_segment', 'distribution_channel', 'reservation_status', 'country']

# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)  # drop='first' avoids the dummy variable trap

# Fit and transform the data
one_hot_encoded_data = ohe.fit_transform(encoded_df[columns_to_encode])

# Convert encoded data to DataFrame
one_hot_df = pd.DataFrame(one_hot_encoded_data, columns=ohe.get_feature_names_out(columns_to_encode))

# Concatenate the encoded columns with the original DataFrame
encoded_df = pd.concat([encoded_df.reset_index(drop=True), one_hot_df.reset_index(drop=True)], axis=1)

# Drop the original columns that were encoded
encoded_df.drop(columns=columns_to_encode, inplace=True)

# Display the first few rows of the updated DataFrame
print(one_hot_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55593 entries, 0 to 55592
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   meal_BB                         55593 non-null  float64
 1   meal_FB                         55593 non-null  float64
 2   meal_HB                         55593 non-null  float64
 3   meal_SC                         55593 non-null  float64
 4   market_segment_Aviation         55593 non-null  float64
 5   market_segment_Complementary    55593 non-null  float64
 6   market_segment_Corporate        55593 non-null  float64
 7   market_segment_Direct           55593 non-null  float64
 8   market_segment_Groups           55593 non-null  float64
 9   market_segment_Offline TA/TO    55593 non-null  float64
 10  market_segment_Online TA        55593 non-null  float64
 11  distribution_channel_Corporate  55593 non-null  float64
 12  distribution_channel_Direct     

In [12]:
# Robust Scaler
from sklearn.preprocessing import RobustScaler

features_to_robust = ['lead_time', 'arrival_date_month', 'total_stays', 'adults', 'adr', 'FUEL_PRCS']
robust_scaler = RobustScaler()
robust_scaler.fit(encoded_df[features_to_robust])
encoded_df[features_to_robust] = robust_scaler.transform(encoded_df[features_to_robust])
encoded_df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,adults,is_repeated_guest,adr,CPI_AVG,INFLATION,INFLATION_CHG,...,distribution_channel_GDS,distribution_channel_TA/TO,reservation_status_Canceled,reservation_status_Check-Out,reservation_status_No-Show,country_DEU,country_ESP,country_FRA,country_GBR,country_PRT
0,0,0,2.684685,0.0,0.0,0,-1.484375,238.034,1.8,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,0,6.243243,0.0,0.0,0,-1.484375,238.034,1.8,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,0,-0.333333,0.0,-1.0,0,-0.3125,238.034,1.8,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0,0,-0.279279,0.0,-1.0,0,-0.3125,238.034,1.8,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,0,-0.27027,0.0,0.0,0,0.046875,238.034,1.8,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
from sklearn.preprocessing import MinMaxScaler

features_to_minmax = ['CPI_AVG', 'INFLATION', 'INFLATION_CHG', 'CSMR_SENT', 'UNRATE', 'INTRSRT', 'GDP', 'DIS_INC', 'CPI_HOTELS']
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(encoded_df[features_to_minmax])
encoded_df[features_to_minmax] = minmax_scaler.transform(encoded_df[features_to_minmax])
encoded_df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,adults,is_repeated_guest,adr,CPI_AVG,INFLATION,INFLATION_CHG,...,distribution_channel_GDS,distribution_channel_TA/TO,reservation_status_Canceled,reservation_status_Check-Out,reservation_status_No-Show,country_DEU,country_ESP,country_FRA,country_GBR,country_PRT
0,0,0,2.684685,0.0,0.0,0,-1.484375,0.281229,0.285714,0.666667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,0,6.243243,0.0,0.0,0,-1.484375,0.281229,0.285714,0.666667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,0,-0.333333,0.0,-1.0,0,-0.3125,0.281229,0.285714,0.666667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0,0,-0.279279,0.0,-1.0,0,-0.3125,0.281229,0.285714,0.666667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,0,-0.27027,0.0,0.0,0,0.046875,0.281229,0.285714,0.666667,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Model Selection & Training

In [14]:
# Assuming df_encoded is your DataFrame and adr is the target variable
target_variable = 'meal_BB'

predictors = encoded_df.columns.drop(labels=[target_variable, "meal_FB", "meal_HB", "meal_SC"])

#if this doesnt work, label encode country first
X = encoded_df[predictors]
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(encoded_df[target_variable])


In [15]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [16]:
# # Train a RandomForestRegressor
estimators = [("RandomForrest", RandomForestClassifier(random_state=42)), ("GradientBoost", GradientBoostingClassifier(random_state=42))]
model = StackingClassifier(estimators=estimators)

model.fit(X_train, y_train)


In [17]:
#cross val
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
y_pred = model.predict(X_val)
y_pred_proba = model.predict_proba(X_val)
precision = precision_score(y_val, y_pred, average='weighted')
rec = recall_score(y_val, y_pred, average='weighted')
f1 = f1_score(y_val, y_pred, average='weighted')
print(precision, rec, f1, cv_scores.mean())


0.8352191215587338 0.8458684654300168 0.8346611758448343 0.839652643059478


In [18]:
from demand_predictor.ml_logic.registry import save_model
save_model(model)

✅ Model saved locally at /Users/mila/code/sarahfink123/demand_prediction/demand_predictor/../training_outputs/models/20240603-152546.pkl


In [19]:

# # Make predictions on the test set
# y_pred = model.predict(X_test)
# precision = precision_score(X_val, y_val, average='weighted')
# rec = recall_score(y_val, y_pred, average='weighted')
# f1 = f1_score(y_train, y_pred, average='weighted')
# print(precision, rec, f1, cv_scores.mean())


In [20]:
meal_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
pred = y_pred[1]
pred_name = meal_mapping[pred]
pred_proba = y_pred_proba[1][1]
print("prediciton of ordering the meal", pred_name)
print("prediciton of ordering the meal probability", pred_proba)

prediciton of ordering the meal 1.0
prediciton of ordering the meal probability 0.9602035993782491


## Calculate the mean/median/mode for every column 

In [21]:
#variables of interest
# target_variable = 'meal_BB'
# predictors = 'lead_time', 'total_stays', 'adults', "hotel", 'arrival_date_month' (take in all, drop meal HB, SC, FB)

In [22]:
# means
column_means = encoded_df.drop(columns=["meal_BB"]).mean()
column_means

#pick the variables that we dont in our input and fill the variables of interest with the input of the user

hotel                              0.568039
is_canceled                        0.286691
lead_time                          0.297724
arrival_date_month                -0.097599
adults                            -0.129819
is_repeated_guest                  0.049970
adr                                0.145467
CPI_AVG                            0.530872
INFLATION                          0.638374
INFLATION_CHG                      0.625726
CSMR_SENT                          0.522373
UNRATE                             0.362113
INTRSRT                            0.390796
GDP                                0.530121
FUEL_PRCS                         -0.176606
CPI_HOTELS                         0.534486
US_GINI                           41.150366
DIS_INC                            0.494910
total_stays                        0.202376
meal_FB                            0.005576
meal_HB                            0.121886
meal_SC                            0.091846
market_segment_Aviation         

In [23]:
# medians
column_medians = encoded_df.drop(columns=["meal_BB"]).median()
column_medians

#pick the variables that we dont in our input and fill the variables of interest with the input of the user

hotel                              1.000000
is_canceled                        0.000000
lead_time                          0.000000
arrival_date_month                 0.000000
adults                             0.000000
is_repeated_guest                  0.000000
adr                                0.000000
CPI_AVG                            0.496064
INFLATION                          0.714286
INFLATION_CHG                      0.666667
CSMR_SENT                          0.522124
UNRATE                             0.428571
INTRSRT                            0.250000
GDP                                0.499259
FUEL_PRCS                          0.000000
CPI_HOTELS                         0.583069
US_GINI                           41.200000
DIS_INC                            0.404358
total_stays                        0.000000
meal_FB                            0.000000
meal_HB                            0.000000
meal_SC                            0.000000
market_segment_Aviation         

In [24]:
# modes
column_modes = encoded_df.drop(columns=["meal_BB"]).mode()
column_modes

#pick the variables that we dont in our input and fill the variables of interest with the input of the user

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,adults,is_repeated_guest,adr,CPI_AVG,INFLATION,INFLATION_CHG,...,distribution_channel_GDS,distribution_channel_TA/TO,reservation_status_Canceled,reservation_status_Check-Out,reservation_status_No-Show,country_DEU,country_ESP,country_FRA,country_GBR,country_PRT
0,1,0,-0.396396,0.2,0.0,0,-1.484375,0.285164,0.857143,0.666667,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
