In [1]:
import pandas as pd
df = pd.read_csv('../hotel_booking_data/hotel_bookings.csv')
df.info()
df.nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

hotel                                2
is_canceled                          2
lead_time                          479
arrival_date_year                    3
arrival_date_month                  12
arrival_date_week_number            53
arrival_date_day_of_month           31
stays_in_weekend_nights             17
stays_in_week_nights                35
adults                              14
children                             5
babies                               5
meal                                 5
country                            177
market_segment                       8
distribution_channel                 5
is_repeated_guest                    2
previous_cancellations              15
previous_bookings_not_canceled      73
reserved_room_type                  10
assigned_room_type                  12
booking_changes                     21
deposit_type                         3
agent                              333
company                            352
days_in_waiting_list     

Split dataset into train, cv, test

In [2]:
from sklearn.model_selection import train_test_split
# Features and target
X = df.drop('is_canceled', axis=1)
y = df['is_canceled']

# train + temp split (train 70%, temp 30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# split temp into CV (validation) and test (50%-50% of temp → 15% each)
X_cv, X_test, y_cv, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

Split columns based on type

In [3]:
# Numeric columns
num_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Categorical columns
cat_features = X_train.select_dtypes(include=['object']).columns.tolist()

Common feature engineering

In [4]:
# Deleting columns that contribute to data leakage
X_train = X_train.drop(['reservation_status_date','reservation_status'], axis=1)
X_cv = X_cv.drop(['reservation_status_date','reservation_status'], axis=1)
X_test = X_test.drop(['reservation_status_date','reservation_status'], axis=1)

# Deleting columns with large amount of NULL values
X_train = X_train.drop('company', axis=1)
X_cv = X_cv.drop('company', axis=1)
X_test = X_test.drop('company', axis=1)
num_features.remove('company')
cat_features.remove('reservation_status_date')
cat_features.remove('reservation_status')

# Deleting columns with many zero values
X_train = X_train.drop('babies', axis=1)
X_cv = X_cv.drop('babies', axis=1)
X_test = X_test.drop('babies', axis=1)
num_features.remove('babies')

In [5]:
# Filling missing values
for df_split in [X_train,X_cv,X_test]: 
    for col in num_features: 
        df_split[col]=df_split[col].fillna(df_split[col].median()) 
    for col in cat_features: 
        df_split[col]=df_split[col].fillna(df_split[col].mode()[0])

In [6]:
# Adding/combining features
for df_split in [X_train,X_cv,X_test]:
    df_split['prev_cancellation_ratio'] = df_split['previous_cancellations'] / (df_split['previous_cancellations'] + df_split['previous_bookings_not_canceled'] + 1e-5)


X_train = X_train.drop(['previous_cancellations','previous_bookings_not_canceled'], axis=1)
X_cv = X_cv.drop(['previous_cancellations','previous_bookings_not_canceled'], axis=1)
X_test = X_test.drop(['previous_cancellations','previous_bookings_not_canceled'], axis=1)

for col in ['previous_cancellations', 'previous_bookings_not_canceled']:
    if col in num_features:
        num_features.remove(col)
num_features.append('prev_cancellation_ratio')

for df_split in [X_train,X_cv,X_test]:
    df_split['has_waiting_list'] = (df_split['days_in_waiting_list'] > 0).astype(int)

X_train = X_train.drop('days_in_waiting_list', axis=1)
X_cv = X_cv.drop('days_in_waiting_list', axis=1)
X_test = X_test.drop('days_in_waiting_list', axis=1)
num_features.remove('days_in_waiting_list')
num_features.append('has_waiting_list')

for df_split in [X_train,X_cv,X_test]:
    df_split['room_honored'] = (df_split['reserved_room_type'] == df_split['assigned_room_type']).astype(int)
num_features.append('room_honored')

season_map = { 'January': 'Winter', 'February': 'Winter', 'March': 'Spring', 'April': 'Spring', 'May': 'Spring', 'June': 'Summer', 'July': 'Summer', 'August': 'Summer', 'September': 'Fall', 'October': 'Fall', 'November': 'Fall', 'December': 'Winter' }
for df_split in [X_train,X_cv,X_test]:
    df_split['arrival_season'] = df_split['arrival_date_month'].map(season_map)

cat_features.append('arrival_season')

In [7]:
# Mapping country and agent columns to numeric
country_freq_map = X_train['country'].value_counts(normalize=True).to_dict()
X_train['country'] = X_train['country'].map(country_freq_map)
X_cv['country'] = X_cv['country'].map(country_freq_map)
X_test['country'] = X_test['country'].map(country_freq_map)


agent_freq_map = X_train['agent'].value_counts(normalize=True).to_dict()
X_train['agent'] = X_train['agent'].map(agent_freq_map)
X_cv['agent'] = X_cv['agent'].map(agent_freq_map)
X_test['agent'] = X_test['agent'].map(agent_freq_map)


In [8]:
# Combining room type and market-channels into single string columns
for df_split in [X_train, X_cv, X_test]:
    df_split['room_pair'] = df_split['reserved_room_type'] + "_" + df_split['assigned_room_type']
    df_split['market_pair'] = df_split['market_segment'] + "_" + df_split['distribution_channel']
cat_features.append('room_pair')
cat_features.append('market_pair')
cols_to_drop = ['reserved_room_type', 'assigned_room_type', 'market_segment', 'distribution_channel']
X_train = X_train.drop(cols_to_drop, axis=1)
X_cv = X_cv.drop(cols_to_drop, axis=1)
X_test = X_test.drop(cols_to_drop, axis=1)
cat_features.remove('reserved_room_type')
cat_features.remove('assigned_room_type')
cat_features.remove('market_segment')
cat_features.remove('distribution_channel')

In [9]:
# Additional features
for df_split in [X_train, X_cv, X_test]:
    df_split['engagement_score'] = df_split['booking_changes'] * df_split['total_of_special_requests']
num_features.append('engagement_score')

# Target encoding for risk segments
from sklearn.model_selection import KFold
import numpy as np
X_train['risk_segment_key'] = (X_train['country'].astype(str) + "_" + 
                               X_train['deposit_type'].astype(str) + "_" + 
                               X_train['customer_type'].astype(str))

X_cv['risk_segment_key'] = (X_cv['country'].astype(str) + "_" + 
                            X_cv['deposit_type'].astype(str) + "_" + 
                            X_cv['customer_type'].astype(str))

X_test['risk_segment_key'] = (X_test['country'].astype(str) + "_" + 
                              X_test['deposit_type'].astype(str) + "_" + 
                              X_test['customer_type'].astype(str))

train_for_encoding = X_train.copy()
train_for_encoding['is_canceled'] = y_train.values

overall_mean = y_train.mean()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
X_train['risk_segment_cancel_rate'] = np.nan

for train_idx, val_idx in kf.split(train_for_encoding):
    train_fold = train_for_encoding.iloc[train_idx]
    val_fold = train_for_encoding.iloc[val_idx]
    fold_means = train_fold.groupby('risk_segment_key')['is_canceled'].mean()
    encoded_vals = val_fold['risk_segment_key'].map(fold_means).fillna(overall_mean)
    X_train.iloc[val_idx, X_train.columns.get_loc('risk_segment_cancel_rate')] = encoded_vals.values


risk_segment_means = train_for_encoding.groupby('risk_segment_key')['is_canceled'].mean()
X_cv['risk_segment_cancel_rate'] = X_cv['risk_segment_key'].map(risk_segment_means).fillna(overall_mean)
X_test['risk_segment_cancel_rate'] = X_test['risk_segment_key'].map(risk_segment_means).fillna(overall_mean)


X_train = X_train.drop('risk_segment_key', axis=1)
X_cv = X_cv.drop('risk_segment_key', axis=1)
X_test = X_test.drop('risk_segment_key', axis=1)
num_features.append('risk_segment_cancel_rate')

Building pipeline for tree models

In [None]:
low_card_cols = ['hotel', 'meal', 'deposit_type', 'customer_type', 'arrival_season']
high_card_cols = ['country', 'room_pair', 'market_pair', 'arrival_date_month']
X_traincv = pd.concat([X_train, X_cv])
y_traincv = pd.concat([y_train, y_cv])
num_features=['lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'is_repeated_guest', 'booking_changes', 'agent', 'adr', 'required_car_parking_spaces', 'total_of_special_requests', 'prev_cancellation_ratio', 'has_waiting_list', 'room_honored', 'engagement_score', 'risk_segment_cancel_rate']

['lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'is_repeated_guest', 'booking_changes', 'agent', 'adr', 'required_car_parking_spaces', 'total_of_special_requests', 'prev_cancellation_ratio', 'has_waiting_list', 'room_honored', 'engagement_score', 'risk_segment_cancel_rate']


In [None]:
# Frequency encoder for high cardinality categorical columns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.freq_maps = {}
        self.columns = columns

    def fit(self, X, y=None):
        # Convert to DataFrame if needed
        if not hasattr(X, 'columns'):
            if self.columns is None:
                raise ValueError("Column names must be provided for array input")
            X = pd.DataFrame(X, columns=self.columns)
        else:
            self.columns = X.columns.tolist()
            
        for col in self.columns:
            self.freq_maps[col] = X[col].value_counts(normalize=True).to_dict()
        return self

    def transform(self, X):
        if not hasattr(X, 'columns'):
            X = pd.DataFrame(X, columns=self.columns)
        X = X.copy()
        for col in self.freq_maps:
            if col in X:
                X[col] = X[col].map(self.freq_maps[col]).fillna(0)
        return X

In [None]:
# Designing random forest pipeline
from sklearn.impute import SimpleImputer
preprocessor_rf = ColumnTransformer(transformers=[
    ('num', SimpleImputer(strategy='median'), num_features),
    ('low_cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ]), low_card_cols),
    ('high_cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', FrequencyEncoder(columns=high_card_cols))  # Pass columns explicitly
    ]), high_card_cols)
])

pipeline_rf = Pipeline([
    ('preprocessor', preprocessor_rf),
    ('classifier', RandomForestClassifier(
        n_estimators=250,
        max_depth=20,
        min_samples_split=4,
        min_samples_leaf=3,
        max_features='sqrt',
        bootstrap=True,
        class_weight='balanced',  # <-- focuses more on minority class
        random_state=42
    ))
])




In [14]:
pipeline_rf.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('low_cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,columns,"['country', 'room_pair', ...]"

0,1,2
,n_estimators,250
,criterion,'gini'
,max_depth,20
,min_samples_split,4
,min_samples_leaf,3
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
# Evaluating random forest
y_train_pred_rf = pipeline_rf.predict(X_train)
train_acc = accuracy_score(y_train, y_train_pred_rf)
print("Training Accuracy:", train_acc)
y_cv_pred_rf = pipeline_rf.predict(X_cv)
cv_acc = accuracy_score(y_cv, y_cv_pred_rf)
print("Validation Accuracy:", cv_acc)



Training Accuracy: 0.927344955906812




Validation Accuracy: 0.8861402725039089


In [16]:
# Testing the final model
y_pred_rf = pipeline_rf.predict(X_test)
print("Final Test Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))



Final Test Accuracy: 0.8876542520520408

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.90      0.91     11275
           1       0.84      0.86      0.85      6634

    accuracy                           0.89     17909
   macro avg       0.88      0.88      0.88     17909
weighted avg       0.89      0.89      0.89     17909


Confusion Matrix:
 [[10173  1102]
 [  910  5724]]


In [17]:
import joblib
joblib.dump(pipeline_rf, "model.pkl")

['model.pkl']

### Model Selection
After testing Logistic Regression, Random Forest, and XGBoost, **Random Forest** achieved the best performance (89.42% accuracy). 
Testing with top 15 features yeilds lower results (85.19%).

### Overfitting Observation

During hyperparameter tuning, the Random Forest model achieved a **very high training accuracy (~0.997)** but only a **moderate increase in test accuracy (~0.899)**.  
This large gap between training and test accuracy indicates **overfitting** — the model memorized the training data instead of learning patterns that generalize well.  

To mitigate overfitting, random forest classifier without tuning was chosen, which provides a better balance:
Training Accuracy: 0.927, Test Accuracy: 0.887  