In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
df2=pd.read_csv('hotel_bookings.csv')

Step 1: Checking for missing values and establishing which columns may need dropping based on NaN count


In [3]:
nan_counts = df2.isna().sum()

# Sort columns by the number of NaN values in descending order
nan_counts_sorted = nan_counts.sort_values(ascending=False)
df2.drop(columns=['company','agent'],inplace=True)
df2.dropna(inplace=True)

In [4]:
df2.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
days_in_waiting_list              

I drop irrelevent categorical and numerical variables

In [5]:
cat_columns = df2.select_dtypes(include=['object']).columns
cat_columns=cat_columns.drop(['hotel','country','meal','reservation_status_date','arrival_date_month','market_segment','distribution_channel'])
df2.drop(columns=['lead_time','arrival_date_year','arrival_date_week_number','arrival_date_month',
                 'stays_in_weekend_nights','stays_in_week_nights','adr'],inplace=True)

One-hot encode categorical variables

In [6]:
X = pd.get_dummies(df2,columns=cat_columns,drop_first=True)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X.drop(columns=['is_canceled'],inplace=True)
y=df2.is_canceled

Splitting data into train and test sets

In [10]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Scaling numerical variables to use in logistic regression

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include='number'))
X_test_scaled = scaler.fit_transform(X_test.select_dtypes(include='number'))
X_train_scaled = pd.DataFrame(X_train_scaled,columns=X_train.select_dtypes(include='number').columns,index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled,columns=X_test.select_dtypes(include='number').columns,index=X_test.index)

Grouping together one-hot encoded variables 

In [25]:
bool_cols=X_train.select_dtypes(include='bool').columns

In [26]:
final_train_df = pd.concat([X_train_scaled,X_train[bool_cols]],axis=1)
final_test_df = pd.concat([X_test_scaled,X_test[bool_cols]],axis=1)

In [27]:
X_train[bool_cols]

Unnamed: 0,reserved_room_type_B,reserved_room_type_C,reserved_room_type_D,reserved_room_type_E,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,reserved_room_type_P,assigned_room_type_B,...,assigned_room_type_K,assigned_room_type_L,assigned_room_type_P,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,reservation_status_Check-Out,reservation_status_No-Show
2499,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
36906,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
16363,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
25818,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
33552,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77310,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
110760,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
104186,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
861,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [28]:
final_train_df.dtypes

arrival_date_day_of_month         float64
adults                            float64
children                          float64
babies                            float64
is_repeated_guest                 float64
previous_cancellations            float64
previous_bookings_not_canceled    float64
booking_changes                   float64
days_in_waiting_list              float64
required_car_parking_spaces       float64
total_of_special_requests         float64
reserved_room_type_B                 bool
reserved_room_type_C                 bool
reserved_room_type_D                 bool
reserved_room_type_E                 bool
reserved_room_type_F                 bool
reserved_room_type_G                 bool
reserved_room_type_H                 bool
reserved_room_type_L                 bool
reserved_room_type_P                 bool
assigned_room_type_B                 bool
assigned_room_type_C                 bool
assigned_room_type_D                 bool
assigned_room_type_E              

In [29]:
from sklearn.linear_model import LogisticRegression
# Initialize the multinomial logistic regression model
model = LogisticRegression(max_iter=100)

# Train the model
model.fit(final_train_df, y_train)

# Make predictions
predictions = model.predict(final_test_df)

print("Predictions:", predictions)

Predictions: [1 1 0 ... 0 1 0]


In [30]:
from sklearn.metrics import accuracy_score, classification_report

Final accuracy results

In [31]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 1.0
