In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/FDM Data Set - Telecommunication/Hotel Reservations.csv')

In [None]:
# Show data
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Booking_ID                            36275 non-null  object 
 1   no_of_adults                          36275 non-null  int64  
 2   no_of_children                        36275 non-null  int64  
 3   no_of_weekend_nights                  36275 non-null  int64  
 4   no_of_week_nights                     36275 non-null  int64  
 5   type_of_meal_plan                     36275 non-null  object 
 6   required_car_parking_space            36275 non-null  int64  
 7   room_type_reserved                    36275 non-null  object 
 8   lead_time                             36275 non-null  int64  
 9   arrival_year                          36275 non-null  int64  
 10  arrival_month                         36275 non-null  int64  
 11  arrival_date   

In [None]:
# Descriptive statistic on dataset on numerical columns
data[['no_of_adults', 'no_of_children', 'no_of_weekend_nights',
    'no_of_week_nights', 'required_car_parking_space', 'lead_time',
    'arrival_year', 'arrival_month', 'arrival_date',
    'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
    'avg_price_per_room', 'no_of_special_requests']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
no_of_adults,36275.0,1.844962,0.518715,0.0,2.0,2.0,2.0,4.0
no_of_children,36275.0,0.105279,0.402648,0.0,0.0,0.0,0.0,10.0
no_of_weekend_nights,36275.0,0.810724,0.870644,0.0,0.0,1.0,2.0,7.0
no_of_week_nights,36275.0,2.2043,1.410905,0.0,1.0,2.0,3.0,17.0
required_car_parking_space,36275.0,0.030986,0.173281,0.0,0.0,0.0,0.0,1.0
lead_time,36275.0,85.232557,85.930817,0.0,17.0,57.0,126.0,443.0
arrival_year,36275.0,2017.820427,0.383836,2017.0,2018.0,2018.0,2018.0,2018.0
arrival_month,36275.0,7.423653,3.069894,1.0,5.0,8.0,10.0,12.0
arrival_date,36275.0,15.596995,8.740447,1.0,8.0,16.0,23.0,31.0
repeated_guest,36275.0,0.025637,0.158053,0.0,0.0,0.0,0.0,1.0


In [None]:
# Dictionaries to store encoding and decoding maps
encoding_maps = {}
decoding_maps = {}

In [None]:
# Encode categorical variables
label_encoders = {}
for column in ['type_of_meal_plan', 'room_type_reserved', 'market_segment_type', 'booking_status']:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

    # Create encoding map
    encoding_map = {original: encoded for original, encoded in zip(label_encoders[column].classes_, label_encoders[column].transform(label_encoders[column].classes_))}
    encoding_maps[column] = encoding_map

    # Create decoding map
    decoding_map = {encoded: original for original, encoded in encoding_map.items()}
    decoding_maps[column] = decoding_map

In [None]:
# Print the encoding and decoding maps
print("Encoding Maps:")
encoding_maps
print("\nDecoding Maps:")
decoding_maps

Encoding Maps:

Decoding Maps:


{'type_of_meal_plan': {0: 'Meal Plan 1',
  1: 'Meal Plan 2',
  2: 'Meal Plan 3',
  3: 'Not Selected'},
 'room_type_reserved': {0: 'Room_Type 1',
  1: 'Room_Type 2',
  2: 'Room_Type 3',
  3: 'Room_Type 4',
  4: 'Room_Type 5',
  5: 'Room_Type 6',
  6: 'Room_Type 7'},
 'market_segment_type': {0: 'Aviation',
  1: 'Complementary',
  2: 'Corporate',
  3: 'Offline',
  4: 'Online'},
 'booking_status': {0: 'Canceled', 1: 'Not_Canceled'}}

In [None]:
# Define features and target
X = data.drop(columns=['booking_status', 'Booking_ID'])
y = data['booking_status']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Show feature train
X_train.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests
25629,2,1,2,1,0,0,0,26,2017,10,17,4,0,0,0,161.0,0
14473,2,1,1,1,0,0,0,98,2018,7,16,4,0,0,0,121.5,2
23720,2,0,0,3,0,0,0,433,2018,9,8,3,0,0,0,70.0,0
5843,2,0,2,5,0,0,0,195,2018,8,8,3,0,0,0,72.25,0
18709,1,0,0,2,0,0,0,188,2018,6,15,3,0,0,0,130.0,0


In [None]:
# Show target train
y_train.head()

Unnamed: 0,booking_status
25629,1
14473,1
23720,0
5843,1
18709,0


In [None]:
# Show feature test
X_test.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests
4968,2,1,1,0,0,0,0,3,2017,8,23,4,0,0,0,90.0,3
34540,2,0,1,2,0,0,3,9,2018,2,12,3,0,0,0,48.67,0
36108,2,0,2,2,0,0,0,24,2018,12,25,4,0,0,0,95.2,1
1553,2,0,0,3,0,0,0,23,2018,6,21,4,0,0,0,127.67,0
24974,2,1,0,2,0,0,3,9,2018,9,8,4,0,0,0,201.5,2


In [None]:
# Show target test
y_test.head()

Unnamed: 0,booking_status
4968,1
34540,1
36108,1
1553,0
24974,1


# **LOGISTIC REGRESSION ALGORITHM**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Hyperparameters for Logistic Regression
param_grid_logreg = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear']
}

# Create GridSearchCV for Logistic Regression
grid_search_logreg = GridSearchCV(LogisticRegression(random_state=42, max_iter=1000),
                                  param_grid=param_grid_logreg,
                                  cv=5,
                                  n_jobs=-1,
                                  verbose=2)

# Fit the model
grid_search_logreg.fit(X_train, y_train)
best_logreg = grid_search_logreg.best_estimator_

# Make predictions and evaluate
y_pred_logreg = best_logreg.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"Logistic Regression Accuracy: {accuracy_logreg}")
print("Classification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_logreg))

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Logistic Regression Accuracy: 0.8022053756030324
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.75      0.61      0.67      2416
           1       0.82      0.90      0.86      4839

    accuracy                           0.80      7255
   macro avg       0.79      0.75      0.77      7255
weighted avg       0.80      0.80      0.80      7255



# **DECISION TREE ALGORITHM**

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Hyperparameters for Decision Trees
param_grid_tree = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'criterion': ['gini', 'entropy']
}

# Create GridSearchCV for Decision Tree
grid_search_tree = GridSearchCV(DecisionTreeClassifier(random_state=42),
                                param_grid=param_grid_tree,
                                cv=5,
                                n_jobs=-1,
                                verbose=2)

# Fit the model
grid_search_tree.fit(X_train, y_train)
best_tree = grid_search_tree.best_estimator_

# Make predictions and evaluate
y_pred_tree = best_tree.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print(f"Decision Tree Accuracy: {accuracy_tree}")
print("Classification Report (Decision Tree):")
print(classification_report(y_test, y_pred_tree))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Decision Tree Accuracy: 0.8682288077188146
Classification Report (Decision Tree):
              precision    recall  f1-score   support

           0       0.83      0.76      0.79      2416
           1       0.89      0.92      0.90      4839

    accuracy                           0.87      7255
   macro avg       0.86      0.84      0.85      7255
weighted avg       0.87      0.87      0.87      7255



# **k-NEAREST NEIGHBORS (k-NN) ALGORITHM**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Hyperparameters for k-NN
param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Create GridSearchCV for k-NN
grid_search_knn = GridSearchCV(KNeighborsClassifier(),
                               param_grid=param_grid_knn,
                               cv=5,
                               n_jobs=-1,
                               verbose=2)

# Fit the model
grid_search_knn.fit(X_train, y_train)
best_knn = grid_search_knn.best_estimator_

# Make predictions and evaluate
y_pred_knn = best_knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"k-NN Accuracy: {accuracy_knn}")
print("Classification Report (k-NN):")
print(classification_report(y_test, y_pred_knn))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
k-NN Accuracy: 0.8483804272915231
Classification Report (k-NN):
              precision    recall  f1-score   support

           0       0.81      0.71      0.76      2416
           1       0.86      0.92      0.89      4839

    accuracy                           0.85      7255
   macro avg       0.84      0.81      0.82      7255
weighted avg       0.85      0.85      0.85      7255



# **SVM**

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Hyperparameters for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Create GridSearchCV for SVM
grid_search_svm = GridSearchCV(SVC(random_state=42),
                               param_grid=param_grid_svm,
                               cv=5,
                               n_jobs=-1,
                               verbose=2)

# Fit the SVM model
grid_search_svm.fit(X_train, y_train)
best_svm = grid_search_svm.best_estimator_

# Make predictions and evaluate
y_pred_svm = best_svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm}")
print("Classification Report (SVM):")
print(classification_report(y_test, y_pred_svm))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
SVM Accuracy: 0.8237077877325982
Classification Report (SVM):
              precision    recall  f1-score   support

           0       0.87      0.55      0.68      2416
           1       0.81      0.96      0.88      4839

    accuracy                           0.82      7255
   macro avg       0.84      0.76      0.78      7255
weighted avg       0.83      0.82      0.81      7255

