In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/Colab\ Notebooks/

/content/drive/MyDrive/Colab Notebooks


In [4]:
data = pd.read_csv('/content/drive/MyDrive/FDM Data Set - Telecommunication/expanded_hotel_bookings.csv')

In [5]:
# Descriptive statistic on dataset on numerical columns
data[['no_of_adults', 'no_of_children', 'no_of_weekend_nights',
    'no_of_week_nights', 'required_car_parking_space', 'lead_time',
    'arrival_year', 'arrival_month', 'arrival_date',
    'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
    'avg_price_per_room', 'no_of_special_requests']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
no_of_adults,50000.0,1.99984,0.816023,1.0,1.0,2.0,3.0,3.0
no_of_children,50000.0,1.00296,0.816287,0.0,0.0,1.0,2.0,2.0
no_of_weekend_nights,45434.0,1.002333,0.815783,0.0,0.0,1.0,2.0,2.0
no_of_week_nights,50000.0,2.49324,1.70679,0.0,1.0,2.0,4.0,5.0
required_car_parking_space,50000.0,0.49896,0.500004,0.0,0.0,0.0,1.0,1.0
lead_time,45466.0,182.703163,104.720953,0.0,93.0,183.0,273.0,364.0
arrival_year,50000.0,2018.50112,0.500004,2018.0,2018.0,2019.0,2019.0,2019.0
arrival_month,50000.0,6.5182,3.453553,1.0,4.0,7.0,10.0,12.0
arrival_date,50000.0,14.55616,8.07795,1.0,8.0,15.0,22.0,28.0
repeated_guest,50000.0,0.5011,0.500004,0.0,0.0,1.0,1.0,1.0


In [6]:
# prompt: check for duplicate values in this datset

# Check for duplicate rows
duplicate_rows = data[data.duplicated()]
print("Number of duplicate rows:", len(duplicate_rows))
if not duplicate_rows.empty:
  print("Duplicate rows:\n", duplicate_rows)


Number of duplicate rows: 0


In [7]:
# prompt: check for null values in this data set and print the number of null values in each column

# Check for null values
null_counts = data.isnull().sum()
print("Null values in each column:\n", null_counts)

Null values in each column:
 Booking_ID                                 0
no_of_adults                               0
no_of_children                             0
no_of_weekend_nights                    4566
no_of_week_nights                          0
lead_time                               4534
arrival_year                               0
arrival_month                              0
arrival_date                               0
no_of_previous_cancellations               0
no_of_previous_bookings_not_canceled       0
avg_price_per_room                      4625
no_of_special_requests                     0
required_car_parking_space                 0
repeated_guest                             0
room_type_reserved                         0
type_of_meal_plan                          0
market_segment_type                        0
booking_status                             0
dtype: int64


In [8]:
data = data.drop('Booking_ID', axis=1)

In [9]:
# Dictionaries to store encoding and decoding maps
encoding_maps = {}
decoding_maps = {}

In [10]:
# Encode categorical variables
label_encoders = {}
for column in ['type_of_meal_plan', 'room_type_reserved', 'market_segment_type', 'booking_status']:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

    # Create encoding map
    encoding_map = {original: encoded for original, encoded in zip(label_encoders[column].classes_, label_encoders[column].transform(label_encoders[column].classes_))}
    encoding_maps[column] = encoding_map

    # Create decoding map
    decoding_map = {encoded: original for original, encoded in encoding_map.items()}
    decoding_maps[column] = decoding_map

In [11]:
# Print the encoding and decoding maps
print("Encoding Maps:")
encoding_maps
print("\nDecoding Maps:")
decoding_maps

Encoding Maps:

Decoding Maps:


{'type_of_meal_plan': {0: 'Meal Plan 1', 1: 'Meal Plan 2', 2: 'Not Selected'},
 'room_type_reserved': {0: 'Room_Type_1', 1: 'Room_Type_2', 2: 'Room_Type_3'},
 'market_segment_type': {0: 'Corporate', 1: 'Offline', 2: 'Online'},
 'booking_status': {0: 'Canceled', 1: 'Not_Canceled'}}

In [12]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Check for null values and print the count in each column
null_counts = data.isnull().sum()
print("Null values in each column:\n", null_counts)

# Use IterativeImputer for handling missing values
imputer = IterativeImputer()
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# After imputation, check again for null values (if any)
null_counts_after = data_imputed.isnull().sum()
print("Null values after imputation:\n", null_counts_after)


Null values in each column:
 no_of_adults                               0
no_of_children                             0
no_of_weekend_nights                    4566
no_of_week_nights                          0
lead_time                               4534
arrival_year                               0
arrival_month                              0
arrival_date                               0
no_of_previous_cancellations               0
no_of_previous_bookings_not_canceled       0
avg_price_per_room                      4625
no_of_special_requests                     0
required_car_parking_space                 0
repeated_guest                             0
room_type_reserved                         0
type_of_meal_plan                          0
market_segment_type                        0
booking_status                             0
dtype: int64
Null values after imputation:
 no_of_adults                            0
no_of_children                          0
no_of_weekend_nights             

In [13]:
# Splitting the data into features (X) and target (y)
X = data_imputed.drop('booking_status', axis=1)
y = data_imputed['booking_status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Training set shape: (40000, 17)
Test set shape: (10000, 17)


In [14]:
# Train Logistic Regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

# Evaluate Logistic Regression model
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logreg))

# Train Random Forest Classifier model
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

# Evaluate Random Forest Classifier model
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

         0.0       0.50      0.34      0.40      4975
         1.0       0.50      0.66      0.57      5025

    accuracy                           0.50     10000
   macro avg       0.50      0.50      0.49     10000
weighted avg       0.50      0.50      0.49     10000

Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.50      0.51      0.51      4975
         1.0       0.50      0.49      0.50      5025

    accuracy                           0.50     10000
   macro avg       0.50      0.50      0.50     10000
weighted avg       0.50      0.50      0.50     10000

