In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Colab\ Notebooks/

/content/drive/MyDrive/Colab Notebooks


In [None]:
data = pd.read_csv('/content/drive/MyDrive/FDM Data Set - Telecommunication/expanded_hotel_bookings.csv')

In [None]:
# Descriptive statistic on dataset on numerical columns
data[['no_of_adults', 'no_of_children', 'no_of_weekend_nights',
    'no_of_week_nights', 'required_car_parking_space', 'lead_time',
    'arrival_year', 'arrival_month', 'arrival_date',
    'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
    'avg_price_per_room', 'no_of_special_requests']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
no_of_adults,50000.0,1.99984,0.816023,1.0,1.0,2.0,3.0,3.0
no_of_children,50000.0,1.00296,0.816287,0.0,0.0,1.0,2.0,2.0
no_of_weekend_nights,45434.0,1.002333,0.815783,0.0,0.0,1.0,2.0,2.0
no_of_week_nights,50000.0,2.49324,1.70679,0.0,1.0,2.0,4.0,5.0
required_car_parking_space,50000.0,0.49896,0.500004,0.0,0.0,0.0,1.0,1.0
lead_time,45466.0,182.703163,104.720953,0.0,93.0,183.0,273.0,364.0
arrival_year,50000.0,2018.50112,0.500004,2018.0,2018.0,2019.0,2019.0,2019.0
arrival_month,50000.0,6.5182,3.453553,1.0,4.0,7.0,10.0,12.0
arrival_date,50000.0,14.55616,8.07795,1.0,8.0,15.0,22.0,28.0
repeated_guest,50000.0,0.5011,0.500004,0.0,0.0,1.0,1.0,1.0


In [None]:
# prompt: check for duplicate values in this datset

# Check for duplicate rows
duplicate_rows = data[data.duplicated()]
print("Number of duplicate rows:", len(duplicate_rows))
if not duplicate_rows.empty:
  print("Duplicate rows:\n", duplicate_rows)


Number of duplicate rows: 0


In [None]:
# prompt: check for null values in this data set and print the number of null values in each column

# Check for null values
null_counts = data.isnull().sum()
print("Null values in each column:\n", null_counts)

Null values in each column:
 Booking_ID                                 0
no_of_adults                               0
no_of_children                             0
no_of_weekend_nights                    4566
no_of_week_nights                          0
lead_time                               4534
arrival_year                               0
arrival_month                              0
arrival_date                               0
no_of_previous_cancellations               0
no_of_previous_bookings_not_canceled       0
avg_price_per_room                      4625
no_of_special_requests                     0
required_car_parking_space                 0
repeated_guest                             0
room_type_reserved                         0
type_of_meal_plan                          0
market_segment_type                        0
booking_status                             0
dtype: int64


In [None]:
data = data.drop('Booking_ID', axis=1)

In [None]:
# Dictionaries to store encoding and decoding maps
encoding_maps = {}
decoding_maps = {}

In [None]:
# Encode categorical variables
label_encoders = {}
for column in ['type_of_meal_plan', 'room_type_reserved', 'market_segment_type', 'booking_status']:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

    # Create encoding map
    encoding_map = {original: encoded for original, encoded in zip(label_encoders[column].classes_, label_encoders[column].transform(label_encoders[column].classes_))}
    encoding_maps[column] = encoding_map

    # Create decoding map
    decoding_map = {encoded: original for original, encoded in encoding_map.items()}
    decoding_maps[column] = decoding_map

In [None]:
# Print the encoding and decoding maps
print("Encoding Maps:")
encoding_maps
print("\nDecoding Maps:")
decoding_maps

Encoding Maps:

Decoding Maps:


{'type_of_meal_plan': {0: 'Meal Plan 1', 1: 'Meal Plan 2', 2: 'Not Selected'},
 'room_type_reserved': {0: 'Room_Type_1', 1: 'Room_Type_2', 2: 'Room_Type_3'},
 'market_segment_type': {0: 'Corporate', 1: 'Offline', 2: 'Online'},
 'booking_status': {0: 'Canceled', 1: 'Not_Canceled'}}

# **RandomForestRegressor**

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Imputation function using regression
def regression_impute(data, target_col):
    """
    Impute missing values in a target column using a RandomForestRegressor.
    """
    # Split into rows with and without the missing values
    data_not_null = data[data[target_col].notnull()]
    data_null = data[data[target_col].isnull()]

    # Check if there are rows to impute
    if data_null.shape[0] == 0:
        print(f"No missing values found in {target_col}.")
        return data

    if data_not_null.shape[0] == 0:
        print(f"Not enough data to impute {target_col}.")
        return data

    # Features used for predicting the target column (dropping target column)
    X_train = data_not_null.drop(columns=[target_col])
    y_train = data_not_null[target_col]

    # Rows with missing values
    X_test = data_null.drop(columns=[target_col])

    # Ensure there are no missing values in features used for training
    if X_train.isnull().sum().sum() > 0:
        print(f"Missing values in features used for predicting {target_col}. Please handle them.")
        return data

    # Train the regressor
    rf = RandomForestRegressor(random_state=42)
    rf.fit(X_train, y_train)

    # Predict and fill missing values
    data.loc[data_null.index, target_col] = rf.predict(X_test)

    return data

# Apply regression imputation for the columns with missing values
columns_to_impute = ['no_of_weekend_nights', 'lead_time', 'avg_price_per_room']

for col in columns_to_impute:
    data = regression_impute(data, col)

# Check if any missing values remain
print(data.isnull().sum())


No missing values found in no_of_weekend_nights.
No missing values found in lead_time.
No missing values found in avg_price_per_room.
no_of_adults                            0
no_of_children                          0
no_of_weekend_nights                    0
no_of_week_nights                       0
lead_time                               0
arrival_year                            0
arrival_month                           0
arrival_date                            0
no_of_previous_cancellations            0
no_of_previous_bookings_not_canceled    0
avg_price_per_room                      0
no_of_special_requests                  0
required_car_parking_space              0
repeated_guest                          0
room_type_reserved                      0
type_of_meal_plan                       0
market_segment_type                     0
booking_status                          0
dtype: int64


In [None]:
# prompt: check for null values in this data set and print the number of null values in each column

# Check for null values
null_counts = data.isnull().sum()
print("Null values in each column:\n", null_counts)

Null values in each column:
 no_of_adults                            0
no_of_children                          0
no_of_weekend_nights                    0
no_of_week_nights                       0
lead_time                               0
arrival_year                            0
arrival_month                           0
arrival_date                            0
no_of_previous_cancellations            0
no_of_previous_bookings_not_canceled    0
avg_price_per_room                      0
no_of_special_requests                  0
required_car_parking_space              0
repeated_guest                          0
room_type_reserved                      0
type_of_meal_plan                       0
market_segment_type                     0
booking_status                          0
dtype: int64


# **Decision Tree Algorithm**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming the data has been preprocessed and there are no missing values
# Define your features (X) and target (y)
X = data.drop(columns=['booking_status'])  # Exclude the target column
y = data['booking_status']  # Target column

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print classification report for detailed performance
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 50.43%

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.50      0.50      7489
           1       0.51      0.51      0.51      7511

    accuracy                           0.50     15000
   macro avg       0.50      0.50      0.50     15000
weighted avg       0.50      0.50      0.50     15000



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define features (X) and target (y)
X = data.drop(columns=['booking_status'])  # Exclude the target column
y = data['booking_status']  # Target column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'criterion': ['gini', 'entropy'],  # Split criteria
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 10, 20],  # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 5, 10],  # Minimum number of samples at a leaf node
    'max_features': [None, 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model with the best parameters
grid_search.fit(X_train, y_train)

# Get the best model
best_clf = grid_search.best_estimator_

# Make predictions using the best model
y_pred = best_clf.predict(X_test)

# Evaluate the accuracy of the best model
accuracy = accuracy_score(y_test, y_pred)
print(f"Best model accuracy: {accuracy * 100:.2f}%")

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Best model accuracy: 48.89%
Best hyperparameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 20}

Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.50      0.49      7489
           1       0.49      0.48      0.48      7511

    accuracy                           0.49     15000
   macro avg       0.49      0.49      0.49     15000
weighted avg       0.49      0.49      0.49     15000



In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)

# Train the model
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression accuracy: {accuracy * 100:.2f}%")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Logistic Regression accuracy: 49.93%

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.32      0.39      7489
           1       0.50      0.68      0.58      7511

    accuracy                           0.50     15000
   macro avg       0.50      0.50      0.48     15000
weighted avg       0.50      0.50      0.48     15000



In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Hyperparameters for k-NN, including a broader range of n_neighbors (k)
param_grid_knn = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13],  # Added more odd values for k
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Create GridSearchCV for k-NN
grid_search_knn = GridSearchCV(KNeighborsClassifier(),
                               param_grid=param_grid_knn,
                               cv=5,
                               n_jobs=-1,
                               verbose=2)

# Fit the model
grid_search_knn.fit(X_train, y_train)
best_knn = grid_search_knn.best_estimator_

# Make predictions and evaluate
y_pred_knn = best_knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"k-NN Accuracy: {accuracy_knn}")
print("Classification Report (k-NN):")
print(classification_report(y_test, y_pred_knn))


Fitting 5 folds for each of 28 candidates, totalling 140 fits


  _data = np.array(data, dtype=dtype, copy=copy,


k-NN Accuracy: 0.5004
Classification Report (k-NN):
              precision    recall  f1-score   support

           0       0.50      0.50      0.50      7489
           1       0.50      0.50      0.50      7511

    accuracy                           0.50     15000
   macro avg       0.50      0.50      0.50     15000
weighted avg       0.50      0.50      0.50     15000

