In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# Suppress warnings (egnoire FutureWarnings)
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('train.csv')
df.head()

In [None]:
df.columns

In [None]:
df.drop(columns=['Unnamed: 0','Booking_ID'],inplace=True)

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df["no_of_adults"].value_counts()
df["no_of_children"].value_counts()
df["no_of_weekend_nights"].value_counts()
df["type_of_meal_plan"].value_counts()
df["room_type_reserved"].value_counts() 
df["arrival_time"].value_counts()

In [None]:
### OUR Data is imbaleanced

In [None]:
cat_cols = ['type_of_meal_plan', 'room_type_reserved','required_car_parking_space','market_segment_type','repeated_guest','booking_status']
num_cols = ['no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights', 'lead_time','arrival_year', 'arrival_month', 'arrival_date']

In [None]:
len(cat_cols), len(num_cols)

In [None]:
#####Data Analysis

In [None]:
data = df.copy()

In [None]:
### univariate analysis

In [None]:
def num_plot_dist(df, num_features):
    fig , axes = plt.subplots(len(num_features),2, figsize=(20, len(num_features)*5))
    if len(num_features) == 1:
        axes = [axes]

    for i , column in enumerate(num_features):
        sns.histplot(df[column], kde=True, ax=axes[i][0])
        axes[i][0].set_title(f'Distribution of {column}')
        sns.boxplot(x=df[column], ax=axes[i][1])
        axes[i][1].set_title(f'Boxplot of {column}')
    plt.tight_layout()
    plt.show()

In [None]:
num_plot_dist(data, num_cols)

In [None]:
for cat_feature in cat_cols:
    plt.figure(figsize=(10, 5))
    # data[cat_feature].value_counts().plot(kind='bar', color='skyblue')
    sns.countplot(x=cat_feature, data=data)
    plt.title(f'Count plot of {cat_feature}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
###Bivariate analysis########

In [None]:
def plot_bivariate_num(df, num_features, target_feature):
    num_plots = len(num_features)
    num_rows = (num_plots + 1) // 2

    fig, axes = plt.subplots(num_rows, 2, figsize=(20, num_rows * 5))
    axes = axes.flatten() if num_rows > 1 else [axes]

    for i , column in enumerate(num_features):
        sns.boxplot(x=target_feature, y=column, data=df, ax=axes[i])
        axes[i].set_title(f'Boxplot of {column} by {target_feature}')
        axes[i].set_xlabel(target_feature)
        axes[i].set_ylabel(column)
    plt.tight_layout()
    plt.show()

In [None]:
plot_bivariate_num(data, num_cols, 'booking_status')

In [None]:
def plot_bivariate_cat(df, cat_features, target_feature):
    num_plots = len(cat_features)
    num_rows = (num_plots + 1) // 2

    fig, axes = plt.subplots(num_rows, 2, figsize=(20, num_rows * 5))
    axes = axes.flatten() if num_rows > 1 else [axes]

    for i , column in enumerate(cat_features):
        sns.countplot(x=column, hue=target_feature, data=df, ax=axes[i])
        axes[i].set_title(f'Count plot of {column} by {target_feature}')
        axes[i].set_xlabel(column)
        axes[i].set_ylabel('Count')
    plt.tight_layout()
    plt.show()

In [None]:
#########Data PROCESSING #########

In [None]:
df.info()

In [None]:
######## Label encoding ###########

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
### Data type is object to we apply label encoding to conver to integer type

In [None]:
labe_encoder = LabelEncoder()

mapping = {}
for col in cat_cols:
    df[col] = labe_encoder.fit_transform(df[col])
    mapping[col] = {label:code for label,code in zip(labe_encoder.classes_, labe_encoder.transform(labe_encoder.classes_))}

In [None]:
for label, code in mapping.items():
    print(f"{label}: {code}")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
## Multicollinearity check
## multicollineary negatively effect the model performance

In [None]:
!pip install statsmodels

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [None]:
X = add_constant(df)

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns #variation inflaction factor
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

In [None]:
## if number in vif is above 5 will say highest muilticollinearity

In [None]:
## Our data dont have multicollinearity

In [None]:
corr = df.corr()

In [None]:
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')

In [None]:
#### Skeness

In [None]:
skewness = df.skew()

In [None]:
skewness

In [None]:
for col in df.columns:
    if skewness[col] > 5:
        df[col] = np.log1p(df[col])

In [None]:
##### Imbalance DAta

In [None]:
df['booking_status'].value_counts()

In [None]:
## methods are:
              # 1. under sampling(data reduced)
              # 2. over sampling (we go for this one)

In [None]:
X = df.drop(columns=['booking_status'])
y = df['booking_status']

In [None]:
!pip install imbalanced-learn

In [None]:
y.value_counts()

In [None]:
## applying smoth technique
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

In [None]:
y_res.value_counts()

In [None]:
# pd.Series(y_res)

In [None]:
### Create a DataFrame from the resampled data  as it is not ditectly change our old data

In [None]:
balanced_df = pd.DataFrame(X_res, columns=X.columns)
balanced_df['booking_status'] = y_res

In [None]:
balanced_df.head()

In [None]:
balanced_df.shape

In [None]:
df = balanced_df.copy()

In [None]:
### Feature Selection 

In [None]:
len(df.columns)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X = df.drop(columns=['booking_status'])
y = df['booking_status']

In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X, y)

In [None]:
Feature_importance = model.feature_importances_

In [None]:
Feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': Feature_importance
})

In [None]:
top_feature_importance_df = Feature_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

In [None]:
len(df.columns)

In [None]:
top_10_features = top_feature_importance_df['feature'].head(10).values

top_10_df = df[top_10_features]

In [None]:
top_10_df.head()

In [None]:
len(top_10_df.columns)

In [None]:
df = top_10_df.copy()

In [None]:
# MODEL SELECTION

In [None]:
!pip install xgboost lightgbm

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, f1_score

In [None]:
X = df.drop(columns=['booking_status'])
y = df['booking_status']

In [None]:
X_train , X_test , y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVC': SVC(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42)
}

In [None]:
mertics = {
    'Model' : [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': []
}

In [None]:
from model_name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    mertics['Model'].append(model_name)
    mertics['Accuracy'].append(accuracy)
    mertics['Precision'].append(precision)
    mertics['Recall'].append(recall)
    mertics['F1 Score'].append(f1)


In [None]:
martics_df = pd.DataFrame(mertics)

In [None]:
martics_df

In [None]:
### TRAIN ON RF MODEL 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [None]:
X = df.drop(columns=['booking_status'])
y = df['booking_status']

In [None]:
X_train , X_test , y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf = RandomForestClassifier(random_state=42)

In [None]:
params_dist ={
    'n_estimators': randint(100, 2000),
    'max_depth': randint(10, 50),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'max_features': uniform(0.1, 0.9),
    'bootstrap': [True, False]
}

In [None]:
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=params_dist,
    n_iter=5,
    cv=5,
    verbose=2,
    random_state=42,
    scoreing='accuracy'
)

In [None]:
random_search.fit(X_train, y_train)

In [None]:
random_search.best_params_

In [None]:
best_rf_model = random_search.best_estimator_

In [None]:
y_pred = best_rf_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
## Save the model

In [None]:
import joblib

In [None]:
joblib.dump(best_rf_model, 'best_rf_model.pkl')

In [None]:
loaded_model = joblib.load('best_rf_model.pkl')

In [None]:
new_data = np.array([[1, 0, 0, 1, 0, 2, 1, 3, 2023, 5, 15]])  # Example new data