**Reading train and test dataset in pandaas**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

train = pd.read_csv("/kaggle/input/mlp-term-3-2025-kaggle-assignment-2/train.csv")
test  = pd.read_csv("/kaggle/input/mlp-term-3-2025-kaggle-assignment-2/test.csv")
sample_sub = pd.read_csv("/kaggle/input/mlp-term-3-2025-kaggle-assignment-2/sample_submission.csv")


In [None]:
train['arrival'] = pd.to_datetime(train['arrival'], errors='coerce')
test['arrival']  = pd.to_datetime(test['arrival'], errors='coerce')

median_arr = train['arrival'].median()

train['arrival'] = train['arrival'].fillna(median_arr)
test['arrival']  = test['arrival'].fillna(median_arr)

train['arrival_year'] = train['arrival'].dt.year
train['arrival_month'] = train['arrival'].dt.month
train['arrival_day'] = train['arrival'].dt.day

test['arrival_year'] = test['arrival'].dt.year
test['arrival_month'] = test['arrival'].dt.month
test['arrival_day'] = test['arrival'].dt.day

train.drop('arrival', axis=1, inplace=True)
test.drop('arrival', axis=1, inplace=True)


In [None]:
from sklearn.impute import SimpleImputer

num_cols = ['id','adults','children','weekends','weekdays','lead_time','repeat',
            'price','requests','arrival_year','arrival_month','arrival_day']
cat_cols = ['meal_type','room_type','segment']

num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

train[num_cols] = num_imputer.fit_transform(train[num_cols])
test[num_cols]  = num_imputer.transform(test[num_cols])

train[cat_cols] = cat_imputer.fit_transform(train[cat_cols])
test[cat_cols]  = cat_imputer.transform(test[cat_cols])


In [None]:
train['total_days'] = train['weekdays'] + train['weekends']
test['total_days']  = test['weekdays'] + test['weekends']

train['total_people'] = train['adults'] + train['children']
test['total_people']  = test['adults'] + test['children']

train['price_per_person'] = train['price'] / (train['total_people'] + 1)
test['price_per_person']  = test['price'] / (test['total_people'] + 1)


In [None]:
train = pd.get_dummies(train, columns=['meal_type','room_type','segment'], drop_first=True)
test  = pd.get_dummies(test,  columns=['meal_type','room_type','segment'], drop_first=True)

train, test = train.align(test, join="left", axis=1)
test['booking_status'] = 0
test = test.drop('booking_status', axis=1)


In [None]:
# 1 — Booking Status Count
sns.countplot(x=train['booking_status'])
plt.title("Booking Status Distribution")
plt.show()

# 2 — Price Distribution
sns.histplot(train['price'], kde=True)
plt.title("Price Distribution")
plt.show()

# 3 — Correlation Heatmap
plt.figure(figsize=(12,6))
sns.heatmap(train.corr(), cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split

X = train.drop("booking_status", axis=1)
y = train["booking_status"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

models = {
    "Logistic": LogisticRegression(max_iter=500),
    "RF": RandomForestClassifier(),
    "GB": GradientBoostingClassifier(),
    "Ada": AdaBoostClassifier(),
    "KNN": KNeighborsClassifier(),
    "XGB": XGBClassifier(eval_metric='logloss'),
    "LGBM": LGBMClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    print(name, accuracy_score(y_val, preds))


In [None]:
from sklearn.model_selection import RandomizedSearchCV

rf_params = {
    'n_estimators':[300,400,500],
    'max_depth':[8,10,12,None],
    'min_samples_split':[2,4],
    'min_samples_leaf':[1,2]
}

rf_tune = RandomizedSearchCV(
    RandomForestClassifier(),
    rf_params,
    n_iter=10,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

rf_tune.fit(X_train, y_train)
best_rf = rf_tune.best_estimator_
print("RF Best Score:", rf_tune.best_score_)


In [None]:
xgb_params = {
    'n_estimators':[300,400],
    'max_depth':[4,6],
    'learning_rate':[0.03,0.05,0.1]
}

xgb_tune = RandomizedSearchCV(
    XGBClassifier(eval_metric='logloss'),
    xgb_params,
    n_iter=6,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

xgb_tune.fit(X_train, y_train)
best_xgb = xgb_tune.best_estimator_
print("XGB Best Score:", xgb_tune.best_score_)


In [None]:
lgb_params = {
    'n_estimators':[200,300,400],
    'max_depth':[4,6,8],
    'learning_rate':[0.03,0.05,0.1]
}

lgb_tune = RandomizedSearchCV(
    LGBMClassifier(),
    lgb_params,
    n_iter=6,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

lgb_tune.fit(X_train, y_train)
best_lgb = lgb_tune.best_estimator_
print("LGBM Best Score:", lgb_tune.best_score_)


In [None]:
# Compare tuned scores
scores = {
    "RF": rf_tune.best_score_,
    "XGB": xgb_tune.best_score_,
    "LGB": lgb_tune.best_score_
}

print(scores)
best_model_name = max(scores, key=scores.get)
print("Best Model:", best_model_name)

best_model = {"RF":best_rf, "XGB":best_xgb, "LGB":best_lgb}[best_model_name]

# Train on full data
best_model.fit(X, y)

# Predict
test_preds = best_model.predict(test)
sample_sub['booking_status'] = test_preds
sample_sub.to_csv("submission.csv", index=False)

print("submission.csv saved!")
