# Import

In [None]:
import os
import warnings
warnings.filterwarnings("ignore")

import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from typing import Dict, Tuple

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_recall_curve,
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
import joblib

# Utils

In [None]:
def ensure_dirs():
    Path("visuals").mkdir(parents=True, exist_ok=True)
    Path("models").mkdir(parents=True, exist_ok=True)

def save_cm(cm: np.ndarray, labels: Tuple[str, str], title: str, fname: str):
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='.0f',
                xticklabels=labels, yticklabels=labels)
    plt.ylabel('Actual'); plt.xlabel('Predicted'); plt.title(title)
    plt.tight_layout()
    plt.savefig(fname, dpi=150)
    plt.close()

def plot_pr_threshold(y_true: pd.Series, y_scores: np.ndarray, title: str, fname: str):
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    # thresholds has len-1 compared to prec/rec
    plt.figure(figsize=(6,4))
    plt.plot(thresholds, precisions[:-1], label='Precision', linestyle='--')
    plt.plot(thresholds, recalls[:-1], label='Recall', linestyle='--')
    plt.xlabel('Threshold'); plt.ylim(0, 1.0); plt.legend(); plt.title(title)
    plt.tight_layout()
    plt.savefig(fname, dpi=150); plt.close()
    # Return threshold that maximizes F1
    f1s = 2 * (precisions * recalls) / (precisions + recalls + 1e-12)
    best_idx = np.nanargmax(f1s)
    best_thr = 0.5 if best_idx == len(thresholds) else (thresholds[best_idx-1] if best_idx>0 else thresholds[0])
    return best_thr, precisions[best_idx], recalls[best_idx], f1s[best_idx]

def metrics_score(y_true, y_pred, title_prefix: str, tag: str) -> Dict[str, float]:
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    printable = classification_report(y_true, y_pred, zero_division=0)
    print(f"\n{title_prefix} Classification Report:\n{printable}")
    cm = confusion_matrix(y_true, y_pred)
    save_cm(cm, labels=('Not Cancelled','Cancelled'),
            title=f"{title_prefix} — Confusion Matrix",
            fname=f"visuals/cm_{tag}.png")
    # Return class-1 (Cancelled) metrics and accuracy
    return {
        "accuracy": report["accuracy"],
        "precision_cancel": report["1"]["precision"],
        "recall_cancel": report["1"]["recall"],
        "f1_cancel": report["1"]["f1-score"],
    }

# Data Loading & Cleaning

In [None]:
def load_data(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    return df.copy()

def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    # Drop ID
    if "Booking_ID" in df.columns:
        df = df.drop(columns=["Booking_ID"])

    # Fix unrealistic children values (replace 9,10 with 3)
    if "no_of_children" in df.columns:
        df["no_of_children"] = df["no_of_children"].replace([9,10], 3)

    # Encode target: Canceled=1, Not_Canceled=0
    if "booking_status" in df.columns and df["booking_status"].dtype == object:
        df["booking_status"] = df["booking_status"].apply(lambda x: 1 if x == "Canceled" else 0)

    # Cap extreme outliers for avg_price_per_room at upper whisker (exclude zeros)
    if "avg_price_per_room" in df.columns:
        Q1 = df["avg_price_per_room"].quantile(0.25)
        Q3 = df["avg_price_per_room"].quantile(0.75)
        IQR = Q3 - Q1
        upper = Q3 + 1.5*IQR
        df.loc[df["avg_price_per_room"] >= 500, "avg_price_per_room"] = upper

    return df

# EDA

In [None]:
def eda_plots(df: pd.DataFrame, enable: bool = True):
    if not enable:
        return

    # Distributions
    for col in ["lead_time", "avg_price_per_room", "no_of_week_nights", "no_of_weekend_nights"]:
        if col in df.columns:
            plt.figure(figsize=(6,4))
            sns.boxplot(x=df[col])
            plt.title(f"Boxplot — {col}")
            plt.tight_layout(); plt.savefig(f"visuals/box_{col}.png", dpi=150); plt.close()

            plt.figure(figsize=(6,4))
            sns.histplot(df[col], kde=True, bins=30)
            plt.title(f"Histogram — {col}")
            plt.tight_layout(); plt.savefig(f"visuals/hist_{col}.png", dpi=150); plt.close()

    # Booking status distribution
    if "booking_status" in df.columns:
        plt.figure(figsize=(6,4))
        sns.countplot(x=df["booking_status"].map({0:"Not_Canceled",1:"Canceled"}))
        plt.title("Target distribution — booking_status")
        plt.tight_layout(); plt.savefig("visuals/target_distribution.png", dpi=150); plt.close()

    # Correlation heatmap (numeric only)
    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    plt.figure(figsize=(10,7))
    sns.heatmap(df[num_cols].corr(), annot=False, cmap="Spectral", vmin=-1, vmax=1)
    plt.title("Correlation Heatmap (numeric)")
    plt.tight_layout(); plt.savefig("visuals/corr_heatmap.png", dpi=150); plt.close()

    # Avg price per room by market segment (if exists)
    if "market_segment_type" in df.columns and "avg_price_per_room" in df.columns:
        plt.figure(figsize=(8,5))
        sns.boxplot(data=df, x="market_segment_type", y="avg_price_per_room")
        plt.xticks(rotation=30, ha='right')
        plt.title("Avg Price per Room by Market Segment")
        plt.tight_layout(); plt.savefig("visuals/price_by_segment.png", dpi=150); plt.close()


# Modeling

In [None]:
# ==========================
# Modeling
# ==========================

def split_encode(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
    X = df.drop(columns=["booking_status"])
    y = df["booking_status"]
    X = pd.get_dummies(X, drop_first=True)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, stratify=y, random_state=1
    )
    return X_train, X_test, y_train, y_test

def train_logistic(X_train, y_train):
    lg = LogisticRegression(max_iter=200)
    lg.fit(X_train, y_train)
    joblib.dump(lg, "models/logistic_regression.joblib")
    return lg

def train_svm_linear(X_train, y_train):
    scaler = MinMaxScaler(feature_range=(-1,1))
    Xs = scaler.fit_transform(X_train)
    svm = SVC(kernel="linear", probability=True, random_state=1)
    svm.fit(Xs, y_train)
    joblib.dump(scaler, "models/scaler_svm_linear.joblib")
    joblib.dump(svm, "models/svm_linear.joblib")
    return svm, scaler

def train_svm_rbf(X_train, y_train):
    scaler = MinMaxScaler(feature_range=(-1,1))
    Xs = scaler.fit_transform(X_train)
    svm = SVC(kernel="rbf", probability=True, random_state=1)
    svm.fit(Xs, y_train)
    joblib.dump(scaler, "models/scaler_svm_rbf.joblib")
    joblib.dump(svm, "models/svm_rbf.joblib")
    return svm, scaler

def train_decision_tree_tuned(X_train, y_train):
    base = DecisionTreeClassifier(random_state=1)
    params = {
        "max_depth": np.arange(1, 100, 10),
        "max_leaf_nodes": [50, 75, 150, 250],
        "min_samples_split": [10, 30, 50, 70],
    }
    gs = GridSearchCV(base, params, cv=5, scoring="recall", n_jobs=-1)
    gs.fit(X_train, y_train)
    best = gs.best_estimator_
    joblib.dump(best, "models/decision_tree_tuned.joblib")
    return best

def train_random_forest(X_train, y_train):
    rf = RandomForestClassifier(random_state=1, n_estimators=300, n_jobs=-1)
    rf.fit(X_train, y_train)
    joblib.dump(rf, "models/random_forest.joblib")
    return rf

# Run & Evaluate

In [None]:
def run_pipeline(csv_path: str, run_eda: bool):
    ensure_dirs()

    print("Loading data...")
    df = load_data(csv_path)
    print(f"Initial shape: {df.shape}")

    print("Cleaning data...")
    df = clean_data(df)
    print(f"Post-clean shape: {df.shape}")

    if run_eda:
        print("Generating EDA visuals...")
        eda_plots(df, enable=True)

    print("Encoding & splitting...")
    X_train, X_test, y_train, y_test = split_encode(df)
    print(f"Train shape: {X_train.shape} | Test shape: {X_test.shape}")
    print("Target ratio (train):", y_train.value_counts(normalize=True).to_dict())

    results = []

    # 1) Logistic Regression
    print("\n=== Logistic Regression ===")
    lg = train_logistic(X_train, y_train)
    y_hat_tr = lg.predict(X_train)
    r_tr = metrics_score(y_train, y_hat_tr, "LR — Train", "lr_train")
    y_hat_te = lg.predict(X_test)
    r_te = metrics_score(y_test, y_hat_te, "LR — Test", "lr_test")

    # Threshold tuning (LR)
    y_scores_tr = lg.predict_proba(X_train)[:,1]
    thr, p_opt, r_opt, f1_opt = plot_pr_threshold(
        y_train, y_scores_tr, "LR — Precision/Recall vs Threshold", "visuals/pr_lr.png"
    )
    # Use 0.40 as course reference but also show best-f1 threshold
    custom_thr = 0.40
    y_hat_thr_tr = (lg.predict_proba(X_train)[:,1] > custom_thr).astype(int)
    y_hat_thr_te = (lg.predict_proba(X_test)[:,1] > custom_thr).astype(int)
    r_thr_tr = metrics_score(y_train, y_hat_thr_tr, f"LR@{custom_thr:.2f} — Train", "lr_thr_train")
    r_thr_te = metrics_score(y_test, y_hat_thr_te, f"LR@{custom_thr:.2f} — Test", "lr_thr_test")

    results.append(("Logistic Regression", r_te))

    # 2) SVM Linear
    print("\n=== SVM (Linear) ===")
    svm_lin, scaler_lin = train_svm_linear(X_train, y_train)
    Xtr_s = scaler_lin.transform(X_train)
    Xte_s = scaler_lin.transform(X_test)
    r_tr = metrics_score(y_train, svm_lin.predict(Xtr_s), "SVM-Linear — Train", "svml_train")
    r_te = metrics_score(y_test, svm_lin.predict(Xte_s), "SVM-Linear — Test", "svml_test")

    # Threshold tuning (SVM Linear)
    thr_lin, _, _, _ = plot_pr_threshold(
        y_train, svm_lin.predict_proba(Xtr_s)[:,1], "SVM-Linear — Precision/Recall vs Threshold", "visuals/pr_svml.png"
    )
    custom_thr = 0.40
    r_thr_tr = metrics_score(y_train, (svm_lin.predict_proba(Xtr_s)[:,1] > custom_thr).astype(int),
                             f"SVM-Linear@{custom_thr:.2f} — Train", "svml_thr_train")
    r_thr_te = metrics_score(y_test, (svm_lin.predict_proba(Xte_s)[:,1] > custom_thr).astype(int),
                             f"SVM-Linear@{custom_thr:.2f} — Test", "svml_thr_test")

    results.append(("SVM (Linear)", r_te))

    # 3) SVM RBF
    print("\n=== SVM (RBF) ===")
    svm_rbf, scaler_rbf = train_svm_rbf(X_train, y_train)
    Xtr_s = scaler_rbf.transform(X_train)
    Xte_s = scaler_rbf.transform(X_test)
    r_tr = metrics_score(y_train, svm_rbf.predict(Xtr_s), "SVM-RBF — Train", "svmr_train")
    r_te = metrics_score(y_test, svm_rbf.predict(Xte_s), "SVM-RBF — Test", "svmr_test")

    thr_rbf, _, _, _ = plot_pr_threshold(
        y_train, svm_rbf.predict_proba(Xtr_s)[:,1], "SVM-RBF — Precision/Recall vs Threshold", "visuals/pr_svmr.png"
    )
    custom_thr = 0.40
    r_thr_tr = metrics_score(y_train, (svm_rbf.predict_proba(Xtr_s)[:,1] > custom_thr).astype(int),
                             f"SVM-RBF@{custom_thr:.2f} — Train", "svmr_thr_train")
    r_thr_te = metrics_score(y_test, (svm_rbf.predict_proba(Xte_s)[:,1] > custom_thr).astype(int),
                             f"SVM-RBF@{custom_thr:.2f} — Test", "svmr_thr_test")

    results.append(("SVM (RBF)", r_te))

    # 4) Decision Tree (tuned)
    print("\n=== Decision Tree (Tuned) ===")
    dt = train_decision_tree_tuned(X_train, y_train)
    r_tr = metrics_score(y_train, dt.predict(X_train), "DT-Tuned — Train", "dt_train")
    r_te = metrics_score(y_test, dt.predict(X_test), "DT-Tuned — Test", "dt_test")

    # Export shallow tree viz
    plt.figure(figsize=(16,8))
    plot_tree(dt, feature_names=X_train.columns, max_depth=3, filled=True, fontsize=8)
    plt.tight_layout(); plt.savefig("visuals/decision_tree_top.png", dpi=200); plt.close()

    # Feature importance DT
    importances = pd.Series(dt.feature_importances_, index=X_train.columns).sort_values(ascending=False)
    plt.figure(figsize=(8,10))
    sns.barplot(x=importances.head(20), y=importances.head(20).index)
    plt.title("Decision Tree — Top 20 Feature Importances")
    plt.tight_layout(); plt.savefig("visuals/dt_feature_importance.png", dpi=150); plt.close()

    results.append(("Decision Tree (Tuned)", r_te))

    # 5) Random Forest
    print("\n=== Random Forest ===")
    rf = train_random_forest(X_train, y_train)
    r_tr = metrics_score(y_train, rf.predict(X_train), "RF — Train", "rf_train")
    r_te = metrics_score(y_test, rf.predict(X_test), "RF — Test", "rf_test")

    rf_imp = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
    plt.figure(figsize=(8,10))
    sns.barplot(x=rf_imp.head(20), y=rf_imp.head(20).index)
    plt.title("Random Forest — Top 20 Feature Importances")
    plt.tight_layout(); plt.savefig("visuals/rf_feature_importance.png", dpi=150); plt.close()

    results.append(("Random Forest", r_te))

    # Save summary table
    summary = pd.DataFrame(
        [(name, r["accuracy"], r["precision_cancel"], r["recall_cancel"], r["f1_cancel"])
         for name, r in results],
        columns=["Model","Accuracy","Precision (Cancel)","Recall (Cancel)","F1 (Cancel)"]
    ).sort_values(by=["F1 (Cancel)","Recall (Cancel)","Accuracy"], ascending=False)
    print("\n=== Test Set Summary ===")
    print(summary.to_string(index=False))
    summary.to_csv("visuals/model_summary.csv", index=False)

    print("\nArtifacts saved to ./visuals and ./models")

# Entry

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Hotel Booking Cancellation Prediction Pipeline")
    parser.add_argument("--csv", type=str, default="data/INNHotelsGroup.csv",
                        help="Path to INN Hotels CSV file")
    parser.add_argument("--eda", action="store_true", help="Generate EDA plots")
    args = parser.parse_args()

    # (Optional) Google Colab Drive mount
    # from google.colab import drive
    # drive.mount('/content/drive')
    # csv_path = "/content/drive/MyDrive/MIT: DSML/Hotel Booking Cancellation Prediction/INNHotelsGroup.csv"

    run_pipeline(args.csv, run_eda=args.eda)