In [9]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# =====================================================
# STEP 1 — LOAD DATA
# =====================================================

train_path = "/content/drive/MyDrive/ML_Project/MultinomialClassification/Dataset/train.csv"
test_path  = "/content/drive/MyDrive/ML_Project/MultinomialClassification/Dataset/test.csv"



df_raw = pd.read_csv(train_path)
test_raw = pd.read_csv(test_path)

TARGET = "spend_category"
ID_COL = "trip_id"

print("Original Training Shape:", df_raw.shape)
print("Original Test Shape:", test_raw.shape)


# =====================================================
# HELPER — Convert Range (15-30, 90+) to numeric
# =====================================================
def range_to_mid(x):
    x = str(x).strip()
    if x.lower() in ["none", "", "nan", "null"]:
        return np.nan
    if "+" in x:
        return float(x.replace("+", ""))
    if "-" in x:
        a, b = x.split("-")
        return (float(a) + float(b)) / 2
    try:
        return float(x)
    except:
        return np.nan


# =====================================================
# STEP 2 — GLOBAL COLUMN DEFINITIONS
# =====================================================
binary_cols = [
    "is_first_visit","intl_transport_included","accomodation_included",
    "food_included","domestic_transport_included","sightseeing_included",
    "guide_included","insurance_included"
]

categorical_cols = [
    "country","age_group","travel_companions","main_activity",
    "visit_purpose","tour_type","info_source","arrival_weather"
]

numeric_count_cols = ["num_females","num_males","mainland_stay_nights","island_stay_nights"]


# =====================================================
# STEP 3 — REMOVE NULL TARGETS FIRST
# =====================================================
removed_target_nulls = df_raw[TARGET].isnull().sum()
print("Rows removed due to null spend_category:", removed_target_nulls)

df_raw = df_raw[df_raw[TARGET].notnull()].reset_index(drop=True)
print("Training shape after removing null targets:", df_raw.shape)


# =====================================================
# STEP 4 — MAIN PREPROCESSING FUNCTION
# =====================================================
def preprocess_raw_df(df):
    df = df.copy()

    # Clean strings
    for c in df.columns:
        if df[c].dtype == object:
            df[c] = df[c].astype(str).str.strip().str.rstrip(',')

    # Binary processing
    for c in binary_cols:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip().str.lower()
            df[c] = df[c].replace({
                "yes": 1,
                "no": 0,
                "nan": np.nan,
                "none": np.nan,
                "null": np.nan,
                "": np.nan
            })
            df[c] = df[c].fillna(0).astype(int)

    # Numeric count fields
    for c in numeric_count_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

    # =====================================================
    # SAFE ORDINAL ENCODING FOR RANGE COLUMNS
    # =====================================================

    # Clean weird string values
    def clean_str(x):
        x = str(x).strip().lower()
        if x in ["nan", "none", "null", ""]:
            return np.nan
        return x

    # Clean raw string columns
    if "days_booked_before_trip" in df.columns:
        df["days_booked_before_trip_clean"] = df["days_booked_before_trip"].apply(clean_str)

    if "total_trip_days" in df.columns:
        df["total_trip_days_clean"] = df["total_trip_days"].apply(clean_str)

    # Define ordinal mappings
    ordinal_days_booked = {
        "1-7": 1,
        "8-14": 2,
        "15-30": 3,
        "31-60": 4,
        "61-90": 5,
        "90+": 6
    }

    ordinal_total_trip = {
        "1-6": 1,
        "7-14": 2,
        "15-30": 3,
        "30+": 4
    }

    # Map → Fill Missing → Convert to int
    if "days_booked_before_trip_clean" in df.columns:
        df["days_booked_before_trip_ord"] = (
            df["days_booked_before_trip_clean"]
                .map(ordinal_days_booked)
        )

        # Fill NaN with mode **of ordinal values**
        df["days_booked_before_trip_ord"].fillna(
            df["days_booked_before_trip_ord"].mode()[0],
            inplace=True
        )

        df["days_booked_before_trip_ord"] = df["days_booked_before_trip_ord"].astype(int)

    if "total_trip_days_clean" in df.columns:
        df["total_trip_days_ord"] = (
            df["total_trip_days_clean"]
                .map(ordinal_total_trip)
        )

        # Fill NaN with mode of ordinal values
        df["total_trip_days_ord"].fillna(
            df["total_trip_days_ord"].mode()[0],
            inplace=True
        )

        df["total_trip_days_ord"] = df["total_trip_days_ord"].astype(int)



    # Special requirements → binary
    if "has_special_requirements" in df.columns:
        df["has_special_req_bin"] = df["has_special_requirements"].astype(str).apply(
            lambda x: 0 if x.lower() in ["none", "", "nan"] else 1
        )

    return df


# =====================================================
# APPLY PREPROCESSING TO TRAIN & TEST
# =====================================================
df = preprocess_raw_df(df_raw).reset_index(drop=True)
test_df = preprocess_raw_df(test_raw).reset_index(drop=True)

print("\nAfter Base Preprocessing:")
print(df.shape)


# =====================================================
# STEP 5 — IMPUTATIONS (NO NaNs must remain)
# =====================================================

# Categorical mode fill
for c in categorical_cols:
    if c in df.columns:
        mode = df[c].mode()[0]
        df[c] = df[c].fillna(mode)
        test_df[c] = test_df[c].fillna(mode)

# =========================================================
# OUTLIER REMOVAL (TRAIN ONLY)
# And count number of rows removed per condition
# =========================================================

clean_df = df.copy()
initial_rows = len(clean_df)

outlier_info = {}

# ----- num_females ≤ 14 -----
before = len(clean_df)
clean_df = clean_df[clean_df["num_females"] <= 10]
after = len(clean_df)
outlier_info["num_females"] = before - after

# ----- Rule 1: num_males ≤ 13 -----
before = len(clean_df)
clean_df = clean_df[clean_df["num_males"] <= 10]
after_rule1 = len(clean_df)
removed_rule1 = before - after_rule1
before_rule2 = len(clean_df)

# Save results
outlier_info["num_males_threshold"] = removed_rule1
print("Removed (num_males > 20):", removed_rule1)


# ----- mainland_stay_nights ≤ 100 -----
before = len(clean_df)
clean_df = clean_df[clean_df["mainland_stay_nights"] <= 90]
after = len(clean_df)
outlier_info["mainland_stay_nights"] = before - after

# ----- island_stay_nights ≤ 21 -----
before = len(clean_df)
clean_df = clean_df[clean_df["island_stay_nights"] <= 60]
after = len(clean_df)
outlier_info["island_stay_nights"] = before - after

final_rows = len(clean_df)

# =========================================================
# PRINT OUTLIER REMOVAL SUMMARY
# =========================================================
print("\n========== OUTLIER REMOVAL SUMMARY ==========")
for col, removed in outlier_info.items():
    print(f"{col}: removed {removed} rows")

print("---------------------------------------------")
print(f"Total rows removed: {initial_rows - final_rows}")
print(f"Final Training Shape after Outlier Removal: {clean_df.shape}")
print("Test Shape (unchanged):", test_df.shape)


# =====================================================
# STEP 5 — FINAL FEATURE DEFINITIONS (No NaNs left)
# =====================================================

numeric_features = [
    "num_females",
    "num_males",
    "mainland_stay_nights",
    "island_stay_nights",
    "days_booked_before_trip_ord",
    "total_trip_days_ord"
]

binary_features = binary_cols + ["has_special_req_bin"]

categorical_features = categorical_cols

all_features = numeric_features + binary_features + categorical_features

clean_df = clean_df[all_features + [TARGET]]
test_df_final = test_df[all_features]




Original Training Shape: (12654, 25)
Original Test Shape: (5852, 24)
Rows removed due to null spend_category: 34
Training shape after removing null targets: (12620, 25)


  df[c] = df[c].replace({
  df[c] = df[c].replace({
  df[c] = df[c].replace({
  df[c] = df[c].replace({
  df[c] = df[c].replace({
  df[c] = df[c].replace({
  df[c] = df[c].replace({
  df[c] = df[c].replace({
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["days_booked_before_trip_ord"].fillna(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the ori


After Base Preprocessing:
(12620, 30)
Removed (num_males > 20): 8

num_females: removed 28 rows
num_males_threshold: removed 8 rows
mainland_stay_nights: removed 27 rows
island_stay_nights: removed 8 rows
---------------------------------------------
Total rows removed: 71
Final Training Shape after Outlier Removal: (12549, 30)
Test Shape (unchanged): (5852, 29)


In [11]:

BASE = "/content/drive/MyDrive/ML_Project/MultinomialClassification/MulticlassLR_WithClustering"

# =====================================================
# STEP 5 — TRAIN/VAL/TEST SPLITTING (READY)
# =====================================================

print("\nREADY FOR SPLITTING — No NaNs left.")
print("Train shape:", clean_df.shape)
print("Test final shape:", test_df_final.shape)

print("Next: perform splitting...")



# =====================================================
# STEP 5 — TRAIN/VAL/TEST SPLITTING
# =====================================================

# Better method:
#  → First create 80% main training, 20% held-out set
train_main, temp = train_test_split(
    clean_df, test_size=0.20, stratify=clean_df[TARGET], random_state=42
)

# Validation = 20% of main training
train_80, val_80 = train_test_split(
    train_main, test_size=0.20, stratify=train_main[TARGET], random_state=42
)

# 20% dataset
train_20, val_20 = train_test_split(
    temp, test_size=0.20, stratify=temp[TARGET], random_state=42
)

print("80% train:", train_80.shape)
print("80% val:", val_80.shape)
print("20% train:", train_20.shape)
print("20% val:", val_20.shape)



READY FOR SPLITTING — No NaNs left.
Train shape: (12549, 26)
Test final shape: (5852, 25)
Next: perform splitting...
80% train: (8031, 26)
80% val: (2008, 26)
20% train: (2008, 26)
20% val: (502, 26)




Completed → 80_skewed




Completed → 80_nonskewed
Completed → 20_skewed




Completed → 20_nonskewed


In [None]:

# =====================================================
# STEP 6 — FUNCTION TO MAKE SKEWED AND NON-SKEWED
# =====================================================
def make_skewed(df):
    # keep natural distribution
    return df.copy()

def make_nonskewed(df):
    # upsample minority classes
    major_size = df[TARGET].value_counts().max()
    frames = []
    for cls in df[TARGET].unique():
        cls_df = df[df[TARGET]==cls]
        cls_up = resample(cls_df, replace=True, n_samples=major_size, random_state=42)
        frames.append(cls_up)
    return pd.concat(frames)




In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, RBF
from sklearn.model_selection import ParameterSampler
import warnings
warnings.filterwarnings("ignore")

# =====================================================
# STEP 6.5 — MANUAL BAYESIAN HYPERPARAMETER TUNING (No Optuna)
# =====================================================

def bayesian_tune_lr(X_train, y_train, X_val, y_val, n_iter=20):

    # Search space
    search_space = {
        "C": np.logspace(-4, 1, 50),     # 50 candidates
    }

    # Random initial samples
    samples = list(ParameterSampler(search_space, n_iter, random_state=42))

    results = []
    scores = []

    # Kernel for Bayesian optimizer
    kernel = Matern(nu=2.5) + WhiteKernel()

    for params in samples:

        model = LogisticRegression(
            C=params["C"],
            penalty="l2",
            multi_class="multinomial",
            solver="lbfgs",
            max_iter=500
        )

        pipe = Pipeline([
            ("prep", preprocess),
            ("lr", model)
        ])

        pipe.fit(X_train, y_train)
        pred = pipe.predict(X_val)
        score = accuracy_score(y_val, pred)

        results.append(params)
        scores.append(score)

    # Fit a Gaussian Process on (C → accuracy)
    C_vals = np.array([r["C"] for r in results]).reshape(-1, 1)
    scores = np.array(scores)

    gp = GaussianProcessRegressor(kernel=kernel, random_state=42)
    gp.fit(C_vals, scores)

    # Predict best C in range
    C_fine = np.logspace(-4, 1, 200).reshape(-1, 1)
    preds = gp.predict(C_fine)

    best_C = C_fine[np.argmax(preds)][0]

    print("\n===== Bayesian Tuning Complete (No Optuna) =====")
    print("Best C:", best_C)

    return {"C": best_C}


In [None]:
# =====================================================
# STEP 7 — PIPELINE (OHE + SCALER + MULTINOMIAL LR)
# =====================================================
preprocess = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ("bin", "passthrough", binary_features)
])

model = LogisticRegression(
    multi_class='multinomial',
    max_iter=500,
    solver='lbfgs'
)

pipe = Pipeline([
    ("prep", preprocess),
    ("lr", model)
])

In [None]:
# =====================================================
# STEP 8 — TRAINING FUNCTION
# =====================================================
def run_and_save(mode_name, train_df, val_df, folder):

    os.makedirs(folder, exist_ok=True)

    X_train = train_df[all_features]
    y_train = train_df[TARGET]

    X_val = val_df[all_features]
    y_val = val_df[TARGET]

    # Train
    pipe.fit(X_train, y_train)

    # Validation results
    val_pred = pipe.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)

    # Save val classification report
    with open(f"{folder}/val_classification_report.txt", "w") as f:
        f.write(classification_report(y_val, val_pred))

    # Train accuracy
    train_pred = pipe.predict(X_train)
    train_acc = accuracy_score(y_train, train_pred)

    # Save accuracy summary
    with open(f"{folder}/accuracy_summary.txt", "w") as f:
        f.write(f"Train accuracy: {train_acc}\n")
        f.write(f"Validation accuracy: {val_acc}\n")

    # Predict test dataset
    final_pred = pipe.predict(test_df_final)

    pd.DataFrame({
        ID_COL: test_df[ID_COL],
        TARGET: final_pred
    }).to_csv(f"{folder}/predictions_{mode_name}.csv", index=False)

    print(f"Completed → {mode_name}")

In [None]:
# =====================================================
# STEP 9 — RUN ALL 4 MODES
# =====================================================


os.makedirs(BASE, exist_ok=True)

tasks = [
    ("80_skewed",     make_skewed(train_80),     val_80),
    ("80_nonskewed",  make_nonskewed(train_80),  val_80),
    ("20_skewed",     make_skewed(train_20),     val_20),
    ("20_nonskewed",  make_nonskewed(train_20),  val_20),
]

for name, tr, va in tasks:
    folder = f"{BASE}/{name}"
    run_and_save(name, tr, va, folder)
