# Neural Network with SMOTE and Bayesian Optimization

This notebook implements a Neural Network model with SMOTE for class balancing and Bayesian Optimization (Optuna) for hyperparameter tuning. It includes:
1. Data Loading and Preprocessing
2. Feature Engineering (K-Means)
3. SMOTE (Synthetic Minority Over-sampling Technique)
4. Bayesian Hyperparameter Optimization
5. Model Training
6. Model Saving
7. Model Loading and Prediction on Test Data

In [None]:
import pandas as pd
import numpy as np
import os
import joblib
import optuna
import tensorflow as pd_tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

# Try importing SMOTE, fallback to manual upsampling if not available
try:
    from imblearn.over_sampling import SMOTE
    HAS_SMOTE = True
except ImportError:
    HAS_SMOTE = False
    print("imblearn not found. Using manual upsampling instead of SMOTE.")

# =====================================================
# CONFIGURATION
# =====================================================
TRAIN_PATH = "../../Dataset/train.csv"
TEST_PATH = "../../Dataset/test.csv"
MODEL_SAVE_PATH = "nn_smote_model.keras"
PIPELINE_SAVE_PATH = "nn_pipeline.pkl"
SUBMISSION_PATH = "submission_nn.csv"
TARGET = "spend_category"
ID_COL = "trip_id"
N_CLUSTERS = 6
RANDOM_STATE = 42

In [None]:
# =====================================================
# DATA LOADING AND PREPROCESSING FUNCTIONS
# =====================================================

def load_data():
    train_df = pd.read_csv(TRAIN_PATH)
    test_df = pd.read_csv(TEST_PATH)
    return train_df, test_df

def preprocess_data(df, is_train=True):
    df = df.copy()
    
    # 1. Clean strings
    for c in df.columns:
        if df[c].dtype == object:
            df[c] = df[c].astype(str).str.strip().str.rstrip(',')
            
    # 2. Binary Columns
    binary_cols = [
        "is_first_visit","intl_transport_included","accomodation_included",
        "food_included","domestic_transport_included","sightseeing_included",
        "guide_included","insurance_included"
    ]
    for c in binary_cols:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip().str.lower()
            df[c] = df[c].replace({"yes": 1, "no": 0}).fillna(0)
            df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(int)

    # 3. Numeric Count Columns
    numeric_count_cols = ["num_females","num_males","mainland_stay_nights","island_stay_nights"]
    for c in numeric_count_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

    # 4. Ordinal Encoding
    def clean_str(x):
        x = str(x).strip().lower()
        if x in ["nan", "none", "null", ""]:
            return np.nan
        return x

    if "days_booked_before_trip" in df.columns:
        df["days_booked_before_trip_clean"] = df["days_booked_before_trip"].apply(clean_str)
        ordinal_days = {"1-7": 1, "8-14": 2, "15-30": 3, "31-60": 4, "61-90": 5, "90+": 6}
        df["days_booked_before_trip_ord"] = df["days_booked_before_trip_clean"].map(ordinal_days)
        mode_val = df["days_booked_before_trip_ord"].mode()[0] if not df["days_booked_before_trip_ord"].mode().empty else 1
        df["days_booked_before_trip_ord"] = df["days_booked_before_trip_ord"].fillna(mode_val).astype(int)

    if "total_trip_days" in df.columns:
        df["total_trip_days_clean"] = df["total_trip_days"].apply(clean_str)
        ordinal_trip = {"1-6": 1, "7-14": 2, "15-30": 3, "30+": 4}
        df["total_trip_days_ord"] = df["total_trip_days_clean"].map(ordinal_trip)
        mode_val = df["total_trip_days_ord"].mode()[0] if not df["total_trip_days_ord"].mode().empty else 1
        df["total_trip_days_ord"] = df["total_trip_days_ord"].fillna(mode_val).astype(int)

    if "has_special_requirements" in df.columns:
        df["has_special_req_bin"] = df["has_special_requirements"].astype(str).apply(
            lambda x: 0 if x.lower() in ["none", "", "nan"] else 1
        )

    # 5. Outlier Removal (Train Only)
    if is_train and TARGET in df.columns:
        df = df[df[TARGET].notnull()].reset_index(drop=True)
        df = df[df["num_females"] <= 10]
        df = df[df["num_males"] <= 10]
        df = df[df["mainland_stay_nights"] <= 90]
        df = df[df["island_stay_nights"] <= 60]
    
    # 6. Fill Categorical Missing Values
    categorical_cols = [
        "country","age_group","travel_companions","main_activity",
        "visit_purpose","tour_type","info_source","arrival_weather"
    ]
    for c in categorical_cols:
        if c in df.columns:
            mode_val = df[c].mode()[0] if not df[c].mode().empty else "Unknown"
            df[c] = df[c].fillna(mode_val)

    return df

In [None]:
# =====================================================
# FEATURE ENGINEERING (K-MEANS)
# =====================================================

def add_kmeans_features(train_df, test_df, n_clusters=6):
    numeric_features = [
        "num_females", "num_males", "mainland_stay_nights", "island_stay_nights",
        "days_booked_before_trip_ord", "total_trip_days_ord"
    ]
    
    X_train_num = train_df[numeric_features].copy()
    X_test_num = test_df[numeric_features].copy()
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_num)
    X_test_scaled = scaler.transform(X_test_num)
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE, n_init=10)
    train_df["kmeans_cluster"] = kmeans.fit_predict(X_train_scaled)
    test_df["kmeans_cluster"] = kmeans.predict(X_test_scaled)
    
    return train_df, test_df

In [None]:
# =====================================================
# DATA PREPARATION & SMOTE
# =====================================================

print("Loading and preprocessing data...")
train_raw, test_raw = load_data()
train_clean = preprocess_data(train_raw, is_train=True)
test_clean = preprocess_data(test_raw, is_train=False)

print(f"Adding K-Means features (K={N_CLUSTERS})...")
train_clean, test_clean = add_kmeans_features(train_clean, test_clean, n_clusters=N_CLUSTERS)

# Define Features
numeric_features = [
    "num_females", "num_males", "mainland_stay_nights", "island_stay_nights",
    "days_booked_before_trip_ord", "total_trip_days_ord"
]
binary_features = [
    "is_first_visit","intl_transport_included","accomodation_included",
    "food_included","domestic_transport_included","sightseeing_included",
    "guide_included","insurance_included", "has_special_req_bin"
]
categorical_features = [
    "country","age_group","travel_companions","main_activity",
    "visit_purpose","tour_type","info_source","arrival_weather",
    "kmeans_cluster"
]

all_features = numeric_features + binary_features + categorical_features

# Prepare X and y
X = train_clean[all_features]
y = train_clean[TARGET]

# Preprocessing Pipeline (Scaling/Encoding)
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
    ("bin", "passthrough", binary_features)
])

print("Fitting preprocessor...")
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(test_clean[all_features])

# Apply SMOTE or Manual Upsampling
print("Applying Class Balancing...")
if HAS_SMOTE:
    print("Using SMOTE...")
    smote = SMOTE(random_state=RANDOM_STATE)
    X_resampled, y_resampled = smote.fit_resample(X_processed, y)
else:
    print("Using Manual Upsampling...")
    # Combine X and y for resampling
    train_data = pd.DataFrame(X_processed)
    train_data[TARGET] = y.values
    
    major_class_size = train_data[TARGET].value_counts().max()
    upsampled_dfs = []
    for cls in train_data[TARGET].unique():
        cls_df = train_data[train_data[TARGET] == cls]
        cls_upsampled = resample(cls_df, 
                                 replace=True, 
                                 n_samples=major_class_size, 
                                 random_state=RANDOM_STATE)
        upsampled_dfs.append(cls_upsampled)
    
    train_upsampled = pd.concat(upsampled_dfs)
    X_resampled = train_upsampled.drop(columns=[TARGET]).values
    y_resampled = train_upsampled[TARGET].values

print(f"Resampled shape: {X_resampled.shape}")

In [None]:
# =====================================================
# BAYESIAN OPTIMIZATION (OPTUNA)
# =====================================================

# Split for Validation during Optimization
X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=RANDOM_STATE, stratify=y_resampled
)

def objective(trial):
    # Hyperparameters
    hidden_units = trial.suggest_categorical("hidden_units", [32, 64, 128])
    dropout_rate = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])
    activation = trial.suggest_categorical("activation", ["relu", "tanh"])
    epochs = trial.suggest_int("epochs", 10, 30)

    input_dim = X_train_opt.shape[1]
    num_classes = len(np.unique(y_resampled))

    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(hidden_units, activation=activation),
        layers.Dropout(dropout_rate),
        layers.Dense(num_classes, activation="softmax")
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(lr),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )

    model.fit(
        X_train_opt, y_train_opt,
        validation_data=(X_val_opt, y_val_opt),
        epochs=epochs,
        batch_size=batch_size,
        verbose=0
    )

    val_pred = np.argmax(model.predict(X_val_opt, verbose=0), axis=1)
    val_acc = accuracy_score(y_val_opt, val_pred)

    return val_acc

print("Starting Bayesian Optimization...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)  # Reduced trials for speed in this template

print("Best Params:", study.best_params)

In [None]:
# =====================================================
# TRAIN FINAL MODEL
# =====================================================

best_params = study.best_params
input_dim = X_resampled.shape[1]
num_classes = len(np.unique(y_resampled))

final_model = keras.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(best_params["hidden_units"], activation=best_params["activation"]),
    layers.Dropout(best_params["dropout"]),
    layers.Dense(num_classes, activation="softmax")
])

final_model.compile(
    optimizer=keras.optimizers.Adam(best_params["lr"]),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

print("Training final model on full resampled dataset...")
final_model.fit(
    X_resampled, y_resampled,
    epochs=best_params["epochs"],
    batch_size=best_params["batch_size"],
    verbose=1
)
print("Training complete.")

In [None]:
# =====================================================
# SAVE MODEL AND PIPELINE
# =====================================================
print(f"Saving model to {MODEL_SAVE_PATH}...")
final_model.save(MODEL_SAVE_PATH)
print(f"Saving pipeline to {PIPELINE_SAVE_PATH}...")
joblib.dump(preprocessor, PIPELINE_SAVE_PATH)
print("Saved successfully.")

In [None]:
# =====================================================
# LOAD MODEL AND PREDICT
# =====================================================
print("Loading model and pipeline...")
loaded_model = keras.models.load_model(MODEL_SAVE_PATH)
loaded_preprocessor = joblib.load(PIPELINE_SAVE_PATH)

print("Predicting on test set...")
# Note: X_test_processed was already transformed earlier, but in a real scenario we'd do:
# X_test_processed = loaded_preprocessor.transform(test_clean[all_features])

test_probs = loaded_model.predict(X_test_processed)
test_preds = np.argmax(test_probs, axis=1)

# Save Predictions
submission = pd.DataFrame({
    ID_COL: test_clean[ID_COL],
    TARGET: test_preds
})
submission.to_csv(SUBMISSION_PATH, index=False)
print(f"Predictions saved to {SUBMISSION_PATH}")

print(submission.head())