# Model Training (Machine Learning)

We train **6 traditional ML classifiers** on nine feature-selection datasets:

1. **Logistic Regression**
2. **Gradient Boosting Classifier**
3. **K-Nearest Neighbours**
4. **Random Forest Classifier**
5. **Decision Tree Classifier**
6. **Support Vector Machine**

Metrics → `Accuracy`, `Precision`, `Recall`, `F1`  
Visuals → Confusion Matrices

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

FEATURES_BASE = Path("../data/processed/features1")
PROC_BASE = Path("../data/processed/ml1")
MODEL_BASE = Path("../models/ml1")
FIG_BASE = Path("../figures/ml1")

for p in [PROC_BASE, MODEL_BASE, FIG_BASE]:
    p.mkdir(parents=True, exist_ok=True)

METHODS = ["rfe","skb","fscs","etc","pc","mi","mir","mu","vt"]
RANDOM_STATE = 42

## Helper Functions

In [2]:
def plot_and_save_confusion(y_true, y_pred, path, title):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_title(title)
    fig.tight_layout()
    fig.savefig(path, dpi=300, bbox_inches="tight")
    plt.close(fig)

def compute_metrics(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, average="weighted", zero_division=0),
        "Recall": recall_score(y_true, y_pred, average="weighted", zero_division=0),
        "F1": f1_score(y_true, y_pred, average="weighted", zero_division=0)
    }

## Model Definitions

In [3]:
MODELS = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    "Gradient Boosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(random_state=RANDOM_STATE),
    "Decision Tree": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "SVM": SVC(probability=True, random_state=RANDOM_STATE)
}

## Train All Six Models Across Nine Feature Sets

In [4]:
for method in METHODS:
    print("\n" + "="*60)
    print(f"▶ Training ML models for: {method.upper()}")
    print("="*60)

    in_dir = FEATURES_BASE / method
    train_path = in_dir / "train.csv"
    test_path = in_dir / "test.csv"
    if not train_path.exists() or not test_path.exists():
        print(f"⚠️ Missing train/test for {method}, skipping.")
        continue

    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    train_df = train_df.dropna(subset=["DepressionEncoded"])
    test_df = test_df.dropna(subset=["DepressionEncoded"])
    X_train = train_df.drop(columns=["DepressionEncoded"])
    y_train = train_df["DepressionEncoded"].astype(int)
    X_test = test_df.drop(columns=["DepressionEncoded"])
    y_test = test_df["DepressionEncoded"].astype(int)

    results = []
    proc_out = PROC_BASE / method
    model_out = MODEL_BASE / method
    fig_out = FIG_BASE / method
    for p in [proc_out, model_out, fig_out]:
        p.mkdir(parents=True, exist_ok=True)

    for name, model in MODELS.items():
        print(f" - Training {name} ...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        metrics = compute_metrics(y_test, y_pred)
        metrics_row = {"Model": name, **metrics}
        results.append(metrics_row)

        cm_path = fig_out / f"{name.lower().replace(' ', '_')}_confusion.png"
        plot_and_save_confusion(y_test, y_pred, cm_path, f"{name} Confusion ({method.upper()})")

        model_path = model_out / f"{name.lower().replace(' ', '_')}.pkl"
        joblib.dump(model, model_path)

    res_df = pd.DataFrame(results)
    res_df.to_csv(proc_out / "results_traditional_ml.csv", index=False)
    print(f"✅ Saved results for {method.upper()} to {proc_out / 'results_traditional_ml.csv'}")


▶ Training ML models for: RFE
 - Training Logistic Regression ...
 - Training Gradient Boosting ...
 - Training KNN ...
 - Training Random Forest ...
 - Training Decision Tree ...
 - Training SVM ...
✅ Saved results for RFE to ..\data\processed\ml1\rfe\results_traditional_ml.csv

▶ Training ML models for: SKB
 - Training Logistic Regression ...
 - Training Gradient Boosting ...
 - Training KNN ...
 - Training Random Forest ...
 - Training Decision Tree ...
 - Training SVM ...
✅ Saved results for SKB to ..\data\processed\ml1\skb\results_traditional_ml.csv

▶ Training ML models for: FSCS
 - Training Logistic Regression ...
 - Training Gradient Boosting ...
 - Training KNN ...
 - Training Random Forest ...
 - Training Decision Tree ...
 - Training SVM ...
✅ Saved results for FSCS to ..\data\processed\ml1\fscs\results_traditional_ml.csv

▶ Training ML models for: ETC
 - Training Logistic Regression ...
 - Training Gradient Boosting ...
 - Training KNN ...
 - Training Random Forest ...
 - 

## Summary of Model Performance Across All Feature Sets

In [5]:
all_results = []

for method in METHODS:
    res_path = PROC_BASE / method / "results_traditional_ml.csv"
    if res_path.exists():
        df = pd.read_csv(res_path)
        df["Feature Set"] = method.upper()
        all_results.append(df)
    else:
        print(f"⚠️ Missing results for {method.upper()}")

if all_results:
    combined_results = pd.concat(all_results, ignore_index=True)
    combined_results = combined_results.sort_values(["Feature Set", "Accuracy"], ascending=[True, False])
    
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    display(combined_results)

    summary_out = PROC_BASE / "all_model_results_summary_v2.csv"
    combined_results.to_csv(summary_out, index=False)
    print(f"✅ Combined model summary saved → {summary_out}")
else:
    print("⚠️ No model results found. Please run training first.")

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Feature Set
18,Logistic Regression,0.837037,0.836223,0.837037,0.835266,ETC
23,SVM,0.817284,0.817255,0.817284,0.816939,ETC
19,Gradient Boosting,0.782716,0.783921,0.782716,0.783217,ETC
21,Random Forest,0.777778,0.780265,0.777778,0.778559,ETC
20,KNN,0.718519,0.720712,0.718519,0.715652,ETC
22,Decision Tree,0.698765,0.69397,0.698765,0.694147,ETC
12,Logistic Regression,0.812346,0.815919,0.812346,0.812285,FSCS
17,SVM,0.807407,0.812084,0.807407,0.808148,FSCS
15,Random Forest,0.797531,0.804078,0.797531,0.797543,FSCS
13,Gradient Boosting,0.767901,0.771343,0.767901,0.767184,FSCS


✅ Combined model summary saved → ..\data\processed\ml1\all_model_results_summary_v2.csv
