# Model Training (Machine Learning)

We train six traditional ML classifiers on nine feature-selection datasets:

1. **Recursive Feature Elimination**
2. **Select K Best**
3. **Fisher Score Chi-Square**
4. **Extra Trees Classifier**
5. **Pearson Correlation**
6. **Mutual Information**
7. **Mutual Info Regression**
8. **Manual Uniqueness**
9. **Variance Threshold**

Each dataset trains:
- Logistic Regression  
- Gradient Boosting Classifier  
- K-Nearest Neighbours  
- Random Forest Classifier  
- Decision Tree Classifier  
- Support Vector Machine  

Metrics ‚Üí `Accuracy`, `Precision`, `Recall`, `F1`  
Visuals ‚Üí Confusion Matrices  
Results, Models and Figures are saved in respective folders.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

DATA_DIR = Path("../data/processed/features")
PROC_DIR = Path("../data/processed/ml")
MODEL_DIR_BASE = Path("../models/ml")
FIG_DIR_BASE = Path("../figures/ml")

METHODS = ["rfe","skb","fscs","etc","pc","mi","mir","mu","vt"]

## Helper Functions

These utilities:
- Train each model  
- Compute metrics (Accuracy, Precision, Recall, F1)  
- Plot and save Confusion Matrices  
- Save trained models and results

In [2]:
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name, method):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    rec = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

    cm = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
    ax.set_title(f"{model_name} ‚Äî {method.upper()}")
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    fig.tight_layout()

    fig_dir = FIG_DIR_BASE / method
    fig_dir.mkdir(parents=True, exist_ok=True)
    fig_path = fig_dir / f"{model_name.lower().replace(' ', '_')}_confusion.png"
    fig.savefig(fig_path, dpi=300, bbox_inches="tight")
    plt.close(fig)

    return {"Model": model_name, "Accuracy": acc, "Precision": prec, "Recall": rec, "F1": f1}


def save_model(model, model_name, method):
    model_dir = MODEL_DIR_BASE / method
    model_dir.mkdir(parents=True, exist_ok=True)
    path = model_dir / f"{model_name.lower().replace(' ', '_')}.pkl"
    joblib.dump(model, path)

## Model Definitions

We define six traditional ML models to train on each feature set.

In [3]:
MODELS = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Gradient Boosting": GradientBoostingClassifier(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC()
}

## Train All Six Models Across Nine Feature Sets

For each feature-selection method:
1. Load train/test CSV files  
2. Train six models  
3. Compute metrics and save results + models + confusion matrices

In [4]:
for method in METHODS:
    print(f"\n=== Training models for {method.upper()} ===")

    train_path = DATA_DIR / method / "train.csv"
    test_path = DATA_DIR / method / "test.csv"

    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    # Drop any missing rows
    train_df = train_df.dropna(subset=["DepressionEncoded"])
    test_df = test_df.dropna(subset=["DepressionEncoded"])

    X_train = train_df.drop(columns=["DepressionEncoded"])
    y_train = train_df["DepressionEncoded"].astype(int)
    X_test = test_df.drop(columns=["DepressionEncoded"])
    y_test = test_df["DepressionEncoded"].astype(int)

    results = []

    for name, model in MODELS.items():
        print(f"üß† Training {name}")
        metrics = train_and_evaluate(model, X_train, y_train, X_test, y_test, name, method)
        results.append(metrics)
        save_model(model, name, method)

    res_df = pd.DataFrame(results)
    out_dir = PROC_DIR / method
    out_dir.mkdir(parents=True, exist_ok=True)
    res_df.to_csv(out_dir / "results_traditional_ml.csv", index=False)

    print(f"‚úÖ Saved metrics ‚Üí {out_dir / 'results_traditional_ml.csv'}")


=== Training models for RFE ===
üß† Training Logistic Regression
üß† Training Gradient Boosting
üß† Training KNN
üß† Training Random Forest
üß† Training Decision Tree
üß† Training SVM
‚úÖ Saved metrics ‚Üí ..\data\processed\ml\rfe\results_traditional_ml.csv

=== Training models for SKB ===
üß† Training Logistic Regression
üß† Training Gradient Boosting
üß† Training KNN
üß† Training Random Forest
üß† Training Decision Tree
üß† Training SVM
‚úÖ Saved metrics ‚Üí ..\data\processed\ml\skb\results_traditional_ml.csv

=== Training models for FSCS ===
üß† Training Logistic Regression
üß† Training Gradient Boosting
üß† Training KNN
üß† Training Random Forest
üß† Training Decision Tree
üß† Training SVM
‚úÖ Saved metrics ‚Üí ..\data\processed\ml\fscs\results_traditional_ml.csv

=== Training models for ETC ===
üß† Training Logistic Regression
üß† Training Gradient Boosting
üß† Training KNN
üß† Training Random Forest
üß† Training Decision Tree
üß† Training SVM
‚úÖ Saved me

## üßÆ Summary of Model Performance Across All Feature Sets

In [5]:
all_results = []

for method in METHODS:
    res_path = PROC_DIR / method / "results_traditional_ml.csv"
    if res_path.exists():
        df = pd.read_csv(res_path)
        df["Feature Set"] = method.upper()
        all_results.append(df)
    else:
        print(f"‚ö†Ô∏è Missing results for {method.upper()}")

if all_results:
    combined_results = pd.concat(all_results, ignore_index=True)
    combined_results = combined_results.sort_values(["Feature Set", "Accuracy"], ascending=[True, False])
    
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    display(combined_results)

    # Save the full summary for documentation
    summary_out = PROC_DIR / "all_model_results_summary.csv"
    combined_results.to_csv(summary_out, index=False)
    print(f"‚úÖ Combined model summary saved ‚Üí {summary_out}")
else:
    print("‚ö†Ô∏è No model results found. Please run training first.")

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Feature Set
18,Logistic Regression,0.8,0.798754,0.8,0.79847,ETC
23,SVM,0.790123,0.790873,0.790123,0.789303,ETC
19,Gradient Boosting,0.765432,0.767866,0.765432,0.766305,ETC
21,Random Forest,0.750617,0.755275,0.750617,0.751077,ETC
20,KNN,0.698765,0.700839,0.698765,0.695469,ETC
22,Decision Tree,0.649383,0.658683,0.649383,0.64953,ETC
12,Logistic Regression,0.767901,0.771709,0.767901,0.768826,FSCS
15,Random Forest,0.762963,0.770586,0.762963,0.764045,FSCS
13,Gradient Boosting,0.760494,0.766566,0.760494,0.761301,FSCS
17,SVM,0.760494,0.764876,0.760494,0.761422,FSCS


‚úÖ Combined model summary saved ‚Üí ..\data\processed\ml\all_model_results_summary.csv
