In [None]:
pip install scikit-learn pandas numpy joblib

In [None]:
# Trains calibrated Linear SVM models (class-balanced) for
# VitalityCategory and StressCategory.
# Uses 8 raw bands + 16 indices + 8 log1p bands.
# Prints full holdout results and generates thesis-ready plots.

import numpy as np
import pandas as pd
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, Tuple, List
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.metrics import (
    accuracy_score, f1_score, balanced_accuracy_score,
    classification_report, confusion_matrix
)
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
from joblib import dump

In [None]:
# ======================
# CONFIG
# ======================
DATA_PATH = "/content/Data_To_Train_3_vitality.xlsx"  # .xlsx/.xls/.csv all OK
TEST_SIZE = 0.25
RANDOM_STATE = 42
DO_CV = False
CV_SPLITS = 3
CV_REPEATS = 5

# === Band mapping (your sensor) ===
BANDS = [f"B{i}" for i in range(1, 9)]
BLUE, GREEN1, GREEN, RED, RE, NIR = "B2", "B3", "B4", "B6", "B7", "B8"
TARGETS = ["VitalityCategory", "StressCategory"]


# ======================
# Helpers
# ======================
def safe_nd(a, b, eps=1e-9): return (a - b) / (a + b + eps)
def safe_ratio(a, b, eps=1e-9): return a / (b + eps)
def safe_sqrt(x): return np.sqrt(np.maximum(x, 0.0))

def load_table(path: str) -> pd.DataFrame:
    ext = Path(path).suffix.lower()
    return pd.read_excel(path) if ext in [".xlsx", ".xls"] else pd.read_csv(path)

In [None]:
# ======================
# Feature computation
# ======================
def compute_features(df: pd.DataFrame) -> pd.DataFrame:
    """8 raw bands + 16 indices + 8 logs"""
    B2, B3, B4, B6, B7, B8 = [df[b].astype(float) for b in [BLUE, GREEN1, GREEN, RED, RE, NIR]]
    X_raw = df[BANDS].apply(pd.to_numeric, errors="coerce").copy()

    X_idx = pd.DataFrame(index=df.index)
    X_idx["NDVI"] = safe_nd(B8, B6)
    X_idx["EVI"] = 2.5*(B8 - B6)/(B8 + 6*B6 - 7.5*B2 + 1.0 + 1e-9)
    X_idx["NDRE"] = safe_nd(B8, B7)
    X_idx["CIre"] = safe_ratio(B8, B7) - 1
    X_idx["GNDVI"] = safe_nd(B8, B4)
    X_idx["SAVI"] = (B8 - B6)*(1+0.5)/(B8 + B6 + 0.5 + 1e-9)
    X_idx["MSAVI2"] = (2*B8 + 1 - safe_sqrt((2*B8 + 1)**2 - 8*(B8 - B6))) / 2
    X_idx["RDVI"] = (B8 - B6) / (safe_sqrt(B8 + B6) + 1e-9)
    X_idx["PRI"] = safe_nd(B3, B4)
    X_idx["VARI"] = (B4 - B6)/(B4 + B6 - B2 + 1e-9)
    X_idx["CIgreen"] = safe_ratio(B8, B4) - 1
    X_idx["MTCI"] = (B8 - B7)/(B7 - B6 + 1e-9)
    X_idx["NDWI"] = safe_nd(B4, B8)
    X_idx["ratio_B7_B6"] = safe_ratio(B7, B6)
    X_idx["ratio_B6_B8"] = safe_ratio(B6, B8)
    X_idx["ratio_B2_B6"] = safe_ratio(B2, B6)

    X_logs = pd.DataFrame(index=df.index)
    for b in BANDS:
        X_logs[f"log1p_{b}"] = np.log1p(df[b].astype(float).clip(lower=0))

    X = pd.concat([X_raw, X_idx, X_logs], axis=1)
    X = X.replace([np.inf, -np.inf], np.nan).dropna()
    return X


# ======================
# Model + Evaluation
# ======================
@dataclass
class ModelResult:
    target: str
    accuracy: float
    macro_f1: float
    bal_acc: float
    report: str
    conf_mat: np.ndarray
    classes: List[str]
    features: List[str]

def make_model() -> Pipeline:
    base = LinearSVC(C=1.0, class_weight="balanced", random_state=RANDOM_STATE)
    cal = CalibratedClassifierCV(base, cv=3, method="sigmoid")
    return Pipeline([("scaler", StandardScaler()), ("cal", cal)])

In [None]:
# ======================
# Visualization
# ======================
def plot_confusion_matrix(cm, classes, target_name, outdir):
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues",
                xticklabels=classes, yticklabels=classes)
    plt.title(f"{target_name} - Confusion Matrix")
    plt.ylabel("True")
    plt.xlabel("Predicted")
    plt.tight_layout()
    plt.savefig(outdir / f"{target_name}_confusion_matrix.png", dpi=300)
    plt.close()


def _extract_linear_coefs_from_calibrated(cal_obj):
    """
    Return (n_classes, n_features) coefficients by averaging across
    calibrated folds. Works for CalibratedClassifierCV over LinearSVC.
    """
    coefs = []
    if hasattr(cal_obj, "calibrated_classifiers_") and cal_obj.calibrated_classifiers_:
        for cc in cal_obj.calibrated_classifiers_:
            est = getattr(cc, "estimator", None)
            if est is not None and hasattr(est, "coef_"):
                c = np.asarray(est.coef_)
                if c.ndim == 1:
                    c = c.reshape(1, -1)
                coefs.append(c)
    elif hasattr(cal_obj, "base_estimator_") and hasattr(cal_obj.base_estimator_, "coef_"):
        c = np.asarray(cal_obj.base_estimator_.coef_)
        if c.ndim == 1:
            c = c.reshape(1, -1)
        coefs.append(c)

    if not coefs:
        return None
    return np.mean(np.stack(coefs, axis=0), axis=0)  # average over folds


def plot_feature_importance(model, features, target_name, outdir):
    cal = model.named_steps.get("cal", None)
    if cal is None:
        print(f"⚠️ No 'cal' step in pipeline for {target_name}; skipping feature importance.")
        return

    coefs = _extract_linear_coefs_from_calibrated(cal)
    if coefs is None:
        print(f"⚠️ Could not extract coefficients for {target_name}; skipping feature importance.")
        return

    # Importance = mean absolute coefficient across classes
    importance = np.mean(np.abs(coefs), axis=0)
    imp_df = pd.DataFrame({"Feature": features, "Importance": importance}).sort_values(
        "Importance", ascending=False
    ).head(20)

    plt.figure(figsize=(8,6))
    sns.barplot(y="Feature", x="Importance", data=imp_df, palette="mako")
    plt.title(f"{target_name} - Top 20 Feature Importances (|coef|, averaged over calibrated folds)")
    plt.tight_layout()
    plt.savefig(outdir / f"{target_name}_feature_importance.png", dpi=300)
    plt.close()


def plot_class_report(rep_dict, target_name, outdir):
    metrics = pd.DataFrame(rep_dict).T.drop(["accuracy"], errors="ignore")
    metrics = metrics.dropna(subset=["precision"], how="all")
    metrics = metrics.loc[~metrics.index.str.contains("avg", case=False)]
    metrics[["precision", "recall", "f1-score"]].plot(kind="bar", figsize=(7,5))
    plt.title(f"{target_name} - Per-class Performance")
    plt.ylabel("Score")
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.savefig(outdir / f"{target_name}_class_report.png", dpi=300)
    plt.close()

In [None]:
# ======================
# Training + Evaluation
# ======================
def fit_eval_one_target(X, y_series, target_name):
    le = LabelEncoder()
    y = le.fit_transform(y_series.astype(str))
    class_names = list(le.classes_)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )

    model = make_model()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    mf1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
    bacc = balanced_accuracy_score(y_test, y_pred)
    rep_text = classification_report(y_test, y_pred, target_names=class_names, zero_division=0)
    rep_dict = classification_report(y_test, y_pred, target_names=class_names, output_dict=True, zero_division=0)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n==== {target_name} RESULTS (holdout set) ====")
    print(f"Accuracy: {acc:.4f} | Balanced Acc: {bacc:.4f} | Macro-F1: {mf1:.4f}")
    print("\nClassification Report:")
    print(rep_text)
    print("\nConfusion matrix (rows=true, cols=pred):")
    print(pd.DataFrame(cm, index=class_names, columns=class_names))

    # === Visualization ===
    outdir = Path("lsvm_outputs") / f"plots_{target_name}"
    outdir.mkdir(parents=True, exist_ok=True)
    plot_confusion_matrix(cm, class_names, target_name, outdir)
    plot_feature_importance(model, X.columns.tolist(), target_name, outdir)
    plot_class_report(rep_dict, target_name, outdir)

    # Save per-target holdout predictions
    y_test_df = pd.DataFrame({
        f"{target_name}_true": le.inverse_transform(y_test),
        f"{target_name}_pred": le.inverse_transform(y_pred)
    }, index=X_test.index)
    y_test_df.to_csv(outdir / f"holdout_predictions_{target_name}.csv")

    res = ModelResult(target_name, acc, mf1, bacc, rep_text, cm, class_names, X.columns.tolist())
    return res, model, le

In [None]:
# ======================
# MAIN
# ======================
def main():
    print("Loading data...")
    df = load_table(DATA_PATH)
    need = set(BANDS + TARGETS)
    miss = need - set(df.columns)
    if miss:
        raise ValueError(f"Missing required columns: {sorted(miss)}")

    df = df.dropna(subset=TARGETS).reset_index(drop=True)
    X_full = compute_features(df)
    df = df.loc[X_full.index].reset_index(drop=True)
    X_full = X_full.reset_index(drop=True)
    print(f"Total features constructed: {X_full.shape[1]}")

    results = {}
    Path("lsvm_outputs").mkdir(parents=True, exist_ok=True)

    for tgt in TARGETS:
        print(f"\nTraining {tgt} model...")
        res, model, le = fit_eval_one_target(X_full, df[tgt], tgt)
        results[tgt] = res
        dump(model, f"lsvm_outputs/linear_svm_cal_{tgt}.joblib")
        dump(le, f"lsvm_outputs/label_encoder_{tgt}.joblib")

    # Summary results
    summary_df = pd.DataFrame([{
        "Target": tgt,
        "Accuracy": results[tgt].accuracy,
        "MacroF1": results[tgt].macro_f1,
        "BalancedAcc": results[tgt].bal_acc
    } for tgt in results])
    summary_df.to_csv("lsvm_outputs/holdout_summary.csv", index=False)

    print("\n===== SUMMARY =====")
    print(summary_df.to_string(index=False))

    # Summary bar plot
    summary_df.set_index("Target")[["Accuracy", "MacroF1", "BalancedAcc"]].plot(kind="bar", figsize=(6,4))
    plt.title("Linear SVM Model Performance Comparison")
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.savefig("lsvm_outputs/model_comparison_summary.png", dpi=300)
    plt.close()

    print("\n✅ All outputs (models, CSVs, and plots) saved in ./lsvm_outputs/")

In [None]:
if __name__ == "__main__":
    main()