In [4]:
# -*- coding: utf-8 -*-
"""
Q1 Journal-Ready Pipeline: Support Vector Machine for Multiclass Meal-Order Cancellation Timing Prediction

Revision history
----------------
30 Jul 2025 – initial draft
30 Jul 2025-b – tolerate IPython "-f" arg
30 Jul 2025-c – OneHotEncoder version-aware
30 Jul 2025-d – SMOTE version-aware (this commit)
"""

import warnings, os, random, joblib, argparse, inspect, sys
from datetime import datetime
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    balanced_accuracy_score,
    f1_score,
    ConfusionMatrixDisplay,
)
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

warnings.filterwarnings("ignore")
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

def identify_feature_types(df: pd.DataFrame):
    """Return separate lists of numeric and categorical columns."""
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in df.columns if c not in num_cols]
    return num_cols, cat_cols

def main(data_path: str = "/Users/shayan/Desktop/IDS2/Stattkueche/poorali/balancing/4class.csv", output_dir: str = "artifacts"):
    os.makedirs(output_dir, exist_ok=True)

    df = pd.read_csv(data_path)
    target = "cancel_timing"
    if target not in df.columns:
        raise ValueError(f"Target column '{target}' not found in data")
    y = df.pop(target)
    X = df.copy()

    leakage_cols = [
        "CanceledQty",
        "DateOfCancel",
        "days_to_cancel",
        "net_qty",
    ]
    X.drop(columns=[c for c in leakage_cols if c in X.columns], inplace=True)

    X = X.loc[:, X.nunique(dropna=False) > 1]

    if {"DateOfOrder", "DateOfService"}.issubset(X.columns):
        X["DateOfOrder"] = pd.to_datetime(X["DateOfOrder"], utc=True)
        X["DateOfService"] = pd.to_datetime(X["DateOfService"], utc=True)
        X["lead_time_days"] = (
            X["DateOfService"].sub(X["DateOfOrder"]).dt.total_seconds() / 86400
        ).astype(int)
        X.drop(columns=["DateOfOrder", "DateOfService"], inplace=True)

    numeric_cols, categorical_cols = identify_feature_types(X)

    num_pipe = Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("scale", StandardScaler()),
    ])

    import sklearn
    sk_parts = sklearn.__version__.split(".")[:2]
    major, minor = int(sk_parts[0]), int(sk_parts[1])
    if (major, minor) >= (1, 2):
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True, min_frequency=10)
    else:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True, min_frequency=10)

    cat_pipe = Pipeline([
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", ohe),
    ])

    preproc = ColumnTransformer([
        ("num", num_pipe, numeric_cols),
        ("cat", cat_pipe, categorical_cols),
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
    )

    if "n_jobs" in inspect.signature(SMOTE).parameters:
        smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=5, n_jobs=-1)
    else:
        smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=5)

    svm = SVC(
        kernel="rbf",
        class_weight="balanced",
        probability=True,
        random_state=RANDOM_STATE,
    )

    pipe = ImbPipeline([
        ("prep", preproc),
        ("balance", smote),
        ("clf", svm),
    ])

    grid = GridSearchCV(
        pipe,
        param_grid={
            "clf__C": [0.1, 1, 10, 50],
            "clf__gamma": ["scale", "auto", 0.01, 0.1],
        },
        cv=StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE),
        scoring="f1_macro",
        n_jobs=-1,
        verbose=1,
    )

    grid.fit(X_train, y_train)

    y_pred = grid.predict(X_test)
    report = classification_report(y_test, y_pred)
    bal_acc = balanced_accuracy_score(y_test, y_pred)
    f1_m = f1_score(y_test, y_pred, average="macro")

    print("\nBest params:", grid.best_params_)
    print("Macro-F1 (test):", round(f1_m, 4))
    print("Balanced accuracy (test):", round(bal_acc, 4))
    print("\nClassification report:\n", report)

    try:
        import matplotlib.pyplot as plt
        cm = confusion_matrix(y_test, y_pred, labels=grid.classes_)
        ConfusionMatrixDisplay(cm, display_labels=grid.classes_).plot(values_format="d")
        plt.title("SVM Confusion Matrix — Test Set")
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, "confusion_matrix.png"), dpi=300)
        plt.close()
    except Exception as e:
        print("Skipping confusion-matrix plot:", e)

    timestamp = datetime.now().strftime("%Y%m%dT%H%M%S")
    best_model_path = os.path.join(output_dir, f"svm_model_{timestamp}.pkl")
    joblib.dump(grid.best_estimator_, best_model_path)
    with open(os.path.join(output_dir, "metrics.txt"), "w") as f:
        f.write(f"Macro-F1: {f1_m}\nBalanced accuracy: {bal_acc}\n\n{report}")

    print(f"\nArtifacts saved to '{output_dir}'. Best model → {best_model_path}")

    return grid.best_estimator_

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Train SVM for cancellation-timing prediction")
    parser.add_argument("--data", default="/Users/shayan/Desktop/IDS2/Stattkueche/poorali/balancing/4class.csv", help="Path to CSV file")
    parser.add_argument("--out", default="artifacts", help="Directory for outputs")
    args, _ = parser.parse_known_args()
    main(args.data, args.out)


FileNotFoundError: [Errno 2] No such file or directory: '4class.csv'