In [4]:
# -------------------------------------------------------------
# Model-1B: Supervised Supplier Categorization 
# Saves: model1_xgb.json, model1_scaler.pkl, model1_psl_encoder.pkl,
#          supplier_categorization_predictions.csv
# -------------------------------------------------------------

import os
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from xgboost import XGBClassifier

# -------------------------------------------------------------
# CONFIG
# -------------------------------------------------------------
INPUT_FILE = "../data_processed/historical_features_with_allocation.csv"

OUT_PRED_FILE = "../data_processed/supplier_categorization_predictions.csv"

MODEL_JSON_FILE = "../models/model1_xgb.json"
SCALER_FILE = "../models/model1_scaler.pkl"
PSL_ENCODER_FILE = "../models/model1_psl_encoder.pkl"

RANDOM_STATE = 42

# -------------------------------------------------------------
# 1) Load input feature table (with PSL target from Model-1A)
# -------------------------------------------------------------
df = pd.read_csv(INPUT_FILE)
print("Loaded:", df.shape)
print(df.head())

# -------------------------------------------------------------
# 2) Ensure supplier_code exists (if EDA didn't create it)
# -------------------------------------------------------------
# Best practice: supplier_code should be created in EDA and reused everywhere.
# But this fallback keeps the script runnable.
if "supplier_code" not in df.columns:
    if "supplier" not in df.columns:
        raise ValueError("Expected either 'supplier_code' or 'supplier' column in input.")
    supplier_encoder = LabelEncoder()
    df["supplier_code"] = supplier_encoder.fit_transform(df["supplier"].astype(str))
    joblib.dump(supplier_encoder, "../models/model1_supplier_encoder.pkl")
    print("NOTE: 'supplier_code' not found. Created from 'supplier' and saved model1_supplier_encoder.pkl")

# -------------------------------------------------------------
# 3) Prepare target (PSL_status -> PSL_code)
# -------------------------------------------------------------
if "PSL_status" not in df.columns:
    raise ValueError("Expected column 'PSL_status' (from Model-1A) in input file.")

psl_encoder = LabelEncoder()
df["PSL_code"] = psl_encoder.fit_transform(df["PSL_status"].astype(str))

# Save PSL encoder for FY25 predictions
joblib.dump(psl_encoder, PSL_ENCODER_FILE)

# -------------------------------------------------------------
# 4) Prepare features (same feature list as your old tested logic)
# -------------------------------------------------------------
feature_cols = [
    "fiscal_year",
    "supplier_code",
    "gross_margin_pct",
    "cash_flow",
    "debt_equity_ratio",
    "node_parity",
    "DDR_gen_support",
    "geo_risk",
    "tariff_risk",
    "chip_shortage_impact"
]

missing = [c for c in feature_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required feature columns: {missing}")

X = df[feature_cols]
y = df["PSL_code"]

# -------------------------------------------------------------
# 5) Train/Test split
# -------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

# -------------------------------------------------------------
# 6) Pipeline + GridSearchCV
# -------------------------------------------------------------
pipe = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("xgb", XGBClassifier(
            objective="multi:softprob",
            eval_metric="mlogloss",
            random_state=RANDOM_STATE,
            n_jobs=-1
        ))
    ]
)

# Keep grid modest to avoid long runs; expand if needed
param_grid = {
    "xgb__n_estimators": [200, 300, 500],
    "xgb__learning_rate": [0.03, 0.05, 0.1],
    "xgb__max_depth": [3, 5, 7],
    "xgb__subsample": [0.7, 0.8, 1.0],
    "xgb__colsample_bytree": [0.7, 0.8, 1.0],
    "xgb__reg_lambda": [0.5, 1.0, 2.0],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("\nBest params:", grid.best_params_)
print("Best CV accuracy:", round(grid.best_score_, 4))

best_model = grid.best_estimator_

# -------------------------------------------------------------
# 7) Evaluate on held-out test set
# -------------------------------------------------------------
y_pred = best_model.predict(X_test)

print("\nModel-1B: Supplier Categorization — Evaluation (Leakage-safe)")
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("\nClassification Report:")
print(
    classification_report(
        psl_encoder.inverse_transform(y_test),
        psl_encoder.inverse_transform(y_pred)
    )
)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# -------------------------------------------------------------
# 8) Save artifacts
# -------------------------------------------------------------
# Save scaler (fitted on TRAIN ONLY via pipeline)
fitted_scaler = best_model.named_steps["scaler"]
joblib.dump(fitted_scaler, SCALER_FILE)

# Save XGBoost model as JSON (fitted xgb inside pipeline)
xgb_fitted = best_model.named_steps["xgb"]
xgb_fitted.save_model(MODEL_JSON_FILE)

print("\nSaved:")
print(" -", os.path.basename(MODEL_JSON_FILE))
print(" -", os.path.basename(SCALER_FILE))
print(" -", os.path.basename(PSL_ENCODER_FILE))

# -------------------------------------------------------------
# 9) Predict PSL for full dataset (for step-5 / future chaining)
# -------------------------------------------------------------
df["PSL_predicted"] = psl_encoder.inverse_transform(best_model.predict(X))

df.to_csv(OUT_PRED_FILE, index=False)
print(" -", os.path.basename(OUT_PRED_FILE))


Loaded: (30, 39)
   supplier  fiscal_year      revenue         COGS  gross_margin_pct  \
0    Micron         2015   16300000.0   11660000.0              28.5   
1   Samsung         2015  176500000.0  108892000.0              37.5   
2  SK Hynix         2015   16900000.0   12168000.0              28.0   
3    Micron         2016   12400000.0   10210000.0              17.6   
4   Samsung         2016  177000000.0  109740000.0              38.0   

      cash_flow  debt_equity_ratio  cost_savings     PPV    QP  ...  \
0  7.800000e+06               0.62          10.0 -1000.0  85.0  ...   
1  3.900000e+10               0.17          10.0  -500.0  95.0  ...   
2  8.000000e+06               0.70           8.0 -1500.0  85.0  ...   
3  3.100000e+06               0.70           5.0 -1000.0  85.0  ...   
4  4.550000e+10               0.15           7.0  -500.0  95.0  ...   

   score_lead_time_attainment  score_carbon  score_renewable_energy_usage  \
0                    0.909091      0.518519   