In [4]:
# -------------------------------------------------------------
# Model-1B: Supervised Supplier Categorization (XGBoost Classifier)
# -------------------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# -------------------------------------------------------------
# 1. Load input feature table (with PSL target from Model-1A)
# -------------------------------------------------------------
df = pd.read_csv("../data_processed/historical_features_with_allocation.csv")

print("Loaded:", df.shape)
print(df.head())

# -------------------------------------------------------------
# 2. Prepare Target (PSL_status) 
# -------------------------------------------------------------
# Using PSL_status from Model-1A
psl_encoder = LabelEncoder()
df["PSL_code"] = psl_encoder.fit_transform(df["PSL_status"])

# Save the encoder
joblib.dump(psl_encoder, "../models/model1_psl_encoder.pkl")

# -------------------------------------------------------------
# 3. Prepare Features for Model-1B (same as your old logic)
# -------------------------------------------------------------
feature_cols = [
    "fiscal_year",
    "supplier_code",
    "gross_margin_pct",
    "cash_flow",
    "debt_equity_ratio",
    "node_parity",
    "DDR_gen_support",
    "geo_risk",
    "tariff_risk",
    "chip_shortage_impact"
]

X = df[feature_cols]
y = df["PSL_code"]

# -------------------------------------------------------------
# 4. Scaling (same as old tested approach)
# -------------------------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

joblib.dump(scaler, "../models/model1_scaler.pkl")

# -------------------------------------------------------------
# 5. Train-Test Split
# -------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------------------------------------
# 6. Train XGBoost Classifier
# -------------------------------------------------------------
model = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    eval_metric="mlogloss",
    random_state=42
)

model.fit(X_train, y_train)

# -------------------------------------------------------------
# 7. Evaluate
# -------------------------------------------------------------
y_pred = model.predict(X_test)

print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(psl_encoder.inverse_transform(y_test),
                            psl_encoder.inverse_transform(y_pred)))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# -------------------------------------------------------------
# 8. Save Model + Predictions
# -------------------------------------------------------------
model.save_model("../models/model1_xgb.json")

df["PSL_predicted"] = psl_encoder.inverse_transform(model.predict(X_scaled))

df.to_csv("../data_processed/supplier_categorization_predictions.csv", index=False)

print("\nSaved:")
print(" - model1_xgb.json")
print(" - model1_scaler.pkl")
print(" - model1_psl_encoder.pkl")
print(" - supplier_categorization_predictions.csv")


Loaded: (30, 39)
   supplier  fiscal_year      revenue         COGS  gross_margin_pct  \
0    Micron         2015   16300000.0   11660000.0              28.5   
1   Samsung         2015  176500000.0  108892000.0              37.5   
2  SK Hynix         2015   16900000.0   12168000.0              28.0   
3    Micron         2016   12400000.0   10210000.0              17.6   
4   Samsung         2016  177000000.0  109740000.0              38.0   

      cash_flow  debt_equity_ratio  cost_savings     PPV    QP  ...  \
0  7.800000e+06               0.62          10.0 -1000.0  85.0  ...   
1  3.900000e+10               0.17          10.0  -500.0  95.0  ...   
2  8.000000e+06               0.70           8.0 -1500.0  85.0  ...   
3  3.100000e+06               0.70           5.0 -1000.0  85.0  ...   
4  4.550000e+10               0.15           7.0  -500.0  95.0  ...   

   score_lead_time_attainment  score_carbon  score_renewable_energy_usage  \
0                    0.909091      0.518519   