In [4]:
# -------------------------------------------------------------
# Model-2B: Supervised Spend Allocation (XGBRegressor)
# -------------------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import joblib

# -------------------------------------------------------------
# CONFIG (UNCHANGED)
# -------------------------------------------------------------
INPUT_FILE = "../data_processed/supplier_categorization_predictions.csv"
OUTPUT_FILE = "../data_processed/spend_allocation_predictions.csv"

MODEL_FILE = "../models/model2_xgb.pkl"
SCALER_FILE = "../models/model2_scaler.pkl"
PSL_ENCODER_FILE = "../models/model2_psl_encoder.pkl"

ENABLE_NORMALIZATION = True   # ensure predictions sum to 100% per fiscal year

# NEW: set to True if you want hyperparameter tuning
ENABLE_GRIDSEARCH = True

# -------------------------------------------------------------
# 1. Load input feature table (PSL_predicted from Model-1B)
# -------------------------------------------------------------
df = pd.read_csv(INPUT_FILE)
print("Loaded:", df.shape)
print(df.head())

# -------------------------------------------------------------
# 2. Encode PSL_predicted (text → numeric)
# -------------------------------------------------------------
psl_encoder = LabelEncoder()
df["PSL_status_encoded"] = psl_encoder.fit_transform(df["PSL_predicted"])

# Save encoder (used for FY25 predictions)
joblib.dump(psl_encoder, PSL_ENCODER_FILE)

# -------------------------------------------------------------
# 3. Define feature columns (UNCHANGED)
# -------------------------------------------------------------
feature_cols = [
    "PSL_status_encoded",
    "cost_savings",
    "PPV",
    "QP",
    "QR",
    "lead_time_attainment",
    "carbon_emission_intensity",
    "renewable_energy_usage",
    "plastic_recycle",
    "human_rights_compliance_score"
]

X = df[feature_cols]
y = df["allocation_percent"].astype(float)

# -------------------------------------------------------------
# 4. Train/Test Split FIRST (FIXES LEAKAGE)
# -------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

# -------------------------------------------------------------
# 5. Pipeline (Scaler + XGBRegressor) — prevents leakage in CV
# -------------------------------------------------------------
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("xgb", XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
    ))
])

# -------------------------------------------------------------
# 6. Optional GridSearchCV (hyperparameter tuning)
# -------------------------------------------------------------
if ENABLE_GRIDSEARCH:
    param_grid = {
        "xgb__n_estimators": [200, 300, 500],
        "xgb__learning_rate": [0.03, 0.05, 0.1],
        "xgb__max_depth": [3, 5, 7],
        "xgb__subsample": [0.7, 0.8, 1.0],
        "xgb__colsample_bytree": [0.7, 0.8, 1.0],
        "xgb__reg_lambda": [0.5, 1.0, 2.0],
    }

    grid = GridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        scoring="r2",
        cv=5,
        n_jobs=-1,
        verbose=1
    )

    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    print("\nBest params:", grid.best_params_)
else:
    best_model = pipe.fit(X_train, y_train)

# -------------------------------------------------------------
# 7. Evaluation (on held-out test set)
# -------------------------------------------------------------
y_pred = best_model.predict(X_test)

print("\nModel-2B: Supervised Spend Allocation — Evaluation (Leakage-safe)")
print(f" R² Score: {r2_score(y_test, y_pred):.3f}")
print(f" MAE: {mean_absolute_error(y_test, y_pred):.3f}")
print(f" RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.3f}")

# -------------------------------------------------------------
# 8. Predict allocation for full historical dataset
# -------------------------------------------------------------
df["predicted_allocation_raw"] = best_model.predict(X)

# Round to nearest 5%
df["predicted_allocation_percent"] = (
    np.clip(np.round(df["predicted_allocation_raw"] / 5) * 5, 0, 100)
)

# -------------------------------------------------------------
# 9. Normalize to ensure 100% per fiscal year
# -------------------------------------------------------------
if ENABLE_NORMALIZATION:
    df["normalized_allocation"] = df.groupby("fiscal_year")[
        "predicted_allocation_percent"
    ].transform(lambda x: (x / x.sum()) * 100)

    df["normalized_allocation"] = (
        np.clip(np.round(df["normalized_allocation"] / 5) * 5, 0, 100)
    )

    df["final_allocation_percent"] = df["normalized_allocation"]
else:
    df["final_allocation_percent"] = df["predicted_allocation_percent"]

# -------------------------------------------------------------
# 10. Build Output Table (UNCHANGED)
# -------------------------------------------------------------
output_cols = [
    "fiscal_year", "supplier",
    "PSL_status",
    "PSL_predicted",
    "PSL_status_encoded",
    "allocation_percent",
    "final_allocation_percent",
    "gross_margin_pct",
    "cash_flow",
    "debt_equity_ratio",
    "cost_savings",
    "PPV",
    "QP",
    "QR",
    "lead_time_attainment",
    "carbon_emission_intensity",
    "renewable_energy_usage",
    "plastic_recycle",
    "human_rights_compliance_score",
    "node_parity",
    "DDR_gen_support",
    "geo_risk",
    "tariff_risk",
    "chip_shortage_impact"
]

df_out = df[output_cols]

# -------------------------------------------------------------
# 11. Save output
# -------------------------------------------------------------
df_out.to_csv(OUTPUT_FILE, index=False)
print("\nSupervised spend allocation saved →", OUTPUT_FILE)

# -------------------------------------------------------------
# 12. Save Model + Scaler (same filenames as your tested script)
# -------------------------------------------------------------
# Save the fitted scaler separately for FY25 pipeline compatibility
fitted_scaler = best_model.named_steps["scaler"]
joblib.dump(fitted_scaler, SCALER_FILE)

# Save the fitted XGB model separately (same as before)
fitted_xgb = best_model.named_steps["xgb"]
joblib.dump(fitted_xgb, MODEL_FILE)

print("\nSaved:")
print(" - model2_xgb.pkl")
print(" - model2_scaler.pkl")
print(" - model2_psl_encoder.pkl")
print(" - spend_allocation_predictions.csv")

Loaded: (30, 41)
   supplier  fiscal_year      revenue         COGS  gross_margin_pct  \
0    Micron         2015   16300000.0   11660000.0              28.5   
1   Samsung         2015  176500000.0  108892000.0              37.5   
2  SK Hynix         2015   16900000.0   12168000.0              28.0   
3    Micron         2016   12400000.0   10210000.0              17.6   
4   Samsung         2016  177000000.0  109740000.0              38.0   

      cash_flow  debt_equity_ratio  cost_savings     PPV    QP  ...  \
0  7.800000e+06               0.62          10.0 -1000.0  85.0  ...   
1  3.900000e+10               0.17          10.0  -500.0  95.0  ...   
2  8.000000e+06               0.70           8.0 -1500.0  85.0  ...   
3  3.100000e+06               0.70           5.0 -1000.0  85.0  ...   
4  4.550000e+10               0.15           7.0  -500.0  95.0  ...   

   score_renewable_energy_usage  score_plastic_recycle  \
0                      0.088235               0.153846   
1      