In [6]:
# ================================================================
# Model-1A (UNSUPERVISED)
# Derive PSL target labels using KMeans Clustering
# ================================================================

import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.cluster import KMeans

# -------------------------------------------------------
# 1. Auto-detect project root
# -------------------------------------------------------
current_dir = os.getcwd()

if current_dir.endswith("notebooks"):
    project_root = os.path.abspath(os.path.join(current_dir, ".."))
else:
    project_root = current_dir

DATA_PROCESSED = os.path.join(project_root, "data_processed")

print(" Project Root:", project_root)
print(" DATA_PROCESSED:", DATA_PROCESSED)

# -------------------------------------------------------
# 2. Load cleaned features
# -------------------------------------------------------
input_file = os.path.join(DATA_PROCESSED, "historical_features_clean.csv")
df = pd.read_csv(input_file)

print(" Loaded:", input_file)
print(df.head())

# Encode supplier name 
if "supplier" in df.columns:
    label_encoder = LabelEncoder()
    df["supplier_code"] = label_encoder.fit_transform(df["supplier"])

# -------------------------------------------------------
# 3. Select clustering features
#    (Strategic + Financial signals)
# -------------------------------------------------------
cluster_features = [
    "gross_margin_pct",
    "cash_flow",
    "debt_equity_ratio",
    "node_parity",
    "DDR_gen_support",
    "geo_risk",
    "tariff_risk",
    "chip_shortage_impact",
]

X = df[cluster_features].copy()

# -------------------------------------------------------
# 4. Scale features
# -------------------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -------------------------------------------------------
# 5. KMeans Clustering (3 supplier categories)
# -------------------------------------------------------
kmeans = KMeans(n_clusters=3, random_state=42, n_init=20)
df["PSL_cluster"] = kmeans.fit_predict(X_scaled)

# -------------------------------------------------------
# 6. Convert clusters ‚Üí Ordered PSL categories
# -------------------------------------------------------
# Compute cluster ranking (Higher score ‚Üí Preferred)
cluster_scores = df.groupby("PSL_cluster")["gross_margin_pct"].mean().sort_values().index.tolist()

cluster_to_psl = {
    cluster_scores[2]: "Preferred",
    cluster_scores[1]: "Developing",
    cluster_scores[0]: "Limited"
}

df["PSL_status"] = df["PSL_cluster"].map(cluster_to_psl)

# -------------------------------------------------------
# 7. Save output
# -------------------------------------------------------
output_file = os.path.join(DATA_PROCESSED, "historical_features_with_psl.csv")
df.to_csv(output_file, index=False)

print("\n Model-1A Completed")
print(" Output Saved:", output_file)
print(df[["supplier", "fiscal_year", "PSL_status"]].head())


üìå Project Root: /Users/rambavisetty/anaconda_projects/capstone
üìÅ DATA_PROCESSED: /Users/rambavisetty/anaconda_projects/capstone/data_processed
‚úÖ Loaded: /Users/rambavisetty/anaconda_projects/capstone/data_processed/historical_features_clean.csv
   supplier  fiscal_year      revenue         COGS  gross_margin_pct  \
0    Micron         2015   16300000.0   11660000.0              28.5   
1   Samsung         2015  176500000.0  108892000.0              37.5   
2  SK Hynix         2015   16900000.0   12168000.0              28.0   
3    Micron         2016   12400000.0   10210000.0              17.6   
4   Samsung         2016  177000000.0  109740000.0              38.0   

      cash_flow  debt_equity_ratio  cost_savings     PPV    QP  ...  \
0  7.800000e+06               0.62          10.0 -1000.0  85.0  ...   
1  3.900000e+10               0.17          10.0  -500.0  95.0  ...   
2  8.000000e+06               0.70           8.0 -1500.0  85.0  ...   
3  3.100000e+06               