In [7]:
import os
import pandas as pd
import numpy as np

# --------------------------------------------------
# 1. Resolve project paths (works from root or notebooks/)
# --------------------------------------------------
cwd = os.getcwd()
if os.path.basename(cwd).lower() == "notebooks":
    PROJECT_ROOT = os.path.abspath(os.path.join(cwd, ".."))
else:
    PROJECT_ROOT = cwd

input_path  = os.path.join(PROJECT_ROOT, "data_processed", "historical_features_with_psl.csv")
output_path = os.path.join(PROJECT_ROOT, "data_processed", "historical_features_with_allocation.csv")

print("Project root:", PROJECT_ROOT)
print("Input file  :", input_path)
print("Output file :", output_path)

# --------------------------------------------------
# 2. Load data
# --------------------------------------------------
df = pd.read_csv(input_path)
print("\nLoaded rows:", len(df))
print("Columns:", df.columns.tolist())

# --------------------------------------------------
# 3. PSL score (from PSL_status)
# --------------------------------------------------
# This mirrors the original logic: Preferred > Developing > Limited
psl_score_map = {
    "Preferred": 1.0,
    "Developing": 0.6,
    "Limited": 0.3
}

if "PSL_status" not in df.columns:
    raise ValueError("Expected column 'PSL_status' in historical_features_with_psl.csv")

df["PSL_score"] = df["PSL_status"].map(psl_score_map)

# If any PSL_status values are missing in the map, warn:
if df["PSL_score"].isna().any():
    missing = df[df["PSL_score"].isna()]["PSL_status"].unique()
    print("\nWARNING: Unmapped PSL_status values:", missing)

# --------------------------------------------------
# 4. Define metric groups
# --------------------------------------------------
# Cost metrics – higher is better (after normalization)
score_cols_cost = ["cost_savings", "PPV"]        # We'll flip PPV so lower PPV => higher score

# Quality & delivery – higher better
score_cols_qd   = ["QP", "QR", "lead_time_attainment"]

# ESG metrics – some "lower better", some "higher better"
# Here we assume:
#   - carbon_emission_intensity: lower is better (we will invert)
#   - renewable_energy_usage, plastic_recycle, human_rights_compliance_score: higher better
score_cols_esg_hi = ["renewable_energy_usage", "plastic_recycle", "human_rights_compliance_score"]
col_carbon = "carbon_emission_intensity"

# --------------------------------------------------
# 5. Safe min–max normalization helper
# --------------------------------------------------
def safe_minmax(series):
    """Min-max scale to [0,1]; if constant or NaN, return 0.5."""
    s = series.astype(float)
    if s.isna().all():
        return pd.Series(0.5, index=s.index)
    vmin, vmax = s.min(), s.max()
    if vmax == vmin:
        return pd.Series(0.5, index=s.index)
    return (s - vmin) / (vmax - vmin)

# --------------------------------------------------
# 6. Build normalized feature scores (0–1)
# --------------------------------------------------

# Cost_savings: higher better → min-max directly
if "cost_savings" in df.columns:
    df["score_cost_savings"] = safe_minmax(df["cost_savings"])
else:
    df["score_cost_savings"] = 0.5

# PPV: typically negative or smaller is better → invert sign then normalize
if "PPV" in df.columns:
    df["score_PPV"] = safe_minmax(-df["PPV"])
else:
    df["score_PPV"] = 0.5

# Quality / Delivery metrics
for col in score_cols_qd:
    if col in df.columns:
        df[f"score_{col}"] = safe_minmax(df[col])
    else:
        df[f"score_{col}"] = 0.5

# ESG: carbon_emission_intensity (lower better) + others (higher better)
if col_carbon in df.columns:
    df["score_carbon"] = safe_minmax(-df[col_carbon])  # invert
else:
    df["score_carbon"] = 0.5

for col in score_cols_esg_hi:
    if col in df.columns:
        df[f"score_{col}"] = safe_minmax(df[col])
    else:
        df[f"score_{col}"] = 0.5

# --------------------------------------------------
# 7. Combine into composite score (all >= 0)
# --------------------------------------------------
# You can adjust these weights to match your original tuned logic
weight_psl   = 0.5
weight_cost  = 0.1
weight_qd    = 0.3
weight_esg   = 0.1

# Cost block: average of cost_savings + PPV scores
df["score_cost_block"] = (df["score_cost_savings"] + df["score_PPV"]) / 2.0

# QD block
df["score_qd_block"] = (
    df["score_QP"] + df["score_QR"] + df["score_lead_time_attainment"]
) / 3.0

# ESG block
df["score_esg_block"] = (
    df["score_carbon"]
    + df["score_renewable_energy_usage"]
    + df["score_plastic_recycle"]
    + df["score_human_rights_compliance_score"]
) / 4.0

# Final composite (always non-negative because all components are in [0,1] and weights ≥ 0)
df["composite_score"] = (
    weight_psl  * df["PSL_score"]
    + weight_cost * df["score_cost_block"]
    + weight_qd   * df["score_qd_block"]
    + weight_esg  * df["score_esg_block"]
)

# Sanity check
if (df["composite_score"] < 0).any():
    print("\nWARNING: Composite score has negatives. Check logic.")
else:
    print("\nComposite scores look OK (no negatives).")

# --------------------------------------------------
# 8. Convert composite scores → allocation % per fiscal_year
# --------------------------------------------------
if "fiscal_year" not in df.columns:
    raise ValueError("Expected column 'fiscal_year' in historical_features_with_psl.csv")

def to_allocation(group):
    total = group["composite_score"].sum()
    if total <= 0:
        # Fallback: equal split if something unexpected happens
        n = len(group)
        return pd.Series([100.0 / n] * n, index=group.index)
    raw = group["composite_score"] / total * 100.0
    # Round to nearest 5%
    rounded = np.round(raw / 5.0) * 5.0
    # Fix rounding drift to ensure exact 100%
    diff = 100.0 - rounded.sum()
    if abs(diff) >= 2.5:
        # Adjust the largest supplier by the residual
        idx_max = rounded.idxmax()
        rounded.loc[idx_max] += diff
    return rounded

df["allocation_percent"] = (
    df.groupby("fiscal_year", group_keys=False)
      .apply(to_allocation)
)

# Clamp to [0,100] for safety
df["allocation_percent"] = df["allocation_percent"].clip(lower=0, upper=100)

# Validate per-year sum
check = df.groupby("fiscal_year")["allocation_percent"].sum()
print("\nAllocation % sum by fiscal_year:")
print(check)

# --------------------------------------------------
# 9. Save output
# --------------------------------------------------
df.to_csv(output_path, index=False)
print("\nSaved historical features with allocation →", output_path)


Project root: /Users/rambavisetty/anaconda_projects/capstone
Input file  : /Users/rambavisetty/anaconda_projects/capstone/data_processed/historical_features_with_psl.csv
Output file : /Users/rambavisetty/anaconda_projects/capstone/data_processed/historical_features_with_allocation.csv

Loaded rows: 30
Columns: ['supplier', 'fiscal_year', 'revenue', 'COGS', 'gross_margin_pct', 'cash_flow', 'debt_equity_ratio', 'cost_savings', 'PPV', 'QP', 'QR', 'lead_time_attainment', 'carbon_emission_intensity', 'renewable_energy_usage', 'plastic_recycle', 'human_rights_compliance_score', 'node_parity', 'DDR_gen_support', 'geo_risk', 'tariff_risk', 'chip_shortage_impact', 'supplier_code', 'PSL_cluster', 'PSL_status']

Composite scores look OK (no negatives).

Allocation % sum by fiscal_year:
fiscal_year
2015    100.0
2016    100.0
2017    100.0
2018    100.0
2019    100.0
2020    100.0
2021    100.0
2022    100.0
2023    100.0
2024    100.0
Name: allocation_percent, dtype: float64

Saved historical fea

  .apply(to_allocation)
