In [1]:
# CELL 1: Setup
import pandas as pd, numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import plotly.express as px

# CELL 2: Territory-level features
df = pd.read_csv("Data1/cleaned_sales.csv", parse_dates=["orderdate"])
df.columns = [c.lower() for c in df.columns]

# Aggregate by territory
by_terr = (df.groupby("territory", as_index=False)
             .agg(sales=("sales","sum"),
                  profit_est=("profit_est","sum"),
                  orders=("ordernumber","nunique"),
                  avg_discount=("discount_pct","mean")))

# Compute average margin
by_terr["avg_margin"] = np.where(
    by_terr["sales"] > 0,
    by_terr["profit_est"] / by_terr["sales"],
    0
)

# Features for clustering
features = ["sales","profit_est","orders","avg_discount","avg_margin"]
X = by_terr[features].fillna(0).values
scaler = StandardScaler()
Xs = scaler.fit_transform(X)

# CELL 3: Choose k by silhouette
n_samples = len(by_terr)
sil = {}
for k in range(2, min(8, n_samples)):  # limit k <= n_samples-1
    km = KMeans(n_clusters=k, n_init=10, random_state=42).fit(Xs)
    sil[k] = silhouette_score(Xs, km.labels_)

best_k = max(sil, key=sil.get)
print("Silhouette by k:", sil, "Best k:", best_k)

# Fit final model
km = KMeans(n_clusters=best_k, n_init=10, random_state=42).fit(Xs)
by_terr["cluster"] = km.labels_

# CELL 4: PCA for visualization
pca = PCA(n_components=2, random_state=42).fit(Xs)
coords = pca.transform(Xs)
by_terr["pc1"], by_terr["pc2"] = coords[:,0], coords[:,1]

# Save results
by_terr.to_csv("segments_territory.csv", index=False)

# Interpretation helper: personas
def label_persona(row):
    if row["sales"] > by_terr["sales"].median() and row["avg_margin"] < by_terr["avg_margin"].median():
        return "High sales, low margin — Fix pricing"
    if row["sales"] > by_terr["sales"].median() and row["avg_margin"] >= by_terr["avg_margin"].median():
        return "High sales, healthy margin — Expand"
    if row["sales"] <= by_terr["sales"].median() and row["avg_margin"] >= by_terr["avg_margin"].median():
        return "Low sales, healthy margin — Seed growth"
    return "Low sales, low margin — Avoid/repair"

by_terr["persona"] = by_terr.apply(label_persona, axis=1)

# Preview results
print(by_terr[["territory","cluster","persona"]].sort_values(["cluster","territory"]).head(20))

# CELL 5: Interactive PCA scatter plot
fig = px.scatter(
    by_terr,
    x="pc1", y="pc2",
    color="cluster",
    hover_data=["territory","sales","profit_est","avg_margin","persona"],
    title="Territory Segmentation (PCA Projection)"
)
fig.show()


Silhouette by k: {2: 0.14208372278352813} Best k: 2
  territory  cluster                                  persona
1      EMEA        0      High sales, healthy margin — Expand
0      APAC        1  Low sales, healthy margin — Seed growth
2     Japan        1     Low sales, low margin — Avoid/repair


In [2]:
import pandas as pd

summary = pd.DataFrame([
    {"Model":"Prophet (per Territory)",
     "Purpose":"12M sales forecast",
     "Evaluation Metric":"CV-MAPE / CV-RMSE",
     "Business Insight":"Use yhat ± bands; pick predictable, growing territories for pilots"},
    {"Model":"Random Forest Classifier",
     "Purpose":"Detect loss-leading SKUs",
     "Evaluation Metric":"ROC-AUC / F1 / Precision@K",
     "Business Insight":"Top-K flagged SKUs explain most negative profit; reprice/delist"},
    {"Model":"ElasticNet Regression",
     "Purpose":"Price elasticity on margin",
     "Evaluation Metric":"Holdout R²",
     "Business Insight":"Every +10% discount changes margin by β; compute sweet-spot by line"},
    {"Model":"K-Means Clustering",
     "Purpose":"Territory/customer segmentation",
     "Evaluation Metric":"Silhouette score",
     "Business Insight":"Expand in high-sales/healthy-margin clusters; fix pricing elsewhere"},
])
summary


Unnamed: 0,Model,Purpose,Evaluation Metric,Business Insight
0,Prophet (per Territory),12M sales forecast,CV-MAPE / CV-RMSE,"Use yhat ± bands; pick predictable, growing te..."
1,Random Forest Classifier,Detect loss-leading SKUs,ROC-AUC / F1 / Precision@K,Top-K flagged SKUs explain most negative profi...
2,ElasticNet Regression,Price elasticity on margin,Holdout R²,Every +10% discount changes margin by β; compu...
3,K-Means Clustering,Territory/customer segmentation,Silhouette score,Expand in high-sales/healthy-margin clusters; ...
