# Q5: K-Means Clustering — Customer Segmentation
Dataset: `kmeans_customers.csv`

In [None]:
# Common imports used across notebooks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
RANDOM_STATE = 42


In [None]:
df = pd.read_csv('/mnt/data/aiml/kmeans_customers.csv')
df.head()

In [None]:
# Use numeric features; drop preferred_channel for KMeans or encode separately
features = ['annual_spend','monthly_visits','avg_transaction_value','loyalty_years']
X = df[features].fillna(df[features].median())
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Xs = scaler.fit_transform(X)


In [None]:
# Elbow + silhouette
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

inertia=[]
sil=[]
K=range(2,11)
for k in K:
    km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10).fit(Xs)
    inertia.append(km.inertia_)
    sil.append(silhouette_score(Xs, km.labels_))

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(K, inertia, 'o-'); plt.title('Elbow'); plt.xlabel('k')
plt.subplot(1,2,2)
plt.plot(K, sil, 'o-'); plt.title('Silhouette'); plt.xlabel('k')
plt.show()

### Interpretation / Discussion

- **Briefly interpret the above results here.** Explain model performance (e.g., accuracy / R² / precision / recall), any issues (overfitting, class imbalance), and recommended next steps (feature engineering, hyperparameter tuning, regularization). Keep all outputs and interpretations visible inline — do **not** save files externally.

In [None]:
# Fit final K (choose k based on plots; default k=3)
k=3
km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=20).fit(Xs)
df['cluster'] = km.labels_
centers = scaler.inverse_transform(km.cluster_centers_)
display(pd.DataFrame(centers, columns=features))


In [None]:
# PCA for visualization
from sklearn.decomposition import PCA
pca = PCA(2, random_state=RANDOM_STATE)
pc = pca.fit_transform(Xs)
plt.figure(figsize=(8,6))
sns.scatterplot(x=pc[:,0], y=pc[:,1], hue=df['cluster'], palette='tab10')
plt.title('KMeans clusters (PCA)')
plt.show()