In [5]:
# ============================================
#  Spending Pattern Analysis with K-Means
# ============================================

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [14]:
CSV_PATH = "spending_l9_dataset.csv"
df = pd.read_csv(CSV_PATH)

print("=== DATA SNAPSHOT ===")
df.head()

=== DATA SNAPSHOT ===


Unnamed: 0,CustomerID,Age,Income_$,SpendingScore,VisitsPerMonth,OnlinePurchases,Gender,Region
0,1,28,33,78,14,9,Female,East
1,2,21,25,87,8,23,Male,North
2,3,23,24,88,13,10,Male,South
3,4,24,25,73,16,11,Female,West
4,5,20,23,88,17,16,Male,West


In [22]:
FEATURES = ["Income_$", "SpendingScore"]
X = df[FEATURES].copy()

# Fill missing values with median
for col in FEATURES:
    if X[col].isna().any():
        X[col] = X[col].fillna(X[col].median())

print("=== Featuess Head ===")
X.head()

=== Featuess Head ===


Unnamed: 0,Income_$,SpendingScore
0,33,78
1,25,87
2,24,88
3,25,73
4,23,88


In [29]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Scaled Shape:", X_scaled.shape)

Scaled Shape: (200, 2)


In [35]:
import warnings
warnings.filterwarnings("ignore")

print("=== ELBOW Method (SSE per k) ===")

sse_values = []

for k in range(1, 11):
    km = KMeans(n_clusters=k, random_state=42, n_init="auto")
    km.fit(X_scaled)
    sse = km.inertia_
    sse_values.append(sse)
    print(f"k={k} → SSE={sse:.2f}")

=== ELBOW Method (SSE per k) ===
k=1 → SSE=400.00
k=2 → SSE=199.70
k=3 → SSE=79.37
k=4 → SSE=21.37
k=5 → SSE=19.09
k=6 → SSE=15.65
k=7 → SSE=14.48
k=8 → SSE=13.81
k=9 → SSE=12.94
k=10 → SSE=11.52


In [40]:
K = 4   

kmeans = KMeans(n_clusters=K, random_state=42, n_init="auto")
labels = kmeans.fit_predict(X_scaled)

df["Cluster"] = labels.astype(int)

df.head()

Unnamed: 0,CustomerID,Age,Income_$,SpendingScore,VisitsPerMonth,OnlinePurchases,Gender,Region,Cluster
0,1,28,33,78,14,9,Female,East,2
1,2,21,25,87,8,23,Male,North,2
2,3,23,24,88,13,10,Male,South,2
3,4,24,25,73,16,11,Female,West,2
4,5,20,23,88,17,16,Male,West,2


In [44]:
sil = silhouette_score(X_scaled, labels)
dbi = davies_bouldin_score(X_scaled, labels)

print("=== metrcs ===")
print(f"Silhouette Score : {sil:.3f}")
print(f"Davies-Bouldin   : {dbi:.3f}")

=== metrcs ===
Silhouette Score : 0.729
Davies-Bouldin   : 0.387


In [47]:
centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)

centers_df = pd.DataFrame(centers_original, columns=FEATURES)
centers_df.index.name = "Cluster"

print("=== CLUSTER CENTERS (Original Units) ===")
centers_df.round(2)

=== CLUSTER CENTERS (Original Units) ===


Unnamed: 0_level_0,Income_$,SpendingScore
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,56.32,53.58
1,28.92,19.6
2,24.14,83.1
3,99.16,79.24


In [60]:
sample_idx = [0, 100, 150]  
sanity = df.loc[sample_idx, FEATURES + ["Cluster"]]
print("=== SANITY CHECK (3 Customers) ===")
sanity

=== SANITY CHECK (3 Customers) ===


Unnamed: 0,Income_$,SpendingScore,Cluster
0,33,78,2
100,32,20,1
150,100,76,3


In [61]:
OUT_PATH = "spending_labeled_clusters.csv"
df.to_csv(OUT_PATH, index=False)

print("Saved clustered dataset →", OUT_PATH)

Saved clustered dataset → spending_labeled_clusters.csv
