
# Task 3: Clustering Analysis â€“ Customer Segmentation

This notebook will:
1. Load `customer_data.csv` (if present) **or** generate a realistic sample dataset with the required columns.
2. Inspect and preprocess (scaling).
3. Find optimal **K** using **Elbow (WCSS)** and **Silhouette** methods.
4. Fit **K-Means**, assign cluster labels.
5. Visualize:
   - Elbow plot (WCSS vs K)  
   - Silhouette vs K  
   - 2D PCA scatter with centroids
6. Export the labeled dataset to `/mnt/data/customer_data_clustered.csv`

> **Expected columns** for your own data: `Customer ID`, `Age`, `Annual Income`, `Spending Score`.


In [None]:

# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Display options
pd.set_option('display.max_columns', None)

DATA_PATH = Path('customer_data.csv')  # looks in current working directory
OUTPUT_PATH = Path('/mnt/data/customer_data_clustered.csv')


In [None]:

# Load or generate dataset
if DATA_PATH.exists():
    df = pd.read_csv(DATA_PATH)
    print('[INFO] Loaded your customer_data.csv')
else:
    print('[WARN] customer_data.csv not found. Generating a realistic sample dataset...')
    rng = np.random.RandomState(42)
    n = 240
    ages = np.concatenate([
        rng.normal(24, 3, n//3),
        rng.normal(36, 4, n//3),
        rng.normal(52, 6, n - 2*(n//3)),
    ]).clip(18, 70)

    incomes = np.concatenate([
        rng.normal(30_000, 6_000, n//3),
        rng.normal(60_000, 8_000, n//3),
        rng.normal(100_000, 12_000, n - 2*(n//3)),
    ]).clip(10_000, 200_000)

    spend = np.concatenate([
        rng.normal(40, 10, n//3),
        rng.normal(60, 12, n//3),
        rng.normal(75, 10, n - 2*(n//3)),
    ]).clip(1, 100)

    df = pd.DataFrame({
        'Customer ID': np.arange(1, n+1),
        'Age': ages.round().astype(int),
        'Annual Income': incomes.round().astype(int),
        'Spending Score': spend.round().astype(int),
    })
    df.to_csv(DATA_PATH, index=False)
    print(f'[INFO] Sample dataset saved to: {DATA_PATH.resolve()}')

df.head()


In [None]:

print('--- Dataset Inspection ---')
print('Shape:', df.shape)
print('Missing values per column:', df.isna().sum().to_dict())
print('Duplicates:', int(df.duplicated().sum()))
print('\nDtypes:')
print(df.dtypes)
print('\nDescribe (numeric):')
display(df.describe())


In [None]:

# Preprocessing
feature_cols = ['Age', 'Annual Income', 'Spending Score']
X = df[feature_cols].copy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Elbow (WCSS) + Silhouette
k_values = list(range(2, 11))
wcss = []
sil_scores = []

for k in k_values:
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    labels = km.fit_predict(X_scaled)
    wcss.append(km.inertia_)
    sil_scores.append(silhouette_score(X_scaled, labels))

# Plot: Elbow
plt.figure()
plt.plot(k_values, wcss, marker='o')
plt.title('Elbow Method (WCSS vs K)')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.grid(True)
plt.tight_layout()
plt.show()

# Plot: Silhouette
plt.figure()
plt.plot(k_values, sil_scores, marker='o')
plt.title('Silhouette Score vs K')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.tight_layout()
plt.show()

best_k = k_values[int(np.argmax(sil_scores))]
print(f'[INFO] Best K by silhouette: {best_k} (score={max(sil_scores):.3f})')


In [None]:

# Fit final KMeans
final_km = KMeans(n_clusters=best_k, n_init=20, random_state=42)
cluster_labels = final_km.fit_predict(X_scaled)
df['Cluster'] = cluster_labels

# PCA 2D scatter
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)
centroids_pca = pca.transform(final_km.cluster_centers_)

plt.figure()
for c in range(best_k):
    mask = df['Cluster'] == c
    plt.scatter(X_pca[mask, 0], X_pca[mask, 1], s=20, label=f'Cluster {c}')
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], marker='X', s=200, label='Centroids')
plt.title(f'PCA 2D Scatter of Clusters (K={best_k})')
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

df.head()


In [None]:

# Export labeled dataset
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUTPUT_PATH, index=False)
print(f'[INFO] Clustered dataset saved to: {OUTPUT_PATH}')

# Cluster summary table
cluster_summary = df.groupby('Cluster')[feature_cols + ['Customer ID']].agg({
    'Age': ['mean', 'median'],
    'Annual Income': ['mean', 'median'],
    'Spending Score': ['mean', 'median'],
    'Customer ID': 'count'
})
cluster_summary.columns = [' '.join(col).strip().replace('Customer ID ', '') for col in cluster_summary.columns.values]
cluster_summary = cluster_summary.rename(columns={'count': 'Size'})
display(cluster_summary)

# Quick recommendations
means = df.groupby('Cluster')[feature_cols].mean().round(2)
order_by_spend = means.sort_values('Spending Score', ascending=False).index.tolist()

print('\n--- Quick Recommendations Heuristics ---')
if len(order_by_spend) > 0:
    top = order_by_spend[0]
    print(f'- Cluster {top} shows the highest average Spending Score; consider loyalty/premium focus.')
if len(order_by_spend) > 1:
    low = order_by_spend[-1]
    print(f'- Cluster {low} shows the lowest Spending Score; try promos and onboarding journeys.')

rich = means['Annual Income'].idxmax()
print(f'- Cluster {rich} has the highest Annual Income; upsell premium/high-margin items.')

young = means['Age'].idxmin()
senior = means['Age'].idxmax()
print(f'- Cluster {young} skews younger; emphasize trendy/entry products and social campaigns.')
print(f'- Cluster {senior} skews older; highlight reliability, warranties, and value bundles.')
