In [None]:
# ========== [1] Kaggle Dataset Setup ==========
# Run this block only once (Google Colab specific)
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download vjchoudhary7/customer-segmentation-tutorial-in-python

In [None]:
!unzip customer-segmentation-tutorial-in-python.zip

In [None]:
# ========== [2] Import Required Libraries ==========
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans

In [None]:
# For cleaner plots
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# ========== [3] Load and Preprocess Data ==========
df = pd.read_csv("Mall_Customers.csv")

# Rename columns for simplicity
df.rename(columns={
    'CustomerID': 'id',
    'Gender': 'gender',
    'Age': 'age',
    'Annual Income (k$)': 'income',
    'Spending Score (1-100)': 'score'
}, inplace=True)

In [None]:
# Basic inspection
print("Dataset Shape:", df.shape)
print("Missing Values:\n", df.isnull().sum())
print("Duplicate Rows:", df.duplicated().sum())

In [None]:
# ========== [4] Data Visualization ==========
# Pairwise plot to understand relationships
sns.pairplot(df[['age', 'income', 'score']])
plt.suptitle("Pairwise Relationship Between Features", y=1.02)
plt.show()

In [None]:
# ========== [5] Determine Optimal Clusters (Elbow Method) ==========
wcss = []
X = df[['income', 'score']]

for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)


In [None]:
# Plot Elbow Curve
plt.plot(range(1, 11), wcss, marker='o')
plt.title("Elbow Method for Optimal K")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS (Within-Cluster Sum of Squares)")
plt.xticks(range(1, 11))
plt.grid(True)
plt.show()

In [None]:
# ========== [6] Apply K-Means Clustering ==========
optimal_k = 5
kmeans_model = KMeans(n_clusters=optimal_k, random_state=42)
kmeans_model.fit(X)

In [None]:
# Assign labels to original data
df['cluster'] = kmeans_model.labels_

In [None]:
# Cluster centers
centroids = kmeans_model.cluster_centers_
print("Cluster Centers:\n", centroids)


In [None]:
# ========== [7] Cluster Visualization ==========
# Scatter plot of clusters
sns.scatterplot(data=df, x='score', y='income', hue='cluster', palette='Set2', s=100)
plt.scatter(centroids[:, 1], centroids[:, 0], c='red', s=200, label='Centroids', marker='X')
plt.title("Customer Segmentation (K-Means Clustering)")
plt.xlabel("Spending Score (1-100)")
plt.ylabel("Annual Income (k$)")
plt.legend()
plt.show()

In [None]:
# ========== [8] Cluster Distribution ==========
cluster_counts = df['cluster'].value_counts().sort_index()
print("Number of customers in each cluster:\n", cluster_counts)

In [None]:
sns.countplot(data=df, x='cluster', hue='gender', palette='coolwarm')
plt.title("Gender Distribution Across Clusters")
plt.xlabel("Cluster")
plt.ylabel("Number of Customers")
plt.legend(title="Gender")
plt.show()

In [None]:
df.to_csv("clustered_customers.csv", index=False)

In [None]:
import joblib
joblib.dump(kmeans_model, "kmeans_model.pkl")