In [None]:
# ==========================================================
# SALES DATA CLUSTERING USING K-MEANS & HIERARCHICAL CLUSTERING
# ==========================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# ----------------------------------------------------------
# 1. Load Dataset
# ----------------------------------------------------------
df = pd.read_csv(r"C:\Users\Radha\Downloads\sales_data_sample.csv", encoding="latin1")

# Keep only numeric columns
df = df.select_dtypes(include=['number']).copy()

# Drop missing values
df.dropna(inplace=True)

# ----------------------------------------------------------
# 2. Outlier Removal (IQR method)
# ----------------------------------------------------------
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# ----------------------------------------------------------
# 3. Data Scaling
# ----------------------------------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

# ----------------------------------------------------------
# 4. Determine Optimal K using Elbow Method
# ----------------------------------------------------------
wcss = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(7,5))
plt.plot(K, wcss, marker='o')
plt.title("Elbow Method to Determine Optimal K")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("WCSS")
plt.grid(True)
plt.show()

# ----------------------------------------------------------
# 5. Apply K-Means (Choose K from Elbow curve, e.g., 3)
# ----------------------------------------------------------
k = 3
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(scaled_data)
df['KMeans_Cluster'] = clusters

# ----------------------------------------------------------
# 6. Visualize K-Means Clusters using PCA
# ----------------------------------------------------------
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

plt.figure(figsize=(7,5))
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=clusters, cmap='viridis', s=50)
plt.title("K-Means Clustering Visualization (2D PCA)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid(True)
plt.show()

# ----------------------------------------------------------
# 7. Hierarchical Clustering
# ----------------------------------------------------------
linked = linkage(scaled_data, method='ward')

plt.figure(figsize=(10,5))
dendrogram(linked)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Samples")
plt.ylabel("Distance")
plt.show()

# Cut dendrogram to form clusters (e.g., 3)
h_clusters = fcluster(linked, t=3, criterion='maxclust')
df['Hierarchical_Cluster'] = h_clusters

print("\nClustered Data Sample:")
print(df.head())
