In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

# Load the dataset
df = pd.read_csv('Mall_Customers.csv')

# Display the first few rows
print("First 5 rows of the dataset:")
print(df.head())

# Display concise summary of the DataFrame
print("\nConcise summary of the dataset:")
print(df.info())

First 5 rows of the dataset:
   CustomerID  Gender  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40

Concise summary of the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              99 non-null     int64 
 1   Gender                  99 non-null     object
 2   Age                     99 non-null     int64 
 3   Annual Income (k$)      99 non-null     int64 
 4   Spending Score (1-100)  99 non-null     int64 
dtypes: int64(4), object(1)
memory usage: 4.0+

In [6]:
import pandas as pd

# Load the dataset
df_mall = pd.read_csv('Mall_Customers.csv')

# Display the first few rows
print("First 5 rows of the dataset:")
print(df_mall.head())

# Display concise summary of the DataFrame
print("\nConcise summary of the dataset:")
print(df_mall.info())

First 5 rows of the dataset:
   CustomerID  Gender  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40

Concise summary of the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              99 non-null     int64 
 1   Gender                  99 non-null     object
 2   Age                     99 non-null     int64 
 3   Annual Income (k$)      99 non-null     int64 
 4   Spending Score (1-100)  99 non-null     int64 
dtypes: int64(4), object(1)
memory usage: 4.0+

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Select the features for clustering
X = df_mall[['Annual Income (k$)', 'Spending Score (1-100)']]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Use the Elbow Method to find the optimal K
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init=10) # n_init for multiple initializations
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Plot the Elbow Method graph
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS')
plt.grid(True)
plt.savefig('elbow_method.png')
plt.close()

print("Elbow method plot saved as 'elbow_method.png'. Please examine the plot to determine the optimal K.")

Elbow method plot saved as 'elbow_method.png'. Please examine the plot to determine the optimal K.


In [8]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import pandas as pd # Re-import to ensure df_mall is accessible if running from scratch

# Select the features for clustering (re-selecting to ensure consistency)
X = df_mall[['Annual Income (k$)', 'Spending Score (1-100)']]

# Scale the features (re-scaling to ensure consistency)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optimal K determined from the Elbow Method plot is 5
optimal_k = 5

# Fit K-Means with the optimal K
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)

# Add the cluster labels to the original DataFrame for visualization and further analysis
df_mall['Cluster'] = cluster_labels

# 4. Visualize clusters with color-coding.
plt.figure(figsize=(12, 8))
scatter = plt.scatter(df_mall['Annual Income (k$)'], df_mall['Spending Score (1-100)'],
                      c=df_mall['Cluster'], cmap='viridis', s=100, alpha=0.8, edgecolors='w')
plt.scatter(scaler.inverse_transform(kmeans.cluster_centers_)[:, 0],
            scaler.inverse_transform(kmeans.cluster_centers_)[:, 1],
            s=300, c='red', marker='X', label='Centroids', edgecolors='k')
plt.title(f'Customer Segments (K={optimal_k})')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.grid(True)
plt.colorbar(scatter, ticks=range(optimal_k), label='Cluster Label')
plt.savefig('kmeans_clusters.png')
plt.close()

print(f"Clusters visualization saved as 'kmeans_clusters.png' for K={optimal_k}.")

# 5. Evaluate clustering using Silhouette Score.
silhouette_avg = silhouette_score(X_scaled, cluster_labels)
print(f"\nSilhouette Score for K={optimal_k}: {silhouette_avg:.4f}")

Clusters visualization saved as 'kmeans_clusters.png' for K=5.

Silhouette Score for K=5: 0.4642
