# Import Libraries & Load DataSet

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score, adjusted_mutual_info_score
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', message="When grouping with a length-1 list-like")


# Load Dataset 

In [2]:
df = pd.read_csv('C:/Users/admin/Desktop/Prodigy Infotech Internship 2024/Mall_Customers.csv')

# DISPLAY DATASET 

In [3]:
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [4]:
df.tail()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18
199,200,Male,30,137,83


# Data Set Shape

In [5]:
df.shape

(200, 5)

# CHECKING DTYPES 

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [7]:
df.describe()

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0,200.0
mean,100.5,38.85,60.56,50.2
std,57.879185,13.969007,26.264721,25.823522
min,1.0,18.0,15.0,1.0
25%,50.75,28.75,41.5,34.75
50%,100.5,36.0,61.5,50.0
75%,150.25,49.0,78.0,73.0
max,200.0,70.0,137.0,99.0


# Age distribution

In [None]:
!pip install nbformat --upgrade

In [None]:
fig = px.histogram(df, x='Age', nbins=10, title='Distribution of Age')
fig.show()


# Annual Income distribution

In [None]:
fig = px.histogram(df, x='Annual Income (k$)', nbins=10, title='Distribution of Annual Income')
fig.show()

# Spending Score distribution

In [None]:
fig = px.histogram(df, x='Spending Score (1-100)', nbins=10, title='Distribution of Spending Score')
fig.show()

# Age vs Annual Income

In [None]:
fig = px.scatter(df, x='Age', y='Annual Income (k$)', color='Gender', hover_data=['CustomerID'])
fig.update_layout(title='Age vs Annual Income')
fig.show()

# Annual Income vs Spending Score

In [None]:
fig = px.scatter(df, x='Annual Income (k$)', y='Spending Score (1-100)', color='Gender', hover_data=['CustomerID'])
fig.update_layout(title='Annual Income vs Spending Score')
fig.show()

# Annual Income by Gender 

In [None]:
fig = px.box(df, x='Gender', y='Annual Income (k$)', points='all', title='Annual Income by Gender')
fig.show()

# Spending Score by Gender

In [None]:
fig = px.box(df, x='Gender', y='Spending Score (1-100)', points='all', title='Spending Score by Gender')
fig.show()

# Check for missing values

In [None]:
df.isnull().sum()

# Convert Gender into categorical numerical values

In [None]:
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# Select features for clustering

In [None]:
X = df[['Annual Income (k$)', 'Spending Score (1-100)']]

# Standardize the data

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Determine the optimal number of clusters using the Elbow Method
### Optimal Number of Clusters

The Elbow Method is used to determine the optimal number of clusters.

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Plot the Elbow Method

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

# Applying K-means to the dataset
### Applying K-means Clustering
K-means clustering is applied with the optimal number of clusters (5):

In [None]:
kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=300, n_init=10, random_state=42)
y_kmeans = kmeans.fit_predict(X_scaled)

# Add the cluster labels to the original dataframe

In [None]:
df['Cluster'] = y_kmeans

# Visualize the clusters

In [None]:
plt.figure(figsize=(10, 7))
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster', data=df, palette='viridis', s=100, alpha=0.7)
plt.title('Clusters of Customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

# Clustering Evaluation Metrics

# Fit K-means

In [None]:
kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=300, n_init=10, random_state=42)
y_kmeans = kmeans.fit_predict(X_scaled)

# Inertia (WCSS)
Measures how tightly the clusters are formed. Lower values indicate more tightly clustered data points:

In [None]:
inertia = kmeans.inertia_
print(f"Inertia (WCSS): {inertia}")

# Silhouette Score
Measures how similar an object is to its own cluster compared to other clusters. It ranges from -1 to 1, where a higher value indicates better-defined clusters:

In [None]:
silhouette_avg = silhouette_score(X_scaled, y_kmeans)
print(f"Silhouette Score: {silhouette_avg}")

# Davies-Bouldin Index
The ratio of within-cluster distances to between-cluster distances. Lower values indicate better clustering:

In [None]:
davies_bouldin = davies_bouldin_score(X_scaled, y_kmeans)
print(f"Davies-Bouldin Index: {davies_bouldin}")

# Calinski-Harabasz Index
The ratio of the sum of between-cluster dispersion to the within-cluster dispersion. Higher values indicate better clustering:

In [None]:
calinski_harabasz = calinski_harabasz_score(X_scaled, y_kmeans)
print(f"Calinski-Harabasz Index: {calinski_harabasz}")