In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans  
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [2]:
df1 = pd.read_csv('Mall_Customers.csv')

In [3]:
df1.isnull().sum()

CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

In [4]:
print(df1.describe())  # Check for extreme outliers


       CustomerID         Age  Annual Income (k$)  Spending Score (1-100)
count  200.000000  200.000000          200.000000              200.000000
mean   100.500000   38.850000           60.560000               50.200000
std     57.879185   13.969007           26.264721               25.823522
min      1.000000   18.000000           15.000000                1.000000
25%     50.750000   28.750000           41.500000               34.750000
50%    100.500000   36.000000           61.500000               50.000000
75%    150.250000   49.000000           78.000000               73.000000
max    200.000000   70.000000          137.000000               99.000000


In [5]:
df1.replace({'Male':1 , 'Female':0 } , inplace=True)

  df1.replace({'Male':1 , 'Female':0 } , inplace=True)


In [6]:
df1 = df1.drop('CustomerID',axis=1)

In [7]:
df1.head(5)

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,19,15,39
1,1,21,15,81
2,0,20,16,6
3,0,23,16,77
4,0,31,17,40


In [8]:
x = df1.iloc[:,1:]
y = df1.iloc[: , 0]

In [9]:
y.head(5)

0    1
1    1
2    0
3    0
4    0
Name: Gender, dtype: int64

In [10]:
x.head(5)


Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100)
0,19,15,39
1,21,15,81
2,20,16,6
3,23,16,77
4,31,17,40


In [11]:
X_train, X_test, _, _ = train_test_split(x, y, test_size=0.2, random_state=42)  


In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [13]:
kmeans = KMeans(n_clusters=3, random_state=42)  
clusters = kmeans.fit_predict(X_train_scaled)



In [14]:
# Create a DataFrame for clusters
clusters_df = pd.DataFrame({'Cluster': clusters})
print("Cluster Centers:\n", kmeans.cluster_centers_)

# Merge the cluster labels with the original DataFrame
df1 = df1.join(clusters_df)
# Assuming that cluster 0 corresponds to Male and cluster 1 corresponds to Female
# You can modify the labels accordingly.

# Map cluster numbers to labels (you may need to adjust this based on cluster centers)
df1['Cluster_Label'] = df1['Cluster'].map({0: 'Other', 1: 'Male', 2: 'Female'})

# Calculate the silhouette score
sil_score = silhouette_score(X_train_scaled, clusters)
print(f"Silhouette Score: {sil_score}")

# Display the clustered data
print(df1.head())


Cluster Centers:
 [[-0.96602238 -0.66264626  0.15892606]
 [ 0.9317245  -0.14319624 -0.63911504]
 [-0.37554623  1.09188749  0.89984272]]
Silhouette Score: 0.35871836438824406
   Gender  Age  Annual Income (k$)  Spending Score (1-100)  Cluster  \
0       1   19                  15                      39      1.0   
1       1   21                  15                      81      2.0   
2       0   20                  16                       6      0.0   
3       0   23                  16                      77      1.0   
4       0   31                  17                      40      2.0   

  Cluster_Label  
0          Male  
1        Female  
2         Other  
3          Male  
4        Female  
