In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
# data url 
data_url="https://raw.githubusercontent.com/redashu/Datasets/refs/heads/master/walmart_store_sample_data.csv"

In [3]:
df=pd.read_csv(data_url)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Store_Size        2000 non-null   int64
 1   Monthly_Footfall  2000 non-null   int64
 2   Income_Level      2000 non-null   int64
dtypes: int64(3)
memory usage: 47.0 KB


In [6]:
# if data is having featues like store and department IDs 
# for now it is optional step 
my_features = ['Store_Size','Monthly_Footfall','Income_Level']
data = df[my_features]

In [7]:
data.head()

Unnamed: 0,Store_Size,Monthly_Footfall,Income_Level
0,34967,3649,42047
1,28617,4710,54531
2,36476,3415,55270
3,45230,4384,62089
4,27658,1212,34497


In [8]:
sc=StandardScaler()
scaled_data=sc.fit_transform(data)
print(scaled_data)

[[ 0.45755178 -0.67482034 -0.816591  ]
 [-0.18827211 -0.14235693  0.01661357]
 [ 0.61102395 -0.79225337  0.06593575]
 ...
 [-0.94454713 -0.31850647 -1.17092319]
 [-0.21349483  1.65226057  0.51637613]
 [-0.80531361  0.36501396 -1.12373674]]


In [9]:
# so we can call KMeans algo with clustering k=3 or (tune it with Elbow method)
km=KMeans(n_clusters=3)

In [11]:
# we can do fit and predict in single line also 
df['cluster']=km.fit_predict(scaled_data)

In [12]:
df.head()

Unnamed: 0,Store_Size,Monthly_Footfall,Income_Level,cluster
0,34967,3649,42047,1
1,28617,4710,54531,1
2,36476,3415,55270,0
3,45230,4384,62089,0
4,27658,1212,34497,1


In [14]:
# we can do some analysis using dataframe also 
df.groupby('cluster')[my_features].mean().round(2)

Unnamed: 0_level_0,Store_Size,Monthly_Footfall,Income_Level
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,40882.12,4883.79,57105.23
1,26162.4,3895.24,43717.29
2,24854.77,6376.5,63634.59


In [15]:
# lets map cluster to human readable for
cluster_map = {
    0: "Balanced Store",
    1: "Budget Store",
    2: "Premium Store"
}

In [16]:
df['cluster_label']=df['cluster'].map(cluster_map)

In [17]:
df.head()

Unnamed: 0,Store_Size,Monthly_Footfall,Income_Level,cluster,cluster_label
0,34967,3649,42047,1,Budget Store
1,28617,4710,54531,1,Budget Store
2,36476,3415,55270,0,Balanced Store
3,45230,4384,62089,0,Balanced Store
4,27658,1212,34497,1,Budget Store


In [None]:
# Home work do the Visualization by plotting 
# Store_Size vs Monthly_Footfall and decicde Centroids
# fRom km you can find cer