In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/content/Wholesale customers data.csv")
df.head() # These are monthly expenses for each product
# except 1st two columns all are food products, channel - whether raw products are coming from hotel, Region - any location. So code is given to it as 1,2,3
# first 2 columns are not important for clustering

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


In [None]:
x = df.iloc[:,2:]
x

In [None]:
df.info()

In [None]:
x.describe()
# In Fresh mean is 12000 but median is 8504, In Milk mean=5796 and median=3627. So right skewed data
# Check outliers by looking 75% and max. So many outliers as there is a huge difference in Fresh, Milk
# 75% people are spending less than 16933 amount on fresh products, less than 7190 on Milk products, and so on
# Max - indicates there are some people who are spending more amt i.e. 73498 and so on

In [None]:
df.isna().sum()

In [None]:
# check for outliers
x.boxplot() # in each column huge amt of outliers are present

In [None]:
# sns.boxplot(x['Delicassen'])

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()
x = sc.fit_transform(x)
x

In [None]:
# Model Building
from sklearn.cluster import DBSCAN

In [None]:
db1 = DBSCAN()  # default eps=0.5, min_samples=5
db1.fit_predict(x) # we get lebels for clusters, -1: outlier, only 1 cluster is made i.e. cluster 0

In [None]:
db2 = DBSCAN(eps=1,min_samples=3) # try changing eps and min_samples
db2.fit_predict(x) # less data points are considered as outliers now.

In [None]:
db3 = DBSCAN(eps=1,min_samples=7)
db3.fit_predict(x)

In [None]:
sns.pairplot(df.drop(columns=['Channel','Region'])) #DBSCAN creates cluster by density. so same is shown in diag. below. 1 cluster with outliers
# ideally looking at these datapoints only 1 cluster should be there so we got 1 cluster as cluster 0.

In [None]:
# check which model is good with eps=0.5 or 1 and min_samples using Silhoutte Score
from sklearn.metrics import silhouette_score

In [None]:
# evaluate cluster with Silhouette Score
silhouette_score(x,db2.fit_predict(x))# pass entire dataset with labels, got moderate cluster

0.41642684756989395

In [None]:
# Try changing values again
db3 = DBSCAN(eps=2,min_samples=3) # try changing eps and min_samples
y = db3.fit_predict(x)

In [None]:
# evaluate cluster with Silhouette Score
silhouette_score(x,db3.fit_predict(x)) # good score for eps=2,min_sample=3, So try changing these values for good score

0.7486988127438182

In [None]:
silhouette_score(x,db1.fit_predict(x))

0.1958113956262888

In [None]:
df['Cluster'] = y

In [None]:
df

In [None]:
# extract all records which are outliers
df[df['Cluster'] == -1]

In [None]:
df[df['Cluster'] == 0] # actual values.

In [None]:
df.groupby('Cluster').agg('mean') # don't consider first 2 columns

Unnamed: 0_level_0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1,1.538462,2.846154,31466.0,28802.076923,35259.461538,13785.538462,14670.846154,9624.076923
0,1.316159,2.533958,11407.665105,5095.854801,7119.880562,2745.75644,2522.566745,1278.290398
