In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler


In [2]:
milk = pd.read_csv("milk.csv").set_index('Animal')
scaler = StandardScaler().set_output(transform='pandas')
milkscaled = scaler.fit_transform(milk)
clust_DB= DBSCAN(eps=1, min_samples=2)
clust_DB.fit(milkscaled)
clust_DB.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  1,
       -1, -1,  1,  2,  2,  2, -1, -1], dtype=int64)

In [3]:
milk.head(2)

Unnamed: 0_level_0,water,protein,fat,lactose,ash
Animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HORSE,90.1,2.6,1.0,6.9,0.35
ORANGUTAN,88.5,1.4,3.5,6.0,0.24


In [4]:
eps_range = [0.2,0.4,0.6,1]
mp_range = [2,3,4,5]
cnt = 0
a =[]
for i in eps_range:
    for j in mp_range:
        clust_DB = DBSCAN(eps=i, min_samples=j)
        clust_DB.fit(milkscaled.iloc[:,:5])
        if len(set(clust_DB.labels_)) > 2:
            cnt = cnt + 1
            milkscaled['Clust'] = clust_DB.labels_
            milk_scl_inliers = milkscaled[milkscaled['Clust']!=-1]
            sil_sc = silhouette_score(milk_scl_inliers.iloc[:,:-1],
                             milk_scl_inliers.iloc[:,-1])
            a.append([cnt,i,j,sil_sc])
            print(i,j,sil_sc)
    
a = np.array(a)
pa = pd.DataFrame(a,columns=['Sr','eps','min_pt','sil'])
print("Best Paramters:")
pa[pa['sil'] == pa['sil'].max()]


0.4 2 0.6518937593821538
0.4 3 0.5385180352469559
0.6 2 0.5934459505692155
0.6 3 0.5344431042454363
0.6 4 0.5519747727201489
1 2 0.4344818095328392
1 3 0.6473871775367226
Best Paramters:


Unnamed: 0,Sr,eps,min_pt,sil
0,1.0,0.4,2.0,0.651894


## DBSCAN with best parameters

In [5]:
# clust_DB= DBSCAN(eps=1, min_samples=2)
# clust_DB.fit(milkscaled)
# print(clust_DB.labels_)
# print(len(set(clust_DB.labels_)))


In [6]:
clust_DB= DBSCAN(eps=1, min_samples=2)
clust_DB.fit(milkscaled.iloc[:,:5])
print(clust_DB.labels_)
print(len(set(clust_DB.labels_)))


[ 0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  1 -1 -1  1  2  2  2 -1
 -1]
4


In [7]:
clust_0= milk[clust_DB.labels_==0]
clust_1= milk[clust_DB.labels_==1]
clust_2= milk[clust_DB.labels_==2]
clust_3= milk[clust_DB.labels_==3]

In [8]:
clust_0.shape, clust_1.shape, clust_2.shape, clust_3.shape

((15, 5), (2, 5), (3, 5), (0, 5))

In [9]:
clust_0

Unnamed: 0_level_0,water,protein,fat,lactose,ash
Animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HORSE,90.1,2.6,1.0,6.9,0.35
ORANGUTAN,88.5,1.4,3.5,6.0,0.24
MONKEY,88.4,2.2,2.7,6.4,0.18
DONKEY,90.3,1.7,1.4,6.2,0.4
HIPPO,90.4,0.6,4.5,4.4,0.1
CAMEL,87.7,3.5,3.4,4.8,0.71
BISON,86.9,4.8,1.7,5.7,0.9
BUFFALO,82.1,5.9,7.9,4.7,0.78
GUINEA PIG,81.9,7.4,7.2,2.7,0.85
FOX,81.6,6.6,5.9,4.9,0.93


## Cluster Analysis

In [10]:
milk_clust= milk.copy()
milk_clust['cluster'] = clust_DB.labels_
milk_clust= milk_clust[milk_clust['cluster'] != -1]
milk_clust.groupby('cluster').mean()

Unnamed: 0_level_0,water,protein,fat,lactose,ash
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,86.36,3.886667,4.033333,5.166667,0.628
1,74.4,9.25,11.05,3.15,1.3
2,65.166667,10.733333,20.4,2.233333,1.5


## USA Arrests

In [11]:
usa = pd.read_csv("USArrests.csv").set_index('Unnamed: 0')

In [12]:
scaler = StandardScaler().set_output(transform='pandas')
usa_scaled = scaler.fit_transform(usa)

In [13]:
usascaled = scaler.fit_transform(usa)
clust_DB= DBSCAN(eps=1, min_samples=2)
clust_DB.fit(usascaled)
clust_DB.labels_
silhouette_score(usa_scaled, clust_DB.labels_)

0.19290975823254836

In [14]:
eps_range = [0.2,0.4,0.6,1]
mp_range = [2,3,4,5]
cnt = 0
a =[]
for i in eps_range:
    for j in mp_range:
        clust_DB = DBSCAN(eps=i, min_samples=j)
        clust_DB.fit(usascaled.iloc[:,:5])
        if len(set(clust_DB.labels_)) > 2:
            cnt = cnt + 1
            usascaled['Clust'] = clust_DB.labels_
            usa_scl_inliers = usascaled[usascaled['Clust']!=-1]
            sil_sc = silhouette_score(usa_scl_inliers.iloc[:,:-1],
                             usa_scl_inliers.iloc[:,-1])
            a.append([cnt,i,j,sil_sc])
            print(i,j,sil_sc)
    
a = np.array(a)
pa = pd.DataFrame(a,columns=['Sr','eps','min_pt','sil'])
print("Best Paramters:")
pa[pa['sil'] == pa['sil'].max()]


0.4 2 0.9294801120696734
0.6 2 0.4598846166598499
0.6 3 0.7189015458613475
1 2 0.11526083405519487
1 3 0.11526083405519487
1 4 0.21075699219922683
1 5 0.22417529426027302
Best Paramters:


Unnamed: 0,Sr,eps,min_pt,sil
0,1.0,0.4,2.0,0.92948


## DBSCAN with best parameters

In [15]:
clust_DB= DBSCAN(eps=0.4, min_samples=2)
clust_DB.fit(usascaled)
print(clust_DB.labels_)
print(len(set(clust_DB.labels_)))


[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1  1 -1 -1  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1]
3


In [16]:

df_clust = pd.DataFrame({'city':list(usa_scaled.index),
                         'cluster': list(clust_DB.labels_)}).sort_values(by='cluster')

In [17]:
df_clust['cluster'].value_counts()

cluster
-1    46
 0     2
 1     2
Name: count, dtype: int64

In [18]:
df_clust.sort_values('cluster')

Unnamed: 0,city,cluster
0,Alabama,-1
2,Arizona,-1
3,Arkansas,-1
4,California,-1
5,Colorado,-1
6,Connecticut,-1
7,Delaware,-1
8,Florida,-1
9,Georgia,-1
10,Hawaii,-1


In [19]:
clust_DB= DBSCAN(eps=0.4, min_samples=2)
clust_DB.fit(usa_scaled.iloc[:,:5])

## Cluster Analysis

In [20]:
usa_clust= usa.copy()
usa_clust['cluster'] = clust_DB.labels_
usa_clust= usa_clust[usa_clust['cluster'] != -1]
usa_clust.groupby('cluster').mean()

Unnamed: 0_level_0,Murder,Assault,UrbanPop,Rape
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,10.75,251.5,84.5,25.05
1,2.15,56.5,56.5,10.4


In [21]:
#######

In [22]:
usa = pd.read_csv("USArrests.csv").set_index('Unnamed: 0')
scaler = StandardScaler().set_output(transform='pandas')
usa_scaled = scaler.fit_transform(usa)
clust_DB= DBSCAN(eps=0.4, min_samples=2)
clust_DB.fit(usa_scaled.iloc[:,:4])

In [23]:
usa_clust= usa.copy()
usa_clust['cluster'] = clust_DB.labels_
usa_clust= usa_clust[usa_clust['cluster'] != -1]
usa_clust.groupby('cluster').mean()

Unnamed: 0_level_0,Murder,Assault,UrbanPop,Rape
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,10.75,251.5,84.5,25.05
1,2.15,56.5,56.5,10.4


In [24]:
usa_clust.sort_values('cluster')

Unnamed: 0_level_0,Murder,Assault,UrbanPop,Rape,cluster
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Illinois,10.4,249,83,24.0,0
New York,11.1,254,86,26.1,0
Iowa,2.2,56,57,11.3,1
New Hampshire,2.1,57,56,9.5,1


### NUTRIENTS dataset

In [30]:
df = pd.read_csv("nutrient.csv", index_col=0)

In [31]:
df.head()

Unnamed: 0_level_0,energy,protein,fat,calcium,iron
Food_Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BEEF BRAISED,340,20,28,9,2.6
HAMBURGER,245,21,17,9,2.7
BEEF ROAST,420,15,39,7,2.0
BEEF STEAK,375,19,32,9,2.6
BEEF CANNED,180,22,10,17,3.7


In [34]:
scaler = StandardScaler().set_output(transform='pandas')
df_scaled = scaler.fit_transform(df)

In [61]:

clust_DB= DBSCAN(eps=1, min_samples=2)
clust_DB.fit(df_scaled)
clust_DB.labels_
silhouette_score(df_scaled, clust_DB.labels_)

0.3419947780792516

In [53]:
eps_range = [0.2,0.4,0.6,1]
mp_range = [2,3,4,5]
cnt = 0
a =[]
for i in eps_range:
    for j in mp_range:
        clust_DB = DBSCAN(eps=i, min_samples=j)
        clust_DB.fit(df_scaled.iloc[:,:5])
        if len(set(clust_DB.labels_)) > 2:
            cnt = cnt + 1
            df_scaled['Clust'] = clust_DB.labels_
            df_scl_inliers = df_scaled[df_scaled['Clust']!=-1]
            sil_sc = silhouette_score(df_scl_inliers.iloc[:,:-1],
                             df_scl_inliers.iloc[:,-1])
            a.append([cnt,i,j,sil_sc])
            print(i,j,sil_sc)
    
a = np.array(a)
pa = pd.DataFrame(a,columns=['Sr','eps','min_pt','sil'])
print("Best Paramters:")
pa[pa['sil'] == pa['sil'].max()]


0.2 2 0.5710837307456915
0.4 2 0.9040570858308878
0.6 2 0.7752441663222646
1 2 0.43236957939711584
1 3 0.43147150560271585
Best Paramters:


Unnamed: 0,Sr,eps,min_pt,sil
1,2.0,0.4,2.0,0.904057


In [54]:
# DBSCAN with Best Parameters

In [55]:
clust_DB= DBSCAN(eps=0.4, min_samples=2)
clust_DB.fit(df_scaled)
print(clust_DB.labels_)
print(len(set(clust_DB.labels_)))


[ 0 -1 -1  0 -1 -1  1 -1 -1 -1  0  0  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1  1 -1]
3


In [56]:
df_clust = pd.DataFrame({'Food_Item':list(df_scaled.index),
                         'cluster': list(clust_DB.labels_)}).sort_values(by='cluster')

In [57]:
df_clust['cluster'].value_counts()

cluster
-1    20
 0     5
 1     2
Name: count, dtype: int64

In [62]:
df_clust

Unnamed: 0_level_0,energy,protein,fat,calcium,iron,cluster
Food_Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BEEF BRAISED,340,20,28,9,2.6,0
BEEF STEAK,375,19,32,9,2.6,0
CHICKEN CANNED,170,25,7,12,1.5,1
SMOKED HAM,340,20,28,9,2.5,0
PORK ROAST,340,19,29,9,2.5,0
PORK SIMMERED,355,19,30,9,2.4,0
TUNA CANNED,170,25,7,7,1.2,1


In [58]:
clust_DB= DBSCAN(eps=0.4, min_samples=2)
clust_DB.fit(df_scaled.iloc[:,:5])


In [59]:
df_clust= df.copy()
df_clust['cluster'] = clust_DB.labels_
df_clust= df_clust[df_clust['cluster'] != -1]
df_clust.groupby('cluster').mean()

Unnamed: 0_level_0,energy,protein,fat,calcium,iron
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,350.0,19.4,29.4,9.0,2.52
1,170.0,25.0,7.0,9.5,1.35


In [60]:
df_clust.sort_values('cluster')

Unnamed: 0_level_0,energy,protein,fat,calcium,iron,cluster
Food_Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BEEF BRAISED,340,20,28,9,2.6,0
BEEF STEAK,375,19,32,9,2.6,0
SMOKED HAM,340,20,28,9,2.5,0
PORK ROAST,340,19,29,9,2.5,0
PORK SIMMERED,355,19,30,9,2.4,0
CHICKEN CANNED,170,25,7,12,1.5,1
TUNA CANNED,170,25,7,7,1.2,1
