In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

In [18]:
df = pd.read_csv('../../Datasets/milk.csv', index_col=0)
df.head()

Unnamed: 0_level_0,water,protein,fat,lactose,ash
Animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HORSE,90.1,2.6,1.0,6.9,0.35
ORANGUTAN,88.5,1.4,3.5,6.0,0.24
MONKEY,88.4,2.2,2.7,6.4,0.18
DONKEY,90.3,1.7,1.4,6.2,0.4
HIPPO,90.4,0.6,4.5,4.4,0.1


In [20]:
sc = StandardScaler()
df_scaled = sc.fit_transform(df)

In [21]:
dbscan = DBSCAN(eps=1, min_samples=2)
dbscan.fit(df_scaled)

In [22]:
dbscan.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  1,
       -1, -1,  1,  2,  2,  2, -1, -1], dtype=int64)

In [28]:
new_df = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)
new_df['Cluster'] = dbscan.labels_

In [30]:
# Calculating the silhouette scores
new_df_inliers = new_df[new_df['Cluster'] != -1]
silhouette_score(new_df_inliers.iloc[:, :-1], new_df_inliers.iloc[:,-1])

0.4344818095328392

### Tuning eps and min_samples

In [35]:
df_scaled = sc.fit_transform(df)
df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)
df_scaled

Unnamed: 0_level_0,water,protein,fat,lactose,ash
Animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HORSE,0.948806,-1.009291,-0.903208,1.542217,-1.037554
ORANGUTAN,0.821407,-1.344603,-0.660619,1.040773,-1.259945
MONKEY,0.813445,-1.121062,-0.738247,1.263637,-1.381249
DONKEY,0.964731,-1.260775,-0.864394,1.152205,-0.936467
HIPPO,0.972694,-1.568145,-0.563583,0.149319,-1.542988
CAMEL,0.757707,-0.757806,-0.670322,0.372182,-0.30973
BISON,0.694008,-0.394551,-0.835283,0.873626,0.0744
BUFFALO,0.31181,-0.087181,-0.233662,0.316466,-0.168208
GUINEA PIG,0.295885,0.331959,-0.301587,-0.797852,-0.026687
CAT,0.271998,1.086413,-0.388919,0.149319,-0.22886


In [45]:
eps = np.linspace(0.1, 1, 10)
min_sp = [2, 3, 4, 5]
a = []

for epsilon in eps:
    for mp in min_sp:
        clust_db = DBSCAN(eps=epsilon, min_samples=mp)
        clust_db.fit(df_scaled)
        distincts = len(np.unique(clust_db.labels_))
        if distincts > 2:
            df_scaled['Cluster'] = clust_db.labels_
            df_inliers = df_scaled[df_scaled['Cluster'] != -1]
            score = silhouette_score(df_inliers.iloc[:, :-1], df_inliers.iloc[:, -1])
            a.append([epsilon, mp, distincts, score])


df_results = pd.DataFrame(a, columns=['eps', 'mp', 'Distinct', 'Score'])
df_results.sort_values(by='Score', ascending=False, inplace=True)
# df_results

print("Best eps:", df_results.loc[0]['eps'])
print("Best mp:", df_results.loc[0]['mp'])

Best eps: 0.4
Best mp: 2.0
