In [5]:
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score, silhouette_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
confile = list(pd.read_csv('../../dbcon.csv'))
postgres_db = 'heartdisease'
db_connection = 'postgresql://{}:{}@{}:{}/{}'.format(confile[0], confile[1], confile[2], confile[3], postgres_db)

query = '''
SELECT *
FROM heartdisease
;'''

heartdisease_df = pd.read_sql(query, db_connection)

X = heartdisease_df.iloc[:, :13]
y = heartdisease_df.iloc[:, 13]

X = X.replace(to_replace='?', value=0)
y = np.where(y > 0, 0, 1)

scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

In [21]:
model_df = pd.DataFrame(columns=['Epsilon', 'Min_Samples', 'Num_Clusters'])
i=0
for epsilon in np.arange(0.1, 1.1, 0.1):
    for minsamp in np.arange(1, 11):
        model = DBSCAN(eps=epsilon, min_samples=minsamp)
        model.fit(scaled_X)
        model_df.loc[i, 'Epsilon'] = epsilon
        model_df.loc[i, 'Min_Samples'] = minsamp
        model_df.loc[i, 'Num_Clusters'] = pd.Series(model.labels_).nunique()
        i += 1

display(
    model_df,
    model_df['Num_Clusters'].value_counts()
)

Unnamed: 0,Epsilon,Min_Samples,Num_Clusters
0,0.1,1,303
1,0.1,2,1
2,0.1,3,1
3,0.1,4,1
4,0.1,5,1
...,...,...,...
95,1,6,1
96,1,7,1
97,1,8,1
98,1,9,1


1      81
301     3
3       3
2       3
303     2
302     2
300     1
9       1
297     1
7       1
294     1
4       1
Name: Num_Clusters, dtype: int64

In [22]:
param_subset = model_df[model_df['Num_Clusters'].between(2, 4, True)]
param_subset

Unnamed: 0,Epsilon,Min_Samples,Num_Clusters
21,0.3,2,2
31,0.4,2,2
41,0.5,2,3
51,0.6,2,3
61,0.7,2,3
71,0.8,2,4
92,1.0,3,2


In [24]:
model_df = pd.DataFrame(columns=['Min_Samples', 'Num_Clusters'])
i=0
for minsamp in np.arange(1, 11):
    model = DBSCAN(metric='euclidean', eps=1, min_samples=minsamp)
    model.fit(scaled_X)
    model_df.loc[i, 'Min_Samples'] = minsamp
    model_df.loc[i, 'Num_Clusters'] = pd.Series(model.labels_).nunique()
    i += 1
display(model_df)

Unnamed: 0,Min_Samples,Num_Clusters
0,1.0,294.0
1,2.0,9.0
2,3.0,2.0
3,4.0,1.0
4,5.0,1.0
5,6.0,1.0
6,7.0,1.0
7,8.0,1.0
8,9.0,1.0
9,10.0,1.0


In [25]:
model_df = pd.DataFrame(columns=['Epsilon', 'Num_Clusters'])
i=0
for eps in np.arange(1, 11):
    model = DBSCAN(metric='euclidean', eps=eps, min_samples=1)
    model.fit(scaled_X)
    model_df.loc[i, 'Epsilon'] = eps
    model_df.loc[i, 'Num_Clusters'] = pd.Series(model.labels_).nunique()
    i += 1
display(model_df)

Unnamed: 0,Epsilon,Num_Clusters
0,1.0,294.0
1,2.0,178.0
2,3.0,34.0
3,4.0,3.0
4,5.0,1.0
5,6.0,1.0
6,7.0,1.0
7,8.0,1.0
8,9.0,1.0
9,10.0,1.0


The number of clusters decrease with both increasing epsilon and minsamples