In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import datasets, metrics
from sklearn.cluster import DBSCAN

In [2]:
heart_disease = pd.read_excel('Processed_Cleveland.xlsx')
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,1,110,264,0,0,132,0,1.2,2,0,7,1
299,68,1,4,144,193,1,0,141,0,3.4,2,2,7,2
300,57,1,4,130,131,0,0,115,1,1.2,2,1,7,3
301,57,0,2,130,236,0,2,174,0,0.0,2,1,3,1


In [3]:
# Define the features and the outcome
X = heart_disease.iloc[:, :13]
y = heart_disease.iloc[:, 13]

# Replace missing values (marked by ?) with a 0
X = X.replace(to_replace='?', value=0)

# Binarize y so that 1 means heart disease diagnosis and 0 means no diagnosis
y = np.where(y > 0, 0, 1)

In [4]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

Below I tried to visualize different values of epsilons and min points.

In [12]:
for i in np.arange(0.25, 2.75, 0.25):
  for j in range (3,13):

    print(f"DBSCAN with eps: {i}, and min Points: {j}")

    dbscan_cluster = DBSCAN(eps=i, min_samples=j)

    clusters = dbscan_cluster.fit_predict(X_std)

    pca = PCA(n_components=2).fit_transform(X_std)

    plt.figure(figsize=(10,5))
    colours = 'rbg'
    for i in range(pca.shape[0]):
        plt.text(pca[i, 0], pca[i, 1], str(clusters[i]),
                color=colours[y[i]],
                fontdict={'weight': 'bold', 'size': 50}
            )

    plt.xticks([])
    plt.yticks([])
    plt.axis('off')
    plt.show()

Output hidden; open in https://colab.research.google.com to view.

Separating two sets is really difficult with this approach.

In [5]:
# DBSCAN with eps=1 fixed
for j in range(1,10):

  print(f"DBSCAN with eps: 1, and min Points: {j}")
  dbscan_cluster = DBSCAN(eps=1, min_samples=j, metric='euclidean')

  clusters = dbscan_cluster.fit_predict(X_std)

  print("DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): {}"
      .format(metrics.adjusted_rand_score(y, clusters)))

DBSCAN with eps: 1, and min Points: 1
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0004340648309851829
DBSCAN with eps: 1, and min Points: 2
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): -0.008961679875102968
DBSCAN with eps: 1, and min Points: 3
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): -0.002755933782262106
DBSCAN with eps: 1, and min Points: 4
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0
DBSCAN with eps: 1, and min Points: 5
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0
DBSCAN with eps: 1, and min Points: 6
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0
DBSCAN with eps: 1, and min Points: 7
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0
DBSCAN with eps: 1, and min Points: 8
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0
DBSCAN with eps: 1, and min Points: 9
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0


There isn't any even positive ARI score for these DBSCAN approaches, nearly.
But we got our best result when
DBSCAN with eps: 3.0, and min Points: 3
And its ARI score is: 0.02064668894650054

In [6]:
#silhouette score for the best eps and minsamples

dbscan_cluster = DBSCAN(eps=1, min_samples=1, metric='euclidean')
clusters = dbscan_cluster.fit_predict(X_std)

print("DBSCAN çözümünün siluet skoru: {}"
      .format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))

DBSCAN çözümünün siluet skoru: 0.02806263640143188


Its incredibly bad.

In [7]:
# DBSCAN with min_samples=1

for i in np.arange(0.5, 5, 0.25):
  print(f"DBSCAN with eps: {i}, and min Points: 1")
  
  dbscan_cluster = DBSCAN(eps=i, min_samples=1, metric='euclidean')

  clusters = dbscan_cluster.fit_predict(X_std)

  print("DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): {}"
      .format(metrics.adjusted_rand_score(y, clusters)))

DBSCAN with eps: 0.5, and min Points: 1
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 8.68128597420814e-05
DBSCAN with eps: 0.75, and min Points: 1
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.00013021930957340477
DBSCAN with eps: 1.0, and min Points: 1
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0004340648309851829
DBSCAN with eps: 1.25, and min Points: 1
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0034691953611277575
DBSCAN with eps: 1.5, and min Points: 1
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.007935993562794265
DBSCAN with eps: 1.75, and min Points: 1
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.01368836573290993
DBSCAN with eps: 2.0, and min Points: 1
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.021441604491943305
DBSCAN with eps: 2.25, and min Points: 1
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.08072698143984362
DBSCAN with eps: 2.5, and min Points: 1
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.02761084266040348
DBSCA

In [8]:
#just wanted to see the best option interval that I had use.

for i in np.arange(0.5, 5, 0.25):
  for j in range(1,10):
    print(f"DBSCAN with eps: {i}, and min Points: {j}")
    
    dbscan_cluster = DBSCAN(eps=i, min_samples=j, metric='euclidean')

    clusters = dbscan_cluster.fit_predict(X_std)

    print("DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): {}"
        .format(metrics.adjusted_rand_score(y, clusters)))

DBSCAN with eps: 0.5, and min Points: 1
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 8.68128597420814e-05
DBSCAN with eps: 0.5, and min Points: 2
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): -0.0036744082404421437
DBSCAN with eps: 0.5, and min Points: 3
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0
DBSCAN with eps: 0.5, and min Points: 4
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0
DBSCAN with eps: 0.5, and min Points: 5
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0
DBSCAN with eps: 0.5, and min Points: 6
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0
DBSCAN with eps: 0.5, and min Points: 7
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0
DBSCAN with eps: 0.5, and min Points: 8
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0
DBSCAN with eps: 0.5, and min Points: 9
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.0
DBSCAN with eps: 0.75, and min Points: 1
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.00013021930957340477
DBSCAN with ep

In [9]:
""" we got the best score for these values;
DBSCAN with eps: 2.25, and min Points: 6
DBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.11374093912422496
"""

' we got the best score for these values;\nDBSCAN with eps: 2.25, and min Points: 6\nDBSCAN çözümünün Düzeltilmiş Rand Endeksi (ARI): 0.11374093912422496\n'

In [10]:
# so I'll try to get the silh. score of that 
dbscan_cluster = DBSCAN(eps=2.25, min_samples=6, metric='euclidean')
clusters = dbscan_cluster.fit_predict(X_std)

print("DBSCAN çözümünün siluet skoru: {}"
      .format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))

DBSCAN çözümünün siluet skoru: -0.047508700429683956


In [11]:
# It didn't turned out to be the best silhouette score.