In [27]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_rand_score, silhouette_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
confile = list(pd.read_csv('../../dbcon.csv'))
postgres_db = 'heartdisease'
db_connection = 'postgresql://{}:{}@{}:{}/{}'.format(confile[0], confile[1], confile[2], confile[3], postgres_db)

query = '''
SELECT *
FROM heartdisease
;'''

heartdisease_df = pd.read_sql(query, db_connection)

X = heartdisease_df.iloc[:, :13]
y = heartdisease_df.iloc[:, 13]

X = X.replace(to_replace='?', value=0)
y = np.where(y > 0, 0, 1)

scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

In [4]:
X1, X2, y1, y2 = train_test_split(X, y, test_size=0.5, random_state=36, stratify=y)

In [32]:
def kmean_iter_func(X, y, scoring=None, kmin=2, kmax=4):
    for k in np.arange(kmin,kmax+1):
        model = KMeans(n_clusters=k)
        model.fit(X)
        if scoring == None:
            display(
                f"with {k} clusters, the crosstab looks like:",
                pd.crosstab(y, model.labels_)
            )
        elif scoring == 'ARI':
            modelscore = adjusted_rand_score(y, model.labels_)
            display(
                f"with {k} clusters, the model's ARI score is {modelscore} and the crosstab looks like:",
                pd.crosstab(y, model.labels_)
            )
        elif scoring == 'SIL':
            modelscore = silhouette_score(X, model.labels_)
            display(
                f"with {k} clusters, the model's silhouette score is {modelscore} and the crosstab looks like:",
                pd.crosstab(y, model.labels_)
            )
        else:
            display(
                "invalid scoring parameter",
                f"with {k} clusters, the crosstab looks like:",
                pd.crosstab(y, model.labels_)
            )

In [8]:
kmean_iter_func(X1, y1)

'with 2 clusters, the crosstab looks like:'

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,36,33
1,30,52


'with 3 clusters, the crosstab looks like:'

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,20,21,28
1,33,14,35


'with 4 clusters, the crosstab looks like:'

col_0,0,1,2,3
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,15,21,27,6
1,7,14,36,25


In [24]:
kmean_iter_func(X2, y2)

'with 2 clusters, the crosstab looks like:'

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,29,41
1,24,58


'with 3 clusters, the crosstab looks like:'

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,28,37,5
1,48,25,9


'with 4 clusters, the crosstab looks like:'

col_0,0,1,2,3
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,26,32,0,12
1,45,24,1,12


Neither model is good objectively, nor comparatively to the other.

In [23]:
kmean_iter_func(X, y, 'ARI')

"with 2 clusters, the model's ARI score is 0.020593537591174656 and the crosstab looks like:"

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,61,78
1,50,114


"with 3 clusters, the model's ARI score is 0.008957754384237345 and the crosstab looks like:"

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,65,34,40
1,68,28,68


"with 4 clusters, the model's ARI score is 0.014504052592432917 and the crosstab looks like:"

col_0,0,1,2,3
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,47,38,52,2
1,31,55,75,3


The model where k=2 scored the best, but the score had a value of 0.02 so it's still a bad model

In [30]:
kmean_iter_func(X, y, 'SIL')

"with 2 clusters, the model's silhouette score is 0.3888908389416227 and the crosstab looks like:"

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,78,61
1,114,50


"with 3 clusters, the model's silhouette score is 0.2821171697462302 and the crosstab looks like:"

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,40,66,33
1,69,67,28


"with 4 clusters, the model's silhouette score is 0.2798165983362194 and the crosstab looks like:"

col_0,0,1,2,3
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,52,38,2,47
1,75,55,3,31


The model where k=2 scored the best, but the score had a value of 0.39 so it's still a bad model