In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn import datasets, metrics
from sqlalchemy import create_engine
from sklearn.metrics import adjusted_rand_score, silhouette_score
import warnings

In [13]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'heartdisease'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

df = pd.read_sql_query(f'select * from {postgres_db}',con=engine)

# No need for an open connection,
# because you're only doing a single query
engine.dispose()

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [17]:
# Define the features and the outcome
X = df.iloc[:, :13]
y = df.iloc[:, 13]

# Replace missing values (marked by `?`) with a `0`
X = X.replace(to_replace='?', value=0)

# Binarize y so that `1` means heart disease diagnosis and `0` means no diagnosis
y = np.where(y > 0, 1, 0)

# Standardize the data.
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [18]:
# Defining the agglomerative clustering
gmm_cluster = GaussianMixture(n_components=3, random_state=123)

# Fit model
clusters = gmm_cluster.fit_predict(X_std)

In [22]:
dbscan_cluster = DBSCAN(eps=1, min_samples=1, metric="euclidean")
y_dbscan = dbscan_cluster.fit_predict(X_std)

agglomerative_cluster = AgglomerativeClustering(n_clusters=2, linkage='average', metric='cosine')
y_ac_average = agglomerative_cluster.fit_predict(X_std)
silhouette_average = silhouette_score(X_std, y_ac_average)

kmeans = KMeans(n_clusters=2, random_state=42)
y_kmeans = kmeans.fit_predict(X_std)
silhouette_kmeans = silhouette_score(X_std, y_kmeans)

In [24]:
ari_kmeans = adjusted_rand_score(y, y_kmeans)
ari_agglo = adjusted_rand_score(y, y_ac_average)
ari_dbscan = adjusted_rand_score(y, y_dbscan)
ari_gmm = adjusted_rand_score(y, clusters)

silhouette_dbscan = silhouette_score(X_std, y_dbscan)
silhouette_gmm = silhouette_score(X_std, clusters)

print(f"KMeans ARI: {ari_kmeans:.3f}, Silhouette Score: {silhouette_kmeans:.3f}")
print(f"Agglomerative ARI: {ari_agglo:.3f}, Silhouette Score: {silhouette_average:.3f}")
print(f"DBSCAN ARI: {ari_dbscan:.3f}, Silhouette Score: {silhouette_dbscan:.3f}")
print(f"GMM ARI: {ari_gmm:.3f}, Silhouette Score: {silhouette_gmm:.3f}")


KMeans ARI: 0.429, Silhouette Score: 0.176
Agglomerative ARI: 0.294, Silhouette Score: 0.148
DBSCAN ARI: 0.000, Silhouette Score: 0.028
GMM ARI: 0.086, Silhouette Score: 0.082


In [28]:
# Create GMM models with different covariance types
gmm_full = GaussianMixture(n_components=2, covariance_type='full', random_state=123)
gmm_tied = GaussianMixture(n_components=2, covariance_type='tied', random_state=123)
gmm_diag = GaussianMixture(n_components=2, covariance_type='diag', random_state=123)
gmm_spherical = GaussianMixture(n_components=2, covariance_type='spherical', random_state=123)

# Fit the models
clusters_full = gmm_full.fit_predict(X_std)
clusters_tied = gmm_tied.fit_predict(X_std)
clusters_diag = gmm_diag.fit_predict(X_std)
clusters_spherical = gmm_spherical.fit_predict(X_std)

# Calculate ARI and silhouette scores for each model
ari_full = adjusted_rand_score(y, clusters_full)
silhouette_full = silhouette_score(X_std, clusters_full)

ari_tied = adjusted_rand_score(y, clusters_tied)
silhouette_tied = silhouette_score(X_std, clusters_tied)

ari_diag = adjusted_rand_score(y, clusters_diag)
silhouette_diag = silhouette_score(X_std, clusters_diag)

ari_spherical = adjusted_rand_score(y, clusters_spherical)
silhouette_spherical = silhouette_score(X_std, clusters_spherical)


print(f"GMM Full ARI: {ari_full:.3f}, Silhouette Score: {silhouette_full:.3f}")
print(f"GMM Tied ARI: {ari_tied:.3f}, Silhouette Score: {silhouette_tied:.3f}")
print(f"GMM Diag ARI: {ari_diag:.3f}, Silhouette Score: {silhouette_diag:.3f}")
print(f"GMM Spherical ARI: {ari_spherical:.3f}, Silhouette Score: {silhouette_spherical:.3f}")

# Determine the best model based on ARI and silhouette score
best_model = max([(ari_full, silhouette_full, "Full"),
                  (ari_tied, silhouette_tied, "Tied"),
                  (ari_diag, silhouette_diag, "Diag"),
                  (ari_spherical, silhouette_spherical, "Spherical")],
                 key=lambda x: (x[0], x[1]))

print(f"\nThe best performing model is the GMM with {best_model[2]} covariance type.")


GMM Full ARI: 0.184, Silhouette Score: 0.136
GMM Tied ARI: 0.184, Silhouette Score: 0.136
GMM Diag ARI: 0.184, Silhouette Score: 0.136
GMM Spherical ARI: 0.208, Silhouette Score: 0.125

The best performing model is the GMM with Spherical covariance type.
