In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn import datasets, metrics


from sqlalchemy import create_engine
import warnings

warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'heartdisease'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

heartdisease_df = pd.read_sql_query('select * from heartdisease',con=engine)

engine.dispose()

In [2]:
# Define the features and the outcome
X = heartdisease_df.iloc[:, :13]
y = heartdisease_df.iloc[:, 13]

# Replace missing values (marked by ?) with a 0
X = X.replace(to_replace='?', value=0)

# Binarize y so that 1 means heart disease diagnosis and 0 means no diagnosis
y = np.where(y > 0, 0, 1)

# Standarizing the features
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

## Apply GMM to the heart disease data by setting n_components=2. Get ARI and silhoutte scores for your solution and compare it with those of the k-means and hierarchical clustering solutions that you implemented in the assignments of the previous checkpoints. Which algorithm does perform better?

In [4]:
# Defining the agglomerative clustering
gmm_cluster = GaussianMixture(n_components=2, random_state=13)

# Fit model
clusters = gmm_cluster.fit_predict(X_std)

print("Adjusted Rand Index of the GMM solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters)))
print("The silhoutte score of the GMM solution: {}"
      .format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))

Adjusted Rand Index of the GMM solution: 0.4207322145049338
The silhoutte score of the GMM solution: 0.16118591340148433


K-Means: ARI .43 and Silhouette .17
Hierarchical: ARI .14 and Silhouette .14
GMM: ARI .42 and Silhouette .16

K means slightly outperforms GMM.

## GMM implementation of scikit-learn has a parameter called covariance_type. This parameter determines the type of covariance parameters to use. Specifically, there are four types you can specify:

#### full: This is the default. Each component has its own general covariance matrix.
#### tied: All components share the same general covariance matrix.
#### diag: Each component has its own diagonal covariance matrix.
#### spherical: Each component has its own single variance.
#### Try all of these. Which one does perform better in terms of ARI and silhouette scores?

In [5]:
# Defining the agglomerative clustering
gmm_cluster = GaussianMixture(n_components=2, random_state=13, covariance_type='full')

# Fit model
clusters = gmm_cluster.fit_predict(X_std)

print("Adjusted Rand Index of the GMM solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters)))
print("The silhoutte score of the GMM solution: {}"
      .format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))

Adjusted Rand Index of the GMM solution: 0.4207322145049338
The silhoutte score of the GMM solution: 0.16118591340148433


In [6]:
# Defining the agglomerative clustering
gmm_cluster = GaussianMixture(n_components=2, random_state=13, covariance_type='tied')

# Fit model
clusters = gmm_cluster.fit_predict(X_std)

print("Adjusted Rand Index of the GMM solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters)))
print("The silhoutte score of the GMM solution: {}"
      .format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))

Adjusted Rand Index of the GMM solution: 0.46482432589803474
The silhoutte score of the GMM solution: 0.1660701212463109


In [7]:
# Defining the agglomerative clustering
gmm_cluster = GaussianMixture(n_components=2, random_state=13, covariance_type='diag')

# Fit model
clusters = gmm_cluster.fit_predict(X_std)

print("Adjusted Rand Index of the GMM solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters)))
print("The silhoutte score of the GMM solution: {}"
      .format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))

Adjusted Rand Index of the GMM solution: 0.3870024156200561
The silhoutte score of the GMM solution: 0.1604139815113049


In [8]:
# Defining the agglomerative clustering
gmm_cluster = GaussianMixture(n_components=2, random_state=13, covariance_type='spherical')

# Fit model
clusters = gmm_cluster.fit_predict(X_std)

print("Adjusted Rand Index of the GMM solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters)))
print("The silhoutte score of the GMM solution: {}"
      .format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))

Adjusted Rand Index of the GMM solution: 0.20765243525722465
The silhoutte score of the GMM solution: 0.12468753110276873


Covariance = tied performs the best on both ARI and Silhouette.