In [None]:
import os
import sys
import pandas as pd
import numpy as np
import pylab
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import math
from scipy.stats import kurtosis, skew
import warnings
from sklearn.metrics import matthews_corrcoef
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

# Context
There are many industries where understanding how things group together is beneficial. For example, retailers want to understand the similarities among their customers to direct advertisement campaigns, and botanists classify plants based on their shared similar characteristics. One way to group objects is to use clustering algorithms. We are going to explore the usefulness of unsupervised clustering algorithms to help doctors understand which treatments might work with their patients.

# Content
We are going to cluster anonymized data of patients who have been diagnosed with heart disease. Patients with similar characteristics might respond to the same treatments, and doctors could benefit from learning about the treatment outcomes of patients like those they are treating. The data we are analyzing comes from the V.A. Medical Center in Long Beach, CA. To download the data, visit here.

Before running any analysis, it is essential to get an idea of what the data look like. The clustering algorithms we will use require numeric data—we'll check that all the data are numeric.'''

# Preproccessing

* age - Age of patient
* sex - Gender of patient
* cp - chest pain type
* trestbps - Resting blood pressure (in mm Hg on admission to the hospital)
* chol - Serum cholesterol in mg/dl
* fbs - Fasting blood sugar > 120 mg/dl (1 = true; 0 = false)
* restecg - Resting electrocardiographic results
* thalach - Maximum heart rate achieved
* exang - Exercise induced angina (1 = yes; 0 = no)
* oldpeak - ST depression induced by exercise relative to rest
* slope - The slope of the peak exercise ST segment

There are 4 columns representing boolean feature: sex, fbs, resrecg, exang
And 1 column for categorial feature: cp

The standard k-means algorithm isn't directly applicable to categorical data, for various reasons. The sample space for categorical data is discrete, and doesn't have a natural origin. A Euclidean distance function on such a space isn't really meaningful. 
*As someone put it, "The fact a snake possesses neither wheels nor legs allows us to say nothing about the relative value of wheels and legs."* 

As long as K-Means does not go well with catergorial/boolean data, theese columns will be deleted.
Might try alternative algorithm to solve the problem later.

**Read data, get rid of categorical features**

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/heart-disease-patients/heart_disease_patients.csv', delimiter=',', nrows = 1000)
df.dataframeName = 'heart_disease_patients.csv'
new_df = df.drop(['id','sex','fbs','restecg','exang','cp','slope'],axis=1).dropna()
new_df.head()

# Fist try analyzis

In [None]:
def plotPerColumnDistribution(df):
    # добавить ассиметрию и эксцесс
    colors = {0:'powderblue',1:'lightsalmon',2:'darkcyan',3:'mediumorchid',4:'plum', 5:'black'}
    plt.figure(figsize=(20,20))
    for i,x in enumerate(df.columns):
        label=f'''Mean: {round(df[x].mean(),2)} 
        Std: {round(df[x].std(),2)}
        Min: {round(df[x].min(),2)}
        Q1: {round(df[x].quantile(0.25))}
        Q2:{round(df[x].quantile(0.5),2)} 
        Q3:{round(df[x].quantile(0.75),2)} 
        Max:{round(df[x].max(),2)}
        Kurtosis: {round(kurtosis(df[x]),2)}
        Simmetry: {round(skew(df[x]),2)}
        '''
        plt.subplot(3,2,i+1)
        #plt.figure(figsize=(10,7))
        plt.title(x.upper(),fontsize=15)
        sns.distplot(df[x], color=colors[i],label=label)
        plt.legend(fontsize=15)

In [None]:
plotPerColumnDistribution(new_df)

If you look at the way the scales change on the graphs, you can see that scaling data to one dimension is absolutely necessary. It is worth noting that
features differ significantly from
direction of skewness, therefore we shoud try
standardization as well.

## Correlation

In [None]:
def pair_corr(df):
    bin_cols = ['sex', 'fbs','exang']
    corr = pd.DataFrame(columns=df.columns, index=df.columns)
    for x in corr.columns:
        for i in corr.index:
            if x in bin_cols and i in bin_cols:
                corr[x][i]= matthews_corrcoef(df[x], df[i])
            elif ((x in bin_cols) and (i not in bin_cols)) or ((x not in bin_cols) and (i  in bin_cols)):
                corr[x][i] = 0
            else:
                corr[x][i] = np.corrcoef(df[x],df[i])[0,1]
    return corr
        
    
def r_ij(pair_corr):
    coeff =[[],[],[],[],[]]
    for  i in range(0,5):
        for j in range(0,5):
            alg_= np.delete(np.delete(pair_corr, i, axis=1), j, axis = 0)#algebraic complement
            r_ij = ((-1)**(i+j))*np.linalg.det(alg_)
            coeff[i].append(r_ij)
    return coeff

def get_chast_corr(df, coeff):
    chast_corr = [[],[],[],[],[]]
    for i in range(0,5):
        for j in range(0,5):
            if i == j:
                r_ij = 1
            else:
                r_ij = - coeff[i][j] / (coeff[i][i]*coeff[j][j])**(1/2)
            chast_corr[i].append(r_ij)
    return pd.DataFrame(data=chast_corr,
           index= df.columns,
          columns= df.columns)


### For future analyzis: glance at correlation of boolean features alone. As we see, it is barely significant.

In [None]:
pcorrbin = pair_corr(df).loc[['sex','fbs','exang']][['sex','fbs','exang']]
sns.heatmap(pcorrbin.astype(float), vmax=1, square=True,annot=True, cmap='BuPu')

### Correlation between numeric features

In [None]:
num_corr = new_df.corr()
plt.figure(figsize=(8,8))
plt.title('Correlation between numeric')
sns.heatmap(num_corr.astype(float), vmax=1, square=True,annot=True, cmap='BuPu')

The Pearson method was used to calculate the correlation coefficient. No strong
dependences between the features were found. A weak linear dependence occures
between thalach (maximum heart rate) and age (age).

With increasing age
to some extent, the rate of maximum heartbeat decreases. Also, with
an increase in the maximum heart rate, ST decreases relatively often.

### Multicollinearity check
##### Let's check for a false correlation. The coefficient of partial correlation.

The partial correlation coefficient evaluates the tightness of dependence between two variables with fixed values of the others.
Partial coefficients are slightly weaker than ordinary pearson coefficients. This shows that the multicollinearity of features is minimal.


In [None]:
num_corr = new_df.corr()
coeff_num = r_ij(num_corr.to_numpy(dtype = 'float')) #partial correlation coefficient 
plt.figure(figsize=(8,8))
plt.title('Partial correlation between numeric')
sns.heatmap(get_chast_corr(num_corr, coeff_num), vmax=1, square=True,annot=True,cmap='BuPu')

## Anomalies detection

Box-plot detects 2 and more anomalies among features:
chol, trestbps, oldpeak.
###### Anomalies are not extreme


In [None]:
colors = {0:'powderblue',1:'lightsalmon',2:'darkcyan',3:'mediumorchid',4:'plum', 5:'black'}
plt.figure(figsize=(15,15))
for i, column in enumerate(new_df.columns):
    plt.subplot(3,2,i+1)
    sns.boxplot(new_df[column],color=colors[i]);

## Scaling 
* The goal of normalization is to convert the original set to the range [0..1] 
* The goal of standardization is to convert the original set to a new one with a mean
  value of 0 and a standard deviation of 1.

In [None]:
standartizer = StandardScaler()
normalizer = MinMaxScaler()
Xsscaled = standartizer.fit_transform(new_df)
Xnscaled = normalizer.fit_transform(new_df)

# Choosing a number of clusters for K-Means

# Which distance metric for K-Means is better?

Often in medical research, in addition to the most popular distance -
the Euclidean distance - the cosine distance is used. It takes into account the angle between
feature vectors in a multidimensional space. 

I performed an analysis
of the average and single method, both are not indicative and give no
clue about the cluster structure of the data. Take a look at it if you need. However, I  decided  at the stage of hierarchical
clustering not to consider the near neighbor method (single), since it is absolutely not
indicative for any distance metric. 

## Hierarchy clustering

In [None]:
import time
import scipy.cluster.hierarchy as sch

start = time.time()

metrics = ['euclidean', 'cosine']
methods = ['single','average']

for method in methods:
    for metric in metrics:    
        mergings = sch.linkage(Xnscaled, metric=metric,method=method)
        plt.figure(figsize=(20,5))
        plt.subplot(1,2,1)
        sch.dendrogram(mergings,
                   leaf_rotation=90,
                   leaf_font_size=10,
                   )
        plt.title(f'{metric} distance with {method} method on normalized data')

        mergings = sch.linkage(Xsscaled, metric=metric,method=method)
        plt.subplot(1,2,2)
        sch.dendrogram(mergings,
                   leaf_rotation=90,
                   leaf_font_size=10,
                   )
        plt.title(f'{metric} distance with {method} method on standardized data')
        plt.show() 


end = time.time()
print(f'Time elapsed: {end - start}')

As you can see, in most cases, the average distance is not applicable. Only cosine distance on standartized data shows promise (5 clusters)

Below I test how
the Euclidean and Cosine distances behave on Hierarchical Clustering with further neighbor (complete) method
. Although it would be interesting to consider Mahalanobis and Minkowski distance as well. I haven't found any Kmeans implementations using the built-in python libraries with these distances. If there are some, let me know!

In [None]:
start = time.time()

metrics = ['euclidean', 'cosine']
methods = ['complete']


for method in methods:
    for metric in metrics:    
        mergings = sch.linkage(Xnscaled, metric=metric,method=method)
        plt.figure(figsize=(20,5))
        plt.subplot(1,2,1)
        sch.dendrogram(mergings,
                   leaf_rotation=90,
                   leaf_font_size=10,
                   )
        plt.title(f'{metric} distance with {method} method on normalized data')

        mergings = sch.linkage(Xsscaled, metric=metric,method=method)
        plt.subplot(1,2,2)
        sch.dendrogram(mergings,
                   leaf_rotation=90,
                   leaf_font_size=10,
                   )
        plt.title(f'{metric} distance with {method} method on standardized data')
        plt.show() 


end = time.time()
print(f'Time elapsed: {end - start}')

The Euclidean distance shows the same number of clusters for
both normalized and standardized data. It is worth noting that the
leftmost "orange" cluster is too small and was attached last in the case
of standardized data, means no good. Most likely outliers grouped there somehow. We better presume 4 clusters.

The cosine distance returns 2 and 12 clusters, respectively. However, within the study field, two clusters are too obvious and unproven, and 12 clusters, therefore 12 different
treatment methods are probably too risky. The cosine distance on
standardized data may tend to place similar objects in
different groups.

All 4 dendrograms indicate the presence of a cluster structure in the data.

##### If for some reason you need to devide data by 5 clusters better standartize it and use cosine distance metric. With other options it shows no promise.

## Elbow mothod

The "Elbow" method involves repeated execution of the algorithm
with an increase in the number of clusters, clustering score is calculated as a function of the number
of clusters and presented on the graph. The score is a form of the ratio of the intracluster distance to
the intercluster distance. The optimal number of clusters is determined
by the location of the " elbow bend”, i.e. when adding a new cluster does not
significantly improve the model.

In [None]:
from sklearn.cluster import AgglomerativeClustering
def wss_calculation(K, data, dist, meth):
    WSS = []
    for i in range(K):
        cluster = AgglomerativeClustering(n_clusters= i+1, affinity=dist, linkage=meth)  
        cluster.fit_predict(data)
        # cluster index
        label = cluster.labels_
        wss = []
        for j in range(i+1):
            # extract each cluster according to its index
            idx = [t for t, e in enumerate(label) if e == j]
            cluster = data[idx,]
            # calculate the WSS:
            cluster_mean = cluster.mean(axis=0)
            distance = np.sum(np.abs(cluster - cluster_mean)**2,axis=-1)
            wss.append(sum(distance))
        WSS.append(sum(wss))
    return WSS


In [None]:
metrics = ['euclidean','cosine']
methods = ['complete','average']


for method in methods:
    for metric in metrics:  
            if (metric == 'euclidean' and method=='average'):
                break
            
            plt.figure(figsize=(20,5))
            

            WSS=wss_calculation(12, Xnscaled, metric, method)
            cluster_range = range(1, 13)

            plt.subplot(1,2,1)
            plt.grid(True)
            plt.title(f'{metric} distance with {method} method on normalzied data')
            plt.xlabel('Number of cluster (k)')
            plt.ylabel('Total intra-cluster variation')
            plt.plot(cluster_range, WSS, marker = "x")



            WSS=wss_calculation(12, Xsscaled, metric, method)
            cluster_range = range(1, 13)

            plt.subplot(1,2,2)
            plt.grid(True)
            plt.title(f'{metric} distance with {method} method on standartized data')
            plt.xlabel('Number of cluster (k)')
            plt.ylabel('Total intra-cluster variation')
            plt.plot(cluster_range, WSS, marker = "x")

            plt.show()

plt.figure(figsize=(20,5))


The elbow for the Euclidean distance and normalized data bends at a point
rather equal to 6 clusters - after 6, the slope becomes more gentle and does not fluctuate. For standardized data, it is more difficult to choose, but intuitively, from 4 to 6 clusters. For cosine distance and normalized data, the elbow bend is not obvious: 2 or 5 clusters. For standardized data: also 2 or 5 clusters.


## Silhouette method

The "silhouette" coefficient is calculated using the average intra
-cluster distance (a) and the average distance to the nearest cluster (b) for each sample.
The silhouette is calculated as (b - a) / max (a, b). b is the distance between a and the nearest
cluster that a does not belong to. 

You can calculate the average silhouette value for all samples and use it as
a metric to estimate the number of clusters where the optimal number is at
the peak of the average silhouette.


In [None]:
from sklearn.metrics import silhouette_score
metrics = ['euclidean','cosine']
data =[Xnscaled, Xsscaled]
for metric in metrics:
    for i in (0,1):
        plt.figure(figsize=(10,5))
        hc_silhouette = []
        hc_scores = []
        for j in range(2,12):
            cluster = AgglomerativeClustering(n_clusters= j, affinity=metric, linkage='complete')  
            y_hc = cluster.fit_predict(data[i])
            silhouette = silhouette_score(data[i], y_hc)
            hc_silhouette.append(silhouette)

        plt.subplot(2,1,i+1)
        if i==0:
            plt.title(f'{metric.upper()}. The silhouette coefficient for normalized data ')
        else: 
            plt.title(f'{metric.upper()}. The silhouette coefficient for standartized data ')
        
        plt.grid(True)
        plt.xlabel("Number of clusters",fontsize=14)
        plt.ylabel("Silhouette score",fontsize=15)
        plt.plot([i for i in range(2,12)],hc_silhouette, marker = 'x')

        plt.show()

For Euclidean distance, the silhouette method shows 4 clusters on
normalized data and 2 on standardized data.
For the cosine distance, the silhouette method shows 2 clusters in both cases.

# Summary:
A decision based on all graphs considered apparentlty doesn't exist in this case. 
The graphs do not give an unambiguous answer and contradict each other.

Notes on the cosine distance:

The elbow and silhouette graphs for the cosine distance
on both data types can be interpreted in favor of 2 clusters.
But look at the dendrogram for standardized data,
it is noticeable that the two clusters are poor-quality separation, since at the penultimate
stage of the union, the distance between the two clusters is barely seen.

Standardized data:
For standardized data, has a Euclidean distance there has a greater potential to divide the sample by more than
2 clusters, if you do not take into account the results
of the silhouette. It is quite possible that in medicine, clusters' size differ significantly. 
Given the specifics, it will be interesting to try
to divide the standardized sample into 4 clusters in the hope that the small
orange cluster (displayed on the dendrogram) will successfully join somewhere
. The dendrogram for the cosine distance looks less
attractive.

#### For standardized data, we choose the Euclidean distance and 4 clusters. For normalized - cosine distance and 4 clusters, so we could compare results.

# K-Means

### Standartized data  - cosine distance

In [None]:
from sklearn.cluster import KMeans

#function for grouping clusters in 2d-features dimension
def createseq(points, y_hc, cluster,i,j):
    a = np.array([x[i] for x in points])[np.where(y_hc==cluster)]
    b = np.array([x[j] for x in points])[np.where(y_hc==cluster)]
    return np.vstack((a,b))

def Kmeans1(data, NUM):
    km = KMeans(init='k-means++', n_clusters=NUM, n_init=12)
    km.fit_transform(data)
    y_km = km.labels_
    ccolors = {0:'powderblue',1:'lightsalmon',2:'darkcyan',3:'mediumorchid',4:'red', 5:'black'}
    plt.figure(figsize=(20,20))
    count = 0
    for i in range(0,5):
        for j in range(i,5):
            if i != j:
                count+=1
                for c in range(0,NUM):
                    cluster = c
                    seq = createseq(data, y_km, cluster,i,j)
                    means = (seq[0].mean(),seq[1].mean())
                    plt.subplot(5,2,count)
                    plt.scatter(seq[0], seq[1],  
                                c=ccolors[c], 
                                label = f'Cреднее: ({means[0].astype(float).round(3)},{means[1].astype(float).round(3)})') 
                    plt.xlabel(f'{new_df.columns[i]}')
                    plt.ylabel(f'{new_df.columns[j]}')
                    plt.legend()
        
                #plt.title(new_df.columns[i].upper()+'-'+new_df.columns[j].upper())    
    plt.show()
    return(y_km)

In [None]:
clust_eu_stand = Kmeans1(Xsscaled, 4)

### Normalized data  - euclidean distance

In [None]:
from nltk.cluster.kmeans import KMeansClusterer
from nltk.cluster.util import cosine_distance

def Kmeans2(data, NUM_CLUSTERS):
    km = KMeansClusterer(NUM_CLUSTERS, distance=cosine_distance, repeats=14)
    assigned_clusters = km.cluster(data, assign_clusters=True)
    y_km = np.array(assigned_clusters)
    ccolors = {0:'powderblue',1:'lightsalmon',2:'darkcyan',3:'mediumorchid',4:'red', 5:'black'}
    count = 0
    plt.figure(figsize=(20,20))
    for i in range(0,5):
        for j in range(i,5):
            if i != j:
                count+=1
                for c in range(0, NUM_CLUSTERS):
                    cluster = c
                    seq = createseq(data, y_km, cluster,i,j)
                    means = (seq[0].mean(),seq[1].mean())
                    plt.subplot(5,2,count)
                    plt.scatter(seq[0], seq[1],  
                                c=ccolors[c], 
                                label = f'Cреднее: ({means[0].astype(float).round(3)},{means[1].astype(float).round(3)})')
                    plt.xlabel(f'{new_df.columns[i]}')
                    plt.ylabel(f'{new_df.columns[j]}')
                    plt.legend()  
    plt.show()
    return(y_km)

In [None]:
clust_cos_norm = Kmeans2(Xnscaled,4)

### 3D diagram (age-chol-thalach)

In [None]:
new_df['cluster_norm'] = clust_cos_norm
new_df['cluster_stand'] = clust_eu_stand


#project the users feature vector in 3 dimensions
fig = plt.figure(figsize=(8,8))
ax = Axes3D(fig)

ax.scatter(new_df.iloc[:,0], new_df.iloc[:,3], new_df.iloc[:,4], c=new_df['cluster_norm'].to_numpy(), cmap='viridis', s=20)
_ = plt.title('Clusters')

#### As long as data is multi (5) dimensional, 3D and 2D charts might seem chaotic, but it doesn't mean clustering is not appropriate

## Mean values per cluster

### Standartized data  - cosine distance

In [None]:
means = pd.DataFrame(columns=['Cluster1','Cluster2', 'Cluster3', 'Cluster4'], index=new_df.columns[:-2])
for i in range(0,4):
    df1 = new_df[new_df['cluster_stand'] == i]
    for x in df1.columns:
        if x not in ['cluster_norm','cluster_stand']:
            means.iloc[:,i][x] = round(df1[x].mean(),3)
means

### Normalized data  - euclidean distance

In [None]:
means = pd.DataFrame(columns=['Cluster1','Cluster2', 'Cluster3', 'Cluster4'], index=new_df.columns[:-2])
for i in range(0,4):
    df1 = new_df[new_df['cluster_norm'] == i]
    for x in df1.columns:
        if x not in ['cluster_norm','cluster_stand']:
            means.iloc[:,i][x] = round(df1[x].mean(),3)
means

# Comparing results
### Rand index

In [None]:
from scipy.special import comb

def rand_index_score(clusters1, clusters2):

    tp_plus_fp = comb(np.bincount(clusters1), 2).sum()
    tp_plus_fn = comb(np.bincount(clusters2), 2).sum()
    A = np.c_[(clusters1, clusters2)]
    tp = sum(comb(np.bincount(A[A[:, 0] == i, 1]), 2).sum()
             for i in set(clusters1))
    fp = tp_plus_fp - tp
    fn = tp_plus_fn - tp
    tn = comb(len(A), 2) - tp - fp - fn
    return (tp + tn) / (tp + fp + fn + tn)


In [None]:
rand_index_score(clust_cos_norm, clust_eu_stand)

The results were similar by 76%. I leave the interpretation to the specialists in the subject area.

Both algorithms showed reasonable results and did not differ too much from
each other. According to the Rend index, clustering is identical in the case of using different
scaling methods by almost 80%. The division into 4 clusters looks reasonable and
can be considered in further research.