In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![](https://www.ucsf.edu/sites/default/files/legacy_files/futuristicheart-600-04092012.jpg)

Clustering is a great technique to divide the dataset into different similar segments, the purpose of this notebook is to apply different clustering techniques, namely :-
1. K-means clustering
2. heirarchial clustering
3. Density based clustering (DBSCAN)

We will be applying visualisations and analyse the clusters using these methods and report the pros and cons of using them as we explore while we proceed with the implementation

I have chosen The heart-attack-prediction data to perform clustering.

In [None]:
#importing the dataset 
HAD = pd.read_csv(r"/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")
HAD.head(5)


> Data description :-
> We have dataset with 303 rows and 14 columns,
> each column represents,
> 1. "age" = age of the patient.
> 2. "sex" = whether male(1) or female(0), 
> 3. "cp" = intensity of chest pain (whether mild,moderate or severe)
> 4. "trtbps" = resting blood pressure (in mm Hg)
> 5. "chol" = cholestrol in mg/dl
> 6. "fbs" = fasting blood sugar>120, 1 for "True" and 0 for "False"
> 7. "restecg" = resting electrocardiographic results
> 8. "thalachh" = maximum heart rate achieved
> 9. "exng" = excercise induced angina
> 10. "old peak" = previous peak

In [None]:
sns.set()
lst = ["Age","Sex","Chest pain","Resting blood pressure","Cholestrol","Fasting blood suger >120? ","resting electrocardiographic result","Max heart rate","Excercise induced angina","old Peak","slp","caa","thall","Whether high risk?"]
lst1 = list(HAD.columns)
plt.figure(figsize=(17,22))
for i in range (0,len(lst)):
    plt.subplot(5,3,i+1)
    plt.hist(HAD[lst1[i]],bins=15,color="green")
    plt.plot()
    plt.xlabel(lst[i])
    plt.ylabel("Counts")



In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(HAD.corr(),annot=True,cmap="Blues_r")

In [None]:
#Exploring and removing data with 0,1 values as they wont be of much help in making the clusters
lst = ["sex","fbs","exng","output"]
for i in range(0, len(lst)):
    print(HAD[lst[i]].value_counts())


In [None]:
#dropping the above explored columns
HADE = HAD.drop(lst,axis=1)
HADE.head(5)


In [None]:
#standardizing the dataset and applying K-means clustering on the dataset
SS = StandardScaler(with_mean=0,with_std=1)
HADES = SS.fit_transform(HADE)
from sklearn.cluster import KMeans
KMC = KMeans(n_clusters=2)
KMC.fit(HADES)
center_age = [KMC.cluster_centers_[0][0],KMC.cluster_centers_[1][0]]
center_thalachh = [KMC.cluster_centers_[0][5],KMC.cluster_centers_[1][5]]

In [None]:
#to have a visualisation of the prediction with n = 2 lets plot graph between age and max_heart_rate as these two variables have wide range of value

plt.figure(figsize=(20,10))
plt.subplot(1,2,1)
sns.scatterplot(x="age",y="thalachh",data=HADE,hue = KMC.labels_,s=40)
plt.title("Predicted outcome based on clustering")
plt.xlabel("Age")
plt.ylabel("Max heart Rate")
plt.subplot(1,2,2)
sns.scatterplot(x="age",y="thalachh",data=HADE,hue = HAD["output"],s=40)
plt.title("Actual outcome")
plt.xlabel("Age")
plt.ylabel("Max heart Rate")

In [None]:
def accr(actual_labels,predicted_labels):
    count=0
    for i in range(0,len(actual_labels)):
        if(actual_labels[i]==predicted_labels[i]):
            count = count+1
    accuracy = count/len(actual_labels)
    return(1-accuracy)
print(accr(list(HAD["output"]),list(KMC.labels_)))       

> The clusters predicted by KMeans has accuracy of around 80%, which is good enough

In [None]:
metric = []
i=[]
for k in range (2,10):
    KMC = KMeans(n_clusters=k)
    KMC.fit(HADES)
    metric.append(KMC.inertia_)
    i.append(k)
plt.figure(figsize=(10,10))    
plt.plot(i,metric,'bx-')
plt.xlabel('No of centroids')
plt.ylabel('sum of square of distance between nearest centroid')

Lets analyze the pros and cons of using the KMeans clustering :-

Pros would be :-
1. Can be used on large dataset
2. Easy to use

Cons would be :-
1. it uses spherical shaped clusters only, hence fails when we have different segment based on other shapes