<h2>Data Reading</h2>

In [41]:
import pandas as pd
import numpy as np

In [42]:
df = pd.read_csv("heartdiseasedata.csv")

In [43]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0
1,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0
2,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0
3,30,0,1,170,237,0,1,170,0,0.0,?,?,6,0
4,31,0,2,100,219,0,1,150,0,0.0,?,?,?,0


In [44]:
df.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
285,52,1,4,160,331,0,0,94,1,2.5,?,?,?,1
286,54,0,3,130,294,0,1,100,1,0.0,2,?,?,1
287,56,1,4,155,342,1,0,150,1,3.0,2,?,?,1
288,58,0,2,180,393,0,0,110,1,1.0,2,?,7,1
289,65,1,4,130,275,0,1,115,1,1.0,2,?,?,1


<h2>1. Data Understanding and Preparation</h2>

In [45]:
df = df.replace('?',np.NaN)

In [46]:
df.shape

(290, 14)

In [47]:
df.dtypes

age             int64
sex             int64
cp              int64
trestbps       object
chol           object
fbs            object
restecg        object
thalach        object
exang          object
oldpeak       float64
slope          object
ca             object
thal           object
num             int64
dtype: object

In [48]:
df.describe()

Unnamed: 0,age,sex,cp,oldpeak,num
count,290.0,290.0,290.0,290.0,290.0
mean,47.865517,0.72069,2.989655,0.59069,0.358621
std,7.78779,0.449436,0.968302,0.912627,0.480425
min,28.0,0.0,1.0,0.0,0.0
25%,42.0,0.0,2.0,0.0,0.0
50%,49.0,1.0,3.0,0.0,0.0
75%,54.0,1.0,4.0,1.0,1.0
max,66.0,1.0,4.0,5.0,1.0


In [49]:
df.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,290.0,290.0,290.0,289.0,267.0,282.0,289.0,289.0,289.0,290.0,103.0,3.0,28.0,290.0
unique,,,,31.0,152.0,2.0,3.0,71.0,2.0,,3.0,1.0,3.0,
top,,,,120.0,230.0,0.0,0.0,150.0,0.0,,2.0,0.0,7.0,
freq,,,,63.0,5.0,262.0,232.0,29.0,201.0,,90.0,3.0,11.0,
mean,47.865517,0.72069,2.989655,,,,,,,0.59069,,,,0.358621
std,7.78779,0.449436,0.968302,,,,,,,0.912627,,,,0.480425
min,28.0,0.0,1.0,,,,,,,0.0,,,,0.0
25%,42.0,0.0,2.0,,,,,,,0.0,,,,0.0
50%,49.0,1.0,3.0,,,,,,,0.0,,,,0.0
75%,54.0,1.0,4.0,,,,,,,1.0,,,,1.0


<h3>Deal with missing data</h3> 

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290 entries, 0 to 289
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         290 non-null    int64  
 1   sex         290 non-null    int64  
 2   cp          290 non-null    int64  
 3   trestbps    289 non-null    object 
 4   chol        267 non-null    object 
 5   fbs         282 non-null    object 
 6   restecg     289 non-null    object 
 7   thalach     289 non-null    object 
 8   exang       289 non-null    object 
 9   oldpeak     290 non-null    float64
 10  slope       103 non-null    object 
 11  ca          3 non-null      object 
 12  thal        28 non-null     object 
 13  num         290 non-null    int64  
dtypes: float64(1), int64(4), object(9)
memory usage: 31.8+ KB


In [51]:
df.drop(['slope', 'ca', 'thal'], inplace=True, axis=1)

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290 entries, 0 to 289
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         290 non-null    int64  
 1   sex         290 non-null    int64  
 2   cp          290 non-null    int64  
 3   trestbps    289 non-null    object 
 4   chol        267 non-null    object 
 5   fbs         282 non-null    object 
 6   restecg     289 non-null    object 
 7   thalach     289 non-null    object 
 8   exang       289 non-null    object 
 9   oldpeak     290 non-null    float64
 10  num         290 non-null    int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 25.0+ KB


In [53]:
df['trestbps'].fillna(df['trestbps'].mode()[0], inplace=True)
df['chol'].fillna(df['chol'].mode()[0], inplace=True)
df['fbs'].fillna(df['fbs'].mode()[0], inplace=True)
df['restecg'].fillna(df['restecg'].mode()[0], inplace=True)
df['thalach'].fillna(df['thalach'].mode()[0], inplace=True)
df['exang'].fillna(df['exang'].mode()[0], inplace=True)

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290 entries, 0 to 289
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         290 non-null    int64  
 1   sex         290 non-null    int64  
 2   cp          290 non-null    int64  
 3   trestbps    290 non-null    object 
 4   chol        290 non-null    object 
 5   fbs         290 non-null    object 
 6   restecg     290 non-null    object 
 7   thalach     290 non-null    object 
 8   exang       290 non-null    object 
 9   oldpeak     290 non-null    float64
 10  num         290 non-null    int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 25.0+ KB


In [55]:
df.shape

(290, 11)

In [56]:
df.drop_duplicates(inplace=True)

In [57]:
df.shape

(289, 11)

In [58]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
0,28,1,2,130,132,0,2,185,0,0.0,0
1,29,1,2,120,243,0,0,160,0,0.0,0
2,29,1,2,140,230,0,0,170,0,0.0,0
3,30,0,1,170,237,0,1,170,0,0.0,0
4,31,0,2,100,219,0,1,150,0,0.0,0


In [59]:
df[['trestbps', 'chol', 'thalach']] = df[['trestbps', 'chol', 'thalach']].astype('int64')
df[['sex', 'cp']] = df[['sex', 'cp']].astype('object')

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 289 entries, 0 to 289
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         289 non-null    int64  
 1   sex         289 non-null    object 
 2   cp          289 non-null    object 
 3   trestbps    289 non-null    int64  
 4   chol        289 non-null    int64  
 5   fbs         289 non-null    object 
 6   restecg     289 non-null    object 
 7   thalach     289 non-null    int64  
 8   exang       289 non-null    object 
 9   oldpeak     289 non-null    float64
 10  num         289 non-null    int64  
dtypes: float64(1), int64(5), object(5)
memory usage: 27.1+ KB


In [61]:
df['sex'].value_counts()

1    209
0     80
Name: sex, dtype: int64

In [62]:
df['cp'].value_counts()

4    123
2    103
3     52
1     11
Name: cp, dtype: int64

In [63]:
df['fbs'].value_counts()

0    269
1     20
Name: fbs, dtype: int64

In [64]:
df['restecg'].value_counts()

0    232
1     51
2      6
Name: restecg, dtype: int64

In [65]:
df['exang'].value_counts()

0    201
1     88
Name: exang, dtype: int64

In [66]:
df.rename(columns={'num       ': 'num'}, inplace=True)

In [67]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

<h3>Create dummy variables to categorical columns</h3>

In [68]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
0,28,1,2,130,132,0,2,185,0,0.0,0
1,29,1,2,120,243,0,0,160,0,0.0,0
2,29,1,2,140,230,0,0,170,0,0.0,0
3,30,0,1,170,237,0,1,170,0,0.0,0
4,31,0,2,100,219,0,1,150,0,0.0,0


In [69]:
df.dtypes

age           int64
sex          object
cp           object
trestbps      int64
chol          int64
fbs          object
restecg      object
thalach       int64
exang        object
oldpeak     float64
num           int64
dtype: object

In [70]:
df = pd.get_dummies(df)

In [71]:
df.astype('float64')

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,num,sex_0,sex_1,cp_1,cp_2,cp_3,cp_4,fbs_0,fbs_1,restecg_0,restecg_1,restecg_2,exang_0,exang_1
0,28.0,130.0,132.0,185.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,29.0,120.0,243.0,160.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,29.0,140.0,230.0,170.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,30.0,170.0,237.0,170.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,31.0,100.0,219.0,150.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285,52.0,160.0,331.0,94.0,2.5,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
286,54.0,130.0,294.0,100.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
287,56.0,155.0,342.0,150.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
288,58.0,180.0,393.0,110.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [72]:
df.shape

(289, 19)

<h3>Standartize data</h3>

In [73]:
from sklearn import preprocessing

In [74]:
y = df['num'].to_numpy()

In [75]:
X = df.drop(['num'], axis=1)

In [76]:
transform = preprocessing.StandardScaler()

In [77]:
X = transform.fit(X).transform(X)
X[0:5]

array([[-2.55044418, -0.15556787, -1.78401207,  1.98244347, -0.64995321,
        -0.61868822,  0.61868822, -0.19891794,  1.34380997, -0.46841145,
        -0.86079257,  0.27267094, -0.27267094, -2.01746758, -0.46291005,
         6.86779926,  0.66167284, -0.66167284],
       [-2.42203331, -0.72323344, -0.08928249,  0.91235351, -0.64995321,
        -0.61868822,  0.61868822, -0.19891794,  1.34380997, -0.46841145,
        -0.86079257,  0.27267094, -0.27267094,  0.49567091, -0.46291005,
        -0.14560705,  0.66167284, -0.66167284],
       [-2.42203331,  0.41209771, -0.28776433,  1.3403895 , -0.64995321,
        -0.61868822,  0.61868822, -0.19891794,  1.34380997, -0.46841145,
        -0.86079257,  0.27267094, -0.27267094,  0.49567091, -0.46291005,
        -0.14560705,  0.66167284, -0.66167284],
       [-2.29362245,  2.11509444, -0.18088949,  1.3403895 , -0.64995321,
         1.61632299, -1.61632299,  5.02719875, -0.74415283, -0.46841145,
        -0.86079257,  0.27267094, -0.27267094, -2.017

<h2>2. Modeling and Evaluation</h2>

<b>k means clustering</b>

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans 

In [79]:
clusterNum = 2
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means.fit(X)
labels = k_means.labels_
for i in range(len(labels)):
    if labels[i] == 1:
        labels[i] = 0
    else:
        labels[i] = 1
print(labels)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0
 1 1 0 0 0 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0 1 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 1
 0 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 1 1 1 0 1 1 1
 1 0 1 0 0 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1]


In [80]:
total_dif = 0
for i in range(len(labels)):
    total_dif = total_dif + abs(labels[i] - y[i])
res = (1 - total_dif/len(labels))*100
print('Accuracy: ', round(res,2))

Accuracy:  81.31
