In [2]:
# Extact file
from zipfile import ZipFile

with ZipFile("archive.zip", 'r') as item:
    item.extractall()

In [3]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Read data from heart.csv file using pandas
data = pd.read_csv("heart.csv")
data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [5]:
data.shape

(918, 12)

In [6]:
# Basic statistics about the data
data.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [7]:
# Check missing data and datatypes
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [8]:
data_cat = data[["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]]

In [9]:
# One Hot Encode the categorical data and return it as a dense array.
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder(sparse_output=False)
data_cat_1hot = cat_encoder.fit_transform(data_cat)
data_cat_1hot

array([[0., 1., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 1.],
       ...,
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [10]:
# Drop categorical data from the main dataframe
data = data.drop(data_cat, axis=1)
data

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
0,40,140,289,0,172,0.0,0
1,49,160,180,0,156,1.0,1
2,37,130,283,0,98,0.0,0
3,48,138,214,0,108,1.5,1
4,54,150,195,0,122,0.0,0
...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1
914,68,144,193,1,141,3.4,1
915,57,130,131,0,115,1.2,1
916,57,130,236,0,174,0.0,1


In [11]:
# Store the labels in a variable
labels = data["HeartDisease"]

In [12]:
data["FastingBS"].unique()

array([0, 1])

In [13]:
# Scale the numerical data using z-score.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_num_scaled = scaler.fit_transform(data.iloc[:, :-1])
data_num_scaled.shape

(918, 6)

In [14]:
data = np.hstack([data_num_scaled, data_cat_1hot])
data

array([[-1.4331398 ,  0.41090889,  0.82507026, ...,  0.        ,
         0.        ,  1.        ],
       [-0.47848359,  1.49175234, -0.17196105, ...,  0.        ,
         1.        ,  0.        ],
       [-1.75135854, -0.12951283,  0.7701878 , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.37009972, -0.12951283, -0.62016778, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.37009972, -0.12951283,  0.34027522, ...,  0.        ,
         1.        ,  0.        ],
       [-1.64528563,  0.30282455, -0.21769643, ...,  0.        ,
         0.        ,  1.        ]])

In [15]:
data.shape

(918, 20)

In [16]:
# Split the data into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [17]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

LR_classifier = LogisticRegression()
LR_classifier.fit(X_train, y_train)
LR_preds = LR_classifier.predict(X_test)

In [18]:
# Support Vector classifier
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
svc_preds = svc.predict(X_test)

In [19]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree_preds = tree.predict(X_test)

In [20]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

ensemble = RandomForestClassifier()
ensemble.fit(X_train, y_train)
ensemble_preds = ensemble.predict(X_test)

In [21]:
# Performance
from sklearn.metrics import accuracy_score

print("Accuracies:\n-----------")
print(f"Logistic Regression: {round(accuracy_score(y_test, LR_preds), 2)}")
print(f"Support Vector Classifier: {round(accuracy_score(y_test, svc_preds), 2)}")
print(f"Decision Tree Classifier: {round(accuracy_score(y_test, tree_preds), 2)}")
print(f"Random Forest Classifier: {round(accuracy_score(y_test, ensemble_preds), 2)}")

Accuracies:
-----------
Logistic Regression: 0.85
Support Vector Classifier: 0.87
Decision Tree Classifier: 0.78
Random Forest Classifier: 0.89


In [49]:
# Dimensionality Reduction using Principal Component Analysis
from sklearn.decomposition import PCA

# Retain 80% of the data and obtain the n principal components.
pca = PCA(0.8)

data_pca = pca.fit_transform(data)
data_pca.shape

(918, 7)

In [50]:
# 80% of the data is coming from only 7 principal components!
data_pca

array([[ 2.42495006, -0.57975713, -0.64151806, ..., -0.76973001,
         0.24859804, -0.14117505],
       [ 0.52325202, -0.85531356, -0.84307389, ..., -1.14881265,
        -0.62007973, -0.47075247],
       [ 1.17338371,  0.13333897,  0.391062  , ..., -0.52980025,
         1.46523659, -0.68446165],
       ...,
       [-1.13792834,  0.16672774,  1.08147719, ..., -0.36057238,
        -0.04172823,  0.25063299],
       [ 1.34148106, -0.53024758, -0.70764215, ...,  0.8208487 ,
        -0.69147761,  0.80216521],
       [ 2.29071642,  0.20364654, -0.45991568, ..., -1.21337918,
        -0.35190522, -0.01713558]])

In [51]:
# This is how variance of the data is distributed among the principal components.
pca.explained_variance_ratio_

array([0.25137218, 0.15334608, 0.11336034, 0.09797289, 0.08000528,
       0.07223638, 0.04188982])

In [52]:
# Split the new data into train and test splits
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(data_pca, labels, test_size=0.2, random_state=42)

In [53]:
# Logistic Regression on compressed data
from sklearn.linear_model import LogisticRegression

LR_classifier_pca = LogisticRegression()
LR_classifier_pca.fit(X_train_pca, y_train_pca)
LR_preds_pca = LR_classifier_pca.predict(X_test_pca)

In [54]:
# Support Vector Regression on compressed data
from sklearn.svm import SVC

svc_pca = SVC()
svc_pca.fit(X_train_pca, y_train_pca)
svc_preds_pca = svc_pca.predict(X_test_pca)

In [55]:
# Decision Tree Classifier on compressed data
from sklearn.tree import DecisionTreeClassifier

tree_pca = DecisionTreeClassifier()
tree_pca.fit(X_train_pca, y_train_pca)
tree_preds_pca = tree_pca.predict(X_test_pca)

In [56]:
# Random Forest Classifier on compressed data
from sklearn.ensemble import RandomForestClassifier

ensemble_pca = RandomForestClassifier()
ensemble_pca.fit(X_train_pca, y_train_pca)
ensemble_preds_pca = ensemble_pca.predict(X_test_pca)

In [57]:
# Performance after compression
print("Accuracies after pca:\n-----------")
print(f"Logistic Regression: {round(accuracy_score(y_test_pca, LR_preds_pca), 2)}")
print(f"Support Vector Classifier: {round(accuracy_score(y_test_pca, svc_preds_pca), 2)}")
print(f"Decision Tree Classifier: {round(accuracy_score(y_test_pca, tree_preds_pca), 2)}")
print(f"Random Forest Classifier: {round(accuracy_score(y_test_pca, ensemble_preds_pca), 2)}")

Accuracies after pca:
-----------
Logistic Regression: 0.85
Support Vector Classifier: 0.81
Decision Tree Classifier: 0.76
Random Forest Classifier: 0.83


References

1. Dataset: https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction

2. Principal Component Analysis with Python Code: https://youtu.be/8klqIM9UvAc

3. Principal Component Analysis (PCA), Step-by-Step: https://youtu.be/FgakZw6K1QQ

