# Heart Disease Prediction

## Imports and Installation

In [1]:
# Install the following libraries (if running on Google Colab, these are the only ones you should need to download)

! pip install torchmetrics -q
! pip install torchinfo -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/729.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/729.2 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/729.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m727.0/729.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m729.2/729.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Data manipulation and loading
import numpy as np
import pandas as pd

# PyTorch
import torch
from torch import nn
from torch.optim import Adam

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Progress Bar
from tqdm.auto import tqdm

## Reading the data

In [3]:
data = pd.read_csv('heart.csv.xls')
data.head() # get first 5 rows

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
data.columns # get column names

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [5]:
# Separating categorical columns from numerical columns
categorical = []
numerical = []

for col in data:
    if data[col].dtype == 'object':
        categorical.append(col)
    else:
        numerical.append(col)

categorical, numerical

(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'],
 ['Age',
  'RestingBP',
  'Cholesterol',
  'FastingBS',
  'MaxHR',
  'Oldpeak',
  'HeartDisease'])

In [6]:
# Counting the values of each categorical column and showing how many categories they each have
for col in categorical:
    print(f'\n{col}: \n{data[col].value_counts()}')


Sex: 
M    725
F    193
Name: Sex, dtype: int64

ChestPainType: 
ASY    496
NAP    203
ATA    173
TA      46
Name: ChestPainType, dtype: int64

RestingECG: 
Normal    552
LVH       188
ST        178
Name: RestingECG, dtype: int64

ExerciseAngina: 
N    547
Y    371
Name: ExerciseAngina, dtype: int64

ST_Slope: 
Flat    460
Up      395
Down     63
Name: ST_Slope, dtype: int64


## Labeling categorical columns

In [7]:
# Creating a copy of the dataset
df = data.copy()

In [8]:
# Replacing binary values
df['Sex'] = df['Sex'].replace(['M', 'F'], [0, 1])
df['ExerciseAngina'] = df['ExerciseAngina'].replace(['N', 'Y'], [0, 1])

In [9]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,1,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,0,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,1,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,0,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [10]:
# Creating the encoder used to represent the categorical data numerically
ohe = OneHotEncoder()

In [11]:
ohe.fit(df[['ChestPainType', 'RestingECG', 'ST_Slope']])
feature_names = ohe.get_feature_names_out(['ChestPainType', 'RestingECG', 'ST_Slope'])
feature_names

array(['ChestPainType_ASY', 'ChestPainType_ATA', 'ChestPainType_NAP',
       'ChestPainType_TA', 'RestingECG_LVH', 'RestingECG_Normal',
       'RestingECG_ST', 'ST_Slope_Down', 'ST_Slope_Flat', 'ST_Slope_Up'],
      dtype=object)

In [12]:
X = ohe.transform(df[['ChestPainType', 'RestingECG', 'ST_Slope']]).toarray()
ohe_df = pd.DataFrame(X, columns=feature_names)
ohe_df.head()

Unnamed: 0,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [13]:
# Deleting the now-encoded columns
df.drop(['ChestPainType', 'RestingECG', 'ST_Slope'], axis=1, inplace=True)

In [14]:
# Adding the new one hot encoded columns to the dataframe
df = pd.concat([df, ohe_df], axis=1)
df.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,0,140,289,0,172,0,0.0,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,1,160,180,0,156,0,1.0,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,0,130,283,0,98,0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,48,1,138,214,0,108,1,1.5,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,54,0,150,195,0,122,0,0.0,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


## Spliting data from labels

In [15]:
X = df.drop('HeartDisease', axis=1)
y = df[['HeartDisease']]

X.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,0,140,289,0,172,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,1,160,180,0,156,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,0,130,283,0,98,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,48,1,138,214,0,108,1,1.5,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,54,0,150,195,0,122,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [16]:
y.head()

Unnamed: 0,HeartDisease
0,0
1,1
2,0
3,1
4,0


## Preparing data

In [17]:
X_tensor = torch.tensor(X.values.astype('float32'))
y_tensor = torch.tensor(y.values.astype('float32'))

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2)
len(X_train), len(y_train), len(X_test), len(y_test)

(734, 734, 184, 184)

## Creating the model

We'll be using Linear and ReLU layers to predict heart disease on the dataset

In [19]:
class Model(nn.Module):
    def __init__(self, inp, h1, h2, h3, out):
        super(Model, self).__init__()

        # input parameters to determine the number of in and out features for the linear layers
        self.linear_1 = nn.Linear(inp, h1)
        self.relu_1 = nn.ReLU()
        self.linear_2 = nn.Linear(h1, h2)
        self.relu_2 = nn.ReLU()
        self.linear_3 = nn.Linear(h2, h3)
        self.relu_3 = nn.ReLU()
        self.linear_4 = nn.Linear(h3, out)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.relu_1(x)
        x = self.linear_2(x)
        x = self.relu_2(x)
        x = self.linear_3(x)
        x = self.relu_3(x)
        x = self.linear_4(x)
        return x

In [20]:
# Instantiate the model
model = Model(inp=X_tensor.shape[1],
              h1=40,
              h2=80,
              h3=60,
              out=1)
model

Model(
  (linear_1): Linear(in_features=18, out_features=40, bias=True)
  (relu_1): ReLU()
  (linear_2): Linear(in_features=40, out_features=80, bias=True)
  (relu_2): ReLU()
  (linear_3): Linear(in_features=80, out_features=60, bias=True)
  (relu_3): ReLU()
  (linear_4): Linear(in_features=60, out_features=1, bias=True)
)

## Loss function, optimizer, evaluation metrics

In [21]:
loss_fn = nn.BCEWithLogitsLoss() # good loss function for binary classification problems like this one
optimizer = Adam(params=model.parameters(), lr=0.001) # learning rate can be adjusted, 0.001 works well

In [22]:
# simple accuracy function
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100
    return acc

## Training and testing model

In [23]:
EPOCHS = 1000 # we'll default to 1000 epochs

# Build the training and testing loop
for epoch in tqdm(range(EPOCHS)):
    ### TRAINING
    model.train()

    # Forward pass
    y_logits = model(X_train)
    y_pred = torch.round(torch.sigmoid(y_logits)) # turns raw logits into a label using sigmoid activation function (returns a number from 0-1)

    # Calculate the loss and accuracy
    loss = loss_fn(y_logits, y_train)
    acc = accuracy_fn(y_train, y_pred)

    # Optimizer zero grad
    optimizer.zero_grad()

    # Loss backward
    loss.backward()

    # Optimizer step
    optimizer.step()

    ### TESTING
    model.eval()
    with torch.inference_mode():
        # Forward pass
        test_logits = model(X_test)
        test_pred = torch.round(torch.sigmoid(test_logits))

        # Calculate the loss and accuracy
        test_loss = loss_fn(test_logits, y_test)
        test_acc = accuracy_fn(y_test, test_pred)

    if epoch % 100 == 0:
        print(f'Epoch: {epoch} | Loss: {loss:.5f}, Acc: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%')

  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch: 0 | Loss: 1.38622, Acc: 44.41% | Test loss: 0.81833, Test acc: 45.65%
Epoch: 100 | Loss: 0.50414, Acc: 77.25% | Test loss: 0.54156, Test acc: 73.37%
Epoch: 200 | Loss: 0.32459, Acc: 86.51% | Test loss: 0.41303, Test acc: 82.07%
Epoch: 300 | Loss: 0.26888, Acc: 88.96% | Test loss: 0.40621, Test acc: 83.15%
Epoch: 400 | Loss: 0.24843, Acc: 89.92% | Test loss: 0.42875, Test acc: 82.61%
Epoch: 500 | Loss: 0.23352, Acc: 91.42% | Test loss: 0.44400, Test acc: 82.07%
Epoch: 600 | Loss: 0.22430, Acc: 91.83% | Test loss: 0.48671, Test acc: 80.43%
Epoch: 700 | Loss: 0.20342, Acc: 92.10% | Test loss: 0.50383, Test acc: 82.07%
Epoch: 800 | Loss: 0.19235, Acc: 92.64% | Test loss: 0.54417, Test acc: 83.15%
Epoch: 900 | Loss: 0.18946, Acc: 92.51% | Test loss: 0.57348, Test acc: 82.61%


We see that we're able to achieve an accuracy of over 90% on our training dataset, with a slightly lower accuracy on our testing dataset, suggesting that some degree of overfitting is occurring. Different choices of hyperparameters like learning rate, epochs, or model layer dimensions/architecture could potentially allow us to achieve even higher accuracy.