# Introduction
Heart disease remains one of the leading causes of morbidity and mortality worldwide. Early detection and accurate diagnosis are crucial in managing and treating this condition. Leveraging structured data and advanced machine learning techniques, particularly neural networks, can significantly enhance prediction accuracy. In this project, we develop a neural network model using the Cleveland Clinic Foundation's heart disease dataset to predict whether a patient has heart disease based on various clinical features.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
path = '/content/drive/MyDrive/Research_Internship/heart.csv'

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Research_Internship/heart.csv')

In [None]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,52,1,1,118,186,0,2,190,0,0.0,2,0,fixed,0
299,43,0,4,132,341,1,2,136,1,3.0,2,0,reversible,1
300,65,1,4,135,254,0,2,127,0,2.8,2,1,reversible,1
301,48,1,4,130,256,1,2,150,1,0.0,1,2,reversible,1


In [None]:
# Identify categorical and numerical columns
categorical_features = ['cp', 'restecg', 'slope', 'thal']
numerical_features = [col for col in df.columns if col not in categorical_features + ['target']]

# Preprocessing :
1.	Handling Missing Values: Impute or remove rows with missing values.
2.	Encoding Categorical Variables: Use one-hot encoding for categorical features like chest pain type, thal, etc.
3.	Feature Scaling: Standardize numerical features to have zero mean and unit variance.
4.	Splitting Data: Divide the dataset into training (80%) and testing (20%) sets.

# Purpose of Preprocessing:

Preprocessing is essential for preparing the data before training machine learning models.
It ensures that the data is in a suitable format and scale for the algorithms to perform optimally.

# Benefits of ColumnTransformer:

Unified Preprocessing Step: Combines multiple preprocessing steps into one cohesive operation, simplifying the preprocessing pipeline.
Consistency: Ensures that the same preprocessing steps are applied to the same features consistently across the dataset.
Flexibility: Easily adaptable to include additional preprocessing steps or handle new types of features as needed.

In [None]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [None]:
# Split data
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
# Convert to torch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
y_test = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

The task at hand is to predict heart disease, which is a binary classification problem. The architecture of this neural network is designed to handle such tasks effectively.

# 64 Neurons in the First Hidden Layer:
 This choice provides a robust initial capacity to capture complex patterns and interactions in the input data. More neurons allow the model to learn a wide variety of features.
# 32 Neurons in the Second Hidden Layer:
This layer refines the features learned by the first layer, focusing on more specific patterns. Reducing the number of neurons in subsequent layers is a common practice to progressively distill the learned representations.

In [None]:
# Neural network model
class HeartDiseaseNN(nn.Module):
    def __init__(self):
        super(HeartDiseaseNN, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

model = HeartDiseaseNN()

In [None]:
# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 0.6354
Epoch [20/100], Loss: 0.5766
Epoch [30/100], Loss: 0.4976
Epoch [40/100], Loss: 0.4092
Epoch [50/100], Loss: 0.3356
Epoch [60/100], Loss: 0.2905
Epoch [70/100], Loss: 0.2653
Epoch [80/100], Loss: 0.2470
Epoch [90/100], Loss: 0.2312
Epoch [100/100], Loss: 0.2171


In [None]:
# Evaluation
model.eval()
with torch.no_grad():
    train_outputs = model(X_train)
    test_outputs = model(X_test)
    train_loss = criterion(train_outputs, y_train)
    test_loss = criterion(test_outputs, y_test)
    print(f'Training Loss: {train_loss.item():.4f}, Testing Loss: {test_loss.item():.4f}')

Training Loss: 0.2158, Testing Loss: 0.3532


In [None]:
# Convert outputs to binary predictions
train_pred = (train_outputs > 0.5).float()
test_pred = (test_outputs > 0.5).float()

train_accuracy = (train_pred == y_train).sum() / y_train.size(0)
test_accuracy = (test_pred == y_test).sum() / y_test.size(0)
print(f'Training Accuracy: {train_accuracy:.4f}, Testing Accuracy: {test_accuracy:.4f}')

Training Accuracy: 0.9215, Testing Accuracy: 0.8525
