In [None]:
# Importing Libraries
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2, l1
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import random
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import confusion_matrix, roc_curve

## About this dataset

### Stuff we know:

* Age : Age of the patient

* Sex : Sex of the patient

* exng: exercise induced angina (1 = yes; 0 = no)

* caa: number of major vessels (0-3)

* cp : Chest Pain type chest pain type

    * Value 1: typical angina
    * Value 2: atypical angina
    * Value 3: non-anginal pain
    * Value 4: asymptomatic

* trtbps : resting blood pressure (in mm Hg)

* chol : cholestoral in mg/dl fetched via BMI sensor

* fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

* restecg : resting electrocardiographic results

    * Value 0: normal
    * Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    * Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

* thalach : maximum heart rate achieved

### Stuff we should predict
* target : 
    * 0 = less chance of heart attack 
    * 1 = more chance of heart attack

In [None]:
# Importing Data
data = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

# Printing Data Info
print(data.info())

In [None]:
# Co-relation Matrix
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(data.corr(), cmap='Blues', annot=True, ax=ax)

## Feature Engineering

In [None]:
# Since 'caa', 'cp', 'restecg' are categories rather than just integers
# We divide them into categories
dummies = pd.get_dummies(pd.DataFrame(data[['caa', 'cp', 'restecg']], dtype='object'))
dummies.head()

In [None]:
# Considering all the features except 'caa', 'cp', and 'restecg'
features = ['age', 'sex', 'trtbps','thalachh', 'exng', 'oldpeak', 'slp', 'thall', 'output']
data = data[features]
data.head()

In [None]:
# Joining Number Data with the Categorical Data
data = pd.concat([data, dummies], axis=1)
data.head()

## Data Handling

In [None]:
# We can observe outliers on 'oldpeak' column. 
# Outliers must be handled to avoid disruption in prediction accuracy.
def outliers(dataCol):
    # sorting column
    sorted(dataCol)
    
    # Interquartile Range
    Q1,Q3 = np.percentile(dataCol,[25,75])    
    IQR = Q3-Q1
    
    # Lower Range Error
    LowerRange = Q1-(1.5 * IQR)
    
    # Upper Range Error
    UpperRange = Q3+(1.5 * IQR)
    
    return LowerRange,UpperRange

In [None]:
# Outliers in OldPeak, Denoted by dots
sns.boxplot(data=data, x='oldpeak')

In [None]:
# We get the limits of the column
lwoldpeak,upoldpeak = outliers(data['oldpeak'])

# Limiting the column values between lwoldpeak and upoldpeak
data['oldpeak'].replace(list(data[data['oldpeak'] < lwoldpeak].oldpeak) ,lwoldpeak,inplace=True)
data['oldpeak'].replace(list(data[data['oldpeak'] > upoldpeak].oldpeak) ,upoldpeak,inplace=True)

In [None]:
# Outliers Managed 
sns.boxplot(data=data, x='oldpeak')

In [None]:
# Splitting Data into Postive and Negative Heart Attack cases.
# The dataset has 'output'= 1 for all the top rows and 'output'= 0 for bottom rows

data_1 = data[data['output']==0]
data_2 = data[data['output']==1]
data_1.shape, data_2.shape

## Training and Testing Dataset Spilt

In [None]:
# Adding rows from both the true and false predictions with good 1:0 split 
train_data = pd.concat([data_1.iloc[:125,:], data_2.iloc[:152, :]], ignore_index=True)
test_data = pd.concat([data_1.iloc[125:,:], data_2.iloc[152:, :]], ignore_index=True)

train_data.shape, test_data.shape

In [None]:
# Shuffling the data to mix up the features
train_data = shuffle(train_data)
test_data = shuffle(test_data)

In [None]:
sns.countplot(data = train_data, x='output')

In [None]:
sns.countplot(data = test_data, x='output')

In [None]:
# Splitting Labels into Training and Validation
train_labels = np.array(train_data['output'])
test_labels = np.array(test_data['output'])

In [None]:
# Splitting Data into Training and Validation
train_features = np.array(train_data.iloc[:, :-1])
test_features = np.array(test_data.iloc[:, :-1])

In [None]:
# Printing out data shapes
train_features.shape, train_labels.shape, test_features.shape, test_labels.shape

## Model Development

In [None]:
def initialize_weights(shape, dtype=None):
    
    return np.random.normal(loc = 0.0, scale = 1e-2, size = shape)

def initialize_bias(shape, dtype=None):
    
    return np.random.normal(loc = 0.5, scale = 1e-2, size = shape)

def DeepLearningModel(input_shape):
    model = Sequential()
    
    model.add(Input(input_shape))
    
    model.add(Dense(16, activation='relu', kernel_initializer=initialize_weights, bias_initializer=initialize_bias, kernel_regularizer=l2(2e-4)))
    
    model.add(Dense(32, activation='relu', kernel_initializer=initialize_weights, bias_initializer=initialize_bias, kernel_regularizer=l2(2e-4)))
    
    model.add(Dense(64, activation='relu', kernel_initializer=initialize_weights, bias_initializer=initialize_bias, kernel_regularizer=l2(2e-4)))
    
    model.add(Dense(1, activation='sigmoid', kernel_initializer=initialize_weights, bias_initializer=initialize_bias, kernel_regularizer=l2(2e-4)))    
    
    return model

In [None]:
# Hyperparameters

# These hyperparameters can be modified to get different output.
# I found these to work well for the data. Play around and let me know in comments

model = DeepLearningModel((20))
print(model.summary())
lr = 0.003
epochs = 40
batch_size = 100
optimizer = Adam(lr)

# You can try different Keras Error for Different Prediction Values.
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(train_features, train_labels, epochs=epochs, batch_size=batch_size)

## Plotting Performance over Training Set

In [None]:
# Plotting Accuracy and Loss against Number of Epochs
sns.lineplot(x=history.epoch, y=history.history['accuracy'])
sns.lineplot(x=history.epoch, y=history.history['loss'])

## Model Evaluation using Test Dataset

In [None]:
# Evaluating on Validation Dataset
model.evaluate(test_features, test_labels)

## Confusion Matrix for Test Dataset

In [None]:
# Plotting Confusion Matrix for better understanding of Model Performance on Validation
roc_act = test_labels
roc_pred = list(map(int, np.round(model.predict(test_features))))
labels = ['True Neg','False Pos','False Neg','True Pos']
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(confusion_matrix(roc_pred, roc_act), cmap='Blues', annot=True, fmt='')

## 96% Test Accuracy using DNN Model for Heart Attack Prediction.