In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Importing the initial libraries
import numpy as np
import pandas as pd

In [None]:
#Importing the dataset
dataset = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')

In [None]:
#Let's see what the data looks like
dataset.head()

In [None]:
#We notice we have columns 0,3,4,7, and 9 are numerical columns
#and columns 1,2,5,6,8,10,11, and 12 are categorical columns
#column 13 will be what we are trying to predict

In [None]:
#Create numpy arrays
X_numerical = dataset.iloc[:,[0,3,4,7,9]].values
X_categorical = dataset.iloc[:,[1,2,5,6,8,10,11,12]]
y = dataset.iloc[:,-1].values

In [None]:
#Encoding X_categorical
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
orginalNumOfColsOfX_categorical = X_categorical.shape[1]
for i in range(X_categorical.shape[1]): 
    currNumOfColsOfX_categorical = X_categorical.shape[1]
    indexOfColumnToEncode = currNumOfColsOfX_categorical - orginalNumOfColsOfX_categorical + i
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [indexOfColumnToEncode])], remainder='passthrough', sparse_threshold=0)
    X_categorical = np.array(ct.fit_transform(X_categorical)) 

In [None]:
#join X_numerical and X_categorical into one array
X = np.concatenate((X_numerical,X_categorical), axis=1)

In [None]:
#we see y is already binary encoded as it only contains 0s and 1s

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
#Feature Scaling i.e Standardisation or Normalisation
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#Importing the ANN libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

In [None]:
#After 50 or so hyperparameter tunings I found the following to be most effective, stable, and reliable

In [None]:
classifier = Sequential()
# Adding first hidden layer
classifier.add(Dense(units = 12, kernel_initializer = 'uniform', activation = 'relu'))
classifier.add(Dropout(0.1))
# Adding second hidden layer
classifier.add(Dense(units = 12, kernel_initializer = 'uniform', activation = 'relu'))
classifier.add(Dropout(0.1))
# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
# Fitting the ANN to the Training set
y_test = y_test.astype('float64') #to have the same type as the other sets
y_train = y_train.astype('float64') #to have the same type as the other sets
classifierHistory = classifier.fit(X_train, y_train, batch_size = 64, epochs = 70, validation_data=(X_test,y_test)) 


In [None]:
#After several runs, this model either overfits or underfits by maximum 3% and the test accuracy ranges from 83%-90%
#so call it 86.5 round down to 86% accurate
#We see here we have a test accuracy of 90% but if we run the model again it will return another result ranging from 83%-90%

In [None]:
#Second model run to show accuracy range

classifier = Sequential()
classifier.add(Dense(units = 12, kernel_initializer = 'uniform', activation = 'relu'))
classifier.add(Dropout(0.1))
classifier.add(Dense(units = 12, kernel_initializer = 'uniform', activation = 'relu'))
classifier.add(Dropout(0.1))
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
classifierHistory = classifier.fit(X_train, y_train, batch_size = 64, epochs = 70, validation_data=(X_test,y_test)) 

In [None]:
#We have a test accuracy of 86.89% with an overfit of just over 2%. 
#The test accuracy is display that this model's average accuracy is 86%

In [None]:
#Evaluating our classification model
scores = classifier.evaluate(X_test, y_test) #it is important to keep in mind that the sigmoid function uses a 50% threshold
#and for a real life scenario such as predicting heart disease prediction we would want to decrease or increase that threshold
#depending on what the outcome means for the next steps to be taken
print('Accuracy: %.2f%%' % (scores[1]*100))

In [None]:
#Visualisation
import matplotlib.pyplot as plt

In [None]:
# summarize history for accuracy
plt.plot(classifierHistory.history['accuracy'])
plt.plot(classifierHistory.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
#we see the train and test accuracy meet and stay within a 3% range the higher the epochs go.
#epochs = 70 was found to optimally prevent a large overfitting or underfitting within this model
#if there were more epochs the training accuracy would continue to increase 
#while the test accuracy would remain fairly constant, and this would cause either and large overfit or underfit.
#for this run we see it prevented a large overfit

In [None]:
# summarize history for loss
plt.plot(classifierHistory.history['loss'])
plt.plot(classifierHistory.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
#The model loss visualisation further supports the findings above

In [None]:
#Conclusion
#While an accuracy of 86% was achieved, the dataset only had 330 rows and hence is quite small.
#Deep Learning is not suited for such little data
#I suggest using a Machine Learning algorithm for this heart disease prediction