In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Heart Attack Analysis:**
#     Task: Predict if a person is prone to a heart attack or not
     
     Strategy: Using deep learning
     
     Steps: 1. Data Visualisation and analysis
            2. Dealing with missing data (if any)
            3. Dealing with categorical data (if any)
            4. Data Preprocessing
            5. Create the model
            6. Train the model
            7. Evaluate Performance

In [None]:
# reading the date, make sure it's the right data
df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
df.head()

# **Step 1: Data Visualisation and analysis**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize = (12, 8))
sns.countplot(x = 'output', data=df)
# We check if the data is uniform with respect to the output result
# and it seems to be (that's good)

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True)
# We create the heatmap of the corrolations in order to see
# if there are 2 or more features verry similar (in that case we could have dropped some features)
# While there are some features highly corrolated with eachother,
# they are not similar enought for us to want to drop them

In [None]:
df.corr()['output'].sort_values()
# Here we see the corrolations of each feature with the output

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x='sex', data=df, hue='output')
# here we see that there is a huge difference between the sexes in the risk of
# having a heart attack

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(x='exng', data=df, hue='output')
# We also see how exercise induced anigma helps reducing drastically the risk of heart failure

While data visualisation is important, in this case we understood pretty much which are the important features and how they might affect the output.
(Take into consideration that because we are using a deep learning based approach, there is no need to **completely** understant the feature correlations)

# **Step 2: Dealing with missing data**

In [None]:
df.isnull().sum()
# we check whether there is missing data

Here we got lucky, we have no missing data so we don't need to do anything.

# **Step 3: Dealing with categorical data**

In [None]:
df.nunique()
# Now we are looking for those features that have a relative small amount of unique values
# we have to see whether there should be or not a correlations between those unique values

In [None]:
# The values that we are going to check are cp, restecg, slp, caa and thall 
# (notice that having 2 unique values is already like having them in a categorical way, so we can neglect them )
sns.countplot(x='cp', data=df, hue='output')

In [None]:
# while cp types (1,2,3) seem to behave similar, cp type 0 seem to be totally different
# one way to deal with this is chaning the elements (1,2,3) into 1
# if we look into the documentation, we see that 0 will mean typical, and the rest of them are atypical
df['cp'] = df['cp'].apply(lambda x: min(1, x))

In [None]:
df['restecg'].value_counts()
# There is a small amount of type 2 so we can just reasign them to the value that behaves the same

In [None]:
sns.countplot(x='restecg', data=df, hue='output')
# We see that the type 2 behaves more like type 0, so we can reasign 2 to the value 0

In [None]:
df['restecg'] = df['restecg'].apply(lambda x: 1 if x==1 else 0)
df['restecg'].unique()

In [None]:
# The slope can be taken as a continous value
df['slp'].value_counts()

In [None]:
# The number of major vessels can be taken as a continous value, not categorical
df['caa'].value_counts()

In [None]:
sns.countplot(x='thall', data=df, hue='output')
# There are various behaviour so we have to use dummy variables, we take it as categorical data
# There is nothing about the Thall rate in the documentation, so we don't know if we have to preserve
# some sort of relation between values

In [None]:
df = pd.concat([df.drop('thall', axis=1), pd.get_dummies(df['thall'], drop_first=True)], axis=1)

In [None]:
df.head()

There Were a few columns that we have to verify.
When there aren't a lot of unique values, we can use one-hot encoding if there is no value based relation between them.
Luckily, in this set there was only 1 feature that really needed that.
(When there are not a lot of values, we can just reasign them, like how we did with 'restecg')

# **Step 4: Data Processing**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
# first we get the data into features and labels, and split it intro train and test
X = df.drop('output', axis=1)
y = df['output']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [None]:
# We use a MinMaxScaler from sklearn in order to normalize the data 
# (check the documentation to see how it works)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# **Step 5/6: Create the Model + Train**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
len(df.columns)

In [None]:
model = Sequential()

# Usually use the number of columns as the first layer
model.add(Dense(16, activation = 'relu'))
# Using a droput layer to reduce overfitting
model.add(Dropout(0.125))
model.add(Dense(16, activation = 'relu'))
model.add(Dense(8, activation = 'relu'))
model.add(Dense(8, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(optimizer = 'adam', loss = 'binary_crossentropy')

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose = 1, patience = 25)

In [None]:
model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), epochs=1000, verbose=1, callbacks=[early_stopping])

In [None]:
pd.DataFrame(model.history.history).plot()
# ploting the history to see how the model behaved

# **Step 7: Evaluation**

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
predictions = model.predict_classes(X_test)

In [None]:
# We care more about the f1-score
# Because there was pretty even data, accuracy is also a good way to evaluate the model 
print(classification_report(y_test, predictions))

In [None]:
# In medicine especially what we want are less false negative
print(confusion_matrix(y_test, predictions))

**Now for the evaluation:
    Accuracy of 90%
    f1-score 0.90
It's not really great but it's definitely not bad either**