In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
from sqlalchemy import create_engine
from config import username, password
import sqlite3

## Data Preprocessing

In [2]:
# Connecting to AWS PostgreSQL server
engine = create_engine(f'postgresql://{username}:{password}@heart-disease.cdexedevamie.us-east-1.rds.amazonaws.com:5432/heart_disease_db')
connection = engine.connect()

In [3]:
# Saving SQL query as local variable 
data = pd.read_sql('select * from hearts', connection)

In [None]:
# Also saving SQL query as SQLite 
conn = sqlite3.connect('Resources\heart_disease.sqlite')
data.to_sql('hearts', conn, if_exists='replace')

In [None]:
conn.close()

In [4]:
# Converting SQL query variable into Pandas dataframe
df = pd.DataFrame(data)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [5]:
# Quick exploration of data
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [6]:
# Age

# Create the bins in which Data will be held
# Bins are 0, 40, 50, 60, 70, 80.   
bins = [0, 40, 50, 60, 70, 80]

# Create the names for the five bins
group_names = ["<40", "40-50", "50-60", "60-70", "70-80"]  

df["Age Range"] = pd.cut(df["age"], bins, labels=group_names, include_lowest=True)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,Age Range
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0,50-60
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0,50-60
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0,60-70
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0,60-70
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0,60-70


In [7]:
#Trestbps

# Create the bins in which Data will be held
# Bins are 0, 125, 150, 175, 200.   
bins = [0, 125, 150, 175, 200]

# Create the names for the five bins
group_names = ["<125", "125-150", "150-175", "175-200"]  

df["Trestbps Range"] = pd.cut(df["trestbps"], bins, labels=group_names, include_lowest=True)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,Age Range,Trestbps Range
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0,50-60,<125
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0,50-60,125-150
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0,60-70,125-150
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0,60-70,125-150
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0,60-70,125-150


In [8]:
#Chol

# Create the bins in which Data will be held
# Bins are 0, 200, 300, 400, 500, 600.   
bins = [0, 200, 300, 400, 500, 600]

# Create the names for the five bins
group_names = ["<200", "200-300", "300-400", "400-500", "500-600"]  

df["Chol Range"] = pd.cut(df["chol"], bins, labels=group_names, include_lowest=True)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,Age Range,Trestbps Range,Chol Range
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0,50-60,<125,200-300
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0,50-60,125-150,200-300
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0,60-70,125-150,<200
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0,60-70,125-150,200-300
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0,60-70,125-150,200-300


In [9]:
#Thalach

# Create the bins in which Data will be held
# Bins are 0, 100, 125, 150, 175, 300.   
bins = [0, 100, 125, 150, 175, 300]

# Create the names for the five bins
group_names = ["<100", "100-125", "125-150", "150-175", "175-300"]  

df["Thalach Range"] = pd.cut(df["thalach"], bins, labels=group_names, include_lowest=True)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,Age Range,Trestbps Range,Chol Range,Thalach Range
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0,50-60,<125,200-300,150-175
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0,50-60,125-150,200-300,150-175
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0,60-70,125-150,<200,100-125
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0,60-70,125-150,200-300,150-175
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0,60-70,125-150,200-300,100-125


In [10]:
# Removing unnecessary columns age, trestbps, chol, and thalach
df.drop(['age', 'trestbps', 'chol', 'thalach'], axis='columns', inplace=True)
df.head()

Unnamed: 0,sex,cp,fbs,restecg,exang,oldpeak,slope,ca,thal,target,Age Range,Trestbps Range,Chol Range,Thalach Range
0,1,0,0,1,0,1.0,2,2,3,0,50-60,<125,200-300,150-175
1,1,0,1,0,1,3.1,0,0,3,0,50-60,125-150,200-300,150-175
2,1,0,0,1,1,2.6,0,0,3,0,60-70,125-150,<200,100-125
3,1,0,0,1,0,0.0,2,1,3,0,60-70,125-150,200-300,150-175
4,0,0,1,1,0,1.9,1,3,2,0,60-70,125-150,200-300,100-125


In [11]:
# Converting categorical variables in the data into indicator variables
df_dummies = pd.get_dummies(df)
df_dummies.head()

Unnamed: 0,sex,cp,fbs,restecg,exang,oldpeak,slope,ca,thal,target,...,Chol Range_<200,Chol Range_200-300,Chol Range_300-400,Chol Range_400-500,Chol Range_500-600,Thalach Range_<100,Thalach Range_100-125,Thalach Range_125-150,Thalach Range_150-175,Thalach Range_175-300
0,1,0,0,1,0,1.0,2,2,3,0,...,0,1,0,0,0,0,0,0,1,0
1,1,0,1,0,1,3.1,0,0,3,0,...,0,1,0,0,0,0,0,0,1,0
2,1,0,0,1,1,2.6,0,0,3,0,...,1,0,0,0,0,0,1,0,0,0
3,1,0,0,1,0,0.0,2,1,3,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,1,1,0,1.9,1,3,2,0,...,0,1,0,0,0,0,1,0,0,0


In [12]:
# Splitting into target (y) and features (X)
y = df_dummies['target'].values
X = df_dummies.drop(['target'], axis='columns')

# Splitting X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [13]:
# Scaling X sets with StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

# Transforming X_train and X_test
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Machine Learning - Neural Network

In [14]:
# Loading optimized model from the 'optimizer' notebook
new_model = tf.keras.models.load_model('Resources/Model/OptimizedModel.h5')
new_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 9)                 261       
                                                                 
 dense_8 (Dense)             (None, 17)                170       
                                                                 
 dense_9 (Dense)             (None, 9)                 162       
                                                                 
 dense_10 (Dense)            (None, 17)                170       
                                                                 
 dense_11 (Dense)            (None, 9)                 162       
                                                                 
 dense_12 (Dense)            (None, 13)                130       
                                                                 
 dense_13 (Dense)            (None, 1)                

In [15]:
# Evaluating best model with test data

# Calculating R2
y_true = y_test.reshape(-1, 1)
y_pred = new_model.predict(X_test_scaled)
metric = tfa.metrics.r_square.RSquare()
metric.update_state(y_true, y_pred)
result = metric.result()
# Calculating loss and accuracy
model_loss, model_accuracy = new_model.evaluate(X_test_scaled, y_test, verbose=0)
print(f'Loss: {model_loss}, Accuracy: {model_accuracy}, R2: {result.numpy()}')

Loss: 0.004432981368154287, Accuracy: 1.0, R2: 0.9958808422088623
