In [16]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [17]:
# Load the dataset
df = pd.read_csv('heart_modified.csv')
df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,51,1,165,64.0,130,70,3,1,0,0,0,1
3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,47,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,52,2,168,76.0,120,80,1,1,1,0,1,0
69996,61,1,158,126.0,140,90,2,2,0,0,1,1
69997,52,2,183,105.0,180,90,3,1,0,1,0,1
69998,61,1,163,72.0,135,80,1,2,0,0,0,1


#PREPROCESSING

In [18]:
df.skew()

age            -0.305744
gender          0.630960
height         -0.642187
weight          1.012070
ap_hi          85.296214
ap_lo          32.114083
cholesterol     1.587123
gluc            2.397462
smoke           2.905867
alco            3.956607
active         -1.529473
cardio          0.001200
dtype: float64




*The normal range for skewness and kurtosis between -2 and +2 are considered acceptable in order to prove normal univariate distribution (George & Mallery, 2010). George, D., & Mallery, M. (2010). SPSS for Windows Step by Step: A Simple Guide and Reference, 17.0 update (10a ed.) Boston: Pearson.*

So first we want to make the data to be normal distribution.

We want to restrict the values of **ap_hi** and **ap_lo** to be between -2 and +2 by removing their outliers. We will not remove outliers for the variables **smoke** and **alco** because the data for these variables are binary (1 and 0). Similarly, we will not remove outliers for the variable **gluc** because its data consists of the values 1, 2, and 3.




In [19]:
Q3 = df.quantile(0.75)
Q1 = df.quantile(0.25)
IQR = Q3-Q1

Jumlah_Outlier = []

for col in df.select_dtypes(np.number).columns:
    outliers = ((df[col] < (Q1[col] - 1.5 * IQR[col])) | (df[col] > (Q3[col] + 1.5 * IQR[col]))).sum()
    Jumlah_Outlier.append([col, outliers, outliers/len(df)])

indexOutput = list(range(0, len(Jumlah_Outlier)))

pd.DataFrame(Jumlah_Outlier, columns=['Column', 'Outlier','persentase'],index=indexOutput).sort_values(by=['persentase'], ascending = False,ignore_index=True)

Unnamed: 0,Column,Outlier,persentase
0,active,13739,0.196271
1,gluc,10521,0.1503
2,smoke,6169,0.088129
3,ap_lo,4632,0.066171
4,alco,3764,0.053771
5,weight,1819,0.025986
6,ap_hi,1435,0.0205
7,height,519,0.007414
8,age,4,5.7e-05
9,gender,0,0.0


In [20]:
columns_to_process = ['ap_hi', 'ap_lo']

df_processed = df.copy()

for column in columns_to_process:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    df_processed = df_processed.loc[(df_processed[column] >= lower_limit) & (df_processed[column] <= upper_limit)]

df_processed


Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,51,1,165,64.0,130,70,3,1,0,0,0,1
3,48,2,169,82.0,150,100,1,1,0,0,1,1
5,60,1,151,67.0,120,80,2,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69994,57,1,165,80.0,150,80,1,1,0,0,1,1
69995,52,2,168,76.0,120,80,1,1,1,0,1,0
69996,61,1,158,126.0,140,90,2,2,0,0,1,1
69998,61,1,163,72.0,135,80,1,2,0,0,0,1


In [21]:
Q3 = df_processed.quantile(0.75)
Q1 = df_processed.quantile(0.25)
IQR = Q3-Q1

Jumlah_Outlier = []

for col in df_processed.select_dtypes(np.number).columns:
    outliers = ((df_processed[col] < (Q1[col] - 1.5 * IQR[col])) | (df_processed[col] > (Q3[col] + 1.5 * IQR[col]))).sum()
    Jumlah_Outlier.append([col, outliers, outliers/len(df_processed)])

indexOutput = list(range(0, len(Jumlah_Outlier)))

pd.DataFrame(Jumlah_Outlier, columns=['Column', 'Outlier','persentase'],index=indexOutput).sort_values(by=['persentase'], ascending = False,ignore_index=True)

Unnamed: 0,Column,Outlier,persentase
0,cholesterol,16039,0.248659
1,active,12676,0.196521
2,gluc,9614,0.14905
3,smoke,5652,0.087625
4,alco,3423,0.053068
5,weight,1580,0.024495
6,height,469,0.007271
7,age,3,4.7e-05
8,gender,0,0.0
9,ap_hi,0,0.0


In [22]:
df_processed.skew()

age           -0.314829
gender         0.620265
height        -0.607016
weight         1.003570
ap_hi          0.718302
ap_lo          0.445713
cholesterol    1.600426
gluc           2.405728
smoke          2.916964
alco           3.987541
active        -1.527485
cardio         0.006450
dtype: float64

In [23]:
# Define the feature and target variables
features = df_processed.drop('cardio', axis=1)
target = df_processed['cardio']

In [28]:
# Scale the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)


#BUILD MODEL

In [39]:
# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# Define the model
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(12, activation='relu', input_shape=(x_train.shape[1],)),
  tf.keras.layers.Dense(8, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 12)                144       
                                                                 
 dense_16 (Dense)            (None, 8)                 104       
                                                                 
 dense_17 (Dense)            (None, 1)                 9         
                                                                 
Total params: 257
Trainable params: 257
Non-trainable params: 0
_________________________________________________________________


In [40]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [41]:
# Train the model
model.fit(x_train, y_train, epochs=10, batch_size=32, verbose = 1, validation_data=(x_test, y_test))

# Save the model
model.save('heart_model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [42]:
# Save the scaler
import joblib
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']