In [1]:
from tensorflow.keras import optimizers, layers, losses
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import pandas as pd
import numpy as np

In [2]:
DB_PATH = r"dataset.csv"
TEST_SIZE = .2
VALIDATION_SIZE = .15
BATCH_SIZE = 32
EPOCHS = 100
DTYPE = 'float32'
LEARNING_RATE = 1e-3
RANDOM_STATE = None
DROP_COLS = ['id']  # , 'dataset'
CAT_COLS = ['sex', 'cp', 'restecg', 'slope', 'thal']
SCALE_COLS = ['trestbps', 'chol', 'thalch']
RENAME_COLS = {
    'num': 'target'
}

In [3]:
class MLP(Model):
    def __init__(self, output_dim):
        super().__init__()
        self.activation = layers.LeakyReLU()
        self.dense1 = layers.Dense(12, activation='linear')
        self.dense2 = layers.Dense(24, activation='linear')
        self.dense3 = layers.Dense(24, activation='linear')
        self.dense4 = layers.Dense(12, activation='linear')
        self.classifier = layers.Dense(output_dim, activation='softmax')

    def call(self, x):
        x = self.dense1(x)
        x = self.activation(x)

        x = self.dense2(x)
        x = self.activation(x)

        x = self.dense3(x)
        x = self.activation(x)

        x = self.dense4(x)
        x = self.activation(x)

        return self.classifier(x)

In [4]:
df = pd.read_csv(DB_PATH).drop(DROP_COLS, axis=1)
df

Unnamed: 0,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [5]:
df.rename(RENAME_COLS, axis=1, inplace=True)
df

Unnamed: 0,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,target
0,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [6]:
# encode categorical columns
for column in CAT_COLS:
    category = pd.Categorical(df[column])
    df[column] = category.codes
    print(column, ': ', category.view())

sex :  ['Male', 'Male', 'Male', 'Male', 'Female', ..., 'Female', 'Male', 'Male', 'Male', 'Male']
Length: 920
Categories (2, object): ['Female', 'Male']
cp :  ['typical angina', 'asymptomatic', 'asymptomatic', 'non-anginal', 'atypical angina', ..., 'asymptomatic', 'typical angina', 'asymptomatic', 'asymptomatic', 'atypical angina']
Length: 920
Categories (4, object): ['asymptomatic', 'atypical angina', 'non-anginal', 'typical angina']
restecg :  ['lv hypertrophy', 'lv hypertrophy', 'lv hypertrophy', 'normal', 'lv hypertrophy', ..., 'st-t abnormality', 'st-t abnormality', 'st-t abnormality', 'lv hypertrophy', 'lv hypertrophy']
Length: 920
Categories (3, object): ['lv hypertrophy', 'normal', 'st-t abnormality']
slope :  ['downsloping', 'flat', 'flat', 'downsloping', 'upsloping', ..., NaN, NaN, NaN, NaN, NaN]
Length: 920
Categories (3, object): ['downsloping', 'flat', 'upsloping']
thal :  ['fixed defect', 'normal', 'reversable defect', 'normal', 'normal', ..., NaN, NaN, 'fixed defect', NaN

In [7]:
X = df.loc[:, :'thal'].drop(columns='dataset')
y = df['target']

In [8]:
y = to_categorical(y)
y

array([[1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]], dtype=float32)

In [9]:
X.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,restecg,thalch,oldpeak,slope,ca,thal
count,920.0,920.0,920.0,861.0,890.0,920.0,865.0,858.0,920.0,309.0,920.0
mean,53.51087,0.78913,0.782609,132.132404,199.130337,0.98587,137.545665,0.878788,0.480435,0.676375,0.102174
std,9.424685,0.408148,0.95635,19.06607,110.78081,0.638633,25.926276,1.091226,1.167966,0.935653,1.251939
min,28.0,0.0,0.0,0.0,0.0,-1.0,60.0,-2.6,-1.0,0.0,-1.0
25%,47.0,1.0,0.0,120.0,175.0,1.0,120.0,0.0,-1.0,0.0,-1.0
50%,54.0,1.0,0.0,130.0,223.0,1.0,140.0,0.5,1.0,0.0,-1.0
75%,60.0,1.0,2.0,140.0,268.0,1.0,157.0,1.5,1.0,1.0,1.0
max,77.0,1.0,3.0,200.0,603.0,2.0,202.0,6.2,2.0,3.0,2.0


In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       920 non-null    int64  
 1   sex       920 non-null    int8   
 2   cp        920 non-null    int8   
 3   trestbps  861 non-null    float64
 4   chol      890 non-null    float64
 5   fbs       830 non-null    object 
 6   restecg   920 non-null    int8   
 7   thalch    865 non-null    float64
 8   exang     865 non-null    object 
 9   oldpeak   858 non-null    float64
 10  slope     920 non-null    int8   
 11  ca        309 non-null    float64
 12  thal      920 non-null    int8   
dtypes: float64(5), int64(1), int8(5), object(2)
memory usage: 62.1+ KB


In [11]:
X.isnull().sum()

age           0
sex           0
cp            0
trestbps     59
chol         30
fbs          90
restecg       0
thalch       55
exang        55
oldpeak      62
slope         0
ca          611
thal          0
dtype: int64

In [12]:
X.drop('ca', axis=1, inplace=True)

In [13]:
for column in X:
    if X[column].hasnans:
        X[column].fillna(X[column].mean(skipna=True), inplace=True)

In [14]:
scaler = MinMaxScaler()
X[SCALE_COLS] = scaler.fit_transform(X[SCALE_COLS])
X['age'] = X['age'] / 10

In [15]:
X = X.astype(DTYPE)
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal
0,6.3,1.0,3.0,0.725000,0.386401,1.0,0.0,0.633803,0.000000,2.300000,0.0,0.0
1,6.7,1.0,0.0,0.800000,0.474295,0.0,0.0,0.338028,1.000000,1.500000,1.0,1.0
2,6.7,1.0,0.0,0.600000,0.379768,0.0,0.0,0.485915,1.000000,2.600000,1.0,2.0
3,3.7,1.0,2.0,0.650000,0.414594,0.0,1.0,0.894366,0.000000,3.500000,0.0,1.0
4,4.1,0.0,1.0,0.650000,0.338308,0.0,0.0,0.788732,0.000000,1.400000,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
915,5.4,0.0,0.0,0.635000,0.552239,1.0,2.0,0.661972,0.000000,0.000000,-1.0,-1.0
916,6.2,1.0,3.0,0.660662,0.230514,0.0,2.0,0.546096,0.389595,0.878788,-1.0,-1.0
917,5.5,1.0,0.0,0.610000,0.369818,1.0,2.0,0.281690,0.000000,0.000000,-1.0,0.0
918,5.8,1.0,0.0,0.660662,0.638474,1.0,0.0,0.546096,0.389595,0.878788,-1.0,-1.0


In [16]:
X_, X_test, y_, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=VALIDATION_SIZE, random_state=RANDOM_STATE)

In [17]:
optimizer = optimizers.Adam(LEARNING_RATE)
loss = losses.CategoricalCrossentropy()
mlp = MLP(output_dim=len(y[0]))
mlp.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
history = mlp.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_val, y_val))


Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch

In [18]:
results = mlp.evaluate(X_test, y_test, batch_size=BATCH_SIZE)

