In [83]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split

## Load data

In [28]:
df = pd.read_csv("./data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [29]:
# normalize column names

df.columns = df.columns.str.lower()
df.head()

# drop lines with missing values
print(df.shape)
df = df.dropna()
print(df.shape)
df.head()

(5110, 12)
(4909, 12)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


## Data pre-processing

In [30]:
# save raw data
df_raw = df.copy()

In [31]:
# load raw data
df = df_raw.copy()
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [32]:
# map stroke column to have meaningful column names after one-hot encoding
df['stroke'] = df['stroke'].map({1: 'had_stroke', 0: 'no_stroke'})
df['stroke']

0       had_stroke
2       had_stroke
3       had_stroke
4       had_stroke
5       had_stroke
           ...    
5104     no_stroke
5106     no_stroke
5107     no_stroke
5108     no_stroke
5109     no_stroke
Name: stroke, Length: 4909, dtype: object

In [33]:
# one-hot encoding for categorical variables:
categorical_cols = ['gender', 'work_type', 'smoking_status', 'stroke']

for col in categorical_cols:
    if col not in df.columns:
        continue
    dummies = pd.get_dummies(df[col])
    df = pd.concat((df, dummies), axis=1)
    df.drop(col, axis=1, inplace=True)

df.head()

Unnamed: 0,id,age,hypertension,heart_disease,ever_married,residence_type,avg_glucose_level,bmi,Female,Male,...,Never_worked,Private,Self-employed,children,Unknown,formerly smoked,never smoked,smokes,had_stroke,no_stroke
0,9046,67.0,0,1,Yes,Urban,228.69,36.6,0,1,...,0,1,0,0,0,1,0,0,1,0
2,31112,80.0,0,1,Yes,Rural,105.92,32.5,0,1,...,0,1,0,0,0,0,1,0,1,0
3,60182,49.0,0,0,Yes,Urban,171.23,34.4,1,0,...,0,1,0,0,0,0,0,1,1,0
4,1665,79.0,1,0,Yes,Rural,174.12,24.0,1,0,...,0,0,1,0,0,0,1,0,1,0
5,56669,81.0,0,0,Yes,Urban,186.21,29.0,0,1,...,0,1,0,0,0,1,0,0,1,0


In [34]:
# binarize binary categorical data
binary_col_mapping = {
    'ever_married': {'No': 0, 'Yes': 1},
    'residence_type': {'Rural': 0, 'Urban': 1}
}

for (col, mapping) in binary_col_mapping.items():
    if col not in df.columns:
        continue
    df[col] = df[col].map(mapping)

df.head()

Unnamed: 0,id,age,hypertension,heart_disease,ever_married,residence_type,avg_glucose_level,bmi,Female,Male,...,Never_worked,Private,Self-employed,children,Unknown,formerly smoked,never smoked,smokes,had_stroke,no_stroke
0,9046,67.0,0,1,1,1,228.69,36.6,0,1,...,0,1,0,0,0,1,0,0,1,0
2,31112,80.0,0,1,1,0,105.92,32.5,0,1,...,0,1,0,0,0,0,1,0,1,0
3,60182,49.0,0,0,1,1,171.23,34.4,1,0,...,0,1,0,0,0,0,0,1,1,0
4,1665,79.0,1,0,1,0,174.12,24.0,1,0,...,0,0,1,0,0,0,1,0,1,0
5,56669,81.0,0,0,1,1,186.21,29.0,0,1,...,0,1,0,0,0,1,0,0,1,0


In [35]:
# normalize non-binary columns
non_binary_cols = ['age', 'avg_glucose_level', 'bmi']

for col in non_binary_cols:
    df[col] = (df[col] - df[col].mean()) / df[col].std()
    print(f'{col}: mean = {df[col].mean():.2f}; stdev = {df[col].std():.2f}')

df.head()

age: mean = 0.00; stdev = 1.00
avg_glucose_level: mean = 0.00; stdev = 1.00
bmi: mean = 0.00; stdev = 1.00


Unnamed: 0,id,age,hypertension,heart_disease,ever_married,residence_type,avg_glucose_level,bmi,Female,Male,...,Never_worked,Private,Self-employed,children,Unknown,formerly smoked,never smoked,smokes,had_stroke,no_stroke
0,9046,1.070029,0,1,1,1,2.777415,0.981245,0,1,...,0,1,0,0,0,1,0,0,1,0
2,31112,1.646395,0,1,1,0,0.01384,0.459222,0,1,...,0,1,0,0,0,0,1,0,1,0
3,60182,0.271984,0,0,1,1,1.48398,0.701135,1,0,...,0,1,0,0,0,0,0,1,1,0
4,1665,1.602059,1,0,1,0,1.549035,-0.62302,1,0,...,0,0,1,0,0,0,1,0,1,0
5,56669,1.690731,0,0,1,1,1.821183,0.013593,0,1,...,0,1,0,0,0,1,0,0,1,0


In [41]:
# create training and testing set

x_cols = [col for col in df.columns if col not in ['had_stroke', 'no_stroke', 'id']]
x = df[x_cols]
y = df[['had_stroke', 'no_stroke']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print(x_train.shape, y_train.shape)

(3289, 19) (3289, 2)


## Build & train model

In [122]:
# build model

model = Sequential()
model.add(Dense(8, activation='relu', input_shape=(19,)))
model.add(Dense(16, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_33 (Dense)            (None, 1)                 20        
                                                                 
 dense_34 (Dense)            (None, 2)                 4         
                                                                 
Total params: 24
Trainable params: 24
Non-trainable params: 0
_________________________________________________________________


In [123]:
model.compile(
    optimizer='adam',  # Optimizer
    # Loss function to minimize
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.Recall()]
)

model.fit(
    x_train,
    y_train,
    epochs=10,
    verbose=2,
    batch_size=64
)

Epoch 1/10
52/52 - 1s - loss: 0.8469 - accuracy: 0.2870 - recall_2: 0.2870 - 1s/epoch - 20ms/step
Epoch 2/10
52/52 - 0s - loss: 0.7214 - accuracy: 0.4615 - recall_2: 0.4615 - 118ms/epoch - 2ms/step
Epoch 3/10
52/52 - 0s - loss: 0.6266 - accuracy: 0.6753 - recall_2: 0.6753 - 101ms/epoch - 2ms/step
Epoch 4/10
52/52 - 0s - loss: 0.5488 - accuracy: 0.8282 - recall_2: 0.8282 - 100ms/epoch - 2ms/step
Epoch 5/10
52/52 - 0s - loss: 0.4793 - accuracy: 0.9240 - recall_2: 0.9240 - 103ms/epoch - 2ms/step
Epoch 6/10
52/52 - 0s - loss: 0.4153 - accuracy: 0.9593 - recall_2: 0.9593 - 84ms/epoch - 2ms/step
Epoch 7/10
52/52 - 0s - loss: 0.3585 - accuracy: 0.9608 - recall_2: 0.9608 - 90ms/epoch - 2ms/step
Epoch 8/10
52/52 - 0s - loss: 0.3107 - accuracy: 0.9608 - recall_2: 0.9608 - 78ms/epoch - 2ms/step
Epoch 9/10
52/52 - 0s - loss: 0.2726 - accuracy: 0.9608 - recall_2: 0.9608 - 80ms/epoch - 2ms/step
Epoch 10/10
52/52 - 0s - loss: 0.2435 - accuracy: 0.9608 - recall_2: 0.9608 - 82ms/epoch - 2ms/step


<keras.callbacks.History at 0x7f045c7a0190>

## Evaluate

In [124]:
# evaluate on test set
loss, acc, recall = model.evaluate(x_test, y_test)
print(f'test loss = {loss}\ntest accuracy = {acc}\nrecall = {recall}')

test loss = 0.24633002281188965
test accuracy = 0.9506173133850098
recall = 0.9506173133850098


In [125]:
# evaluate on single instance
test_individual = np.array(x_test.iloc[0])
test_individual.shape = (1, 19, 1)
print(model.predict(test_individual))
print(y_test.iloc[0])

[[0.06121294 0.9387871 ]]
had_stroke    0
no_stroke     1
Name: 4336, dtype: uint8


In [126]:
model.weights

[<tf.Variable 'dense_33/kernel:0' shape=(19, 1) dtype=float32, numpy=
 array([[-0.16431153],
        [-0.05243177],
        [ 0.35711494],
        [ 0.48986274],
        [ 0.81471527],
        [-0.16893609],
        [ 0.02026648],
        [ 0.11735447],
        [ 0.04848059],
        [-0.08349611],
        [ 0.11388587],
        [-0.1496892 ],
        [-0.0609648 ],
        [ 0.45283943],
        [ 0.17521504],
        [ 0.1007489 ],
        [ 0.14436275],
        [ 0.20498295],
        [ 0.3006384 ]], dtype=float32)>,
 <tf.Variable 'dense_33/bias:0' shape=(1,) dtype=float32, numpy=array([0.38689867], dtype=float32)>,
 <tf.Variable 'dense_34/kernel:0' shape=(1, 2) dtype=float32, numpy=array([[-0.99138224,  1.0343903 ]], dtype=float32)>,
 <tf.Variable 'dense_34/bias:0' shape=(2,) dtype=float32, numpy=array([-0.36959678,  0.363533  ], dtype=float32)>]

## Build linear model

In [128]:
# build model

model = Sequential()
model.add(Dense(1, activation='linear', input_shape=(19,)))
model.add(Dense(2, activation='softmax'))

model.summary()

model.compile(
    optimizer='adam',  # Optimizer
    # Loss function to minimize
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.Recall()]
)

model.fit(
    x_train,
    y_train,
    epochs=10,
    verbose=2,
    batch_size=64
)

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_37 (Dense)            (None, 1)                 20        
                                                                 
 dense_38 (Dense)            (None, 2)                 4         
                                                                 
Total params: 24
Trainable params: 24
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
52/52 - 1s - loss: 0.9379 - accuracy: 0.2216 - recall_4: 0.2216 - 920ms/epoch - 18ms/step
Epoch 2/10
52/52 - 0s - loss: 0.8221 - accuracy: 0.4263 - recall_4: 0.4263 - 187ms/epoch - 4ms/step
Epoch 3/10
52/52 - 0s - loss: 0.7425 - accuracy: 0.7045 - recall_4: 0.7045 - 143ms/epoch - 3ms/step
Epoch 4/10
52/52 - 0s - loss: 0.6865 - accuracy: 0.9282 - recall_4: 0.9282 - 140ms/epoch - 3ms/step
Epoch 5/10
52/52 - 0s - loss: 0.6438 - accuracy: 0.9

<keras.callbacks.History at 0x7f045d7dc700>

In [129]:
# evaluate on test set
loss, acc, recall = model.evaluate(x_test, y_test)
print(f'test loss = {loss}\ntest accuracy = {acc}\nrecall = {recall}')

test loss = 0.4943358302116394
test accuracy = 0.9506173133850098
recall = 0.9506173133850098
