In [2]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split

2021-12-12 21:12:19.571182: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-12 21:12:19.571212: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Load data

In [3]:
df = pd.read_csv("./data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
# normalize column names

df.columns = df.columns.str.lower()
df.head()

# drop lines with missing values
print(df.shape)
df = df.dropna()
print(df.shape)
df.head()

(5110, 12)
(4909, 12)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


## Data pre-processing

In [5]:
# save raw data
df_raw = df.copy()

In [6]:
# load raw data
df = df_raw.copy()
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [7]:
# map stroke column to have meaningful column names after one-hot encoding
df['stroke'] = df['stroke'].map({1: 'had_stroke', 0: 'no_stroke'})
df['stroke']

0       had_stroke
2       had_stroke
3       had_stroke
4       had_stroke
5       had_stroke
           ...    
5104     no_stroke
5106     no_stroke
5107     no_stroke
5108     no_stroke
5109     no_stroke
Name: stroke, Length: 4909, dtype: object

In [8]:
# one-hot encoding for categorical variables:
categorical_cols = ['gender', 'work_type', 'smoking_status', 'stroke']

for col in categorical_cols:
    if col not in df.columns:
        continue
    dummies = pd.get_dummies(df[col])
    df = pd.concat((df, dummies), axis=1)
    df.drop(col, axis=1, inplace=True)

df.head()

Unnamed: 0,id,age,hypertension,heart_disease,ever_married,residence_type,avg_glucose_level,bmi,Female,Male,...,Never_worked,Private,Self-employed,children,Unknown,formerly smoked,never smoked,smokes,had_stroke,no_stroke
0,9046,67.0,0,1,Yes,Urban,228.69,36.6,0,1,...,0,1,0,0,0,1,0,0,1,0
2,31112,80.0,0,1,Yes,Rural,105.92,32.5,0,1,...,0,1,0,0,0,0,1,0,1,0
3,60182,49.0,0,0,Yes,Urban,171.23,34.4,1,0,...,0,1,0,0,0,0,0,1,1,0
4,1665,79.0,1,0,Yes,Rural,174.12,24.0,1,0,...,0,0,1,0,0,0,1,0,1,0
5,56669,81.0,0,0,Yes,Urban,186.21,29.0,0,1,...,0,1,0,0,0,1,0,0,1,0


In [9]:
# binarize binary categorical data
binary_col_mapping = {
    'ever_married': {'No': 0, 'Yes': 1},
    'residence_type': {'Rural': 0, 'Urban': 1}
}

for (col, mapping) in binary_col_mapping.items():
    if col not in df.columns:
        continue
    df[col] = df[col].map(mapping)

df.head()

Unnamed: 0,id,age,hypertension,heart_disease,ever_married,residence_type,avg_glucose_level,bmi,Female,Male,...,Never_worked,Private,Self-employed,children,Unknown,formerly smoked,never smoked,smokes,had_stroke,no_stroke
0,9046,67.0,0,1,1,1,228.69,36.6,0,1,...,0,1,0,0,0,1,0,0,1,0
2,31112,80.0,0,1,1,0,105.92,32.5,0,1,...,0,1,0,0,0,0,1,0,1,0
3,60182,49.0,0,0,1,1,171.23,34.4,1,0,...,0,1,0,0,0,0,0,1,1,0
4,1665,79.0,1,0,1,0,174.12,24.0,1,0,...,0,0,1,0,0,0,1,0,1,0
5,56669,81.0,0,0,1,1,186.21,29.0,0,1,...,0,1,0,0,0,1,0,0,1,0


In [10]:
# normalize non-binary columns
non_binary_cols = ['age', 'avg_glucose_level', 'bmi']

for col in non_binary_cols:
    df[col] = (df[col] - df[col].mean()) / df[col].std()
    print(f'{col}: mean = {df[col].mean():.2f}; stdev = {df[col].std():.2f}')

df.head()

age: mean = 0.00; stdev = 1.00
avg_glucose_level: mean = 0.00; stdev = 1.00
bmi: mean = 0.00; stdev = 1.00


Unnamed: 0,id,age,hypertension,heart_disease,ever_married,residence_type,avg_glucose_level,bmi,Female,Male,...,Never_worked,Private,Self-employed,children,Unknown,formerly smoked,never smoked,smokes,had_stroke,no_stroke
0,9046,1.070029,0,1,1,1,2.777415,0.981245,0,1,...,0,1,0,0,0,1,0,0,1,0
2,31112,1.646395,0,1,1,0,0.01384,0.459222,0,1,...,0,1,0,0,0,0,1,0,1,0
3,60182,0.271984,0,0,1,1,1.48398,0.701135,1,0,...,0,1,0,0,0,0,0,1,1,0
4,1665,1.602059,1,0,1,0,1.549035,-0.62302,1,0,...,0,0,1,0,0,0,1,0,1,0
5,56669,1.690731,0,0,1,1,1.821183,0.013593,0,1,...,0,1,0,0,0,1,0,0,1,0


In [11]:
# create training and testing set

x_cols = [col for col in df.columns if col not in ['had_stroke', 'no_stroke', 'id']]
x = df[x_cols]
y = df[['had_stroke', 'no_stroke']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print(x_train.shape, y_train.shape)

(3927, 19) (3927, 2)


## Build & train model

In [33]:
# build model

model = Sequential()
model.add(Dense(8, activation='relu', input_shape=(19,)))
model.add(Dense(16, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 8)                 160       
                                                                 
 dense_11 (Dense)            (None, 16)                144       
                                                                 
 dense_12 (Dense)            (None, 32)                544       
                                                                 
 dense_13 (Dense)            (None, 2)                 66        
                                                                 
Total params: 914
Trainable params: 914
Non-trainable params: 0
_________________________________________________________________


In [34]:
sample_weight = y_train['had_stroke'] == 1
sample_weight.map({False: 1, True: 50})

3565    1
898     1
2707    1
4198    1
2746    1
       ..
4613    1
511     1
3247    1
3946    1
916     1
Name: had_stroke, Length: 3927, dtype: int64

In [35]:
model.compile(
    optimizer='adam',  # Optimizer
    # Loss function to minimize
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.Recall()]
)

model.fit(
    x_train,
    y_train,
    epochs=10,
    verbose=2,
    batch_size=64,
    sample_weight=sample_weight,

)

Epoch 1/10
62/62 - 1s - loss: 0.0202 - accuracy: 0.0428 - recall_5: 0.0428 - 767ms/epoch - 12ms/step
Epoch 2/10
62/62 - 0s - loss: 0.0072 - accuracy: 0.0397 - recall_5: 0.0397 - 100ms/epoch - 2ms/step
Epoch 3/10
62/62 - 0s - loss: 9.9398e-04 - accuracy: 0.0397 - recall_5: 0.0397 - 97ms/epoch - 2ms/step
Epoch 4/10
62/62 - 0s - loss: 3.0261e-04 - accuracy: 0.0397 - recall_5: 0.0397 - 114ms/epoch - 2ms/step
Epoch 5/10
62/62 - 0s - loss: 1.4534e-04 - accuracy: 0.0397 - recall_5: 0.0397 - 102ms/epoch - 2ms/step
Epoch 6/10
62/62 - 0s - loss: 8.5347e-05 - accuracy: 0.0397 - recall_5: 0.0397 - 93ms/epoch - 1ms/step
Epoch 7/10
62/62 - 0s - loss: 5.5386e-05 - accuracy: 0.0397 - recall_5: 0.0397 - 87ms/epoch - 1ms/step
Epoch 8/10
62/62 - 0s - loss: 3.8519e-05 - accuracy: 0.0397 - recall_5: 0.0397 - 74ms/epoch - 1ms/step
Epoch 9/10
62/62 - 0s - loss: 2.8166e-05 - accuracy: 0.0397 - recall_5: 0.0397 - 83ms/epoch - 1ms/step
Epoch 10/10
62/62 - 0s - loss: 2.2228e-05 - accuracy: 0.0397 - recall_5: 0.0

<keras.callbacks.History at 0x7fb33e891220>

## Evaluate

In [14]:
# evaluate on test set
loss, acc, recall = model.evaluate(x_test, y_test)
print(f'test loss = {loss}\ntest accuracy = {acc}\nrecall = {recall}')

test loss = 0.17382635176181793
test accuracy = 0.9460285305976868
recall = 0.9460285305976868


In [15]:
# evaluate on single instance
test_individual = np.array(x_test.iloc[0])
test_individual.shape = (1, 19, 1)
print(model.predict(test_individual))
print(y_test.iloc[0])

[[0.02684021 0.9731598 ]]
had_stroke    0
no_stroke     1
Name: 4336, dtype: uint8


In [16]:
model.weights

[<tf.Variable 'dense/kernel:0' shape=(19, 8) dtype=float32, numpy=
 array([[ 0.3322972 , -0.1469666 ,  0.32511944,  0.37885192,  0.13180521,
          0.58542264, -0.7736696 ,  0.3460944 ],
        [-0.10244417,  0.16380171, -0.48052746, -0.37715727,  0.18645899,
          0.3020158 , -0.00884684,  0.11090732],
        [-0.2834323 ,  0.2820159 ,  0.11527443,  0.13907392, -0.20678733,
          0.19135423, -0.3064339 ,  0.3636547 ],
        [ 0.40273878,  0.37400237,  0.41999725, -0.25991192,  0.13538285,
         -0.18778075,  0.04511276, -0.1537914 ],
        [ 0.15130857, -0.29536396, -0.14237633,  0.4822192 , -0.0691272 ,
          0.27116352,  0.46923774, -0.08016112],
        [ 0.39966914,  0.29203045, -0.32293692,  0.4232454 , -0.0981659 ,
          0.34865558,  0.02511504,  0.3698498 ],
        [ 0.02579864,  0.18910855,  0.3251635 ,  0.23537078,  0.38890848,
          0.05236178, -0.0584064 ,  0.01152524],
        [-0.08439066, -0.44568756,  0.16868289, -0.17575818,  0.24132772

In [25]:
# testing set with only stroke victims
x_test_pos = df.loc[df['had_stroke'] == 1, x_cols]
y_test_pos = y.loc[y['had_stroke'] == 1]
y.head()

model.evaluate(x_test_pos, y_test_pos)



[2.2573814392089844, 0.0, 0.0]

## Build linear model

In [17]:
# build model

model = Sequential()
model.add(Dense(1, activation='linear', input_shape=(19,)))
model.add(Dense(2, activation='softmax'))

model.summary()

model.compile(
    optimizer='adam',  # Optimizer
    # Loss function to minimize
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.Recall()]
)

model.fit(
    x_train,
    y_train,
    epochs=10,
    verbose=2,
    batch_size=64
)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 1)                 20        
                                                                 
 dense_5 (Dense)             (None, 2)                 4         
                                                                 
Total params: 24
Trainable params: 24
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
62/62 - 1s - loss: 0.7502 - accuracy: 0.4848 - recall_1: 0.4848 - 780ms/epoch - 13ms/step
Epoch 2/10
62/62 - 0s - loss: 0.6376 - accuracy: 0.7087 - recall_1: 0.7087 - 117ms/epoch - 2ms/step
Epoch 3/10
62/62 - 0s - loss: 0.5475 - accuracy: 0.8322 - recall_1: 0.8322 - 82ms/epoch - 1ms/step
Epoch 4/10
62/62 - 0s - loss: 0.4621 - accuracy: 0.9055 - recall_1: 0.9055 - 68ms/epoch - 1ms/step
Epoch 5/10
62/62 - 0s - loss: 0.3817 - accuracy: 0.9368

<keras.callbacks.History at 0x7fb35c072580>

In [18]:
# evaluate on test set
loss, acc, recall = model.evaluate(x_test, y_test)
print(f'test loss = {loss}\ntest accuracy = {acc}\nrecall = {recall}')

test loss = 0.21711164712905884
test accuracy = 0.9450101852416992
recall = 0.9450101852416992
