In [83]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split

In [28]:
df = pd.read_csv("./data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [29]:
# normalize column names

df.columns = df.columns.str.lower()
df.head()

# drop lines with missing values
print(df.shape)
df = df.dropna()
print(df.shape)
df.head()

(5110, 12)
(4909, 12)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


# Data processing

In [30]:
# save raw data
df_raw = df.copy()

In [31]:
# load raw data
df = df_raw.copy()
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [32]:
# map stroke column to have meaningful column names
df['stroke'] = df['stroke'].map({1: 'had_stroke', 0: 'no_stroke'})
df['stroke']

0       had_stroke
2       had_stroke
3       had_stroke
4       had_stroke
5       had_stroke
           ...    
5104     no_stroke
5106     no_stroke
5107     no_stroke
5108     no_stroke
5109     no_stroke
Name: stroke, Length: 4909, dtype: object

In [33]:
# one-hot for categorical variables:
categorical_cols = ['gender', 'work_type', 'smoking_status', 'stroke']

for col in categorical_cols:
    if col not in df.columns:
        continue
    dummies = pd.get_dummies(df[col])
    df = pd.concat((df, dummies), axis=1)
    df.drop(col, axis=1, inplace=True)

df.head()

Unnamed: 0,id,age,hypertension,heart_disease,ever_married,residence_type,avg_glucose_level,bmi,Female,Male,...,Never_worked,Private,Self-employed,children,Unknown,formerly smoked,never smoked,smokes,had_stroke,no_stroke
0,9046,67.0,0,1,Yes,Urban,228.69,36.6,0,1,...,0,1,0,0,0,1,0,0,1,0
2,31112,80.0,0,1,Yes,Rural,105.92,32.5,0,1,...,0,1,0,0,0,0,1,0,1,0
3,60182,49.0,0,0,Yes,Urban,171.23,34.4,1,0,...,0,1,0,0,0,0,0,1,1,0
4,1665,79.0,1,0,Yes,Rural,174.12,24.0,1,0,...,0,0,1,0,0,0,1,0,1,0
5,56669,81.0,0,0,Yes,Urban,186.21,29.0,0,1,...,0,1,0,0,0,1,0,0,1,0


In [34]:
# binarize binary data

binary_col_mapping = {
    'ever_married': {'No': 0, 'Yes': 1},
    'residence_type': {'Rural': 0, 'Urban': 1}
}

for (col, mapping) in binary_col_mapping.items():
    if col not in df.columns:
        continue
    df[col] = df[col].map(mapping)

df.head()

Unnamed: 0,id,age,hypertension,heart_disease,ever_married,residence_type,avg_glucose_level,bmi,Female,Male,...,Never_worked,Private,Self-employed,children,Unknown,formerly smoked,never smoked,smokes,had_stroke,no_stroke
0,9046,67.0,0,1,1,1,228.69,36.6,0,1,...,0,1,0,0,0,1,0,0,1,0
2,31112,80.0,0,1,1,0,105.92,32.5,0,1,...,0,1,0,0,0,0,1,0,1,0
3,60182,49.0,0,0,1,1,171.23,34.4,1,0,...,0,1,0,0,0,0,0,1,1,0
4,1665,79.0,1,0,1,0,174.12,24.0,1,0,...,0,0,1,0,0,0,1,0,1,0
5,56669,81.0,0,0,1,1,186.21,29.0,0,1,...,0,1,0,0,0,1,0,0,1,0


In [35]:
# normalize non-binary columns
non_binary_cols = ['age', 'avg_glucose_level', 'bmi']

for col in non_binary_cols:
    df[col] = (df[col] - df[col].mean()) / df[col].std()
    print(f'{col}: mean = {df[col].mean():.2f}; stdev = {df[col].std():.2f}')

df.head()

age: mean = 0.00; stdev = 1.00
avg_glucose_level: mean = 0.00; stdev = 1.00
bmi: mean = 0.00; stdev = 1.00


Unnamed: 0,id,age,hypertension,heart_disease,ever_married,residence_type,avg_glucose_level,bmi,Female,Male,...,Never_worked,Private,Self-employed,children,Unknown,formerly smoked,never smoked,smokes,had_stroke,no_stroke
0,9046,1.070029,0,1,1,1,2.777415,0.981245,0,1,...,0,1,0,0,0,1,0,0,1,0
2,31112,1.646395,0,1,1,0,0.01384,0.459222,0,1,...,0,1,0,0,0,0,1,0,1,0
3,60182,0.271984,0,0,1,1,1.48398,0.701135,1,0,...,0,1,0,0,0,0,0,1,1,0
4,1665,1.602059,1,0,1,0,1.549035,-0.62302,1,0,...,0,0,1,0,0,0,1,0,1,0
5,56669,1.690731,0,0,1,1,1.821183,0.013593,0,1,...,0,1,0,0,0,1,0,0,1,0


In [41]:
x_cols = [col for col in df.columns if col not in ['had_stroke', 'no_stroke', 'id']]
x = df[x_cols]
y = df[['had_stroke', 'no_stroke']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

print(x_train.shape, y_train.shape)

(3289, 19) (3289, 2)


In [59]:
model = Sequential()
model.add(Dense(8, activation='relu', input_shape=(19,)))
model.add(Dense(32, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_16 (Dense)            (None, 8)                 160       
                                                                 
 dense_17 (Dense)            (None, 32)                288       
                                                                 
 dense_18 (Dense)            (None, 32)                1056      
                                                                 
 dense_19 (Dense)            (None, 2)                 66        
                                                                 
Total params: 1,570
Trainable params: 1,570
Non-trainable params: 0
_________________________________________________________________


In [65]:
model.compile(
    optimizer='adam',  # Optimizer
    # Loss function to minimize
    loss='binary_crossentropy',
    metrics=["accuracy"]
)

In [67]:
model.fit(
    x_train,
    y_train,
    epochs=10,
    verbose=2,
    batch_size=64
)

Epoch 1/10
52/52 - 0s - loss: 0.1047 - accuracy: 0.9623 - 72ms/epoch - 1ms/step
Epoch 2/10
52/52 - 0s - loss: 0.1058 - accuracy: 0.9638 - 83ms/epoch - 2ms/step
Epoch 3/10
52/52 - 0s - loss: 0.1045 - accuracy: 0.9632 - 110ms/epoch - 2ms/step
Epoch 4/10
52/52 - 0s - loss: 0.1040 - accuracy: 0.9629 - 85ms/epoch - 2ms/step
Epoch 5/10
52/52 - 0s - loss: 0.1052 - accuracy: 0.9638 - 73ms/epoch - 1ms/step
Epoch 6/10
52/52 - 0s - loss: 0.1046 - accuracy: 0.9632 - 81ms/epoch - 2ms/step
Epoch 7/10
52/52 - 0s - loss: 0.1039 - accuracy: 0.9632 - 69ms/epoch - 1ms/step
Epoch 8/10
52/52 - 0s - loss: 0.1040 - accuracy: 0.9635 - 65ms/epoch - 1ms/step
Epoch 9/10
52/52 - 0s - loss: 0.1040 - accuracy: 0.9629 - 57ms/epoch - 1ms/step
Epoch 10/10
52/52 - 0s - loss: 0.1047 - accuracy: 0.9635 - 74ms/epoch - 1ms/step


<keras.callbacks.History at 0x7f045ce870d0>

In [63]:
loss, acc = model.evaluate(x_test, y_test)
print(f'test loss = {loss}\ntest accuracy = {acc}')

test loss = 0.17076796293258667
test accuracy = 0.9493827223777771


In [96]:
test_individual = np.array(x_test.iloc[0])
test_individual.shape = (1, 19, 1)
print(model.predict(test_individual))
print(y_test.iloc[0])

[[0.00131511 0.99868494]]
had_stroke    0
no_stroke     1
Name: 4336, dtype: uint8
