In [None]:
import sklearn
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import tensorflow as tf 
import tensorflow_datasets as tfds
import pathlib

# Prepering the dataset

In [None]:
data_path = "../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv"
data = pd.read_csv(data_path)
data.head()

In [None]:
# 201 missing values for bmi feature 
data.isnull().sum()

In [None]:
#no duplicated rows
if data.duplicated().sum() == 0:
    print("No duplicated rows")
else:
    print("There are duplicated rows in the data")

In [None]:
print("There are {}  examples is this dataset, before dropping the rows containing null values".format(len(data)))

In [None]:
# so I decide to fill the missing values 

data.fillna(data.median(), inplace=True)

In [None]:
# 5 categorical variables
cat_variables = ['gender', 'hypertension', 'heart_disease', 'ever_married',
                'work_type', 'work_type', 'Residence_type', "smoking_status"] 

for variable in cat_variables:
    data[variable] = data[variable].astype('category')

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
data['gender'][data['gender'] == 'Other']
data.drop(3116, inplace=True)

In [None]:
x = data[['gender',  'age', 'hypertension', 'heart_disease','ever_married',
          'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']]
y = data['stroke']


x_train_full, x_test, y_train_full, y_test = train_test_split(x, y, shuffle=True, test_size=0.2,
                                                              stratify=y,
                                                              random_state=100)

x_train, x_val, y_train, y_val = train_test_split(x_train_full, y_train_full,
                                                  shuffle=True, test_size=0.2,
                                                  stratify=y_train_full,
                                                  random_state=100)

In [None]:
numerical_features = ['age', 'avg_glucose_level', 'bmi']
categorical_features = ['gender', 'hypertension', 'heart_disease','ever_married',
                        'work_type', 'Residence_type', 'smoking_status']


preprocessing_pipeline = ColumnTransformer([
    ('num', MinMaxScaler(), numerical_features),
    ('cat', OneHotEncoder(), categorical_features)
])

In [None]:
x_train_prepered = preprocessing_pipeline.fit_transform(x_train)
x_val_prepered = preprocessing_pipeline.transform(x_val)
x_test_prepered = preprocessing_pipeline.transform(x_test)

y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

In [None]:
training_dataset = tf.data.Dataset.from_tensor_slices((tf.constant(x_train_prepered), tf.constant(y_train)))
training_dataset = training_dataset.shuffle(512).batch(64).prefetch(1)

val_dataset = tf.data.Dataset.from_tensor_slices((tf.constant(x_val_prepered), tf.constant(y_val)))
val_dataset = val_dataset.shuffle(256).batch(64).prefetch(1)

test_dataset = tf.data.Dataset.from_tensor_slices((tf.constant(x_train_prepered), tf.constant(y_train)))
test_dataset = test_dataset.shuffle(256).batch(64).prefetch(1)

# Custom training loop

In [None]:
def base_model():
    
    inputs = tf.keras.Input(shape=(22,))
    x = tf.keras.layers.Dense(512, activation='relu')(inputs)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.5)(x)

    x = tf.keras.layers.Dense(256, activation='relu',
                             kernel_regularizer=tf.keras.regularizers.l2())(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    model_output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    
    model = tf.keras.models.Model(inputs=inputs, outputs=model_output)
    
    return model

In [None]:
n_epochs = 5
batch_size = 64
n_steps = len(x_train_prepered) // batch_size

optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
loss_fn = tf.keras.losses.BinaryCrossentropy()
metric = tf.keras.metrics.Accuracy()
val_metrics = tf.keras.metrics.Accuracy()

In [None]:
model = base_model()

epochs_train_losses = []
epochs_val_losses = []
epochs_train_acc = []
epochs_val_acc = []

for epoch in range(1, n_epochs + 1):
    print("Epoch {}/{}".format(epoch, n_epochs))
    
    #apply gradients
    training_losses = []
    for step, (x_batch_train, y_batch_train) in enumerate(training_dataset):
        #calculate the gradiants with regards to the model trainable_weights
        with tf.GradientTape() as tape:
            #forward pass
            logits = model(x_batch_train)
            loss = loss_fn(y_batch_train, logits)
            training_losses.append(loss)
        # backward pass 
        #calculate the gradiants
        grads = tape.gradient(loss, model.trainable_weights)
        #modifying the trainable_weights
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        
        #calculating rhe ACC after modifying the  trainable_weights
        metric(y_batch_train, tf.argmax(logits, axis=1, output_type=tf.int32))
    
    # the metric measurement on the epoch
    train_acc = metric.result()
    epochs_train_acc.append(train_acc)
    
    #The mean of the losses of the batches give us the mean loss for each epoch
    losses_train_mean = np.mean(training_losses)
    epochs_train_losses.append(losses_train_mean)
    
    #calculating the validation loss and MAE
    val_losses = []
    for x_val, y_val in val_dataset:
        val_logits = model(x_val)
        val_loss = loss_fn(y_val, val_logits)
        val_losses.append(val_loss)
        val_metrics(y_val, tf.argmax(val_logits, axis=1, output_type=tf.int32))
     
    val_acc = val_metrics.result()
    epochs_val_acc.append(val_acc)
    
    losses_val_mean = np.mean(val_losses)
    epochs_val_losses.append(losses_val_mean)
    
    print("Trainig Loss: {}-------Training Accuracy: {}".format(epochs_train_losses[-1], train_acc))
    print("Validation Loss: {}-------Validation Accuracy: {}".format(epochs_val_losses[-1], val_acc))
    print("\n")
    
    #reset the metrics after each epoch
    metric.reset_states()
    val_metrics.reset_states()


# Evaluate the model on the test dataset

In [None]:
test_accuracy = tf.keras.metrics.Accuracy()

for (x, y) in test_dataset:
    # training=False is needed only if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    logits = model(x, training=False)
    prediction = tf.argmax(logits, axis=1, output_type=tf.int32)
    test_accuracy(prediction, y)

print("Test set accuracy: {:.3%}".format(test_accuracy.result()))