# Studying heart disease with Neural Networks

In [None]:
# Kaggles seaborn version is still 0.10
# 0.11 required for plots in this notebook
!pip install seaborn --upgrade
import seaborn
print(f"Seaborn version on this notebook = {seaborn.__version__}")

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math as m
import tensorflow as tf
from sklearn import preprocessing


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### First look at the data: 13 feature columns and 1 target column at the end with presence (1) or no presence (0) of heart disease

In [None]:
df = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv') # when in kaggle!
# df = pd.read_csv('heart.csv') # when at home!
df.describe()

### Features description:
* age - age in years
* sex - (0 = female, 1 = male)
* cp - chest pain type
* trestbps - resting blood pressure (mmHg)
* chol - serum cholesterol (mg/dl)
* fbs - fasting blood sugar > 120 mg/dl (0 = false, 1 = true)
* restecg - resting electrocardiographic results (values 0, 1 , 2)
* thalach - maximum heart rate achieved (bpm)
* exang - exercise induced angina (0 = no, 1 = yes)
* oldpeak - ST depression induced by exercise relative to rest
* slope - the slope of the peak exercise ST segment 
* ca - number of major vessels colored by flourosopy (values 0 , 1 , 2, 3)
* thal - 3 = normal; 6 = fixed defect; 7 = reversable defect

### Target
* target - presence of heart disease (0 = no, 1 = yes)



### Functions to make bin lists for histograms

In [None]:
# Useful functions
def make_bins_simple(min_bin, max_bin, size):
    '''Create bins for histogram plots. Set min and max bin value (bin range) and bin size (step)'''
    return list(range(min_bin, max_bin + size, size))

def make_bins_max(df_col, size, round_val):
    '''Create bins for histogram plots. Min and max values rounded to nearest 10 from data'''
    min_val = m.floor(min(df_col)/round_val)*round_val
    max_val = m.ceil(max(df_col)/round_val)*round_val
    return list(range(min_val, max_val + size, size))

# 1. Exploratory data analysis: 
## a) Age distribution by sex
* There are more than twice the number of males than females. 
* The age distribution is similar 

In [None]:
bins = make_bins_max(df.age, 2, 10)
male_pct = round(100 * sum(df.sex) / len(df.sex), 1)
female_pct = round(100 - male_pct, 1)

sns.histplot(data = df, x = df.age, hue = df.sex, kde = True, bins = bins)
label = ['Male', 'Female']
plt.legend(label)
plt.title(f"""Age distribution by sex
{female_pct}% female  {male_pct}% male""")
plt.show()

## b) Distribution of resting blood presure w.r.t presence of heart disease
### Simple distribution
* We associate high blood pressure with heart disease, therefore it is sensible to look at the differences its distribution against the target
* Looking solely at the mean would indicate that on average patients without heart disease have a higher rbps
* Further investigation is required due to the clear positive skewness of the distributions 
* Above 150 rbps patients with heart disease begin to contribute more to the distribution 

In [None]:
bins = make_bins_max(df.trestbps, 10, 10)
mean_bps = df.groupby('target')['trestbps'].mean().round(1)

kde_plot = sns.histplot(data = df, x = df.trestbps, hue = df.target, kde = True, bins = bins)
plt.xlabel('Resting blood pressure (mmHg)')
label = ['No presence', 'Presence']
plt.legend(label)
plt.title(f'''Resting blood pressure mean:
No heart disease {mean_bps[0]}
Heart disease {mean_bps[1]}''')
plt.show()

##### Kernel density data can be extracted from KDE plot and used for further analysis

In [None]:
kde_data = [kde_plot.get_lines()[i].get_data() for i in range (len(kde_plot.get_lines()))]

### Distribution split by sex
* Rates for female patient are significantly higher than for males
* Female patients with heart disease also appear to contribute more to higher rbps

In [None]:
presence_f = round(100*sum((df.sex == 0) & (df.target == 1))/ sum(df.sex == 0),1)
presence_m = round(100*sum((df.sex == 1) & (df.target == 1))/ sum(df.sex == 1),1)
print(f'''Rates of heart disease by sex:
Females {presence_f}% 
Males {presence_m}%''')

In [None]:
bins = make_bins_max(df.trestbps, 10, 10)
g = sns.FacetGrid(df, col="sex", hue="target")
g.map_dataframe(sns.histplot, "trestbps", bins = bins, kde = True)
g.set_axis_labels("Resting blood pressure (mmHg)", "Count")
g.add_legend()
plt.show()

## c) Kernel Density Estimation (KDE) plot of fasting blood sugar, and chest pain reports


In [None]:
sns.kdeplot(x = df.cp, y = df.fbs, fill=True, hue = df.target, alpha = 0.7, thresh=.2)
plt.xlabel('Chest pain')
plt.ylabel('Fasting blood sugar > 120 mg/dl')
plt.xticks([0,1,2,3])
plt.yticks([0,1])
plt.xlim(-1, 3.5)
plt.ylim(-0.5,1.5)
plt.title("KDE plot")
plt.show()

## 2.  Applying a neural network model 
## a) Preprocessing data
#### Check that the data is balanced (same number of target categories 0s and 1s)

In [None]:
sum(df.target)/len(df.target), len(df.target)
# There are more 1s than 0s but not too many.

#### Define target and scaled feature arrays

In [None]:
# Separate features and targets into numpy arrays
# scale features using the sklearn preprocessing package
targets = np.array(df.target)
unscaled_features = np.array(df.loc[:, df.columns != 'target'])
scaled_features = preprocessing.scale(unscaled_features)

#### Shuffle the data (targets are currently ordered, 1s first, 0s second)

In [None]:
shuffled_indices = np.arange(scaled_features.shape[0])
np.random.shuffle(shuffled_indices)
# Shuffle by assigninf random index
shuffled_features = scaled_features[shuffled_indices]
shuffled_targets = targets[shuffled_indices]

#### Split data into train, test and validation sets (80:10:10)

In [None]:
sample_counts = shuffled_features.shape[0]
train_counts = int(0.8 * sample_counts)
validation_counts = int(0.1 * sample_counts)
test_counts = sample_counts - train_counts - validation_counts

train_inputs = shuffled_features[:train_counts]
train_targets = shuffled_targets[:train_counts]

validation_inputs = shuffled_features[train_counts:train_counts+validation_counts]
validation_targets = shuffled_targets[train_counts:train_counts+validation_counts]

test_inputs = shuffled_features[train_counts + validation_counts:]
test_targets = shuffled_targets[train_counts + validation_counts:]

#### Save the three datasets in .npz format for TensorFlow

In [None]:
np.savez('heart_train', inputs=train_inputs, targets=train_targets)
np.savez('heart_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('heart_test', inputs=test_inputs, targets=test_targets)

## b) Create ML algorithm
### Class to handle batching

In [None]:
# Class that will do the batching for the algorithm
class Heart_Data_Reader():
    # Dataset is a mandatory arugment, while the batch_size is optional
    # If you don't input batch_size, it will automatically take the value: None
    def __init__(self, dataset, batch_size = None):
    
        # The dataset that loads is one of "train", "validation", "test".
        # e.g. if I call this class with x('train',5), it will load 'heart_train.npz' with a batch size of 5.
        npz = np.load('heart_{0}.npz'.format(dataset))
        
        # Two variables that take the values of the inputs and the targets. Inputs are floats, targets are integers
        self.inputs, self.targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)
        
        # Counts the batch number, given the size you feed it later
        # If the batch size is None, we are either validating or testing, so we want to take the data in a single batch
        if batch_size is None:
            self.batch_size = self.inputs.shape[0]
        else:
            self.batch_size = batch_size
        self.curr_batch = 0
        self.batch_count = self.inputs.shape[0] // self.batch_size
    
    # A method to load the next batch
    def __next__(self):
        if self.curr_batch >= self.batch_count:
            self.curr_batch = 0
            raise StopIteration()
            
        # You slice the dataset in batches and then the "next" function loads them one after the other
        batch_slice = slice(self.curr_batch * self.batch_size, (self.curr_batch + 1) * self.batch_size)
        inputs_batch = self.inputs[batch_slice]
        targets_batch = self.targets[batch_slice]
        self.curr_batch += 1
        
        # One-hot encode the targets.
        classes_num = 2
        targets_one_hot = np.zeros((targets_batch.shape[0], classes_num))
        targets_one_hot[range(targets_batch.shape[0]), targets_batch] = 1
        
        # The function will return the inputs batch and the one-hot encoded targets
        return inputs_batch, targets_one_hot
    
        
    # A method needed for iterating over the batches
    def __iter__(self):
        return self

### Functions to run neural network and plot results

In [None]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

def run_NN(hidden_layer_size = 200,
           batch_size = 140,
           max_epochs = 1000,
           verbose = False):

    # 13 features
    input_size = 13

    # Output size is 2, (one hot encoded) no-presence or presence of heart disease.
    output_size = 2

    # Reset the default graph, so you can fiddle with the hyperparameters and then rerun the code.
    tf.reset_default_graph()

    # Create the placeholders
    inputs = tf.placeholder(tf.float32, [None, input_size])
    targets = tf.placeholder(tf.int32, [None, output_size])
    
    # Neural network structure, number of hidden layers = 2
    weights_1 = tf.get_variable("weights_1", [input_size, hidden_layer_size])
    biases_1 = tf.get_variable("biases_1", [hidden_layer_size])
    outputs_1 = tf.nn.relu(tf.matmul(inputs, weights_1) + biases_1)

    weights_2 = tf.get_variable("weights_2", [hidden_layer_size, hidden_layer_size])
    biases_2 = tf.get_variable("biases_2", [hidden_layer_size])
    outputs_2 = tf.nn.sigmoid(tf.matmul(outputs_1, weights_2) + biases_2)

    weights_3 = tf.get_variable("weights_3", [hidden_layer_size, output_size])
    biases_3 = tf.get_variable("biases_3", [output_size])

    # We will incorporate the softmax activation into the loss, as in the previous example
    outputs = tf.matmul(outputs_2, weights_3) + biases_3

    # Use the softmax cross entropy loss with logits
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=outputs, labels=targets)
    mean_loss = tf.reduce_mean(loss)

    # Get a 0 or 1 for every input indicating whether it output the correct answer
    out_equals_target = tf.equal(tf.argmax(outputs, 1), tf.argmax(targets, 1))
    accuracy = tf.reduce_mean(tf.cast(out_equals_target, tf.float32))

    # Optimize with Adam
    optimize = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(mean_loss)

    # Create a session
    sess = tf.InteractiveSession()

    # Initialize the variables
    initializer = tf.global_variables_initializer()
    sess.run(initializer)

    # Early stopping mechanism not used
    prev_validation_loss = 9999999.

    # Load the first batch of training and validation, using the class we created. 
    train_data = Heart_Data_Reader('train', batch_size)
    validation_data = Heart_Data_Reader('validation')

    validation_loss_list = []
    training_loss_list = []
    val_acc_list = []

    # Create the loop for epochs 
    for epoch_counter in range(max_epochs):
        
        # Initialise epoch loss as 0 float
        curr_epoch_loss = 0.

        # Iterate over the training data 
        for input_batch, target_batch in train_data:
            _, batch_loss = sess.run([optimize, mean_loss], 
                feed_dict={inputs: input_batch, targets: target_batch})

            #Record the batch loss into the current epoch loss
            curr_epoch_loss += batch_loss

        # Find the mean curr_epoch_loss
        # batch_count is a variable, defined in the Heart_Data_Reader class
        curr_epoch_loss /= train_data.batch_count

        # Set validation loss and accuracy for the epoch to zero
        validation_loss = 0.
        validation_accuracy = 0.

        # Use the same logic of the code to forward propagate the validation set
        # There will be a single batch, as the class was created in this way
        for input_batch, target_batch in validation_data:
            validation_loss, validation_accuracy = sess.run([mean_loss, accuracy],
                feed_dict={inputs: input_batch, targets: target_batch})

        # Print statistics for the current epoch
        if verbose == True:
            print('Epoch '+str(epoch_counter+1)+
                  '. Training loss: '+'{0:.3f}'.format(curr_epoch_loss)+
                  '. Validation loss: '+'{0:.3f}'.format(validation_loss)+
                  '. Validation accuracy: '+'{0:.2f}'.format(validation_accuracy * 100.)+'%')
        
        validation_loss_list.append(validation_loss)
        training_loss_list.append(curr_epoch_loss)
        val_acc_list.append(validation_accuracy)

        # Trigger early stopping if validation loss begins increasing.
        if validation_loss > prev_validation_loss:
            #break
            pass

        # Store this epoch's validation loss to be used as previous in the next iteration.
        prev_validation_loss = validation_loss

    print(f'End of training with {batch_size} batches')
    
    plot_NN_results(hidden_layer_size, batch_size)
    
    return validation_loss_list, training_loss_list, val_acc_list

In [None]:
def plot_NN_results(hidden_layer_size, batch_size):
    max_val_acc = round(max(val_acc_list)*100,2)
    min_val_loss_index = np.argmin(validation_loss_list)
    
    val_x = range(len(validation_loss_list))
    val_y = validation_loss_list

    train_x = range(len(training_loss_list))
    train_y = training_loss_list

    plt.plot(val_x, val_y, label = 'Validation')
    plt.plot(train_x, train_y, label = 'Training')

    plt.xlabel("epochs")
    plt.ylabel("validation loss")
    plt.legend()
    plt.title(f"""Hidden layer size = {hidden_layer_size}
    Batch no. at val loss min = {min_val_loss_index}
    Batch size = {batch_size}
    Max validation accuracy = {max_val_acc}%""")
    plt.show()  

In [None]:
validation_loss_list, training_loss_list, val_acc_list = run_NN(hidden_layer_size = 200,
                                                                batch_size = 140,
                                                                max_epochs = 1000,
                                                                verbose = False)
tf.compat.v1.InteractiveSession().close()