## Binary Classification using Simple NN

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import os

import tensorflow as tf
from tensorflow import keras
from keras.optimizers import SGD

import os
import tempfile

import matplotlib as mpl
import seaborn as sns

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

seed_value= 0

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(seed_value)
# for later versions: 
# tf.compat.v1.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from tensorflow.compat.v1.keras import backend as K
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
K.set_session(sess)
# for later versions:
# session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
# sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
# tf.compat.v1.keras.backend.set_session(sess)

%matplotlib inline

## About this dataset

**Age : Age of the patient****

**Sex : Sex of the patient****

**exang: exercise induced angina (1 = yes; 0 = no)****

**ca: number of major vessels (0-3)****

**cp : Chest Pain type chest pain type****

    * Value 1: typical angina
    * Value 2: atypical angina
    * Value 3: non-anginal pain
    * Value 4: asymptomatic

**trtbps : resting blood pressure (in mm Hg)****

**chol : cholestoral in mg/dl fetched via BMI sensor****

**fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)****

**rest_ecg : resting electrocardiographic results****

    * Value 0: normal
    * Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    * Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

**thalach : maximum heart rate achieved**

**target : 0= less chance of heart attack 1= more chance of heart attack**

## Dataset

In [None]:
df= pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
df.shape

In [None]:
df.head()

## Missing Values

In [None]:
missingno.matrix(df,sparkline=False, figsize=(10,5), fontsize=12);

In [None]:
df.info()

## Little bit of EDA

In [None]:
neg, pos = np.bincount(df['output'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

In [None]:
val = df.output.value_counts().to_frame().reset_index()

x = val.index
y = val.output

colors = ['lightblue',] * 2
colors[1] = 'crimson'

# Use the hovertext kw argument for hover text
fig = go.Figure(data=[go.Bar(x=x, y=y,
            hovertext=['Count of 0s', 'Count of 1s'], marker_color=colors )])

fig.update_layout(title_text='Count of Classes', height = 400, width = 500)
fig.show()

In [None]:
data = df[['age', 'sex', 'cp', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall']]

def graph1(name, u, title):
    
    ax = sns.kdeplot(x=data[name],hue=df['output'], ax=u, shade=True, palette = 'Blues')
    ax.set_title(title, fontsize=13)
    
    ax. spines["right"]. set_visible(False)
    ax. spines["left"]. set_visible(False)
    ax. spines["top"]. set_visible(False)
    ax. spines["bottom"]. set_visible(False)
    
    plt.legend(loc = 'best')
    

fig2, ax2 = plt.subplots(4, 3, figsize=(15, 18), gridspec_kw={"wspace" : 0.4, "hspace" : 0.3, "top": 0.95})

graph1("age", ax2[0,0], 'Age')
graph1("sex", ax2[0,1], 'Gender')
graph1("cp", ax2[0,2], 'Chest Pain type')

graph1('chol', ax2[1,0], 'Cholestoral in mg/dl')
graph1('fbs', ax2[1,1], 'Fasting blood sugar')
graph1("restecg", ax2[1,2], 'Resting blood sugar')

graph1('thalachh', ax2[2,0], 'Maximum heart rate')
graph1('exng', ax2[2,1], 'Exercise induced angina')
graph1("oldpeak", ax2[2,2], 'Previous Peak')

graph1('slp', ax2[3,0], 'Slope')
graph1('caa', ax2[3,1], 'Number of major vessels')
graph1('thall', ax2[3,2], 'Thal Rate')

plt.rcParams['axes.axisbelow'] = True

## Split and normalize the data

Split the dataset into train, validation, and test sets. The validation set is used during the model fitting to evaluate the loss and any metrics, however the model is not fit with this data. The test set is completely unused during the training phase and is only used at the end to evaluate how well the model generalizes to new data.

In [None]:
# Use a utility from sklearn to split and shuffle your dataset.
train_df, test_df = train_test_split(df, test_size=0.2)
train_df, val_df = train_test_split(df, test_size=0.2)

# Form np arrays of labels and features.
train_labels = np.array(train_df.pop('output'))
bool_train_labels = train_labels != 0
val_labels = np.array(val_df.pop('output'))
test_labels = np.array(test_df.pop('output'))

train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)

Normalize the input features using the sklearn StandardScaler. This will set the mean to 0 and standard deviation to 1.

Note: The StandardScaler is only fit using the train_features to be sure the model is not peeking at the validation or test sets.

In [None]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

train_features = np.clip(train_features, -5, 5)
val_features = np.clip(val_features, -5, 5)
test_features = np.clip(test_features, -5, 5)

print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)

## Define the model and metrics

Define a function that creates a simple neural network with a densly connected hidden layer, a dropout layer to reduce overfitting, and an output sigmoid layer that returns the probability of output being 1:

In [None]:
METRICS = [
          keras.metrics.TruePositives(name='tp'),
          keras.metrics.FalsePositives(name='fp'),
          keras.metrics.TrueNegatives(name='tn'),
          keras.metrics.FalseNegatives(name='fn'), 
          keras.metrics.BinaryAccuracy(name='accuracy'),
          keras.metrics.Precision(name='precision'),
          keras.metrics.Recall(name='recall'),
          keras.metrics.AUC(name='auc'),
          keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

def make_model(metrics= METRICS, output_bias=None):
        
        if output_bias is not None:
                
                output_bias = tf.keras.initializers.Constant(output_bias)
            
        model = keras.Sequential([
                  keras.layers.Dense(
                              100, activation='relu',
                              input_shape=(train_features.shape[-1],)),
                  keras.layers.Dropout(0.5),
                  keras.layers.Dense(1, activation='sigmoid',
                                 bias_initializer=output_bias),
        ])

        model.compile(
              optimizer=keras.optimizers.Adam(lr=1e-3),
              #optimizer = SGD(lr=1e-3, momentum = 1),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=metrics)

        return model

## Baseline model

In [None]:
EPOCHS = 10
BATCH_SIZE = 1

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_acc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

model = make_model()
model.summary()

## Train the model

In [None]:
%%time

model = make_model()
baseline_history = model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[early_stopping],
    validation_data=(val_features, val_labels))

## Check training history

Let's produce plots of the model's accuracy and loss on the training and validation set.

In [None]:
def plot_metrics(history):
    
    mpl.rcParams['figure.figsize'] = (18, 10)
    metrics = ['loss', 'accuracy', 'precision', 'recall']
    for n, metric in enumerate(metrics):
        
            name = metric.replace("_"," ").capitalize()
            plt.subplot(2,2,n+1)
            ax = plt.plot(history.epoch, history.history[metric], label='Train', linewidth = 2, color = 'skyblue')
            ax = plt.plot(history.epoch, history.history['val_'+ metric], linestyle= "--", label= 'Validation', linewidth = 2, color = 'gray')
            
            plt.xlabel('Epoch')
            plt.ylabel(name)
            
            
            plt.legend(fontsize ='large')

In [None]:
plot_metrics(baseline_history)

In [None]:
train_predictions_baseline = model.predict(train_features, batch_size=BATCH_SIZE)
test_predictions_baseline = model.predict(test_features, batch_size=BATCH_SIZE)

## Training Accuracy: 0.8967

In [None]:
train_results = model.evaluate(train_features, train_labels, batch_size=BATCH_SIZE, verbose=0)

for name, value in zip(model.metrics_names, train_results):
          print(name, ': ', value)

## Validation Accuracy: 0.8361

In [None]:
val_results = model.evaluate(val_features, val_labels, batch_size=BATCH_SIZE, verbose=0)

for name, value in zip(model.metrics_names, val_results):
          print(name, ': ', value)

## Test Accuracy: 0.9016

In [None]:
test_results = model.evaluate(test_features, test_labels, batch_size=BATCH_SIZE, verbose=0)

for name, value in zip(model.metrics_names, test_results):
          print(name, ': ', value)

## Let's look at the number of False positivis and False negatives

In [None]:
def plot_cm(labels, predictions, p=0.5):
    
      cm = confusion_matrix(labels, predictions > p)
      plt.figure(figsize=(6,5))
      sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
      plt.title('Confusion matrix')
      plt.ylabel('Actual label')
      plt.xlabel('Predicted label')

      print('True Negatives (no heart-attack as no heart-attack): ', cm[0][0])
      print('False Positives (no heart as heart-attack): ', cm[0][1])
      print('False Negatives (heart-attack as no heart-attack): ', cm[1][0])
      print('True Positives (heart-attack as heart-attack): ', cm[1][1])
    
plot_cm(test_labels, test_predictions_baseline)

### ROC

In [None]:
def plot_roc(name, labels, predictions, **kwargs):
    
    mpl.rcParams['figure.figsize'] = (8, 8)
    fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions)
    ax = plt.plot(100*fp, 100*tp, label=name, linewidth=2, **kwargs)
    plt.xlabel('False positives [%]', fontsize = 16)
    plt.ylabel('True positives [%]', fontsize = 16)
    
plot_roc("Train Baseline", train_labels, train_predictions_baseline, color='gray')
plot_roc("Test Baseline", test_labels, test_predictions_baseline, color='black', linestyle='--')
plt.legend(loc='lower right', fontsize = 'large')
plt.show()

### AUPRC

Area under the interpolated precision-recall curve, obtained by plotting (recall, precision) points for different values of the classification threshold. Depending on how it's calculated, PR AUC may be equivalent to the average precision of the model.

In [None]:
def plot_prc(name, labels, predictions, **kwargs):
    
    mpl.rcParams['figure.figsize'] = (8, 8)
    precision, recall, _ = sklearn.metrics.precision_recall_curve(labels, predictions)
    plt.plot(precision, recall, label=name, linewidth=2, **kwargs)
    plt.xlabel('Recall', fontsize = 16)
    plt.ylabel('Precision', fontsize = 16)

plot_prc("Train Baseline", train_labels, train_predictions_baseline,  color='gray')
plot_prc("Test Baseline", test_labels, test_predictions_baseline, color='black', linestyle='--')
plt.legend(loc='lower left', fontsize = 'large')
plt.show()

In [None]:
test_df['Predicted_output'] = list(np.where(np.array(test_predictions_baseline) > 0.5, 1, 0 )[:, 0])
test_df.to_csv('submission.csv', index = False)

#### Will work on improving the Accuracy and Recall score in the next notebook soon! Let me know what you think about this one :)

## References

*  [True vs. False and Positive vs. Negative](https://developers.google.com/machine-learning/crash-course/classification/true-false-positive-negative)
*  [Accuracy](https://developers.google.com/machine-learning/crash-course/classification/accuracy)
*  [Precision and Recall](https://developers.google.com/machine-learning/crash-course/classification/precision-and-recall)
*  [ROC-AUC](https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc)
*  [Relationship between Precision-Recall and ROC Curves](https://www.biostat.wisc.edu/~page/rocpr.pdf)

<img src= "https://i.pinimg.com/originals/67/fb/22/67fb22aa0142b62effc23870f80cf39d.jpg" alt ="Titanic" style='width: 250px;'>