## Binary Classification using TPOT Optimized Model

## Hightlights!

**1. Classification metric visualization using plot_metric**

**2. TPOT's stacking estimator**

In [None]:
!pip install plot_metric

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# import the usual stuff
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os

# import TPOT and sklearn stuff
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import sklearn.metrics

%matplotlib inline

## About this dataset

**Age : Age of the patient****

**Sex : Sex of the patient****

**exang: exercise induced angina (1 = yes; 0 = no)****

**ca: number of major vessels (0-3)****

**cp : Chest Pain type chest pain type****

    * Value 1: typical angina
    * Value 2: atypical angina
    * Value 3: non-anginal pain
    * Value 4: asymptomatic

**trtbps : resting blood pressure (in mm Hg)****

**chol : cholestoral in mg/dl fetched via BMI sensor****

**fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)****

**rest_ecg : resting electrocardiographic results****

    * Value 0: normal
    * Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    * Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

**thalach : maximum heart rate achieved**

**target : 0= less chance of heart attack 1= more chance of heart attack**

## Dataset

In [None]:
df= pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
df.shape

In [None]:
df.head()

## Missing Values

In [None]:
missingno.matrix(df,sparkline=False, figsize=(10,5), fontsize=12);

In [None]:
df.info()

## Little bit of EDA

In [None]:
fig = make_subplots(rows=4, cols=4)

fig.add_trace(go.Box(
    y=df.age,
    name="Age of the patient",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
), row = 1, col = 1)

fig.add_trace(go.Box(
    y=df.sex,
    name="Sex of the patient",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
), row = 1, col = 2)

fig.add_trace(go.Box(
    y=df.cp,
    name="Chest Pain type",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
), row = 1, col = 3)

fig.add_trace(go.Box(
    y=df.trtbps,
    name="Resting blood pressure",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
), row = 1, col = 4)


fig.add_trace(go.Box(
    y=df.chol,
    name="Cholestoral in mg/dl",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
), row = 2, col = 1)

fig.add_trace(go.Box(
    y=df.fbs,
    name="Fasting blood sugar",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
), row = 2, col = 2)

fig.add_trace(go.Box(
    y=df.restecg,
    name="Resting electrocardiographic",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
), row = 2, col = 3)

fig.add_trace(go.Box(
    y=df.thalachh,
    name="Max heart rate",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
), row = 2, col = 4)

fig.add_trace(go.Box(
    y=df.exng,
    name="Exercise induced angina",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
), row = 3, col = 1)

fig.add_trace(go.Box(
    y=df.oldpeak,
    name="ST depression",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
), row = 3, col = 2)

fig.add_trace(go.Box(
    y=df.slp,
    name="Slope of the peak",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
), row = 3, col = 3)

fig.add_trace(go.Box(
    y=df.caa,
    name="No of major vessels ",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
), row = 3, col = 4)

fig.add_trace(go.Box(
    y=df.thall,
    name="Thal Rate",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
), row = 4, col = 1)

fig.add_trace(go.Box(
    y=df.output,
    name="Chance of heart attack",
    boxpoints='suspectedoutliers', # only suspected outliers
    marker=dict(
        color='rgb(8,81,156)',
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color='rgb(8,81,156)'
), row = 4, col = 2)

fig.update_layout(height=1200, width=1200, title_text="Boxplots of all the columns")
fig.show()

### We will not take care of the outliers in this version. That will come soon--stay tuned!

# Model Creation with [TPOT](http://epistasislab.github.io/tpot/)

<img src= "https://raw.githubusercontent.com/EpistasisLab/tpot/master/images/tpot-logo.jpg" alt ="Titanic" style='width: 200px;'>

**Note: The total number of pipelines is equal to POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE.**

In [None]:
%%time

target = df.output
data = df.drop('output', axis = 1)

# create train and test sets
X_train, X_test, y_train, y_test = train_test_split(data, target, train_size=0.75, test_size=0.25, random_state=34)

scores = []
_pipes = []

tpot = TPOTClassifier(verbosity=3, 
                      scoring='accuracy', 
                      random_state=23, 
                      periodic_checkpoint_folder="tpot.txt", 
                      n_jobs=-1, 
                      generations=10, 
                      population_size=30)

# run three iterations and time them
for x in range(3):

    tpot.fit(X_train, y_train)
    _pipes.append(tpot.fitted_pipeline_)
    scores.append(tpot.score(X_test, y_test))
    tpot.export('tpot_pipeline.py')

print('Scores:', scores)   
print('Best pipelines:', _pipes)

## Score from the TPOT Stacked Models

In [None]:
print(scores)

In [None]:
import plotly.graph_objects as go

colors = ['skyblue',] * 3

fig = go.Figure(data=[go.Bar(
    x=list(range(len(scores))),
    y=scores,
    marker_color=colors # marker color can be a single color value or an iterable
)])
fig.update_layout(title_text='Recall scores of the Models', width = 500, height = 500)

In [None]:
print(tpot.export())

## Creating the Stacking Architecture from the Pipeline

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive


model = make_pipeline(
    StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=True, criterion="gini", max_features=0.2, min_samples_leaf=1, min_samples_split=10, n_estimators=100)),
    StackingEstimator(estimator=BernoulliNB(alpha=1.0, fit_prior=False)),
    RFE(estimator=ExtraTreesClassifier(criterion="gini", max_features=0.15000000000000002, n_estimators=100), step=0.1),
    StandardScaler(),
    ExtraTreesClassifier(bootstrap=True, criterion="gini", max_features=0.6000000000000001, min_samples_leaf=6, min_samples_split=18, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(model.steps, 'random_state', 23)

model.fit(X_train, y_train)

In [None]:
from sklearn.tree import plot_tree
from sklearn.metrics import recall_score, accuracy_score, classification_report, confusion_matrix, f1_score, precision_score

pred=model.predict(X_test)
train_pred = model.predict(X_train)

print("Accuracy score of the test set: {}".format(accuracy_score(y_test, pred)))
print("Accuracy score of the train set: {}".format(accuracy_score(y_train, train_pred)))

### Let's try the following and see if that helps us get rid of over-fitting--

In [None]:
model = make_pipeline(
 
    RFE(estimator=ExtraTreesClassifier(criterion="gini", max_features=0.15000000000000002, n_estimators=100), step=0.1),
    StandardScaler(),
    ExtraTreesClassifier(bootstrap=True, criterion="gini", max_features=0.6000000000000001, min_samples_leaf=6, min_samples_split=18, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(model.steps, 'random_state', 23)

model.fit(X_train, y_train)

In [None]:
from sklearn.tree import plot_tree
from sklearn.metrics import recall_score, accuracy_score, classification_report, confusion_matrix, f1_score, precision_score

pred=model.predict(X_test)
train_pred = model.predict(X_train)

print("Macro Avg F1 score of the test set: {}".format(f1_score(y_test, pred, average='macro')))
print("Macro Avg F1 score of the train set: {}".format(f1_score(y_train, train_pred, average='macro')))
print('.')
print("Macro Avg Precision score of the test set: {}".format(precision_score(y_test, pred, average="macro")))
print("Macro Avg Precision score of the train set: {}".format(precision_score(y_train, train_pred, average="macro")))
print('.')
print("Macro Avg Recall score of the test set: {}".format(recall_score(y_test, pred, average="macro")))
print("Macro Avg Recall score of the train set: {}".format(recall_score(y_train, train_pred, average="macro")))
print('.')
print("Accuracy score of the test set: {}".format(accuracy_score(y_test, pred)))
print("Accuracy score of the train set: {}".format(accuracy_score(y_train, train_pred)))

#### Looks much, much better!

In [None]:
from pandas import DataFrame
from plot_metric.functions import BinaryClassification

# Use predict_proba to predict probability of the class
y_pred = model.predict_proba(X_test)[:,1]

# Visualisation with plot_metric
bc = BinaryClassification(y_test, y_pred, labels=["Class 1", "Class 2"])

# Figures
from matplotlib import rcParams
rcParams['figure.figsize'] = 8, 6
bc.plot_roc_curve()
plt.show()
bc.plot_precision_recall_curve()
plt.show()
bc.plot_class_distribution()
plt.show()
bc.plot_confusion_matrix()
plt.show()
bc.plot_confusion_matrix(normalize=True)
plt.show()

In [None]:
bc.print_report()

### Lemme know what you think about this one!

<img src= "https://i.pinimg.com/originals/67/fb/22/67fb22aa0142b62effc23870f80cf39d.jpg" alt ="Titanic" style='width: 250px;'>