In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")

Age : Age of the patient

Sex : Sex of the patient

exang: exercise induced angina (1 = yes; 0 = no)

ca: number of major vessels (0-3)

cp : Chest Pain type chest pain type

Value 1: typical angina
Value 2: atypical angina
Value 3: non-anginal pain
Value 4: asymptomatic
trtbps : resting blood pressure (in mm Hg)

chol : cholestoral in mg/dl fetched via BMI sensor

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

rest_ecg : resting electrocardiographic results

Value 0: normal
Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
thalach : maximum heart rate achieved

target : 0= less chance of heart attack 1= more chance of heart attack

In [None]:
df.head().T

In [None]:
df.isnull().any()

In [None]:
df.output.value_counts()

Quite balanced Outputs.

In [None]:
df.isnull().any()

No Null Values.

In [None]:
for column in df.columns:
    if column != "output":
        sns.jointplot(x = column, y = "output", data = df, color="purple")

In [None]:
## Correlation

corr=df.corr()
corr.style.background_gradient(cmap="inferno")

## Setting Up for ML

In [None]:
end = df.shape[1] - 1
array = df.values

X = array[:,0:end]
y = array[:,end]

In [None]:
## Import ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, recall_score, precision_score\

from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [None]:
## Measuring Model Accuracy
def evaluate_model(clf, X_test, y_test, model_name, oversample_type):
    print('--------------------------------------------')
    print('Model ', model_name)
    print('Data Type ', oversample_type)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    precision = precision_score(y_test, y_pred, average="weighted")
    
    print('Confusion Matrix', end = "\n")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print('Classification Report')
    print(classification_report(y_test, y_pred))
    print('Returns the f1 Score, Recall Score and Precision Score')
    return [model_name, oversample_type, f1, recall, precision]

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, classification_report, recall_score, precision_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV

In [None]:
### Models
models = []

# models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
models.append(('DecisionTrees', DecisionTreeClassifier(random_state=42)))
models.append(('RandomForest', RandomForestClassifier(random_state=42)))
models.append(('LinearSVC', LinearSVC(random_state=0)))
models.append(('AdaBoostClassifier', AdaBoostClassifier(random_state=42)))
models.append(('SGD', SGDClassifier(random_state = 42)))
models.append(("CART",  DecisionTreeClassifier(random_state = 42)))

In [None]:
### Evaluate Models
results = []
names = []

# evaluate each model in turn
results = []
names = []
for name, model in models:
        kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
        results.append(cv_results)
        names.append(name)
        print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

In [None]:
## Plot Performance
plt.boxplot(results, labels = names)
plt.title("Algorithm Comparism")
plt.xticks(rotation=45)
plt.show()

The RandomForestClassifier model performs best here, in the next chunk, we would explore how to get the best from this model - by tuning the parameters.

### Hyper-parameter Tuning

In [None]:
params_grid = {'bootstrap': [True, False],
 'max_depth': [10, 50, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 500]}

In [None]:
rfc = RandomForestClassifier(random_state=42)

kfold = StratifiedKFold(n_splits=10,
                        random_state=1,
                        shuffle=True)

rfc_cv = GridSearchCV(estimator=rfc,
                       param_grid=params_grid,
                       cv=kfold, verbose=0)
rfc_cv.fit(X_train, y_train)

params_grid = {'bootstrap': [True, False],
 'max_depth': [10, 50, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 500, 800]}

In [None]:
rfc_cv.best_params_

In [None]:
params_grid['bootstrap']

In [None]:
### PIPELINE
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

model = make_pipeline(MinMaxScaler(),
                      RandomForestClassifier(bootstrap=True,
                                             max_depth=10,
                                             max_features="auto",
                                             min_samples_leaf=2,
                                             min_samples_split=2,
                                             n_estimators=20
                                             ))

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_val)

In [None]:
evaluate_model(model, X_val, y_val, RandomForestClassifier, "min_max_SCALED")

In [None]:
rfc_model = RandomForestClassifier(bootstrap=True,
                                             max_depth=10,
                                             max_features="auto",
                                             min_samples_leaf=2,
                                             min_samples_split=2,
                                             n_estimators=20
                                             )

evaluate_model(model, X_val, y_val, RandomForestClassifier, "UN_SCALED Dataset")

**** The accuracy from the Unscaled Dataset peforms better with a greater accuracy than when we apply the min max scaler. ****