In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div style="background-color: pink; color: #1434A4; border-radius: 32px; height:50px">
    <center><h1 style="display:block; padding:7px">TABLE OF CONTENTS 📚</h1></center>
</div>

#####  **1. IMPORTING LIBRARIES**
##### **2. LOADING DATA**
##### **3. DATA PREPROCESSING**
##### **4. DATA ANALYSIS**
##### **5. MODEL BUILDING**
##### **6. CONCLUSIONS**

<div style="background-color: pink; color: #1434A4; border-radius: 32px; height:50px">
    <center><h1 style="display:block; padding:7px">Importing required libraries 📚</h1></center>
</div>


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
np.random.seed(0)

<div style="background-color: pink; color: #1434A4; border-radius: 32px; height:50px">
    <center><h1 style="display:block; padding:7px">LOADING DATA 📚</h1></center>
</div>


In [None]:
data = pd.read_csv("/kaggle/input/heart-disease-classification/heart.csv")
data.head()

In [None]:
data.info()

In [None]:
# since dataset is small, dropping age column as it will increase variability and add noise.
data.drop(['age'],axis=1,inplace=True)

In [None]:
data.describe().T

<div style="background-color: pink; color: #1434A4; border-radius: 32px; height:250px">
    <center><h1 style="display:block; padding:7px">DATA ANALYSIS 📚</h1></center>
    <left><h2 style="display:block; padding:1px">The analysis consist of:</h2></left>
    <left><h3 style="display:block">* Count Plot</h2></left>
    <left><h3 style="display:block">* Corelation Heat Map</h2></left>
    <left><h3 style="display:block; padding:1px">* Box Plot</h2></left>
</div>

In [None]:
#first of all let us evaluate the target and find out if our data is imbalanced or not

colours=["#f7b2b0","#8f7198", "#003f5c"]
sns.countplot(data= data, x="target",palette=colours)

In [None]:
#correlation matrix
corrmat= data.corr()
plt.figure(figsize=(15,15))  

cmap = sns.diverging_palette(250, 10, s=80, l=55, n=9, as_cmap=True)

sns.heatmap(corrmat,annot=True, cmap=cmap, center=0)

**slope Vs oldpeak by target**

In [None]:
sns.boxplot(data =data,x="slope",y="oldpeak",palette='winter', hue="target")
plt.show()

**exang Vs max_hr by target**

In [None]:
sns.boxplot(data =data,x="exang",y="max_hr",palette='winter', hue="target")
plt.show()

<div style="background-color: pink; color: #1434A4; border-radius: 32px; height:50px">
    <center><h1 style="display:block; padding:7px">MODEL SELECTION AND BUILDING</h1></center>
</div>


In this section we will:

1. Set up features(X) and target(Y)
2. Scale the features
3. Split training and test sets
4. Model selection
5. Hyperparameter tuning

In [None]:
#assigning values to features as X and target as y
X=data.drop(["target"],axis=1)
y=data["target"]

#Set up a standard scaler for the features
col_names = list(X.columns)
s_scaler = preprocessing.StandardScaler()
X_df= s_scaler.fit_transform(X)
X_df = pd.DataFrame(X_df, columns=col_names)   
X_df.describe().T

In [None]:
#looking at the scaled features
plt.figure(figsize=(20,10))
sns.boxenplot(data = X_df,palette = 'winter')
plt.xticks(rotation=90)
plt.show()

In [None]:
#spliting test and training sets
X_train, X_test, y_train,y_test = train_test_split(X_df,y,test_size=0.3,random_state=42)

In [None]:
#A quick model selection process
#pipelines of models( it is short was to fit and pred)
pipeline_lr=Pipeline([('lr_classifier',LogisticRegression(random_state=42))])

pipeline_dt=Pipeline([ ('dt_classifier',DecisionTreeClassifier(random_state=42))])

pipeline_rf=Pipeline([('rf_classifier',RandomForestClassifier())])

pipeline_svc=Pipeline([('sv_classifier',SVC())])

# List of all the pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_svc]

# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest', 3: "SVC"}


# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

#cross validation on accuracy 
cv_results_accuracy = []
for i, model in enumerate(pipelines):
    cv_score = cross_val_score(model, X_train,y_train, cv=10 )
    cv_results_accuracy.append(cv_score)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))

**So Random Forest does best amongst the models to be the most accurate. Let us build a better random forest with grid search cv. Let's find out how it performs on testset**

In [None]:
#taking look at the test set
pred_rfc = pipeline_rf.predict(X_test)
accuracy = accuracy_score(y_test, pred_rfc)
print(accuracy)

In [None]:
#Building a dictionalry with list of optional values that will me analyesed by GridSearch CV
parameters = { 
    'n_estimators': [100,150],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,6,8],
    'criterion' :['gini', 'entropy']
}

#Fitting the trainingset to find parameters with best accuracy

CV_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parameters, cv= 3)
CV_rfc.fit(X_train, y_train)

#Getting the outcome of gridsearch

CV_rfc.best_params_

In [None]:
RF_model = RandomForestClassifier(**CV_rfc.best_params_)
RF_model.fit(X_train, y_train)
#Testing the Model on test set
predictions=RF_model.predict(X_test)
acccuracy= accuracy_score(y_test,predictions)
acccuracy

In [None]:
acccuracy = accuracy_score(y_test, predictions)
recall = recall_score(y_test, predictions, average="weighted")
precision = precision_score(y_test, predictions, average="weighted")
f1_score = f1_score(y_test, predictions, average="micro")

print("********* Random Forest Results *********")
print("Accuracy    : ", acccuracy)
print("Recall      : ", recall)
print("Precision   : ", precision)
print("F1 Score    : ", f1_score)

In [None]:
print(classification_report(y_test, predictions))

In [None]:
# cofusion matrix
plt.subplots(figsize=(12,8))
cf_matrix = confusion_matrix(y_test, predictions)
sns.heatmap(cf_matrix/np.sum(cf_matrix), cmap=cmap,annot = True, annot_kws = {'size':15})

<div style="background-color: pink; color: #1434A4; border-radius: 32px; height: 50px">
    <center><h1 style="display:block; padding:7px">I hope you like this!✌️</h1></center>
</div>
