In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**A heart attack occurs when an artery supplying your heart with blood and oxygen becomes blocked. Fatty deposits build up over time, forming plaques in your heart's arteries. If a plaque ruptures, a blood clot can form and block your arteries, causing a heart attack.**

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.subplots import make_subplots
import matplotlib

import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

print('Libraries imported successfully..!!')

# About dataset

   - **Age** : Age of the patient

  - **Sex** : Sex of the patient

  - **exang**: exercise induced angina (1 = yes; 0 = no)

  - **ca**: number of major vessels (0-3)

  - **cp** : Chest Pain type chest pain type

    - **Value 1**: typical angina

    - **Value 2**: atypical angina

    - **Value 3**: non-anginal pain

    - **Value 4**: asymptomatic

  - **trtbps** : resting blood pressure (in mm Hg)

  - **chol** : cholestoral in mg/dl fetched via BMI sensor

  - **fbs** : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

  - **restecg** : resting electrocardiographic results

      - **Value 0**: normal

      - **Value 1**: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)

      - **Value 2**: showing probable or definite left ventricular hypertrophy by Estes' criteria

  - **thalachh** : maximum heart rate achieved

  - **output** : 0 = less chance of heart attack 1 = more chance of heart attack

## Reading the Dataset

In [None]:
#Reading the csv file heart.csv in variable 
df = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
# looking at the first 5 rows of our data
df.head()

## Checking the shape of DataFrame

In [None]:
print('Number of rows are',df.shape[0], 'and number of columns are ',df.shape[1])

## Pandas Profiling Report

In [None]:
# !pip install pandas-profiling==2.7.1 

In [None]:
profile = ProfileReport(df, title = "Pandas Profiling Report",html = {'style' : {'full_width' : True}})

In [None]:
profile.to_notebook_iframe()

## Checking the number of unique values in each column

In [None]:
dict = {}
for i in list(df.columns):
    dict[i] = df[i].value_counts().shape[0]

pd.DataFrame(dict,index=["unique count"]).T

## Info of Dataset

In [None]:
df.info()

## Checking for duplicate rows

In [None]:
df[df.duplicated()]

## Removing the duplicates

In [None]:
df.drop_duplicates(inplace=True)

## Checking new shape

In [None]:
print('Number of rows are',df.shape[0], 'and number of columns are ',df.shape[1])

## List of continuous and categorical features and output feature

In [None]:
cont_features = [i for i in df.columns if df[i].nunique()>5]
cat_features = [i for i in df.columns if df[i].nunique()<=5]
target_feature = ["output"]
print("The categorial cols are : ", cat_features)
print("The continuous cols are : ", cont_features)
print("The target variable is :  ", target_feature)

## Describing the Dataset

In [None]:
df.describe().T

## Checking null values

In [None]:
df.isnull().sum()

## Checking how many classes in target variable 

In [None]:
df['output'].value_counts()

## Computing the correlation matrix

In [None]:
df.corr().T

# Exploratory Data Analysis

In [None]:
df1 = df.copy()
ot = {0: "Less chance of HA",1:'More chance of HA'}
df1.output = [ot[item] for item in df1.output]

In [None]:
ax = sns.countplot(data=df, x='output',palette=['#85bfdc','#f64c72'])
ax.set(xticklabels=['less chance of heart attack', 'more chance of heart attack'],title="Target Distribution")
ax.tick_params(bottom=False)

In [None]:
fig = px.histogram(df1, x="age",color="output",
                   marginal="box",
                   hover_data=df.columns,
                  color_discrete_sequence=['#f64c72','#85bfdc'])
fig.update_layout(
    title="Heart attack chance corresponding to age"
)
fig.show()

> This comes as a surprise that in this data the mean age is lesser for higher chance of heart attack
> 

In [None]:
more = df[df['output']==1]['trtbps']
less = df[df['output']==0]['trtbps']
fig = ff.create_distplot([less, more],['less chance of heart attack', 'more chance of heart attack']
                         , show_hist=False, 
                        colors=['#85bfdc','#f64c72'])
fig.update_layout(
    title="Heart Attack chance corresponding to resting heart rate",
    xaxis_title="Resting heart rate",
)
fig.show()

>Some, features like resting heart rate are indifferent to chances of heart attack


In [None]:
more = df[df['output']==1]['thalachh']
less = df[df['output']==0]['thalachh']
fig = ff.create_distplot([less, more],['less chance of heart attack', 'more chance of heart attack']
                         , bin_size=5,
                        colors=['#85bfdc','#f64c72'])
fig.update_layout(
    title="Heart Attack chance corresponding to maximum heart rate achieved",
    xaxis_title="Maximum heart rate achieved",
)
fig.show()

>Here, we can clearly see that maximum heart rate is directly proportional to the chances of heart attack


In [None]:
fig = px.box(df1, x="cp", y="chol",color='output',color_discrete_map={'Less chance of HA':'#85bfdc','More chance of HA':'#f64c72'})
fig.update_layout(title="Effects of cholestrol corresponding to chest pain type on chances of heart attack")
fig.show()

In [None]:
temp = df.drop(['sex','cp','fbs','exng','restecg','exng','thall','caa','slp'], axis=1)
fig, ax = plt.subplots(1, 1, figsize=(6,6))
df_cor = temp.corr()
half = np.triu(np.ones_like(df_cor, dtype=np.bool))

my_colors = ['#85bfdc','#f64c72']
cmap = matplotlib.colors.LinearSegmentedColormap.from_list('Custom', my_colors)

heatmap = sns.heatmap(df_cor, 
            square=True, 
            mask=half,
            linewidth=2.5, 
            vmax=0.4, vmin=0, 
            cmap=cmap, 
            cbar=False, 
            ax=ax,annot=True)

heatmap.set(title="Heatmap of continous variables")
heatmap.set_yticklabels(heatmap.get_xticklabels(), rotation = 0)
heatmap.spines['top'].set_visible(True)
fig.text(1.2, 0.85, '''* thalachh(Maximum heart rate achieved) is positively correlated while,
* oldpeak is negatively correlated with the output ''', 
         fontweight='light', fontfamily='serif', fontsize=11, va='top', ha='right') 

plt.tight_layout()

## Create dimensions

In [None]:
# Create dimensions
exng = go.parcats.Dimension(
    values=df.exng,label="exng"
)

cp = go.parcats.Dimension(
    values=df.cp,label="cp"
)

fbs = go.parcats.Dimension(
    values=df.fbs,label="fbs"
)

gender_dim = go.parcats.Dimension(values=df.sex, label="sex")

restecg = go.parcats.Dimension(values=df.sex, label="restecg")
thall = go.parcats.Dimension(values=df.sex, label="thall")
caa = go.parcats.Dimension(values=df.sex, label="caa")
slp = go.parcats.Dimension(values=df.sex, label="slp")

survival_dim = go.parcats.Dimension(
    values=df.output, label="Outcome", categoryarray=[0, 1],
    ticktext=['Less chance', 'More chance']
)

# Create parcats trace
color = df.output;
colorscale = [[0, '#85bfdc'], [1, '#f64c72']];

fig = go.Figure(data = [go.Parcats(dimensions=[exng,slp,restecg,fbs,thall,caa,cp,
                                              gender_dim,survival_dim],
        line={'color': color, 'colorscale': colorscale},
        hoveron='color', hoverinfo='count+probability',
        labelfont={'size': 18, 'family': 'Times'},
        tickfont={'size': 16, 'family': 'Times'},
        arrangement='freeform')])
fig.update_layout(title="Plotly parallel categorical plot for all the categorical labels", )
fig.show()

In [None]:
lbs = ['sex','cp','fbs','exng','restecg','thall','caa','slp']

rows = 3
cols = 3

subplot_titles = [l for l in lbs]

specs=[[{"type": "bar"},{"type": "bar"},{"type": "bar"}],
       [{"type": "bar"},{"type": "bar"},{"type": "bar"}],
       [{"type": "bar"},{"type": "bar"},None]]


fig = make_subplots(
        rows=rows,
        cols=cols,
        subplot_titles=subplot_titles,
        specs=specs,  
        print_grid=False
)

for i, b in enumerate(lbs):
    row = i // cols + 1
    col = (i % rows) + 1
    name = lbs[i]
    l = [(100)*df[df[name]==x]['output'].sum()/len(df[df[name]==x]['output']) 
         for x in range(len(df[name].value_counts().tolist()))]
    fig.add_trace(go.Bar(
    x = [x for x in range(len(df[name].value_counts().tolist()))],
    y = l,
    marker_color=['#85bfdc','#9999c9','#aa77aa','#cc6397','#f64c72'],
    ),row=row,col=col)

fig.update_layout(autosize = True,
                  title="Percertage of people having 'more chance of heart attack' for each type", 
                  title_x=0.5,
                 showlegend=False)
fig.show()

>For certain categories the chances of heart attack was found high:-
> - Age = 0
> - cp = 2,3
> - thall = 2
> - caa = 0,4
> - slp = 2

In [None]:
fig = px.scatter_3d(df1, x='oldpeak', y='thalachh', z='age',
              color='output',size='trtbps',color_discrete_sequence=['#f64c72','#85bfdc'])
fig.show()

In [None]:
fig = px.scatter(df1,
                x='thalachh',
                y= 'chol',
                color='output',
                facet_col='cp', 
                facet_row='sex',
                color_discrete_sequence=['#f64c72','#85bfdc'], 
                )

fig.show()

# Packages 

In [None]:
# Scaling
from sklearn.preprocessing import RobustScaler

# Train Test Split
from sklearn.model_selection import train_test_split

# Models
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

# Metrics
from sklearn.metrics import accuracy_score, classification_report, roc_curve, confusion_matrix

# Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

print('Packages imported...')

# Making features model ready

## Scaling and Encoding features

### 

### Using MaxAbsScaler

In [None]:
# creating a copy of df
df2 = df

# define the columns to be encoded and scaled
cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]

# encoding the categorical columns
df1 = pd.get_dummies(df2, columns = cat_cols, drop_first = True)

# defining the features and target
X = df2.drop(['output'],axis=1)
y = df2[['output']]

# instantiating the scaler
scaler = MaxAbsScaler()

# scaling the continuous featuree
X[con_cols] = scaler.fit_transform(X[con_cols])
print("The first 5 rows of X are")
X.head()

### Using MinMaxScaler

In [None]:
# creating a copy of df
df2 = df

# define the columns to be encoded and scaled
cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]

# encoding the categorical columns
df1 = pd.get_dummies(df2, columns = cat_cols, drop_first = True)

# defining the features and target
X = df2.drop(['output'],axis=1)
y = df2[['output']]

# instantiating the scaler
scaler = MinMaxScaler()

# scaling the continuous featuree
X[con_cols] = scaler.fit_transform(X[con_cols])
print("The first 5 rows of X are")
X.head()

### Using RobustScaler

In [None]:
# creating a copy of df
df2 = df

# define the columns to be encoded and scaled
cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]

# encoding the categorical columns
df1 = pd.get_dummies(df2, columns = cat_cols, drop_first = True)

# defining the features and target
X = df2.drop(['output'],axis=1)
y = df2[['output']]

# instantiating the scaler
scaler = RobustScaler()

# scaling the continuous featuree
X[con_cols] = scaler.fit_transform(X[con_cols])
print("The first 5 rows of X are")
X.head()

### Using StandardScaler

In [None]:
# creating a copy of df
df2 = df

# define the columns to be encoded and scaled
cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]

# encoding the categorical columns
df1 = pd.get_dummies(df2, columns = cat_cols, drop_first = True)

# defining the features and target
X = df2.drop(['output'],axis=1)
y = df2[['output']]

# instantiating the scaler
scaler = StandardScaler()

# scaling the continuous featuree
X[con_cols] = scaler.fit_transform(X[con_cols])
print("The first 5 rows of X are")
X.head()

## Train and test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)
print("The shape of X_train is      ", X_train.shape)
print("The shape of X_test is       ",X_test.shape)
print("The shape of y_train is      ",y_train.shape)
print("The shape of y_test is       ",y_test.shape)

# Modeling

## Linear Classifiers

## Support Vector Machines

In [None]:
# instantiating the object and fitting
clf = SVC(kernel='linear', C=1, random_state=42).fit(X_train,y_train)

# predicting the values
y_pred = clf.predict(X_test)

# printing the test accuracy
# print("The test accuracy score of SVM is ", accuracy_score(y_test, y_pred))

lr_conf_matrix = confusion_matrix(y_test, y_pred)
lr_acc_score = accuracy_score(y_test, y_pred)
print("\nconfussion matrix :")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Support Vector Machines:",lr_acc_score*100,'\n')
print(classification_report(y_test,y_pred))

### Hyperparameter tuning of SVC

In [None]:
# instantiating the object
svm = SVC()

# setting a grid - not so extensive
parameters = {"C":np.arange(1,10,1),'gamma':[0.00001,0.00005, 0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.5,1,5]}

# instantiating the GridSearchCV object
searcher = GridSearchCV(svm, parameters)

# fitting the object
searcher.fit(X_train, y_train)

# the scores
print("The best params are :", searcher.best_params_)
print("The best score is   :", searcher.best_score_)

# predicting the values
y_pred = searcher.predict(X_test)

# printing the test accuracy
# print("The test accuracy score of SVM after hyper-parameter tuning is ", accuracy_score(y_test, y_pred))

lr_conf_matrix = confusion_matrix(y_test, y_pred)
lr_acc_score = accuracy_score(y_test, y_pred)
print("\nconfussion matrix :")
print(lr_conf_matrix)
print("\n")
print("Accuracy of SVM after hyper-parameter tuning:",lr_acc_score*100,'\n')
print(classification_report(y_test,y_pred))

## Logistic Regression

In [None]:
# instantiating the object
logreg = LogisticRegression()

# fitting the object
logreg.fit(X_train, y_train)

# calculating the probabilities
y_pred_proba = logreg.predict_proba(X_test)

# finding the predicted valued
y_pred = np.argmax(y_pred_proba,axis=1)

# printing the test accuracy
# print("The test accuracy score of Logistric Regression is ", accuracy_score(y_test, y_pred))

lr_conf_matrix = confusion_matrix(y_test, y_pred)
lr_acc_score = accuracy_score(y_test, y_pred)
print("\nconfussion matrix :")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Logistic Regression:",lr_acc_score*100,'\n')
print(classification_report(y_test,y_pred))

## Tree Models

### Decision Tree

In [None]:
# instantiating the object
dt = DecisionTreeClassifier(random_state = 42)

# fitting the model
dt.fit(X_train, y_train)

# calculating the predictions
y_pred = dt.predict(X_test)

# printing the test accuracy
# print("The test accuracy score of Decision Tree is ", accuracy_score(y_test, y_pred))

lr_conf_matrix = confusion_matrix(y_test, y_pred)
lr_acc_score = accuracy_score(y_test, y_pred)
print("\nconfussion matrix :")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Decision Tree:",lr_acc_score*100,'\n')
print(classification_report(y_test,y_pred))

### Random Forest

In [None]:
# instantiating the object
rf = RandomForestClassifier()

# fitting the model
rf.fit(X_train, y_train)

# calculating the predictions
y_pred = dt.predict(X_test)

# printing the test accuracy
# print("The test accuracy score of Random Forest is ", accuracy_score(y_test, y_pred))

lr_conf_matrix = confusion_matrix(y_test, y_pred)
lr_acc_score = accuracy_score(y_test, y_pred)
print("\nconfussion matrix :")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Random Forest:",lr_acc_score*100,'\n')
print(classification_report(y_test,y_pred))

### Gradient Boosting Classifier - without tuning

In [None]:
# instantiate the classifier
gbt = GradientBoostingClassifier(n_estimators = 300,max_depth=1,subsample=0.8,max_features=0.2,random_state=42)

# fitting the model
gbt.fit(X_train,y_train)

# predicting values
y_pred = gbt.predict(X_test)
# print("The test accuracy score of Gradient Boosting Classifier is ", accuracy_score(y_test, y_pred))

lr_conf_matrix = confusion_matrix(y_test, y_pred)
lr_acc_score = accuracy_score(y_test, y_pred)
print("\nconfussion matrix :")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Gradient Boosting Classifie:",lr_acc_score*100,'\n')
print(classification_report(y_test,y_pred))

In [None]:
scores=[]
best_estimators = {}


model_params = {  
    
    
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [2,3,4,5,6,7,18,19,20],
            'algorithm' : ['auto','ball_tree'],
            'weights' : ['uniform','distance'],
            'leaf_size' : [27,28,29,30,31]
        }
    },
    
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            'max_depth' : [None,1,2,6,5]
        }
    },
    
    
    'AdaBoostClassifier': {
        'model': AdaBoostClassifier(),
        'params': {
            'n_estimators': [30,35,40,45,50,55],
            'learning_rate' : [1,1.1,1.2,1.3,1.4,1.5],
            'algorithm' : ['SAMME', 'SAMME.R']
        }
    },
    
    'GaussianNB': {
        'model': GaussianNB(),
        'params': {
            
        }
    },
    
     'LOGISTIC_REGRESSION': {
        'model': LogisticRegression(),
        'params': {
            'C': [1,2,3,4,5,6,7],
            'solver' : [ 'liblinear', 'lbfgs'],
            'multi_class' : ['auto', 'ovr' ]
        }
    },
    
        
    'SVM': {
        'model': SVC(),
        'params': {
             'C': [1,2,3,5,6,7],
             'kernel': ['rbf','linear'],
             'gamma': ['auto', 'scale']
        }
    },
       
    'RANDOM_FOREST':{
        'model' : RandomForestClassifier(),
        'params': {
            'n_estimators':[1,2,3,4,5,10,15],
            'criterion': ['entropy'],
            'random_state' : [12,13],
            'max_depth' : [5,6]

        }
    }
}

import time


for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    # print(mp['model'], mp['params'])
    start_time = time.time()
    
    clf.fit(X_train, y_train)    
    

    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': str(clf.best_params_)
    })
    best_estimators[model_name] = clf.best_estimator_
    # print(f'{(time.time() - start_time)/60} minutes')

import pandas as pd    
df3 = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df3 = df3.sort_values(by='best_score',ascending=False)
# print(df3)

# for i in df3['model'].values.tolist():
#     print(i ,':', cross_val_score(best_estimators[i],X_test,y_test,cv=5).mean())
#     print(i)

plt.figure(figsize = (10,5))
sns.barplot(x = df3['best_score'], y = df3['model'], palette='pastel')