In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
data.shape

In [None]:
data.head()

exang: exercise induced angina (1 = yes; 0 = no)

ca: number of major vessels (0-3)

cp : Chest Pain type chest pain type

Value 1: typical angina Value 2: atypical angina Value 3: non-anginal pain Value 4: asymptomatic

trtbps : resting blood pressure (in mm Hg)

chol : cholestoral in mg/dl fetched via BMI sensor

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

rest_ecg : resting electrocardiographic results

Value 0: normal Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

thalach : maximum heart rate achieved

target : 0= less chance of heart attack 1= more chance of heart attack

In [None]:
data.info()

there are no object datatypes

In [None]:
data.isnull().sum()

there are no null values

In [None]:
data.output.value_counts()

the data is not imbalanced 

In [None]:
data.skew()

and the data is not skewed. so we don't need to do any preprocessing.

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(data.corr(), annot=True)

the data is not multi collinear as we have max correlation of 0.43.

now to understand the affect of data on the output in 2 types:
1. kde plots which gives the prob distib value of the feature wrt the op.
2. histograms

In [None]:
sns.kdeplot(data.query("output == 0")['age'], color = '#1e434c', shade = True, label = 'No heart disease', alpha = 0.8)
sns.kdeplot(data.query("output == 1")['age'], color = '#8d230f', shade = True, label = 'Heart disease', alpha = 0.8)
plt.legend()

In [None]:
data.groupby(data.output)['age'].mean()

a/c to data, the age has no significant margin for the seperation of the output.

In [None]:
sns.kdeplot(data.query("output == 0")['thalachh'], color = '#1e434c', shade = True, label = 'No heart disease', alpha = 0.8)
sns.kdeplot(data.query("output == 1")['thalachh'], color = '#8d230f', shade = True, label = 'Heart disease', alpha = 0.8)
plt.legend()

heartrate of 155 and above has brought a higher rate of heart disease.

In [None]:
sns.kdeplot(data.query("output == 0")['chol'], color = '#1e434c', shade = True, label = 'No heart disease', alpha = 0.8)
sns.kdeplot(data.query("output == 1")['chol'], color = '#8d230f', shade = True, label = 'Heart disease', alpha = 0.8)
plt.legend()

In [None]:
sns.kdeplot(data.query("output == 0")['trtbps'], color = '#1e434c', shade = True, label = 'No heart disease', alpha = 0.8)
sns.kdeplot(data.query("output == 1")['trtbps'], color = '#8d230f', shade = True, label = 'Heart disease', alpha = 0.8)
plt.legend()

according to the data, both trtbps and chol has no affect on the output.

In [None]:
data.groupby('output').hist(figsize=(10,10))

we can note down the other features which affect the ouput like 
1. cp which has higher values of level 2 and level 3 in case of a heart attack.
2. having blood sugar less than 120 gives less chance of having a heart attack.
3. obviously exang=1 is higher in case of having no heart attack.


In [None]:
count=sns.countplot(x='sex',hue='output',data=data)

In [None]:
data.groupby(data.output)['sex'].value_counts('mean')

In [None]:
data['sex'].value_counts()

In [None]:
sns.boxplot(data['trtbps'])

In [None]:
sns.boxplot(data['chol'])

In [None]:
sns.boxplot(data['oldpeak'])

In [None]:
sns.boxplot(data['caa'])

these are the columns that have outliners

In [None]:
def clean_outliers(df1, features):
    for i in features:
        Q1=df1[i].quantile(0.25)
        Q2=df1[i].quantile(0.75)
        IQR= (Q2-Q1)
        print("Feature {} has min value: {} max value: {}".format(i, Q1-IQR*1.5,Q2+IQR*1.5))
        df1=df1[((df1[i]>(Q1-IQR*1.5))&(df1[i]<(Q2+IQR*1.5)))]
        #df1=df_c
    return df1

In [None]:
num_cols=['caa','oldpeak','chol','trtbps']
df_clean=clean_outliers(data, num_cols)

In [None]:
df_clean.shape

In [None]:
target=df_clean['output']
data=df_clean.drop('output',axis=1)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data,target,test_size=0.2)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report,roc_curve,accuracy_score,auc,f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn import metrics
from sklearn.model_selection import cross_val_score,cross_val_predict

In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
def Mod(model):
    model.fit(x_train_scaled,y_train)
    train=model.score(x_train_scaled,y_train)
    print('training score: '+str(train))
    test=model.score(x_test_scaled,y_test)
    print('testing score: '+str(test))
    pred=model.predict(x_test_scaled)
    cm=confusion_matrix(y_test,pred)
    plot_confusion_matrix(model,x_test_scaled,y_test)
    metrics.plot_roc_curve(model, x_test_scaled, y_test)
    metrics.plot_precision_recall_curve(model,x_test_scaled,y_test)
    print("Precision:", metrics.precision_score(y_test, pred))
    print("Recall:",metrics.recall_score(y_test, pred))
    print('f1:',f1_score(y_test,pred))
    print(f1_score(y_test,pred))

In [None]:
Mod(LogisticRegression())

In [None]:
Mod(KNeighborsClassifier())

In [None]:
Mod(SVC())

In [None]:
Mod(RandomForestClassifier())

In [None]:
pipe=Pipeline([('preprocessing',StandardScaler()),('model',LogisticRegression())])


In [None]:
grid=[{'model':[LogisticRegression(max_iter=5000)],'preprocessing':[StandardScaler(),MinMaxScaler(),None],'model__C':[0.001,0.01,0.1,1,10,100]},
     {'model':[RandomForestClassifier(random_state=8)],'preprocessing':[None],'model__n_estimators':[100,200,300,400,500],'model__max_depth':[1,2,3,4],'model__max_features':[1,2,3]}]

In [None]:
grid=GridSearchCV(pipe,grid,cv=5)

In [None]:
grid.fit(x_train,y_train)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
main_pred=grid.predict(x_test)

In [None]:
    print("Precision:", metrics.precision_score(y_test, main_pred))
    print("Recall:",metrics.recall_score(y_test, main_pred))
    print('f1:',f1_score(y_test,main_pred))
    metrics.plot_confusion_matrix(grid,x_test,y_test)

now we want to increase the recall meaning that we want to reduce false negitive and meaning we want to reduce the threshold.

In [None]:
main_pred1=grid.predict_proba(x_test)[:,1]>0.3

In [None]:
    print("Precision:", metrics.precision_score(y_test, main_pred1))
    print("Recall:",metrics.recall_score(y_test, main_pred1))
    print('f1:',f1_score(y_test,main_pred1))
    metrics.plot_confusion_matrix(grid,x_test,y_test)

here we increased our recall to 96.8 which is brilliant with an fscore of 80.5

In [None]:
rf=RandomForestClassifier(max_depth=4, max_features=2, n_estimators=300,
                        random_state=8)

In [None]:
rf.fit(x_train,y_train)

In [None]:
mod_pred=rf.predict(x_test)

In [None]:
    print("Precision:", metrics.precision_score(y_test, mod_pred))
    print("Recall:",metrics.recall_score(y_test, mod_pred))
    print('f1:',f1_score(y_test,mod_pred))
    metrics.plot_confusion_matrix(rf,x_test,y_test)

In [None]:
mod_pred1=rf.predict_proba(x_test)[:,1]>0.3

In [None]:
    print("Precision:", metrics.precision_score(y_test, mod_pred1))
    print("Recall:",metrics.recall_score(y_test, mod_pred1))
    print('f1:',f1_score(y_test,mod_pred1))
    metrics.plot_confusion_matrix(rf,x_test,y_test)

In [None]:
explainer = shap.TreeExplainer(rf)

# calculate shap values. This is what we will plot.
shap_values = explainer.shap_values(x_test)

In [None]:
colors = ["#9bb7d4", "#0f4c81"]           
shap.summary_plot(shap_values[1], x_test,alpha=0.4)

we can see that last 4 features has almost neutral effect on the output.

we can also use force plot to visualize individual effects of the features.

and that's the END.
ThankYou!!