In [None]:
# Do EDA and then classify if a person is prone to heart attack or not.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

pd.options.display.max_columns = None
pd.options.display.width = None

warnings.filterwarnings("ignore")

dtset = pd.read_csv('G:\Kaggle\heart.csv')

dtset.head()

dtset.info(verbose=True)

Above shows there is no null-values in the dataset.


sns.histplot(data=dtset,x="output")

sns.histplot(data=dtset,x="sex")

sns.swarmplot(data=dtset,x="output",y="age",hue="sex")
plt.show()

From the above plot its clear that patients of HA is distributed across all age segments, but is sigificantly lower below later 30's and above initial 60's. Bulk patients are present roughly within early 40's to latter 50's age segment. Also gender 1 is more prone to heart attack as compared to gender 0, but for this dataset out of the gender 0 datapoints present, most are heart-attack prone. Also this dataset is gender-wise imbalanced, there is more datapoints of gender1 relative to gender2.

sns.set_theme(style="darkgrid")
sns.displot(dtset,x="cp",col="output")

sns.histplot(data=dtset,x="cp",kde=True)

cp type 0 is most prevalent, and persons having this is mostly not prone to heart-attack. After that chest-apin type 2 is prevalent & persons having this is mostly prone to heart-attack. Chest pain 3 is the least prevalent.

sns.swarmplot(data=dtset,x="output",y="chol",hue="sex")

Again, gender type 1 is more prone to heart-attacks & that can be attributed to their cholestrol levels. Gender type 2 is relatively less prone to heart attack. 

sns.set_theme(style="ticks")
sns.boxplot(x=dtset["chol"])

sns.boxplot(x=dtset["trtbps"])

sns.boxplot(x=dtset["thalachh"])

sns.boxplot(x=dtset["age"])

sns.boxplot(x=dtset["oldpeak"])

#dtset.oldpeak.value_counts()

sns.boxplot(x=dtset["caa"])

dtset["caa"].value_counts()  # It's clear, its more of a sort of categorical variable.

dtset.slp.value_counts()

dtset.thall.value_counts()

dtset.describe()

plt.figure(figsize=(20,6))
sns.heatmap(dtset.corr(),annot=True,mask=np.triu(dtset.corr()))

| sex,cp,fbs,restecg,exng,slp,caa,thall | --------------- are the features that looks categorical (needed dummy creation).

| chol,trtbps,thalachh,oldpeak | ------ are the variables that need outlier-handling.

----------------------------------------------------------------------------------------------------

print(dtset.cp.value_counts())
print(dtset.fbs.value_counts())
print(dtset.restecg.value_counts())
print(dtset.exng.value_counts())
print(dtset.slp.value_counts())
print(dtset.caa.value_counts())
print(dtset.thall.value_counts())

dmy=dtset[["sex","cp","fbs","restecg","exng","slp","caa","thall"]]
dmy.replace({"sex":{1:"M",2:"F"},"cp":{0:"L'cp",1:"Lcp",2:"Mcp",3:"Hcp"},"fbs":{0:"Lfbs",1:"Hfbs"},"restecg":{0:"Lecg",1:"Mecg",2:"Hecg"},"exng":{0:"Lexng",1:"Hexng"},"caa":{0:"L_caa",1:"Lcaa",2:"Mcaa",3:"Hcaa",4:"H'caa"},"slp":{0:"Lslp",1:"Mslp",2:"Hslp"},"thall":{0:"L_thall",1:"Lthall",2:"Mthall",3:"Hthall"}},inplace=True)

d1=pd.get_dummies(dmy,prefix="d_",drop_first=True)

d1

dtset2 = dtset[dtset.columns.difference(["sex","cp","fbs","restecg","exng","slp","caa","thall","age"])]

# Age doesn't need outlier-handling, thalachh needs only lower-quantile outlier handling.

outlr1 = dtset[dtset.columns.difference(["sex","cp","fbs","restecg","exng","slp","caa","thall","age","thalachh"])]
outlr1.apply(lambda x : x.clip(upper=x.quantile(0.99)))

outlr2=pd.DataFrame(dtset["thalachh"])
outlr2.apply(lambda x : x.clip(lower=x.quantile(0.01)))

dtset3=pd.concat([outlr1,outlr2,d1],axis=1)       # final data-set.

dtset3

------------------------------------------------------------------------------------------------------

import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,roc_auc_score,roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

train,test=train_test_split(dtset3,test_size=0.3,random_state=123)

# Statistical model.

mdl1=smf.logit(formula='''output~chol+oldpeak+trtbps+thalachh+d__M+d__Lcp+d__Lcp+d__Mcp+d__Lfbs+d__Lecg+d__Mecg+d__Lexng+
d__Lslp+d__Mslp+d__Hcaa+d__L_caa+d__Lcaa+d__Mcaa+d__L_thall+d__Lthall+d__Mthall''',data=train).fit()

print(mdl1.summary())

predicted=mdl1.predict(test)

ytest=test["output"]
ytest_prd=list(map(round,predicted))

cm = confusion_matrix(ytest,ytest_prd) 
print ("Confusion Matrix : \n", cm) 

print('Test accuracy: ', accuracy_score(ytest,ytest_prd))
print("Precision:",precision_score(ytest,ytest_prd))
print("Precision:",recall_score(ytest,ytest_prd))
print("ROC score:",roc_auc_score(ytest,ytest_prd))

x=pd.concat([ytest,predicted],axis=1)
x.rename(columns={"output":"ytest",0:"ytest_pred"},inplace=True)
x["ytest_pred_abs"]=x["ytest_pred"].apply(lambda x:round(x))

sns.distplot(x[x["ytest_pred_abs"]==1],color='r',kde=True)
sns.distplot(x[x["ytest_pred_abs"]==0],color='g',kde=True)

Observation: From above distplot, it's clear that the model developed is able to segregate patients likely to get prone to heart-attack and not prone to heart-attack, to a large extent.

x_train,x_test,y_train,y_test=train_test_split(dtset3[dtset3.columns.difference(["output"])],dtset["output"],test_size=0.3,
                                               random_state=12345)

logreg=LogisticRegression()
logreg.fit(x_train,y_train)

y_logreg_pred=logreg.predict(x_test)

print('Test accuracy: ', accuracy_score(y_test,y_logreg_pred))
print("Precision:",precision_score(y_test,y_logreg_pred))
print("Precision:",recall_score(y_test,y_logreg_pred))
print("ROC score:",roc_auc_score(y_test,y_logreg_pred))

pred_probability=pd.DataFrame(logreg.predict_proba(x_test)).rename(columns={0:'zero',1:'one'})
roc_table=pd.concat([pd.DataFrame(y_test).reset_index(drop=True),pd.DataFrame(y_logreg_pred),pred_probability],axis=1).rename(columns={0:'predicted','output':'actual'})
roc_table

fpr,tpr,threshold=roc_curve(roc_table["actual"],roc_table["one"],drop_intermediate=False)

auc_score=roc_auc_score(y_test,y_logreg_pred)
plt.figure(figsize=(6,4))
plt.plot(fpr,tpr,'b',label='ROC Curve (area = %0.2f)'%auc_score)
plt.legend(loc = 'lower right')
plt.plot([0,1],[0,1],'r--')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

TPR=tpr[(np.abs(tpr-0.82).argmin())]
cutoff = threshold[np.abs(tpr-0.8).argmin()]
print(TPR)
print(cutoff)

roc_table["new_labels"]=roc_table["one"].apply(lambda x:1 if x>0.6093127021234298 else 0)
roc_table

accuracy_score(roc_table["actual"],roc_table["new_labels"])

Observation: Even after changing the cut-off there isn't any significant improvement in ROC score of model. Now other algorithms will be implemented

XG~Boost

pargrid_rf={"n_estimators":[10,20,30,50,70,80,90,110,130,150],"learning_rate":[10**x for x in range (-5,1)]}
gscv_xgbm=GridSearchCV(estimator=XGBClassifier(),param_grid=pargrid_rf,cv=10,verbose=True,n_jobs=-1)
gscv_result=gscv_xgbm.fit(x_train,y_train)
gscv_result.best_params_

xgbm=gscv_xgbm.best_estimator_
t=xgbm.fit(x_train,y_train)

gscv_result.best_score_



