In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import tree
import seaborn as sns
import pandas as pd
import numpy as np
np.random.seed(0)

In [None]:
data = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")
data = data[data['thall'] != 0] #Error value. Drop that except all
data = data[data['caa'] != 4] #Error value. Drop that except all
data

<h2>Descriptive Analytics</h2><br>
Age group

In [None]:
print("Age group range : [",data.age.min(),",",data.age.max(),"]")

<h2>Gender Distribution</h2>

In [None]:
sex,count = np.unique(data.sex,return_counts=True)
plt.bar(str(sex[0]),count[0],label='Female')
plt.bar(str(sex[1]),count[1],label='Male')
plt.legend()
plt.xlabel("Gender")
plt.ylabel("Gender count")
plt.plot()

<h2>Chest pain Type</h2><br>
Categories : <br>
<ul>
<li>Value 0: typical angina</li>
<li>Value 1: atypical angina</li>
<li>Value 2: non-anginal pain</li>
<li>Value 3: asymptomatic</li>

In [None]:
cp_type,count = np.unique(data['cp'],return_counts=True)
plt.bar(str(cp_type[0]),count[0],label="Typical angina")
plt.bar(str(cp_type[1]),count[1],label="Atypical angina")
plt.bar(str(cp_type[2]),count[2],label="Non-anginal pain")
plt.bar(str(cp_type[3]),count[3],label="Asymptomatic")
plt.legend()
plt.xlabel("CP Type")
plt.ylabel("Count")
plt.legend()
plt.show()

<h2>Resting Blood Pressure</h2>

In [None]:
print("Resting Blood Pressure range : [",data['trtbps'].min(),",",data['trtbps'].max(),"]")

<h2>Cholestrol</h2>

In [None]:
print("Cholestrol range : [",data['chol'].min(),",",data['chol'].max(),"]")

In [None]:
fbs,count = np.unique(data['fbs'],return_counts=True)
plt.bar(str(fbs[0]),count[0],label='< 120')
plt.bar(str(fbs[1]),count[1],label='> 120')
plt.legend()
plt.xlabel("Fasting Blood Sugar Threshold")
plt.ylabel("Count")
plt.show()

<h2>Resting Electrocardiograph</h2>
<ul>
    <li>Value 0: normal</li>
    <li>Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)</li>
    <li>Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria</li>
 </ul>

In [None]:
restecg,count = np.unique(data['restecg'],return_counts=True)

plt.bar(str(restecg[0]),count[0],label='0')
plt.bar(str(restecg[1]),count[1],label='1')
plt.bar(str(restecg[2]),count[2],label='2')
plt.legend()
plt.xlabel("Resting ECG Category")
plt.ylabel("Count")
plt.show()

<h2>Maximum Heart Rate Achieved</h2>

In [None]:
print("Maximum Heart Rate Achieved range : [",data['thalachh'].min(),",",data['thalachh'].max(),"]")

<h2>Exercise Induced Angina</h2>

In [None]:
exng,count = np.unique(data['exng'],return_counts=True)

plt.bar(str(exng[0]),count[0],label='False')
plt.bar(str(exng[1]),count[1],label='True')
plt.legend()
plt.xlabel("Exercise Induced Angina")
plt.ylabel("Count")
plt.show()

<h2>Number of Major Vessels</h2>


In [None]:
caa,count = np.unique(data['caa'],return_counts=True)
plt.bar(str(caa[0]),count[0],label='0')
plt.bar(str(caa[1]),count[1],label='1')
plt.bar(str(caa[2]),count[2],label='2')
plt.bar(str(caa[3]),count[3],label='3')

plt.xlabel("Number of Major Vessels")
plt.ylabel("Count")
plt.show()

<h2>Sloping</h2>

In [None]:
slp,count = np.unique(data['slp'],return_counts=True)
plt.bar(str(slp[0]),count[0],label='Downsloping')
plt.bar(str(slp[1]),count[1],label='Flat')
plt.bar(str(slp[2]),count[2],label='Upsloping')
plt.legend()
plt.xlabel("Sloping Category")
plt.ylabel("Count")
plt.show()

<h2>Old Peak</h2>


In [None]:
print("Old peak value range : [",data['oldpeak'].min(),",",data['oldpeak'].max(),"]")

<h2>Thal - 2</h2>

In [None]:
thal2,count = np.unique(data['thall'],return_counts=True)
print(thal2,count)
plt.bar(str(slp[0]),count[0],label='Downsloping')
plt.bar(str(slp[1]),count[1],label='Flat')
plt.bar(str(slp[2]),count[2],label='Upsloping')
plt.legend()
plt.xlabel("Sloping Category")
plt.ylabel("Count")
plt.show()

<h2>Output Class Balancing</h2>

In [None]:
op,count = np.unique(data['output'],return_counts=True)
plt.bar(str(op[0]),count[0],label='No Risk')
plt.bar(str(op[1]),count[1],label='Risk')
plt.legend()
plt.xlabel("Risk Category")
plt.ylabel("Count")
plt.show()

In [None]:
scaler = StandardScaler()
scaler.fit(data[data.columns[:len(data.columns)-1]])
mdata = scaler.transform(data[data.columns[:len(data.columns)-1]])
mdata

In [None]:
train_x,test_x,train_y,test_y = train_test_split(mdata,
                                                data['output'],
                                                test_size=0.2,
                                                random_state=4)

<h2>Building model and Training</h2>
<ul>
    <li>Decision Tree</li>
    <li>Random Forest</li>
    <li>Gradient Boosting</li>
</ul>

In [None]:
#Generating and training a Decision tree algorithm
tr = tree.DecisionTreeClassifier(max_depth=13)
tr = tr.fit(train_x,train_y)
y_pred = tr.predict(train_x)
print("Train Accuracy : ",metrics.accuracy_score(train_y,y_pred))
y_pred = tr.predict(test_x)
print("Test Accuracy : {:.2f}".format(metrics.accuracy_score(test_y,y_pred)))

#printing precision and recall
print("Precision : {:.2f}".format(metrics.precision_score(test_y,y_pred)))
print("Recall : {:.2f}".format(metrics.recall_score(test_y,y_pred)))
print("F1 Score : {:.2f}".format(metrics.f1_score(test_y,y_pred)))

#Plotting confusion matrix
disp = metrics.plot_confusion_matrix(tr,
                                     test_x,
                                     test_y,
                                     display_labels=["No risk of Heart Attack","Risk of Heart Attack"])
plt.show()

#Showing the tree graphically
fig = plt.figure(figsize=(40,20))
tr= tree.plot_tree(tr,filled=True,fontsize=10,feature_names=data.columns)
plt.show()

In [None]:
#Generating and training a Random Forest Algorithm
rf = RandomForestClassifier(n_estimators=100,max_depth=5)
rf.fit(train_x,train_y)
y_pred = rf.predict(train_x)
print("Train Accuracy : {:.2f}".format(metrics.accuracy_score(train_y,y_pred)))
y_pred = rf.predict(test_x)
print("Test Accuracy : {:.2f}".format(metrics.accuracy_score(test_y,y_pred)))

#printing precision and recall
print("Precision : {:.2f}".format(metrics.precision_score(test_y,y_pred)))
print("Recall : {:.2f}".format(metrics.recall_score(test_y,y_pred)))
print("F1 Score : {:.2f}".format(metrics.f1_score(test_y,y_pred)))

#Plotting the confusion matrix
disp = metrics.plot_confusion_matrix(rf,
                                     test_x,
                                     test_y,
                                     display_labels=["No risk of Heart Attack","Risk of Heart Attack"])
plt.show()

In [None]:
#Generating and training Gradient Boosting Classifier Algorithm
gbc = GradientBoostingClassifier(n_estimators=50,max_depth=2)
gbc.fit(train_x,train_y)
y_pred = gbc.predict(train_x)
print("Train Accuracy : {:.2f}".format(metrics.accuracy_score(train_y,y_pred)))
y_pred = gbc.predict(test_x)
print("Test Accuracy : {:.2f}".format(metrics.accuracy_score(test_y,y_pred)))

#printing precision and recall
print("Precision : {:.2f}".format(metrics.precision_score(test_y,y_pred)))
print("Recall : {:.2f}".format(metrics.recall_score(test_y,y_pred)))
print("F1 Score : {:.2f}".format(metrics.f1_score(test_y,y_pred)))

#Plotting confusion matrix
disp = metrics.plot_confusion_matrix(gbc,
                                     test_x,
                                     test_y,
                                     display_labels=["No risk of Heart Attack","Risk of Heart Attack"])
plt.show()

In [None]:
#Generating and training logistic regression algorithm
lr = LogisticRegression(solver='liblinear')
lr.fit(train_x,train_y)
y_pred = lr.predict(train_x)
print("Train Accuracy : {:.2f}".format(metrics.accuracy_score(train_y,y_pred)))
y_pred = lr.predict(test_x)
print("Test Accuracy : {:.2f}".format(metrics.accuracy_score(test_y,y_pred)))

#printing precision and recall
print("Precision : {:.2f}".format(metrics.precision_score(test_y,y_pred)))
print("Recall : {:.2f}".format(metrics.recall_score(test_y,y_pred)))
print("F1 Score : {:.2f}".format(metrics.f1_score(test_y,y_pred)))

#Plotting confusion matrix
disp = metrics.plot_confusion_matrix(lr,
                                     test_x,
                                     test_y,
                                     display_labels=["No risk of Heart Attack","Risk of Heart Attack"])
plt.show()

<h2>Conclusion</h2>
<p>Gradient Boosting achieves the best fit in all of the algorithms tried above</p>
<table>
    <tr>
        <th>Model</th>
        <th>Test Acc</th>
        <th>Train Acc</th>
        <th>Precision</th>
        <th>Recall</th>
        <th>F1 Score</th>
    </tr>
    <tr>
        <td>Decision Tree</td>
        <td>1.0</td>
        <td>0.85</td>
        <td>0.88</td>
        <td>0.86</td>
        <td>0.87</td>
    </tr>
     <tr>
        <td>Random Forest</td>
        <td>0.94</td>
        <td>0.90</td>
        <td>0.89</td>
        <td>0.94</td>
        <td>0.92</td>
    </tr>
     <tr>
        <td>Grandient Boosting</td>
        <td>0.92</td>
        <td>0.92</td>
        <td>0.92</td>
        <td>0.94</td>
        <td>0.93</td>
    </tr>
    <tr>
        <td>Logistic Regression</td>
        <td>0.86</td>
        <td>0.90</td>
        <td>0.91</td>
        <td>0.91</td>
        <td>0.91</td>
    </tr>
 </table>