In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

About this dataset

Age : Age of the patient

Sex : Sex of the patient

exang: exercise induced angina (1 = yes; 0 = no)

ca: number of major vessels (0-3)

cp : Chest Pain type chest pain type 

Value 1: typical angina
Value 2: atypical angina
Value 3: non-anginal pain
Value 4: asymptomatic

trtbps : resting blood pressure (in mm Hg)

chol : cholestoral in mg/dl fetched via BMI sensor

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

rest_ecg : resting electrocardiographic results
    Value 0: normal
    Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

thalach : maximum heart rate achieved

target : 0= less chance of heart attack 1= more chance of heart attack


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
file_path = "/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv"
heart_data = pd.read_csv(file_path)
heart_data.head(20)

In [None]:
##Dropping duplicates
heart_data = heart_data.drop_duplicates()

In [None]:
#Statistics of the data
heart_data.describe()

In [None]:
print("Distribution on the basis of sex")
plot_1 = sns.countplot(data=heart_data, x='sex', hue="output",order=sorted(heart_data.sex.value_counts().index))
plt.legend(title='Condition', loc='upper right', labels=['Not Prone', 'Prone'])
plt.show(plot_1)


The plot shows that the individuals with sex as 0 are more prone to heart attacks as compared to individuals with sex as 1.

In [None]:
print("Distribution on the basis of chest pain type")
plot_2 = sns.countplot(data=heart_data, x='cp', hue="output",order=sorted(heart_data.cp.value_counts().index))
plt.legend(title='Condition', loc='upper right', labels=['Not Prone', 'Prone'])
plt.show(plot_2)

cp : Chest Pain type chest pain type 

**Value 0: typical angina , 
Value 1: atypical angina , 
Value 2: non-anginal pain , 
Value 3: asymptomatic**

As observed from the plot, value of 0 ( typical angina ) has relatively lesser chance as compared to atypical angina, non-anginal pain or asymptomatic condition.

In [None]:
print("Distribution on the basis of number of blood vessels")
plot_3 = sns.countplot(data=heart_data, x='caa', hue="output",order=sorted(heart_data.caa.value_counts().index))
plt.legend(title='Condition', loc='upper right', labels=['Not Prone', 'Prone'])
plt.show(plot_3)

As the count of blood vessels increase, the chance decrease.There is a direct correlation between the feature and the target.

In [None]:
print("Distribution on the basis of resting electrocardiographic results ")
plot_4 = sns.countplot(data=heart_data, x='restecg', hue="output",order=sorted(heart_data.restecg.value_counts().index))
plt.legend(title='Condition', loc='upper right', labels=['Not Prone', 'Prone'])
plt.show(plot_4)

In [None]:
print("Distribution on the basis of fasting blood sugar > 120 mg/dl")
plot_5 = sns.countplot(data=heart_data, x='fbs', hue="output",order=sorted(heart_data.fbs.value_counts().index))
plt.legend(title='Condition', loc='upper right', labels=['Not Prone', 'Prone'])
plt.show(plot_5)

In [None]:
#thalachh - Maximum hear rate achieved
#Higher maximum heart rate -> greater chance of being prone
heart_data['thalachh_bucket'] =pd.qcut(heart_data['thalachh'],q=np.linspace(0,1,10).tolist(),labels=False)
print("Distribution on the basis of maximum heart rate")
plot_6 = sns.countplot(data=heart_data, x='thalachh_bucket', hue="output",order=sorted(heart_data.thalachh_bucket.value_counts().index))
plt.legend(title='Condition', loc='upper right', labels=['Not Prone', 'Prone'])
plt.show(plot_6)

In [None]:
#The maximum heart rate for a prone person is centred around 150 which is expected as 120-140 is considered the limit for normal condition.
prone_data = heart_data['thalachh'][heart_data["output"]==1]
notprone_data = heart_data['thalachh'][heart_data["output"]==0]
ax = plt.axes()
sns.distplot(prone_data , label = "prone", ax = ax)
sns.distplot(notprone_data, label = "not prone", ax = ax)
ax.legend()

In [None]:
heart_data["bp_scale"] = pd.cut(heart_data['trtbps'],bins=5,labels=False)
print("Distribution on the basis of bp_scale")
plot_7 = sns.countplot(data=heart_data, x='bp_scale', hue="output",order=sorted(heart_data.bp_scale.value_counts().index))
plt.legend(title='Condition', loc='upper right', labels=['Not Prone', 'Prone'])
plt.show(plot_7)

In [None]:
heart_data["chol_scale"] = pd.qcut(heart_data['chol'],q=np.linspace(0,1,10).tolist(),labels=False)
print("Distribution on the basis of chol_scale")
plot_8 = sns.countplot(data=heart_data, x='chol_scale', hue="output",order=sorted(heart_data.chol_scale.value_counts().index))
plt.legend(title='Condition', loc='upper right', labels=['Not Prone', 'Prone'])
plt.show(plot_8)

In [None]:
prone_data = heart_data['chol'][heart_data["output"]==1]
notprone_data = heart_data['chol'][heart_data["output"]==0]
ax = plt.axes()
sns.distplot(prone_data , label = "prone", ax = ax)
sns.distplot(notprone_data, label = "not prone", ax = ax)
ax.legend()

In [None]:
heart_data["age_scale"] = pd.cut(heart_data['age'],bins=10,labels=False)
print("Distribution on the basis of chol_scale")
plot_8 = sns.countplot(data=heart_data, x='age_scale', hue="output",order=sorted(heart_data.age_scale.value_counts().index))
plt.legend(title='Condition', loc='upper right', labels=['Not Prone', 'Prone'])
plt.show(plot_8)

In [None]:
heart_data.columns

In [None]:
onehotencode = ["cp","restecg","thall","slp"]
onehotencode_dict = {}
for x in onehotencode:
    onehotencode_dict[x] = x
ohe_dummies = pd.get_dummies(heart_data, prefix = onehotencode_dict,columns = onehotencode)
heart_data = pd.concat([heart_data, ohe_dummies], axis=1)

In [None]:
heart_data.columns

In [None]:
heart_data.columns

In [None]:
features = ["age", "sex",'fbs','exng','oldpeak','caa','thalachh_bucket', 'bp_scale', 'chol_scale', 'cp_0',
       'cp_1', 'cp_2', 'cp_3', 'restecg_0', 'restecg_1', 'restecg_2', 'thall_0', 'thall_1', 'thall_2', 'thall_3', 'slp_0', 'slp_1', 'slp_2','output']

In [None]:
model_data = heart_data[features]

In [None]:
model_data = model_data.loc[:, ~model_data.columns.duplicated()]

In [None]:
from sklearn.model_selection import train_test_split
train_df , test_df = train_test_split(model_data, test_size=0.25,shuffle=True, stratify=model_data["output"])


In [None]:
train_df.shape, test_df.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=10, n_jobs = 1,verbose=1, class_weight="balanced_subsample")

In [None]:
rf.fit(train_df.drop(columns=["output"]), train_df["output"])

In [None]:
test_df ["prediction"] = rf.predict_proba(test_df.drop(columns = ["output"]))[:,1]
test_df ["prediction_label"] = np.argmax(rf.predict_proba(test_df.drop(columns = ["output","prediction"])),axis=1)


In [None]:
test_df[["prediction","prediction_label","output"]]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_df["prediction_label"], test_df["output"]))