Import library

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Read data from kaggle


In [None]:
data = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
data.head()

In [None]:
data.describe()

Checking duplicate data

In [None]:
duplicate_datarow = data[data.duplicated()]
print("Duplicate rows: ", duplicate_datarow.shape)

In [None]:
data = data.drop_duplicates()

In [None]:
duplicate_datarow = data[data.duplicated()]
print("Duplicate rows: ", duplicate_datarow.shape)

In [None]:
data.info()

In [None]:
data.isnull().any()

In [None]:
data.isnull().sum()

**Basic descriptive analysis**

In [None]:
data.describe().transpose()

**Exploratory data analysis**

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(x=data["output"], color="red", alpha=0.4)

In [None]:
print(data['age'].mean())
print(data['age'].min())
print(data['age'].max())

In [None]:
plt.figure(figsize=(12,7))
sns.distplot(x=data["age"], bins=15, color="red")

In [None]:
sns.jointplot(x="age", y="chol", data=data, hue="output")

****Removing out layer****

In [None]:
data = data[data["chol"] < 380]

In [None]:
data["chol"].max()

In [None]:
gender_gb = data.groupby("output")["sex"]
gender_gb.value_counts()

In [None]:
data = data[data["trtbps"] < 180]

In [None]:
data["trtbps"].max()

In [None]:
data = data[data["thalachh"] > 100]

In [None]:
data["thalachh"].min()

In [None]:
ekg_normal = len(data[data["restecg"] == 0])/len(data["restecg"])*100
ekg_normal

In [None]:
ekg1 = len(data[data["restecg"] == 1])/len(data["restecg"])*100
ekg2 = len(data[data["restecg"] == 2])/len(data["restecg"])*100
ekg_normal = len(data[data["restecg"] == 0])/len(data["restecg"])*100

print("Abnormal EKG: {:.2f}".format(ekg1))
print("\n")
print("Hypertrophy by Estes: {:.2f}".format(ekg2))
print("\n")
print("Normal EKG: {:.2f}".format(ekg_normal))

In [None]:
plt.figure(figsize=(18,7))
sns.heatmap(data.corr(method="pearson"), cmap="PuRd", annot=True, lw=0.1)

In [None]:
data["caa"] = data["caa"].replace([2,3,4],1)
data["caa"].value_counts()

In [None]:
data["oldpeak"].min()

In [None]:
data["oldpeak"] = data["oldpeak"].replace(0.0, 0.01)
data["oldpeak"].value_counts()

In [None]:
data["log_oldpeak"] = np.log10(data["oldpeak"])

In [None]:
data = data.drop("oldpeak", axis=1)

In [None]:
data.corr()["output"]

In [None]:
data = data.drop(["trtbps", "chol", "restecg", "fbs", "age", "sex"], axis=1)

In [None]:
data.corr()["output"]

In [None]:
data.head()

Creating X,Y variable for training

In [None]:
X = data.drop("output", axis=1)
y = data["output"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Decision Tree Prediction

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree_model = DecisionTreeClassifier(max_depth=15, random_state=42)

In [None]:
tree_model.fit(X_train, y_train)

In [None]:
tree_predict = tree_model.predict(X_test)

In [None]:
tree_predict

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(y_test, tree_predict))
print("Confusion Report:")
print(confusion_matrix(y_test, tree_predict))

Random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
random_model = RandomForestClassifier(n_estimators=80, bootstrap=True, random_state=42, criterion="entropy")

In [None]:
random_model.fit(X_train, y_train)

In [None]:
random_predict = random_model.predict(X_test)

Predictive mode: Linear model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_model = LogisticRegression(solver="liblinear", random_state=42)

In [None]:
log_model.fit(X_train, y_train)

In [None]:
log_predict = log_model.predict(X_test)

In [None]:
print(classification_report(y_test, log_predict))
print("Confusion Report:")
print(confusion_matrix(y_test, log_predict))

Predictive model Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gaussian_model = GaussianNB()

In [None]:
gaussian_model.fit(X_train, y_train)

In [None]:
gaussian_predict = gaussian_model.predict(X_test)

In [None]:
print(classification_report(y_test, gaussian_predict))
print("Confusion Report:")
print(confusion_matrix(y_test, gaussian_predict))

All algorithm report

In [None]:
report = [["GaussianNB", 0.84, 0.84, 0.84, 0.84], ["Random Forest", 0.82, 0.83, 0.82, 0.82], 
          ["DecisionTreeClassifier", 0.81, 0.81, 0.81, 0.81],
          ["LogisticRegression", 0.88, 0.87, 0.86, 0.86]]
overall_result = pd.DataFrame(report, columns=["Model", "Accuracy Score", "Precision", "Recall", "F1-score"])
overall_result.sort_values("Accuracy Score", ascending=False)