
# **Heart Attack Analysis & Prediction Dataset**


* age - age in years

* sex - sex (1 = male; 0 = female)

* cp - chest pain type (1 = typical angina; 2 = atypical angina; 3 = non-anginal pain; 4 = asymptomatic)

* trestbps - resting blood pressure (in mm Hg on admission to the hospital)

* chol - serum cholestoral in mg/dl

* fbs - fasting blood sugar > 120 mg/dl (1 = true; 0 = false)

* restecg - resting electrocardiographic results (0 = normal; 1 = having ST-T; 2 = hypertrophy)

* thalach - maximum heart rate achieved

* exang - exercise induced angina (1 = yes; 0 = no)

* oldpeak - ST depression induced by exercise relative to rest

* slope - the slope of the peak exercise ST segment (1 = upsloping; 2 = flat; 3 = downsloping)

* ca - number of major vessels (0-3) colored by flourosopy

* thal(thallium stress)- 0 to 3

* output- 0(less chance of heart attack) and 1(more chance of heart attack)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
print(df.groupby('output').size())
#sns.countplot(df['output'],label="Count")

In [None]:
df.isnull().sum()

In [None]:
df.eq(0).sum()

In [None]:
dups=df.duplicated()
print("Duplicated Lines:%d" % dups.sum())

In [None]:
df.drop_duplicates(inplace=True)
df.shape

## Outliers

In [None]:
sns.boxplot(data=df,x=df['trtbps'])

Q1=df['trtbps'].quantile(0.25)
Q3=df['trtbps'].quantile(0.75)
IQR=Q3-Q1
print(Q1)
print(Q3)
print(IQR)
Lower_Whisker = Q1-(1.5)*IQR
Upper_Whisker = Q3+(1.5)*IQR
print(Lower_Whisker, Upper_Whisker)

In [None]:
df = df[df['trtbps']< Upper_Whisker]
sns.boxplot(data=df,x=df['trtbps'])

In [None]:
sns.boxplot(data=df,x=df['chol'])

Q1=df['chol'].quantile(0.25)
Q3=df['chol'].quantile(0.75)
IQR=Q3-Q1
print(Q1)
print(Q3)
print(IQR)
Lower_Whisker = Q1-(1.5)*IQR
Upper_Whisker = Q3+(1.5)*IQR
print(Lower_Whisker, Upper_Whisker)

In [None]:
df = df[df['chol']< Upper_Whisker]
sns.boxplot(data=df,x=df['chol'])

In [None]:
sns.boxplot(data=df,x=df['thalachh'])

Q1=df['thalachh'].quantile(0.25)
Q3=df['thalachh'].quantile(0.75)
IQR=Q3-Q1
print(Q1)
print(Q3)
print(IQR)
Lower_Whisker = Q1-(1.5)*IQR
Upper_Whisker = Q3+(1.5)*IQR
print(Lower_Whisker, Upper_Whisker)

In [None]:
df = df[df['thalachh'] > Lower_Whisker]
sns.boxplot(data=df,x=df['thalachh'])

In [None]:
sns.boxplot(data=df,x=df['oldpeak'])

Q1=df['oldpeak'].quantile(0.25)
Q3=df['oldpeak'].quantile(0.75)
IQR=Q3-Q1
print(Q1)
print(Q3)
print(IQR)
Lower_Whisker = Q1-(1.5)*IQR
Upper_Whisker = Q3+(1.5)*IQR
print(Lower_Whisker, Upper_Whisker)

In [None]:
df = df[df['oldpeak']< Upper_Whisker]
sns.boxplot(data=df,x=df['oldpeak'])

In [None]:
df.shape

In [None]:
df.corr()

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(10,8))
sns.heatmap(df.corr())
plt.show()

In [None]:
Output = pd.DataFrame(df.corr()['output'].sort_values(ascending=False))
sns.heatmap(Output)

## Test Data Split and Standard Scaling

In [None]:
from sklearn.model_selection import train_test_split

X = df.iloc[:,:-1]
y = df.iloc[:,-1:]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=20)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_raw = scaler.fit_transform(X_train)
X_test_raw = scaler.transform(X_test)

X_train = pd.DataFrame(X_train_raw, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_raw, columns=X_test.columns, index=X_test.index)


## Modeling and Prediction

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix ,classification_report,accuracy_score,recall_score,precision_score,f1_score

## Logistic Regression

In [None]:
lr = LogisticRegression()
#lr.fit(X_train, y_train.values.ravel()) # ravel() : 1d - array transform
lr = LogisticRegression(solver='lbfgs',random_state=66,C=1).fit(X_train, y_train.values.ravel())

print("Training set accuracy: {:.5f}".format(lr.score(X_train, y_train)))
print("Test set accuracy    : {:.5f}".format(lr.score(X_test, y_test)))

In [None]:
y_pred_lr = lr.predict(X_test)
confusion_matrix(y_pred=y_pred_lr,y_true=y_test)

In [None]:
print("Model Accuracy      : {:.5f}".format(accuracy_score(y_pred=y_pred_lr,y_true=y_test)))
print("Recall Score        : {:.5f}".format(recall_score(y_pred=y_pred_lr,y_true=y_test)))
print("Precision Score     : {:.5f}".format(precision_score(y_pred=y_pred_lr,y_true=y_test)))
print("F1 Score            : {:.5f}".format(f1_score(y_pred=y_pred_lr,y_true=y_test)))

## Support Vector Machine

In [None]:
svc = SVC(gamma='scale',random_state=66)
svc.fit(X_train, y_train.values.ravel())
print("Training set accuracy: {:.5f}".format(svc.score(X_train, y_train)))
print("Test set accuracy    : {:.5f}".format(svc.score(X_test, y_test)))

In [None]:
y_pred_svm=svc.predict(X_test)
confusion_matrix(y_pred=y_pred_svm,y_true=y_test)

In [None]:
print("Model Accuracy      : {:.5f}".format(accuracy_score(y_pred=y_pred_svm,y_true=y_test)))
print("Recall Score        : {:.5f}".format(recall_score(y_pred=y_pred_svm,y_true=y_test)))
print("Precision Score     : {:.5f}".format(precision_score(y_pred=y_pred_svm,y_true=y_test)))
print("F1 Score            : {:.5f}".format(f1_score(y_pred=y_pred_svm,y_true=y_test)))

## ROC Curve

In [None]:
from sklearn.metrics import roc_curve, auc

logistic_fpr, logistic_tpr, threshold = roc_curve(y_test, y_pred_lr)
auc_logistic = auc(logistic_fpr, logistic_tpr)

svm_fpr, svm_tpr, threshold = roc_curve(y_test, y_pred_svm)
auc_svm = auc(svm_fpr, svm_tpr)

plt.figure(figsize=(5, 5), dpi=100)

plt.plot(logistic_fpr, logistic_tpr, marker='.', label='Lojistik Regresyon (auc = %0.5f)' % auc_logistic)
plt.plot(svm_fpr, svm_tpr, linestyle='-', label='Destek VektÃ¶r Makineleri (auc = %0.5f)' % auc_svm)


plt.xlabel('False Positive Rate -->')
plt.ylabel('True Positive Rate -->')

plt.legend()

plt.show()