In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, mean_squared_error, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
df.head()

## Correlation Heatmap

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(), annot=True, linewidths=1, cmap = 'coolwarm')
plt.title('Correlation Heatmap')
plt.xlabel('Column')
plt.ylabel('Column')

## Data Visualization

In [None]:
fig, axs = plt.subplots(1,2,figsize=(14,4))

table=pd.crosstab(df.cp,df.output)
table.div(table.sum(1).astype(float), axis=0).plot(ax=axs[0],kind='bar', stacked=True)
pd.crosstab(df.cp,df.output).plot(ax=axs[1],kind='bar')

for ax in axs.flat:
    ax.set(xlabel='Cp', ylabel='number of patients')
    ax.title.set_text('Number of Patients for different Cp types')

As from above figures, we can say that for type 1 and type 2 cp, the proability of y being 1 is high

In [None]:
table=pd.crosstab(df.caa,df.output)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Proportion of number of patients for different number of major veseels')
plt.xlabel('number of major vessels')
plt.ylabel('Number of Patients')

Similarly, for number of vessels 0 and 4, probability of y being 1 is high. Hence the correlation of caa with output is high

In [None]:
output = df.output.unique()
plt.hist([df.loc[df.output == x, 'thalachh'] for x in output], label=output)
plt.legend()

We can observe that when value of thalachh (maximum heart rate achieved) is around 160, the value of y is 1

In [None]:
pd.crosstab(df.exng,df.output).plot(kind='bar')
plt.title('Patients with exercise induced angina & Possibility of heart attack')
plt.xlabel('exng')
plt.ylabel('Number of Patients')

As expected, the patients with exercise induced angina are less likely to be suffered from heart attack

In [None]:
pd.crosstab(df.thall,df.output).plot(kind='bar')
plt.title('Patients with different type of thall')
plt.xlabel('thall')
plt.ylabel('Number of Patients')

When thall type is 2, the chances of getting heart attack are much higher.

In [None]:
pd.crosstab(df.fbs,df.output).plot(kind='bar')
plt.title('Patients with fasting blood sugar > 120')
plt.xlabel('fbs')
plt.ylabel('Number of Patients')

As we can see, the number of patients getting heart attack is almost same as number of patients who are less likely to get heart attack regardless of their blood sugar level. So this feature will less likely to be helpful in classification model

## Feature Engineering

### Correlation Coefficient magnitute with target variable (output)
1. Greater than 0.35 - cp, thalachh, exng, oldpeak, slp, caa
2. Greater than 0.25 - thall, sex
3. Less than 0.25 and greater than 0.1 - age, trtbps, restecg
4. Almost equal to 0 - chol, fbs  

**Note**: We can see that slp and oldpeak have negative correlation of 0.58 which is significant. So, only one of them is included in x_train. As the correaltion of oldpeak with output is greater than that of slp, we are dropping slp from x

## Importance of Feature Engineering  
Let's see the accuracy of the logistic regression model without doing any feature engineering

In [None]:
x = df.drop(columns=['output'])
y = df['output']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

logreg = LogisticRegression(solver='liblinear')
logreg.fit(x_train, y_train)

y_test_pred = logreg.predict(x_test)
print(classification_report(y_test, y_test_pred))

Now let's try the same thing with dropping the features with correlation less than 0.25

In [None]:
x = df.drop(columns=['output','slp','chol','fbs','trtbps', 'restecg','age'])
y = df['output']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

logreg = LogisticRegression(solver='liblinear')
logreg.fit(x_train, y_train)

y_test_pred = logreg.predict(x_test)
print(classification_report(y_test, y_test_pred))

We can see that accuracy increased from 0.87 to 0.89 when the columns are dropped. This shows us the importance of feature engineering. It's always importat to make sure that the features are independant of each other and have maximum correlation with target variable only.

## Classification Models

In [None]:
logreg = LogisticRegression(solver="liblinear")
gnb = GaussianNB()
knn = KNeighborsClassifier()
dec_tree = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42,verbose=False)
gb = GradientBoostingClassifier(verbose=False)

models = [logreg,gnb,knn,dec_tree,rf,gb]

In [None]:
for model in models:
    model.fit(x_train,y_train)
    name = model.__class__.__name__
    y_pred = model.predict(x_test)
    print("Model    -", name)
    print("Accuracy -",accuracy_score(y_test,y_pred))
    print("Loss     -", mean_squared_error(y_test,y_pred))
    print()

## Maximum Accuracy is for Logistic Regression - 0.885