In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

## **Exploratory Data Analysis - Story-Telling**

### **Basic Information and Summary Statistics**

In [None]:
df = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
df.head()

In [None]:
df.info() #Zero null values

In [None]:
df.describe()

In [None]:
df['output'].unique() #Validating that theri is no noise data in output column

In [None]:
print(f"Percentage of people with more chance of heart-attack (1): {df['output'].value_counts()[1]/df['output'].count()*100}%")
print(f"Percentage of people with less chance of heart-attack (1): {df['output'].value_counts()[0]/df['output'].count()*100}%")

In [None]:
df.nunique().plot()
plt.show()
print(df.nunique())

### **As observed from above distributions, following variables has a proper numerical distribution and are not mere numerically encoded categorical varibles**:

* **age**: Age of the patient
* **trtbps**: resting blood pressure (in mm Hg)
* **chol**: cholestoral in mg/dl fetched via BMI sensor
* **thalachh**: maximum heart rate achieved
* **oldpeak**: Previous peak

In [None]:
df_numeric_cols = [['age', 'trtbps'], 
                   ['chol', 'thalachh'], 
                   ['oldpeak']]
row_num = 3
col_num = 2
fig, axes = plt.subplots(row_num, col_num, figsize=(15,15))
for row in range(row_num):
    for col in range(col_num):
        if row == row_num-1 and col == col_num-1:
            axes[row,col].set_axis_off()
            break
        sns.kdeplot(data=df, 
                    x=df_numeric_cols[row][col], 
                    ax=axes[row,col],
                    hue = 'output',
                    bw_adjust=.30)
        axes[row,col].set_xlabel(f"{df_numeric_cols[row][col].title()}\n\n{df.groupby(['output']).describe()[df_numeric_cols[row][col]]}")
plt.subplots_adjust(wspace=0.25, hspace=0.6)

### **Correlation between numerical features and 'output' variable with two discrete values - 0,1**:
**Note**: Since one variable is continuous and the other is categorical, we use Point-Biserial Correlation

In [None]:
import scipy
corr_values = np.zeros(shape=(5,2))
corr_values = pd.DataFrame(corr_values, columns = ['Feature', 'Point-Biserial Correlation'])
for index, col in enumerate(['age','trtbps','chol','thalachh','oldpeak']):
    corr, p_value = scipy.stats.pointbiserialr(df[col], df['output'])
    corr_values.loc[index,["Feature"]] = col
    corr_values.loc[index, ["Point-Biserial Correlation"]] = corr

In [None]:
df_numeric_cols = [['age', 'trtbps'], 
                   ['chol', 'thalachh'], 
                   ['oldpeak']]
row_num = 3
col_num = 2
fig, axes = plt.subplots(row_num, col_num, figsize=(15,12))
for row in range(row_num):
    for col in range(col_num):
        if row == row_num-1 and col == col_num-1:
            axes[row,col].set_axis_off()
            break
        sns.scatterplot(data=df, 
                    x=df_numeric_cols[row][col], 
                    y='output', 
                    ax=axes[row,col],
                    hue = 'output')
        axes[row,col].set_xlabel(f"{df_numeric_cols[row][col].title()}")

In [None]:
corr_values.style.set_table_attributes("style='display:inline'").set_caption('Point-Biserial Correlation b/w Feature and Output')

### **As observed**: 

* **Slight Negative Correlation**: We observe that **trtbps** is **slightly low** when **output** is 1 (more chance of heart attack)
* **Moderate Negative Correlation**: We observe that **age** is **moderately low** when **output** is 1 (more chance of heart attack)

* **Moderately High Negative Correlation**: We observe that **oldpeak** is **moderately low** when **output** is 1 (more chance of heart attack)

* **Moderately Positive Correlation**: We observe that **thalachh** is **moderately high** when **output** is 1 (more chance of heart attack)

In [None]:
df_cat_cols = [['sex', 'cp', 'fbs'], 
                ['restecg', 'exng', 'slp'], 
                ['caa', 'thall', 'output']]
row_num = 3
col_num = 3
fig, axes = plt.subplots(row_num, col_num, figsize=(20,15))
for row in range(row_num):
    for col in range(col_num):
        if row == row_num-1 and col == col_num-1:
            axes[row,col].set_axis_off()
            break
        sns.countplot(data=df, 
                    x=df_cat_cols[row][col],
                    ax=axes[row,col],
                    hue = 'output')
        axes[row,col].set_xlabel(f"{df_cat_cols[row][col].title()}")
plt.subplots_adjust(hspace=0.25)

### **As observed**:
* **Zero 'caa' (number of major vessels (0-3))**: High chance of heart attack (1) can be found among groups with zero major vessel
* **'thall' (Thal rate)**: People having thal rate as 2 have high chance of heart attack
* **'cp' (Chest Pain type)**: People having chest pain type 0 (no chest pain) have low chance of heart attack; chest pain type 2 (atypical angina) has high chance of heart attack

## **Predictive Modeling**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test=train_test_split(df.drop('output', axis  = 1),
                                                  df['output'],
                                                  random_state=1)

In [None]:
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import auc

### **Pre-processing Numerical and Categorical Data**

In [None]:
numeric_cols = ['age', 'trtbps','chol', 'thalachh','oldpeak']
cat_cols = ['sex', 'cp', 'fbs','restecg', 'exng', 'slp','caa', 'thall']

In [None]:
numeric_features = numeric_cols
cat_features = cat_cols

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(with_mean=False), numeric_features),
        ('cat', OneHotEncoder(handle_unknown = 'ignore'), cat_features)
        ])

X_train_scale=preprocessor.fit_transform(X_train)
X_test_scale=preprocessor.transform(X_test)

### **Stacking Models**

In [None]:
models = [('clf_Grad',GradientBoostingClassifier(learning_rate = 0.005,
                                                 n_estimators = 30)),
          ('clf_DN1',MLPClassifier(early_stopping  = True)),
          ('clf_SVC',SVC())]

In [None]:
stacking=StackingClassifier(estimators=models,
                            cv=10,
                            n_jobs=-1)

In [None]:
stacking.fit(X_train_scale,y_train)

### **Model Evaluation: Accuracy and AUC**

In [None]:
print(f"Accuracy of the model on Test Data: {stacking.score(X_test_scale,y_test)*100}%")

In [None]:
preds = stacking.predict(X_test_scale)
fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test, preds)

In [None]:
print(f"AUC metric of the model on Test Data: {auc(fpr,tpr)*100}%")