In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Heart Attack Prediction with Extensive EDA



#### Explaning the dataset

age - Age of the patient

sex - Sex of the patient

cp - Chest pain type ~ 0 = Typical Angina, 1 = Atypical Angina, 2 = Non-anginal Pain, 3 = Asymptomatic

trtbps - Resting blood pressure (in mm Hg)

chol - Cholestoral in mg/dl fetched via BMI sensor

fbs - (fasting blood sugar > 120 mg/dl) ~ 1 = True, 0 = False

restecg - Resting electrocardiographic results ~ 0 = Normal, 1 = ST-T wave normality, 2 = Left ventricular hypertrophy

thalachh - Maximum heart rate achieved

oldpeak - Previous peak

slp - Slope

caa - Number of major vessels

thall - Thalium Stress Test result ~ (0,3)

exng - Exercise induced angina ~ 1 = Yes, 0 = No

output - Target variable

### Importing Important Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import scipy.stats as ss

#### Setting up some basic paprmeters for plotting purpose

In [None]:
%matplotlib inline
warnings.filterwarnings("ignore")
sns.set_palette('deep')
sns.set_color_codes()
sns.set_style('dark')


### Loading the dataset

In [None]:
df = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
print('Data is Loaded')

### Data Analysis Part

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

#### Checking Missing values

In [None]:
df.isna().apply(pd.value_counts, axis=0)

#### Categorical and Continuous Variables

In [None]:
categorical = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'thall', 'caa', 'slp']
continous = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
print('Categorical Variables are:', ', '.join(categorical))
print('Continous Variables are:', ', '.join(continous))

### Looking up basic stats of dataset

In [None]:
df.describe().T

### Exploratory Data Analysis

#### Univariate Analysis :

In [None]:
chart_count = len(continous) + 1

fig = plt.figure(figsize=(20, 17))
axes = [fig.add_subplot(3, 3, i) for i in range(1, chart_count + 1)]
fig.tight_layout(pad=7)
fig.patch.set_facecolor('#eaeaf2')

axes[0].spines["bottom"].set_visible(False)
axes[0].spines["left"].set_visible(False)
axes[0].spines["top"].set_visible(False)
axes[0].spines["right"].set_visible(False)
axes[0].tick_params(left=False, bottom=False)
axes[0].set_xticklabels([])
axes[0].set_yticklabels([])
axes[0].text(0.5, 0.5,
             'Violin plot for the\n continous features\n_________________',
             horizontalalignment='center', verticalalignment='center',
             fontsize=20, fontweight='bold', fontfamily='serif')

for i in range(1, chart_count):
    var = continous[i - 1]
    ax = axes[i]
    ax.grid(axis='y', linestyle=':')
    ax.text(0.5, 1.05, var.title(),
            horizontalalignment='center', verticalalignment='center',
            fontsize=14, fontweight='bold', transform=ax.transAxes)
    color = sns.color_palette('deep')[i - 1]
    sns.violinplot(data=df, y=var, ax=ax, color=color)
    ax.set_xlabel('')
    ax.set_ylabel('')

#### Conclusion

* chol, trtbps, and oldpeak have decent amount of outliers that could affect certain models sensitive to them.

* oldpeak and chol (moderately) are not uniformly distributed. This could affect models or analysis with uniform distribution as requirement.

In [None]:
chart_count = len(categorical) + 1

fig = plt.figure(figsize=(20, 17))
axes = [fig.add_subplot(3, 3, i) for i in range(1, chart_count + 1)]
fig.tight_layout(pad=7)
fig.patch.set_facecolor('#eaeaf2')

axes[0].spines["bottom"].set_visible(False)
axes[0].spines["left"].set_visible(False)
axes[0].spines["top"].set_visible(False)
axes[0].spines["right"].set_visible(False)
axes[0].tick_params(left=False, bottom=False)
axes[0].set_xticklabels([])
axes[0].set_yticklabels([])
axes[0].text(0.5, 0.5,
             'Count plot for the\n categorical features\n_________________',
             horizontalalignment='center', verticalalignment='center',
             fontsize=20, fontweight='bold', fontfamily='serif')

for i in range(1, chart_count):
    var = categorical[i - 1]
    ax = axes[i]
    ax.text(0.5, 1.05, var.title(),
            horizontalalignment='center', verticalalignment='center',
            fontsize=14, fontweight='bold', transform=ax.transAxes)
    sns.countplot(data=df, x=var, ax=ax)
    ax.set_xlabel('')
    ax.set_ylabel('')

#### Conclusion

* Restecg, Thall, Caa, Slp are likely to impact few models sensitive to data distribution because value counts for some values is extremely low.

* Fbs and Cp could possibly affect models sensitive to data distribution because their value counts is also not ideal.



### Bivariate Analysis

#### Corrected Cramer's V for categorical variables

In [None]:
def cramers_corrected_stat(x, y):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    result = -1

    conf_matrix = pd.crosstab(x, y)

    if conf_matrix.shape[0] == 2:
        correct = False
    else:
        correct = True

    chi2, p = ss.chi2_contingency(conf_matrix, correction=correct)[0:2]

    n = sum(conf_matrix.sum())
    phi2 = chi2/n
    r, k = conf_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    result = np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
    return round(result, 6), round(p, 6)


for var in categorical:
    x = df[var]
    y = df['output']
    cramersV, p = cramers_corrected_stat(x, y)
    print(f'For variable {var}, Cramer\'s V: {cramersV} and p value: {p}')

#### Conclusion

* The stats seem to agree with the conclusions we drew previously from graph along with a measure of the correlation.

* Fbs is not related at all

* Sex, Restecg have very weak correlation

* Sp, Thall seem to have a moderately strong correlation

* Exng, Caa, Slp have decent correlation

#### Kruskal-Wallis H-test

Since the distribution for some variables is non-Gaussian we would be using non-parametric test--specifically Kruskal-Wallis H Test

In [None]:
for var in continous:
    gp = df[[var, 'output']].groupby(['output'])
    gp_array = [group[var].to_numpy() for name, group in gp]
    kstat, p = ss.kruskal(*gp_array)
    kstat, p = round(kstat, 6), round(p, 6)
    print(f'For variable {var}, Kruskal-Wallis H-test: {kstat} and p value: {p}')

#### Conclusion

* Suprisingly, all variables have correlation. Although chol and trtpbs cut very close to our alpha (which is 0.05).

#### Pair Plot

In [None]:
sns.pairplot(df, hue='output');

### Conclusion 

Here's the conclusion from the entire EDA:

#### Feature Insights

* chol, trtbps, and oldpeak have decent amount of outliers. This could affect certain models sensitive to them.

* oldpeak and chol (moderately) are not uniformly distributed. This could affect models or analysis with uniform distribution as requirement.

* Restecg, Thall, Caa, Slp are likely to impact few models sensitive to data distribution because value counts for some values is extremely low.

* Fbs and Cp could possibly affect models sensitive to data distribution because their value counts is also not ideal.

* Relation to target variable

* All categorical variables except Fbs are related to output, albeit to varying degrees. Especially, Restecg and sex have very weak relation.

* All continous variables are related to output

* Multi-collinearity

* Variables do not have strong correlation and are weakly correlated

### Model Creation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


### Performing Feature Engineering

We would get out X as the values we determined to have an impact, Standardize them, and get One-hot encoding for categorical variables.

#### Performing feature separation and Scaling and Encoding of features

In [None]:
X = df[['sex', 'restecg', 'cp', 'exng', 'thall', 'caa', 'slp', 'age',
        'trtbps', 'chol', 'thalachh', 'oldpeak']]
y = df['output']

scaler = StandardScaler()
X[continous] = scaler.fit_transform(X[continous])

encode_columns = categorical.copy()
encode_columns.remove('fbs')

X = pd.get_dummies(X, columns=encode_columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=65)

print('Done Pre-processing')
print('Final No. of features: ', X.shape[1])

### Training Various Models

In [None]:
models = {
          'SVM': SVC(),
          'Decision Tree': DecisionTreeClassifier(),
          'Random Forest': RandomForestClassifier(),
          'Logistic Regression': LogisticRegression(),
          'K-Nearest Neighbors': KNeighborsClassifier(),
          'Gradient Boosting': GradientBoostingClassifier(),
          'AdaBoost Classifier': AdaBoostClassifier(learning_rate=0.15, n_estimators=25),
         }

accuracy_dict, precision_dict, recall_dict, f1_dict = dict(), dict(), dict(), dict()

for name, model in models.items():
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    print('---------------------------------------------------\n',
          name,
          '\n---------------------------------------------------')

    acc = accuracy_score(y_test, y_hat)
    precision, recall, f1, support = precision_recall_fscore_support(y_test, y_hat, average='binary')
    acc, precision, recall, f1 = round(acc, 5), round(precision, 5), round(recall, 5), round(f1, 5)
    
    accuracy_dict[name] = acc
    precision_dict[name] = precision
    recall_dict[name] = recall
    f1_dict[name] = f1

    print(f'Accuracy: {acc}\nPrecision: {precision}\nRecall: {recall}\nF1: {f1}')

    cm = confusion_matrix(y_test, y_hat)
    df_cm = pd.DataFrame(cm)
    sns.heatmap(df_cm, annot=True, cmap='Blues', linewidths=2)
    plt.title(f'Confusion Matrix for {name}', fontsize=15)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

### Stacking

In [None]:
level0 = [(name, model) for name, model in models.items()]
level1 = LogisticRegression()
stacked = StackingClassifier(estimators=level0, final_estimator=level1, n_jobs=-1)
stacked.fit(X_train, y_train)
y_hat = stacked.predict(X_test)

name = 'Stacked Classifier'
print('---------------------------------------------------\n',
      name,
      '\n---------------------------------------------------')

acc = accuracy_score(y_test, y_hat)
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_hat, average='binary')
acc, precision, recall, f1 = round(acc, 5), round(precision, 5), round(recall, 5), round(f1, 5)

accuracy_dict[name] = acc
precision_dict[name] = precision
recall_dict[name] = recall
f1_dict[name] = f1

print(f'Accuracy: {acc}\nPrecision: {precision}\nRecall: {recall}\nF1: {f1}')

cm = confusion_matrix(y_test, y_hat)
df_cm = pd.DataFrame(cm)
sns.heatmap(df_cm, annot=True, cmap='Blues', linewidths=2)
plt.title(f'Confusion Matrix for {name}', fontsize=15)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

### Plotting Scores

In [None]:
scores_dicts = {
                'Accuracy': accuracy_dict,
                'Precision': precision_dict,
                'Recall': recall_dict,
                'F1 Score': f1_dict,
              }

for name, scores_dict in scores_dicts.items():
    index, values = zip(*scores_dict.items())
    acc_df = pd.DataFrame(data=values, index=index, columns=[name])
    plt.figure(figsize=(9, 10))
    sns.barplot(y=acc_df.index, x=acc_df[name])
    plt.title(f'Plot of {name} Score')

### Thanks for reading up to this far.

### If you liked the notebook, please consider upvoting.