## Heart Attack Analysis and Prediction

## About Dataset

age - Age of the patient

sex - Sex of the patient (1 = male; 0 = female)

cp - Chest pain type ~ 0 = Typical Angina, 1 = Atypical Angina, 2 = Non-anginal Pain, 3 = Asymptomatic

trtbps - Resting blood pressure (in mm Hg)

chol - Cholestoral in mg/dl fetched via BMI sensor

fbs - (fasting blood sugar > 120 mg/dl) ~ 1 = True, 0 = False

restecg - Resting electrocardiographic results ~ 0 = Normal, 1 = ST-T wave normality, 2 = Left ventricular hypertrophy

thalachh - Maximum heart rate achieved

oldpeak - Previous peak

slp - Slope

caa - Number of major vessels

thall - Thalium Stress Test result ~ (0,3)

exng - Exercise induced angina ~ 1 = Yes, 0 = No

output - 0= less chance of heart attack 1= more chance of heart attack

## Importing Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## Importing Dataset

In [None]:
df = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

## Descriptive Statistic Numerical Data

In [None]:
numerical_cols = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
df[numerical_cols].describe()

## Exploratory Data Analysis

### Univariate Analysis

#### Categorical Data

sex, cp, fbs, restecg, exng, slp, caa, thall, output

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(15, 14))

categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']

i = 0
for row in range(0, 3):
    if row == 2:
        for col in range(0, 2):
            plot = sns.countplot(
                x = categorical_cols[i],
                data = df,
                palette = ['#a6e3e9', '#ffc7c7', '#bbded6', '#8785a2', '#f9ed69'],
                ax = axes[row, col]
            )
            axes[row, col].set_xlabel('')
            axes[row, col].set_title(categorical_cols[i])
            i += 1
    else:
        for col in range(0, 3):
            sns.countplot(
                x = categorical_cols[i],
                data = df,
                palette = ['#a6e3e9', '#ffc7c7', '#bbded6', '#8785a2', '#f9ed69'],
                ax = axes[row, col]
            )
            axes[row, col].set_xlabel('')
            axes[row, col].set_title(categorical_cols[i])
            i += 1


fig.suptitle('Count Plot Categorical Data', size=15)
axes[2, 2].remove()
plt.show()

In [None]:
fig, axes = plt.subplots(figsize=(9, 7))

sns.countplot(
    x = 'output',
    data = df,
    palette = ['#a6e3e9', '#ffc7c7'],
    ax = axes
)
fig.suptitle('Count of Target Classes', size=15)
axes.set_xlabel('')
axes.set_ylabel('Counts')
axes.set_title('output')
axes.set_xticklabels(['0\nless chance of heart attack', '1\nmore chance of heart attack'])
plt.show()

#### Numerical Data

age, trtbps, chol, thalachh, oldpeak

In [None]:
fig, axes = plt.subplots(4, 3, figsize=(20, 16))

i = 0
for row in range(0, 3, 2):
    if row == 2:
        for col in range(0, 2):
            sns.histplot(
                x = np.array(df[numerical_cols[i]]),
                color = '#ffc7c7',
                ax = axes[row, col]
            )
            axes[row, col].set_title(numerical_cols[i])
            i += 1
    else:
        for col in range(0, 3):
            sns.histplot(
                x = np.array(df[numerical_cols[i]]),
                color = '#ffc7c7',
                ax = axes[row, col]
            )
            axes[row, col].set_title(numerical_cols[i])
            i += 1

i = 0
for row in range(1, 4, 2):
    if row == 3:
        for col in range(0, 2):
            sns.boxplot(
                x = np.array(df[numerical_cols[i]]),
                palette = ['#a6e3e9'],
                width = 0.5,
                ax = axes[row, col]
            )
            axes[row, col].set_title(numerical_cols[i])
            i += 1
            
    else:
        for col in range(0, 3):
            sns.boxplot(
                x = np.array(df[numerical_cols[i]]),
                palette = ['#a6e3e9'],
                width = 0.5,
                ax = axes[row, col]
            )
            axes[row, col].set_title(numerical_cols[i])
            i += 1

fig.suptitle('Histogram and Box Plot Numerical Data', size=15)
axes[2, 2].remove()
axes[3, 2].remove()
plt.show()

### Bivariate Analysis

#### Categorical Data According Target Classes

sex, cp, fbs, restecg, exng, slp, caa, thall, output

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(20, 17))

# sex
sns.countplot(
    x = 'sex',
    hue = 'output',
    data = df,
    palette = ['#a6e3e9', '#b83b5e'],
    ax = axes[0, 0]
)
axes[0, 0].set_xlabel('')
axes[0, 0].set_title('sex')
axes[0, 0].set_xticklabels(['0\nFemale', '1\nMale'])

# cp
sns.kdeplot(
    x = 'cp',
    hue = 'output',
    data = df,
    palette = ['#08d9d6', '#b83b5e'],
    fill = True,
    ax = axes[0, 1]
)
axes[0, 1].set_xlabel('')
axes[0, 1].set_title('cp')

# fbs
sns.countplot(
    x = 'fbs',
    hue = 'output',
    data = df,
    palette = ['#a6e3e9', '#b83b5e'],
    ax = axes[0, 2]
)
axes[0, 2].set_xlabel('')
axes[0, 2].set_title('fbs')
axes[0, 2].set_xticklabels(['0', '1\nfasting blood sugar > 120 mg/dl'])

# restecg
sns.kdeplot(
    x = 'restecg',
    hue = 'output',
    data = df,
    palette = ['#08d9d6', '#b83b5e'],
    fill = True,
    ax = axes[1, 0]
)
axes[1, 0].set_xlabel('')
axes[1, 0].set_title('restecg')

# exng
sns.countplot(
    x = 'exng',
    hue = 'output',
    data = df,
    palette = ['#a6e3e9', '#b83b5e'],
    ax = axes[1, 1]
)
axes[1, 1].set_xlabel('')
axes[1, 1].set_title('exngg')
axes[1, 1].set_xticklabels(['0', '1\nexercise induced angina'])

# slp
sns.kdeplot(
    x = 'slp',
    hue = 'output',
    data = df,
    palette = ['#08d9d6', '#b83b5e'],
    fill = True,
    ax = axes[1, 2]
)
axes[1, 2].set_xlabel('')
axes[1, 2].set_title('slp')

# caa
sns.kdeplot(
    x = 'caa',
    hue = 'output',
    data = df,
    palette = ['#08d9d6', '#b83b5e'],
    fill = True,
    ax = axes[2, 0]
)
axes[2, 0].set_xlabel('')
axes[2, 0].set_title('caa')

# thall
sns.kdeplot(
    x = 'thall',
    hue = 'output',
    data = df,
    palette = ['#08d9d6', '#b83b5e'],
    fill = True,
    ax = axes[2, 1]
)
axes[2, 1].set_xlabel('')
axes[2, 1].set_title('thall')

fig.suptitle('Visualization of Categorical Data According Target Classes', size=15)
axes[2, 2].remove()
plt.show()

In [None]:
percentage = []
percentage.append(df.loc[(df['sex'] == 1) & (df['output'] == 1)].count()[0]*100/df.loc[df['sex'] == 1].count()[0])
percentage.append(df.loc[(df['sex'] == 0) & (df['output'] == 1)].count()[0]*100/df.loc[df['sex'] == 0].count()[0])

fig, axes = plt.subplots(figsize=(9, 7))

sns.barplot(
    x = df['sex'].unique(),
    y = percentage,
    palette = ['#f5c0c0', '#b0efeb'],
    ax = axes
)

fig.suptitle('Percentage More Chance Heart Attack (Target Class 1) of Sex Features', size=15)
axes.set_title('sex')
axes.set_ylabel('Percentage')
axes.set_xticklabels(['0\nFemale', '1\nMale'])
plt.show()

#### Numerical Data According Target Classes

age, trtbps, chol, thalachh, oldpeak

In [None]:
fig, axes = plt.subplots(4, 3, figsize=(20, 17))

i = 0
for row in range(0, 3, 2):
    if row == 2:
        for col in range(0, 2):
            sns.kdeplot(
                x = numerical_cols[i],
                hue = 'output',
                data = df,
                palette = ['#08d9d6', '#b83b5e'],
                fill = True,
                ax = axes[row, col]
            )
            axes[row, col].set_xlabel('')
            axes[row, col].set_title(numerical_cols[i])
            i += 1
    else:
        for col in range(0, 3):
            sns.kdeplot(
                x = numerical_cols[i],
                hue = 'output',
                data = df,
                palette = ['#08d9d6', '#b83b5e'],
                fill = True,
                ax = axes[row, col]
            )
            axes[row, col].set_xlabel('')
            axes[row, col].set_title(numerical_cols[i])
            i += 1

i = 0
for row in range(1, 4, 2):
    if row == 3:
        for col in range(0, 2):
            sns.boxplot(
                x = numerical_cols[i],
                y = 'output',
                data = df,
                orient = 'h',
                palette = ['#a6e3e9', '#b83b5e'],
                width = 0.5,
                ax = axes[row, col]
            )
            axes[row, col].set_xlabel('')
            axes[row, col].set_title(numerical_cols[i])
            i += 1
    else:
        for col in range(0, 3):
            sns.boxplot(
                x = numerical_cols[i],
                y = 'output',
                data = df,
                orient = 'h',
                palette = ['#a6e3e9', '#b83b5e'],
                width = 0.5,
                ax = axes[row, col]
            )
            axes[row, col].set_xlabel('')
            axes[row, col].set_title(numerical_cols[i])
            i += 1

fig.suptitle('Visualization of Numerical Data According Target Classes', size=15)
axes[2, 2].remove()
axes[3, 2].remove()
plt.show()

### Multivariate Analysis

#### Correlation

In [None]:
fig, axes = plt.subplots(figsize=(11, 9))

sns.heatmap(
    data = df.corr(),
    cmap = 'coolwarm',
    annot = True,
    ax = axes
)

plt.show()

### Conclusion

1. Univariate Analysis
    - The sex feature has more male (1) observation data than female (0) observation data
    - Target classes doesn't occur imbalanced data
    - Every numerical data has outliers except age feature
    - The distribution of the age feature almost has a normal distribution
2. Bivariate Analysis
    - sex - Female has greater percentage of more chance heart attack than male
    - cp - Chest pain type 2 (non-anginal pain) has greatest percentage of more chance heart attack
    - restecg - Resting electrocardiographic results 1 (ST-T wave normality) has greatest percentage of more chance heart attack
    - exngg - Exercise induced angina 0 (no) has greater percentage of more chance heart attach than 1 (yes)
    - slp - Slope 2 has greatest percentage of more chance heart attack
    - caa - Number of major vessels 0 has greatest percentage of more chance heart attack
    - thall - Thalium Stress Test result 2 has greatest percentage of more chance heart attack
    - age - Age around 50 have a greater chance of having heart attack
    - thalachh - Maximum heart rate achieved
        - greater maximum heart rate achieved, greater chance to heart attack
    - oldpeak - Previous peak
        - lowest previous peak achieved, greater chance to heart attack
3. Multivariate Analysis
    - according to the heatmap, there are few features that have a fairly large positive correlation with output, such as cp, thalachh, and slp.
    - according to the heatmap, there are few features that have a fairly large negative correlation with output, such as exng, oldpeak, and caa.

## Prediction with Model

### Split into Features and Target Variable

In [None]:
X = df.drop(columns=['output']).values
X

In [None]:
y = df['output'].values
y

### Split into Training Set and Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Feature Scaling

In [None]:
df_temp = pd.DataFrame(X_train)
df_temp.head()

In [None]:
numerical_cols

In [None]:
columns_to_scale = [0, 3, 4, 7, 9]

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, columns_to_scale] = sc.fit_transform(X_train[:, columns_to_scale])
X_test[:, columns_to_scale] = sc.transform(X_test[:, columns_to_scale])

In [None]:
df_temp = pd.DataFrame(X_train)
df_temp.head()

### Training Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=0)
classifier.fit(X_train, y_train)

### Evaluate Model

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(f'accuracy score: {"{:.2f}".format(accuracy_score(y_test, y_pred)*100)} %')

#### Evaluate with k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
print(f'10-fold cross validation mean accuracy: {"{:.2f}".format(accuracies.mean()*100)} %')
print(f'standard deviation: {"{:.2f}".format(accuracies.std()*100)} %')

### Hyperparameter Tuning

In [None]:
classifier.get_params()

In [None]:
n_estimators = np.arange(10, 160, 10)
random_state = np.arange(0, 21)

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [
    {
        'n_estimators': n_estimators,
        'criterion': ['gini', 'entropy'],
        'random_state': random_state,
        'class_weight': ['balanced', 'balanced_subsample', None]
    }
]

grid_search = GridSearchCV(
    estimator=classifier,
    param_grid=parameters,
    scoring='accuracy',
    cv=10,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(f'best 10-fold cross validation mean accuracy: {"{:.2f}".format(best_accuracy*100)} %')
print(f'best parameters: {best_parameters}')

In [None]:
df_accuracy = pd.DataFrame(grid_search.cv_results_)
df_accuracy.sort_values(by='rank_test_score').head()