In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras_tuner import RandomSearch, HyperParameters

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')

print(df.shape)
df.head()

# EDA

## Information about data

- age : Age of the patient


- sex : Sex of the patient


- exng: exercise induced angina (1 = yes; 0 = no)


- caa: number of major vessels (0-3)


- cp : Chest Pain type chest pain type

    -Value 1: typical angina
    
    -Value 2: atypical angina
    
    -Value 3: non-anginal pain
    
    -Value 4: asymptomatic


- trtbps : resting blood pressure (in mm Hg)


- chol : cholestoral in mg/dl fetched via BMI sensor


- fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)


- restecg : resting electrocardiographic results

    -Value 0: normal
    
    -Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    
    -Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria


- thalachh : maximum heart rate achieved


 - output : 0= less chance of heart attack 
            1= more chance of heart attack

## Target Variable

In [None]:
target = 'output'

In [None]:
df.groupby(target)['age'].count().plot.bar()
plt.ylabel('count')
plt.show()

In [None]:
print('Y: {}%'.format(round(df[target].value_counts()[0] * 100 / len(df), 3)))
print('Y: {}%'.format(round(df[target].value_counts()[1] * 100 / len(df), 3)))

Datset is balanced

## Missing Values

In [None]:
df.isnull().sum()

## Discrete Features

In [None]:
dis_features = [feature for feature in df.columns if len(df[feature].unique()) < 20 and feature != target]

print(dis_features)

### Distribution

In [None]:
for feature in dis_features:
    df.groupby(feature)[target].count().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

### vs Target Variable

In [None]:
for feature in dis_features:
    df.groupby(feature)[target].mean().plot.bar()
    plt.xlabel(feature)
    plt.ylabel(target)
    plt.show()

- for 'slp': 0 value can be replaced with 1 as both have almost identical relationship with target variable
- for 'caa': 4 value can be replaced with 0 as both have almost identical relationship with target variable
- for 'thall': 0 value can be replaced with 1 as both have almost identical relationship with target variable

## Continuous Features

In [None]:
con_features = [feature for feature in df.columns if feature not in dis_features and feature != target]

print(con_features)

### Distribution

In [None]:
for feature in con_features:
    iqr = stats.iqr(df[feature], interpolation = 'midpoint')
    h = (2 * iqr) / (len(df[feature]) ** (1/3))
    bins = round((max(df[feature]) - min(df[feature])) / h)
    
    df[feature].hist(bins = bins)
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

### Outliers

In [None]:
for feature in con_features:
    df.boxplot(column = feature)
    plt.xlabel(feature)
    plt.ylabel('value')
    plt.show()

### Transformation

In [None]:
for feature in con_features:
    extreme = df[feature].median() + 3 * df[feature].std()
    print('{} values to replace: {}%'.format(feature, len(df.loc[df[feature] > extreme])))
    
    data = df.copy()
    data[feature] = np.where(data[feature] > extreme, extreme, data[feature])
    
    iqr = stats.iqr(data[feature], interpolation = 'midpoint')
    h = (2 * iqr) / (len(data[feature]) ** (1/3))
    bins = round((max(data[feature]) - min(data[feature])) / h)
    
    data[feature].hist(bins = bins)
    plt.show()

In [None]:
for feature in con_features[-1:]:
    data = df.copy()
    
    data[feature] = data[feature] ** 0.5
    
    iqr = stats.iqr(data[feature], interpolation = 'midpoint')
    h = (2 * iqr) / (len(data[feature]) ** (1/3))
    bins = round((max(data[feature]) - min(data[feature])) / h)
    
    data[feature].hist(bins = bins)
    plt.show()

# Feature Engineering

## Discrete Features

In [None]:
df['slp'] = np.where(df['slp'] == 0, 1, df['slp'])
df['caa'] = np.where(df['caa'] == 4, 0, df['caa'])
df['thall'] = np.where(df['thall'] == 0, 1, df['thall'])

## Continuous Features

In [None]:
for feature in con_features:
    extreme = df[feature].median() + 3 * df[feature].std()
    df[feature] = np.where(df[feature] > extreme, extreme, df[feature])

In [None]:
df['oldpeak'] = df['oldpeak'] ** 0.5

# Feature Selection

In [None]:
X = df.iloc[:, :-1]
y = df[target]

In [None]:
cor = X.corr()

plt.figure(figsize = (10, 8))
sns.heatmap(cor, annot = True, cmap = plt.cm.CMRmap_r)
plt.show()

# Models

## Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
scaler = StandardScaler()

scaler.fit(X_train)

In [None]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train_scaled, y_train)

In [None]:
y_pred_lr = model.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, digits = 4))

## KNN

In [None]:
scores = []
neighbors = np.arange(5, 13)

for n in neighbors:
    model = KNeighborsClassifier(n_neighbors = n)
    model.fit(X_train_scaled, y_train)
    scores.append(model.score(X_test_scaled, y_test))
    
best_neighbors = neighbors[scores.index(max(scores))]
print(best_neighbors)

In [None]:
model = KNeighborsClassifier(n_neighbors = best_neighbors)

model.fit(X_train_scaled, y_train)

In [None]:
y_pred_knn = model.predict(X_test_scaled)

In [None]:
y_pred_knn = model.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn, digits = 4))

## SVM

In [None]:
model = SVC()

In [None]:
model.fit(X_train_scaled, y_train)

In [None]:
y_pred_svm = model.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, digits = 4))

## Ensemble Models

### Random Forest

In [None]:
model = RandomForestClassifier()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred_rf = model.predict(X_test)

print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, digits = 4))

### XGBoost

In [None]:
model = XGBClassifier(use_label_encoder = False)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred_xgb = model.predict(X_test)

print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb, digits = 4))

## ANN

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
scaler = StandardScaler()

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
def build_model(hp):
    model = keras.Sequential([
        keras.layers.Dense(units = hp.Int('dense_1_units', min_value = 32, max_value = 128, step = 8),
                           kernel_initializer = hp.Choice('dense_1_kernel', values = ['he_normal', 'he_uniform']),
                           activation = 'relu',
                           input_dim = 13),
        keras.layers.Dense(units = hp.Int('dense_2_units', min_value = 32, max_value = 64, step = 8),
                           kernel_initializer = hp.Choice('dense_1_kernel', values = ['he_normal', 'he_uniform']),
                           activation = 'relu'),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(units = hp.Int('dense_3_units', min_value = 32, max_value = 64, step = 8),
                           kernel_initializer = hp.Choice('dense_1_kernel', values = ['he_normal', 'he_uniform']),
                           activation = 'relu'),
        keras.layers.Dense(units = 1, activation = 'sigmoid')
    ])
    
    model.compile(optimizer = keras.optimizers.Adam(hp.Choice('learning_rate', values = [1e-2, 1e-3])),
                  loss = 'binary_crossentropy',
                  metrics = ['accuracy'])
    
    return model

In [None]:
tuner_search = RandomSearch(build_model, objective = 'val_accuracy', max_trials = 3, directory = 'ann_hp_output', project_name = 'heart_disease')

In [None]:
tuner_search.search(X_train_scaled, y_train, epochs = 5, validation_split = 0.1)

In [None]:
model = tuner_search.get_best_models(num_models = 1)[0]

In [None]:
model.summary()

In [None]:
model.fit(X_train_scaled, y_train, epochs = 10, validation_split = 0.1, initial_epoch = 5)

In [None]:
y_pred_ann = np.round(model.predict(X_test_scaled)).astype(int)

print(confusion_matrix(y_test, y_pred_ann))
print(classification_report(y_test, y_pred_ann, digits = 4))