# Heart Attack Analysis & Prediction Dataset
## A dataset for heart attack classification

### About this dataset
* Age : Age of the patient
* Sex : Sex of the patient
* exang: exercise induced angina (1 = yes; 0 = no)
* ca: number of major vessels (0-3)
* cp : Chest Pain type chest pain type
    * Value 0: typical angina
    * Value 1: atypical angina
    * Value 2: non-anginal pain
    * Value 3: asymptomatic

* trtbps : resting blood pressure (in mm Hg)
* chol : cholestoral in mg/dl fetched via BMI sensor
* fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
* rest_ecg : resting electrocardiographic results
    * Value 0: normal
    * Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    * Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

* thalach : maximum heart rate achieved
* output : 0= less chance of heart attack 1= more chance of heart attack

In [None]:
# Import libraries 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Model evalution
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

# Load Data


In [None]:
data = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')

# Deep Copy
df = data.copy()

# Data Exploration

In [None]:

df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.columns

In [None]:
df.output.value_counts()

## Heart Disease frequency according to Sex

In [None]:
pd.crosstab(df.sex, df.output)

In [None]:
pd.crosstab(df.sex, df.output).plot(kind='bar', figsize=(10, 6), color=['lightblue', 'salmon'])

plt.title('Heart Disease frequency for Sex')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.legend(['Female', 'Male'])
plt.xlabel('0 = Not Disease, 1 = Disease')

## Age vs Max Heart Rate for Heart Disease

In [None]:
plt.figure(figsize=(10, 7))
sns.scatterplot(x=df.age, y=df.thalachh, hue=df.output, alpha=0.5)
plt.title('Heart Disease Age vs Max Heart Rate')
plt.ylabel('Max Heart Rate')
plt.xlabel('Age')
plt.legend(['Not disease', 'Disease'])

## Heart Desease Frequency per Chest Pain Type

* cp : Chest Pain type chest pain type
    * Value 0: typical angina
    * Value 1: atypical angina
    * Value 2: non-anginal pain
    * Value 3: asymptomatic

In [None]:
pd.crosstab(df.cp, df.output).plot(kind='bar', figsize=(10,8), color=['lightblue', 'salmon'])
plt.ylabel('Frequency')
plt.xlabel('Chest Pain Type')
plt.title('Heart Disease Chest Pain Frequency')
plt.legend(['Note Disease', 'Disease'])
plt.xticks(rotation=0)

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(data=df.corr(), annot=True, fmt='.2f', cmap='YlOrRd', linewidths=0.5)

In [None]:
df.head()

# Modelling

In [None]:
# Split Data
x = df.drop(columns=['output'])

y = df['output']

In [None]:
x

In [None]:
y

In [None]:
# split into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

## Model selection

In [None]:
model_selection = {'Logistic Regression': LogisticRegression(),
                   'K Neighbors Classifier': KNeighborsClassifier(),
                   'Random Forest Classifier': RandomForestClassifier()}

def fit_and_score(x_train, x_test, y_train, y_test):
    # Make empty Score dic
    score = {}
    # Model fit
    for name, model in model_selection.items():
        # Fit the data in to model
        model.fit(x_train, y_train)
        # save the score 
        score[name] = model.score(x_test, y_test)
    return score

In [None]:
model_scores = fit_and_score(x_train=x_train, x_test=x_test, y_train=y_train, y_test=y_test)

model_scores

In [None]:
model_compare = pd.DataFrame(model_scores, index=['accuracy'])

model_compare.T.plot(kind='bar')

# For improve accuracy following steps :
* Hypyterparameter tuning
* feature importance
* Confusion Matrix
* Cross Validation
* Precision
* Recall
* F1 score
* Classification Report
* ROC Curve
* Area under the curve (AUC)

### HyperParameters Tuning (by hand)

In [None]:
# KNN tune
train_score = []
test_score = []

# Create a list of different values for K-Neighbors
neighbors = range(1, 21)

# Setup KNN instant
knn = KNeighborsClassifier()

# Loop through different n_neighbors
for i in neighbors:
    knn.set_params(n_neighbors=i)
    
    # Fit the algorithm
    knn.fit(x_train, y_train)
    
    # update training score list
    train_score.append(knn.score(x_train, y_train))
    
    # update test score list
    test_score.append(knn.score(x_test, y_test))
    

In [None]:
train_score

In [None]:
test_score

In [None]:
plt.figure(figsize=(15, 8))

sns.lineplot(x=neighbors, y=train_score)
sns.lineplot(x=neighbors, y=test_score)
plt.xticks(np.arange(1,21))
plt.xlabel('Different values of Neighbors')
plt.ylabel('Model Score')
plt.legend(['train_score', 'test_score'])

print(f'Maximum KNN Score on the test data:{max(test_score)*100:.2f}%')

## Hyperparameter tunning with RandomizedSearchCV

Now tunning following:
* Logistic Regression()
* Random Forest Classifier()

... using RandomizedSearchCV

In [None]:
# Create a hyperparameter grid for Logistic Regression
log_reg_grid = {'C': np.logspace(-4, 4, 20),
                'solver': ['liblinear']}

# Create a hyperparameter grid for RandomForestClassifier
rf_grid = {'n_estimators': np.arange(10, 1000, 50),
           'max_depth': [None, 3, 5, 10],
           'min_samples_split': np.arange(2, 20, 2),
           'min_samples_leaf': np.arange(1, 20, 2)}

Now hyperparameter grids setup for each models ansd tune them using RandomizedSearchCV...

In [None]:
# setup random hyperparameter search for LogisticRegression
rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                                param_distributions=log_reg_grid,
                                cv=5,
                                n_iter=20,
                                verbose=True, 
                                random_state=42)

# fit random hyperparameter search model for LogisticRegression
rs_log_reg.fit(x_train, y_train)

In [None]:
# Find the best hyperparameters
rs_log_reg.best_params_

In [None]:
# Evaluate the randomized search randomforestClassifier model
rs_log_reg.score(x_test, y_test)

Now tuned LogisticRegression(), Let's do the same for RandomForestClassifer()...

In [None]:
# Setup random hyperparameter search for RandomForestClassifier
rs_rf = RandomizedSearchCV(RandomForestClassifier(),
                           param_distributions=rf_grid,
                           cv=5,
                           n_iter=20,
                           verbose=True,
                           random_state=42)

# Fit random hyperparameter search model for RandomForestClassifier
rs_rf.fit(x_train, y_train)

In [None]:
# Find the best hyperparameters
rs_rf.best_params_

In [None]:
# Evaluate the randomized search randomforestClassifier model
rs_rf.score(x_test, y_test)

## Hyperparameter Tuning with GridSearchCV

LogisticRegression model provides the best scores so far,  therefore improve again using GridSearchCV...

In [None]:
# Diffrent hyperparameters for our LogisticRegression model
log_reg_grid = {'C': np.logspace(-4,4,30),
                'solver': ['liblinear']}

# setup grid hyperparameter search for LogisticRegression
gs_log_reg = GridSearchCV(LogisticRegression(),
                          param_grid= log_reg_grid,
                          cv=5,
                          verbose=True)

# Fit grid hyperparameter search model
gs_log_reg.fit(x_train, y_train)

In [None]:
# Check the best hyperparameters
gs_log_reg.best_params_

In [None]:
# Evaluate the grid search LogisticRegression model
gs_log_reg.score(x_test, y_test)

## Evaluting our tuned machine learning classifier, beyond accuracy

* ROC curve and AUC score
* Confusion matrix
* classification report
* Precision
* Recall
* F1-Score

....and it would be great it cross-validaton was used where possible

to make comaparistions and evaluate our trained model, first we need to make predictions.

In [None]:
# make predictions with tuned model
y_preds = gs_log_reg.predict(x_test)

y_preds

In [None]:
y_test

In [None]:
plot_roc_curve(gs_log_reg, x_test, y_test)

In [None]:
# Confusion matrix
confusion_matrix(y_test, y_preds)

In [None]:
plt.figure(figsize=(3,3))
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True, cbar=False)
plt.xlabel('True Value')
plt.ylabel('Predicted Value')

In [None]:
print(classification_report(y_test, y_preds))

## Calculate evaluation metrics using cross-validation

calculate accuracy, precision, recall and f1-score model using cross-validation and `cross_val_score()`.

In [None]:
# Check best hyperparameters 
gs_log_reg.best_params_

In [None]:
# Create a new classifier with best parameters
clf = LogisticRegression(C=0.20433597178569418,
                         solver='liblinear')

In [None]:
# Cross-validated accuracy
cv_acc = cross_val_score(clf, x, y, cv=5, scoring='accuracy')

cv_acc = np.mean(cv_acc)
cv_acc

In [None]:
# Cross-validated precision
cv_precision = cross_val_score(clf, x, y, cv=5, scoring='precision')

cv_precision = np.mean(cv_precision)
cv_precision

In [None]:
# Cross-validated recall
cv_recall = cross_val_score(clf, x, y, cv=5, scoring='recall')

cv_recall = np.mean(cv_recall)
cv_recall

In [None]:
# Cross-validated f1
cv_f1 = cross_val_score(clf, x, y, cv=5, scoring='f1')

cv_f1 = np.mean(cv_f1)
cv_f1

In [None]:
# Visualize cross-validated metrics
cv_metrics = pd.DataFrame({'Accuracy': cv_acc,
                           'Precision': cv_precision,
                           'Recall': cv_recall,
                           'F1': cv_f1},
                          index=[0])
cv_metrics.T.plot.bar(title='Cross-validated classification metrics', legend=False)

In [None]:
# Fit an instance of LogisticRegression
clf = LogisticRegression(C=0.20433597178569418,
                         solver='liblinear')

clf.fit(x_train, y_train)

In [None]:
# Check coef_
clf.coef_

In [None]:
# Match coef's of features to columns
feature_dict = dict(zip(df.columns, list(clf.coef_[0])))
feature_dict

In [None]:
# Visualize feature importance
feature_df = pd.DataFrame(feature_dict, index=[0])
feature_df.T.plot.bar(title='Feature Importance', legend=False)

# Prediction

In [None]:
x_raw = df.drop(columns=['output'])
y_raw = df['output']

In [None]:
# Create Data Empty Dataframe
input_ = {'age':0, 'sex':0, 'cp':0, 'trtbps':0, 'chol':0, 'fbs':0, 'restecg':0, 'thalachh':0, 'exng':0, 'oldpeak':0, 
     'slp':0, 'caa':0, 'thall':0}
output = {'Logistic Regression': 0, 'K Neighbors Classifier': 0, 'Random Forest Classifier': 0}

# Create Data Empty Dataframe
final = {'age':0, 'sex':0, 'cp':0, 'trtbps':0, 'chol':0, 'fbs':0, 'restecg':0, 'thalachh':0, 'exng':0, 'oldpeak':0, 
     'slp':0, 'caa':0, 'thall':0, 'Logistic Regression': 0, 'K Neighbors Classifier': 0, 'Random Forest Classifier': 0}
data_final = pd.DataFrame(columns=['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh', 'exng', 'oldpeak', 
                                   'slp', 'caa', 'thall', 'Logistic Regression', 'K Neighbors Classifier', 
                                   'Random Forest Classifier'])


def input_data():
    # Make temporary dataframe
    temp = pd.DataFrame(columns=['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh', 'exng', 
                                   'oldpeak', 'slp', 'caa', 'thall'])
    
    # Make some intruction
    intruction = {"age": "Age: age in years",
                  "sex": "Sex: sex (1 = male; 0 = female)",
                  "cp": """Chest Pain: chest pain type
-- Value 0: typical angina
-- Value 1: atypical angina
-- Value 2: non-anginal pain
-- Value 3: asymptomatic""",
                  "trtbps": "Trest_bps: resting blood pressure (in mm Hg on admission to the hospital)",
                  "chol": "Cholestoral: serum cholestoral in mg/dl",
                  "fbs": "Fasting Blood Sugar: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)",
                  "restecg": '''Resting Electrocardiographic Results:
-- Value 0: normal
-- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
-- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria''',
                  "thalachh": "Thalach: maximum heart rate achieved",
                  "exng": "Ex_Ang: exercise induced angina (1 = yes; 0 = no)",
                  "oldpeak": "Old_Peak: ST depression induced by exercise relative to rest",
                  "slp": "Slope: the slope of the peak exercise ST segment",
                  "caa": "CA: number of major vessels (0-3) colored by flourosopy",
                  "thall": "Thal: 3 = normal; 6 = fixed defect; 7 = reversable defect",
                  "output": """Output: diagnosis of heart disease (angiographic disease status)
-- Value 0: < 50% diameter narrowing
-- Value 1: > 50% diameter narrowing"""}
    
    # Input your data
    for name, val in input_.items():
        print(intruction[name])
        input_[name] = input(f'{name} : ')
    
    # Input all data into dataframe
    temp = temp.append(input_, ignore_index=True)
    
    # Conver all value into float
    temp = temp.astype(np.float64)
    
    # Make Model 
    models = {'Logistic Regression': LogisticRegression(),
              'K Neighbors Classifier': KNeighborsClassifier(),
              'Random Forest Classifier': RandomForestClassifier()}
    
    # Set random seed
    np.random.seed(42)
    # Model fit and pridict optput
    for name, model in models.items():
        model.fit(x_raw, y_raw)  # fit the model
        
        # predict value
        y_preds = model.predict(temp)
#         if name == 'K Neighbors Classifier':
#             y_preds = model.predict_proba(temp)
#         else:
#             y_preds = model.predict_log_proba(temp)
            
        output[name] = y_preds
    
    # save data in final
    for name, val in input_.items():
        final[name] = input_[name]
    for name, val in output.items():
        final[name] = output[name]
    

In [None]:
# input_data()

In [None]:
# data_final = data_final.append(final, ignore_index=True)

# data_final