In [None]:
import numpy as np 
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# import moduls
import seaborn as sns
sns.set_style('darkgrid') # set grid for all graphs
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
from collections import Counter

> [](http://)

# Discription of variables in data set

About this dataset
Age : Age of the patient

Sex : Sex of the patient

exang: exercise induced angina (1 = yes; 0 = no)

ca: number of major vessels (0-3)

cp : Chest Pain type chest pain type (Value 1: typical angina
Value 2: atypical angina
Value 3: non-anginal pain
Value 4: asymptomatic)

trtbps : resting blood pressure (in mm Hg)

chol : cholestoral in mg/dl fetched via BMI sensor

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

rest_ecg : resting electrocardiographic results

Value 0: normal
Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
thalach : maximum heart rate achieved

target : 0= less chance of heart attack 1= more chance of heart attack

# Loading data and first sight at data frame

In [None]:
data = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

We have only numeric (int, float) types

In [None]:
data.isnull().sum()

Data doesn't have missing values

In [None]:
fig = plt.figure()
ax = fig.add_subplot()
ax.set(title='Distribution of target variable')
data.output.value_counts().plot(kind='pie', autopct="%.2f")
plt.show()

In [None]:
for col in data.columns:
    n = data[col].nunique()
    print(col + " has %s unique values" %n)

We see here that some of variables have 2-5 unique values. I will treat it like categorical variables. Collect numeric and categorical columns separetly

In [None]:
num_cols = [col for col in data.columns if data[col].nunique() > 5]
cat_cols = [col for col in data.columns if data[col].nunique() <= 5]
print('Numeric columns are:', num_cols, 'Categorical columns are:', cat_cols, sep='\n')

In [None]:
data[num_cols].describe()

Distribution of numeric data

# Visialization

In [None]:
sns.pairplot(data, vars=num_cols, hue='output', corner=True)
plt.show()

**Conclusions:**
* It doesn't look like any variables have correlation. Maybe only 'thalachh' and 'age' have light negative correlation.
* Interesting histigrams of 'oldpeak'. Most of observations have low 'oldpeak' (< 2) but most of all have chance of heart attack.
* Interesting histigrams of 'thalachh': we can say that maximum heart rate can increase chance of heart attack.

In [None]:
sns.histplot(data=data, x="thalachh", hue="output", kde=True)
plt.show()

**Not all variables were described on dataset page, I took this one from [disscusions](http://www.kaggle.com/rashikrahmanpritom/heart-attack-analysis-prediction-dataset/discussion/234843):**

1. age - age in years

2. sex - sex (1 = male; 0 = female)

3. cp - chest pain type (1 = typical angina; 2 = atypical angina; 3 = non-anginal pain; 0 = asymptomatic)

4. trestbps - resting blood pressure (in mm Hg on admission to the hospital)

5. chol - serum cholestoral in mg/dl

6. fbs - fasting blood sugar > 120 mg/dl (1 = true; 0 = false)

7. restecg - resting electrocardiographic results (1 = normal; 2 = having ST-T wave abnormality; 0 = hypertrophy)

8. thalach - maximum heart rate achieved

9. exang - exercise induced angina (1 = yes; 0 = no)

10. oldpeak - ST depression induced by exercise relative to rest

11. slope - the slope of the peak exercise ST segment (2 = upsloping; 1 = flat; 0 = downsloping)

12. ca - number of major vessels (0-3) colored by flourosopy

13. thal - 2 = normal; 1 = fixed defect; 3 = reversable defect

14. num - the predicted attribute - diagnosis of heart disease (angiographic disease status) (Value 0 = < diameter narrowing; Value 1 = > 50% diameter narrowing)

Thanks to [jaykumar1607](http://www.kaggle.com/jaykumar1607)

**Here I would like to examine dependance "chance of heart attack" and other features. But at first I have to convert some numeric data to categorical (do binning)**

In [None]:
cat_data = data.copy()

In [None]:
cat_data.head()

# Binning and rename values

In [None]:
# function for calculation max heart rate ('thalachh') normal/critical for person
# it calculates as: 220 - age, and then compare with value in 'thalachh' columns

def calc_max_pulse(col_max_heart_rate, col_age):
    x = np.array(col_max_heart_rate)
    y = np.array(col_age)
    changed_array = []
    for n in range(len(x)):
        if (x[n]) <= (220 - y[n]):
            changed_array.append('normal')
        else: 
            changed_array.append('critical')
    return changed_array

In [None]:
## binning of numerical variables

cat_data['trtbps'] = pd.cut(cat_data['trtbps'], bins = [0, 120, 129, 139, 159, 179, 10000], labels = ['Optimal', 'Normal', 'High normal', 
                                                                                              'Grade 1 hypertension', 'Grade 2 hypertension',
                                                                                              'Grade 3 hypertension'])

cat_data['thalachh'] = calc_max_pulse(cat_data['thalachh'], cat_data['age'])

cat_data['age'] = pd.cut(cat_data['age'], bins = [0,45,60,200], labels = ['Adults','Mid Adults','Elderly'])

cat_data['chol'] = pd.cut(cat_data['chol'], bins = [0, 200, 239, 600], labels = ['Ideal', 'Borderline high', 'High'])


[Binning blood pressure](http://en.wikipedia.org/wiki/Blood_pressure)

[Age binnig](http://kidspicturedictionary.com/english-through-pictures/people-english-through-pictures/age-physical-description/) thanks to [bhuvanchennoju](http://www.kaggle.com/bhuvanchennoju) and his [great notebook](https://www.kaggle.com/bhuvanchennoju/data-stroytelling-auc-focus-on-strokes#Summary)

[Cholesterol level](http://www.cholesterolmenu.com/cholesterol-levels-chart/)
I suspect that our data['chol'] is 'Total cholesterol'

**Rename features for more informative graphs**

In [None]:
cat_data['sex'].replace({0: 'female', 1: 'male'},inplace=True)
cat_data['cp'].replace({1: 'typical angina', 2: 'atypical angina', 3: 'non-anginal pain', 0: 'asymptomatic'},inplace=True)
cat_data['output'].replace({0: 'less risk', 1: 'risk'},inplace=True)
cat_data['fbs'].replace({0: 'blood sugar less 120 mg/dl', 1: 'blood sugar more 120 mg/dl'},inplace=True)
cat_data['restecg'].replace({1: 'normal', 2: 'having ST-T wave abnormality', 0: 'hypertrophy'},inplace=True)
cat_data['exng'].replace({0: 'no exercise', 1: 'exercise induced angina'},inplace=True)
cat_data['slp'].replace({0: 'downsloping', 1: 'flat', 2: 'upsloping'},inplace=True)
cat_data['caa'].replace({0: '0 major vessel', 1: '1 major vessel', 2: '2 major vessels', 3: '3 major vessels', 4: '4 major vessels'},inplace=True)
cat_data['thall'].replace({1: 'fixed defect', 2: 'normal', 3: 'reversable defect'},inplace=True)

In [None]:
fig = plt.figure(figsize=(25,25))
n = 1
for column in cat_data.drop('oldpeak', axis=1):
    ax = plt.subplot(4,4,n)
    sns.countplot(x='output', hue=column, data=cat_data)
    n += 1
plt.show()

**Conclusions:** 
* Women more vulnerable than men
* Chest pain (cp) angina (typical/atypical) can indicate people who have a risk
* A bit confusing: we have more risk obsertation in "Optimal" blood pressure (trtbps) group and "Normal" group in esting electrocardiographic results (restecg). thall = "normal" the same
* Cholesterol and blood sugar don't informative for us
* "Exng", "slp", "caa" have one strong tendence in risk group
* Some strange value for feature 'thall' = 0. It wasn't described anywhere.


I would say this plots a bit confusing. I thought that it would be some correlation between high rate of heart rate and cholesterol or elderly people would have more chance to be in risk but it doesn't observed.

In [None]:
cat_data[(cat_data['thall'] == 0)]

In [None]:
cat_data.groupby(['output', 'thall']).agg({'output': 'count'})

We have 2 observations. One in risk col and another in less risk col. I will change "thall = 0" values to most frequent in "risk" and "less risk" groups

In [None]:
cat_data.loc[((cat_data['thall'] == 0) & (cat_data['output'] == 'less risk')), ('thall')] = 'reversable defect'

In [None]:
cat_data.loc[((cat_data['thall'] == 0) & (cat_data['output'] == 'risk')), ('thall')] = 'normal'

In [None]:
cat_data.groupby(['output', 'thall']).agg({'output': 'count'})

Check correlations between features (I use main data without binning)

In [None]:
fig = plt.figure(figsize=(15,15))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()

* The strongest correlation between features "slp" and "oldpeak". And it's negative correlation
* Target feature "output" has stronger possitive correlation with "cp", "thalach" and stronger negative correlation with "exng", "oldpeak".

# Preparation data for modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
Counter(data.output)

As we can see target variable is balanced and we haven't to do "undersampling" or "oversampling" things. Just separate our data with "train_test_split"

In [None]:
X = data.drop('output', axis=1)
y = data.output

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

**I would like check difference between raw data, normalized and binning data. I will do 2 new data frame. First ("scaled_X_train") with normalized train data (using Standart Scaler), second ("encodered_X_train") with transformed numeric columns to categorical (binning like I did above and then use Label Encoder). And compare the results.**

In [None]:
scaled_X_train = X_train.copy()

In [None]:
scaled_X_train.head()

In [None]:
array = ['age', 'trtbps', 'chol', 'thalachh']
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(scaled_X_train[array])
normalize_results = pd.DataFrame(X_train_scaled, index=scaled_X_train.index, columns=array)
scaled_X_train.drop(array, axis=1, inplace=True)
scaled_X_train = scaled_X_train.join(normalize_results)

In [None]:
scaled_X_train.head()

***We do scaling after splitting for excluding data leakage in test data!!***

In [None]:
encodered_X_train = X_train.copy()

Do the same I did before in binning section

In [None]:
encodered_X_train['trtbps'] = pd.cut(encodered_X_train['trtbps'], bins = [0, 120, 129, 139, 159, 179, 10000], labels = ['Optimal', 'Normal', 'High normal', 
                                                                                              'Grade 1 hypertension', 'Grade 2 hypertension',
                                                                                              'Grade 3 hypertension'])

encodered_X_train['thalachh'] = calc_max_pulse(encodered_X_train['thalachh'], encodered_X_train['age'])

encodered_X_train['age'] = pd.cut(encodered_X_train['age'], bins = [0,45,60,200], labels = ['Adults','Mid Adults','Elderly'])

encodered_X_train['chol'] = pd.cut(encodered_X_train['chol'], bins = [0, 200, 239, 600], labels = ['Ideal', 'Borderline high', 'High'])

In [None]:
encoder = LabelEncoder()
for title in array:
    encodered_X_train[title] = encoder.fit_transform(encodered_X_train[title])

In [None]:
# IT CAN BE USED INSTEAD OF LabelEncoder()

# col_for_dummy = ['age', 'cp', 'trtbps', 'chol', 'thalachh', 'slp', 'caa', 'thall']
# dummy_df = pd.get_dummies(encodered_X_train[col_for_dummy])
# encodered_X_train.drop(col_for_dummy, axis=1, inplace=True)
# dummy_X_train = pd.concat([encodered_X_train, dummy_df], axis=1)
# encodered_X_train = dummy_X_train

In [None]:
encodered_X_train.head()

In [None]:
X_train.head()

Now we have 3 dataset for modeling

# Modeling

Import modeling moduls

In [None]:
# models
from sklearn.ensemble import (RandomForestClassifier, 
                              AdaBoostClassifier, 
                              GradientBoostingClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# metrics
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, 
                             precision_recall_curve, roc_curve, 
                             plot_precision_recall_curve, plot_confusion_matrix)

# for regular expressions
import re 

Firstly, let's check which dataset fits more

In [None]:
models = [RandomForestClassifier(), GradientBoostingClassifier(), LogisticRegression(), KNeighborsClassifier(), AdaBoostClassifier(),
         DecisionTreeClassifier(), SVC(probability = True), XGBClassifier(eval_metric = 'logloss'), LGBMClassifier()]

model_names = []
train_data = [X_train, scaled_X_train, encodered_X_train]

for mod_name in models:
    r = re.findall((r'\w*'), str(mod_name))
    model_names.append(r[0])
    
for n in range(len(models)):
    clf = models[n]
    scores = cross_val_score(clf, train_data[0], y_train, cv=5)
    print('Algorithm is: %s' %model_names[n])
    print('Raw data accuracy:', scores.mean().round(3))
    scores = cross_val_score(clf, train_data[1], y_train, cv=5)
    print('Scaled data accuracy:', scores.mean().round(3))
    scores = cross_val_score(clf, train_data[2], y_train, cv=5)
    print('Encodered data accuracy:', scores.mean().round(3))
    print()

KNN and SVC - sensitive for preparing data but in other cases deference is not significant. I prefer use scaled data

In [None]:
def cross_valid_scores(models_array, X_tr, y_tr):
    
    accuracy = []
    precision = []
    recall = []
    f1 = []
    auc = []
        
    scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'accuracy', 'roc_auc']

    for n in models:
        clf = n
        scores = cross_validate(clf, X_tr, y_tr, cv=5, scoring=scoring)

        acc_mean = scores['test_accuracy'].mean().round(3)
        prec_mean = scores['test_precision_macro'].mean().round(3)
        rec_mean = scores['test_recall_macro'].mean().round(3)
        f1_mean = scores['test_f1_macro'].mean().round(3)
        roc_mean = scores['test_roc_auc'].mean().round(3)

        accuracy.append(acc_mean*100)
        precision.append(prec_mean*100)
        recall.append(rec_mean*100)
        f1.append(f1_mean*100)
        auc.append(roc_mean*100)
        
    results_df = pd.DataFrame({"Accuracy Score":accuracy,"Precision Score":precision,
                        "Recall Score":recall, "f1 Score":f1,"AUC Score":auc,
                        "Algorithm": model_names})
    
    results_df = (results_df.sort_values(by = 'AUC Score', ascending = False)
                  .reset_index(drop =  True))
    
    return results_df

In [None]:
scaled_X_test = X_test.copy()
X_test_scaled = scaler.transform(scaled_X_test[array])
normalize_results = pd.DataFrame(X_test_scaled, index=scaled_X_test.index, columns=array)
scaled_X_test.drop(array, axis=1, inplace=True)
scaled_X_test = scaled_X_test.join(normalize_results)

Normalized test set

In [None]:
scaled_X_test.head()

In [None]:
cross_valid_scores(models, scaled_X_train, y_train)

I will maximaze "Recall score" because in my opinion, we don't have to miss positvite class (heart risk). Below I'll use Logistic Regression for final model. 

# Logistic Regression tune with Grid Search

In [None]:
param_grid = {'C': np.arange(1e-05, 3, 0.1)}
scoring = {'Accuracy': 'accuracy', 'AUC': 'roc_auc', 'Log_loss': 'neg_log_loss', 'Recall': 'recall'}

grid_log_reg = GridSearchCV(LogisticRegression(), return_train_score=True,
                  param_grid=param_grid, scoring=scoring, cv=5, refit='Log_loss')

grid_log_reg.fit(scaled_X_train, y_train)

In [None]:
grid_log_reg.cv_results_

In [None]:
grid_log_reg.best_params_

In [None]:
best_clf = grid_log_reg.best_estimator_

In [None]:
best_clf.score(scaled_X_train, y_train).round(2)*100

In [None]:
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

for n in scoring:
    scores = cross_val_score(best_clf, scaled_X_train, y_train, cv=5, scoring=n).mean()
    print(n, round((scores),2)*100)

# Prediction

In [None]:
prediction = best_clf.predict(scaled_X_test)
probability = best_clf.predict_proba(scaled_X_test)

In [None]:
probability_train = best_clf.predict_proba(scaled_X_train)

In [None]:
print('Accuracy: ', round(accuracy_score(y_test, prediction),2)*100)
print('Precision: ', round(precision_score(y_test, prediction),2)*100)
print('Recall: ', round(recall_score(y_test, prediction),2)*100)
print('F1-score: ', round(f1_score(y_test, prediction),2)*100)
print()
print('Confusion matrix: ', confusion_matrix(y_test, prediction), sep='\n')

In [None]:
train_auc = roc_auc_score(y_train, probability_train[:, 1])
test_auc = roc_auc_score(y_test, probability[:, 1])

plt.figure()
plt.plot(*roc_curve(y_train, probability_train[:, 1])[:2], label='train AUC={:.4f}'.format(train_auc))
plt.plot(*roc_curve(y_test, probability[:, 1])[:2], label='train AUC={:.4f}'.format(test_auc))

legend_box = plt.legend(fontsize='large', framealpha=1).get_frame()
legend_box.set_facecolor('white')
legend_box.set_edgecolor('black')
plt.plot(np.linspace(0,1,100), np.linspace(0,1,100))

plt.show()

# Gain Recall to 95% by specifying threshold for positive class

In [None]:
pd.Series(probability[:, 1]).hist()
plt.show()

Historgam of predicted probabilities. We can see the highest area between 0.4-0.5 in flat middle. I will adjust threshold = 0.4 to include this area in posotive class

In [None]:
plot_precision_recall_curve(best_clf, scaled_X_test, y_test)

This is precision-recall curve where we can see dependence between these variables

In [None]:
prediction_with_threshold = np.where(probability[:, 1] > 0.4, 1, 0)

In [None]:
print('Accuracy with adjusted threshold: ', round(accuracy_score(y_test, prediction_with_threshold),2)*100)
print('Precision with adjusted threshold: ', round(precision_score(y_test, prediction_with_threshold),2)*100)
print('Recall with adjusted threshold: ', round(recall_score(y_test, prediction_with_threshold),2)*100)
print('F1-score with adjusted threshold: ', round(f1_score(y_test, prediction_with_threshold),2)*100)
print()
print('Confusion matrix with adjusted threshold: ', confusion_matrix(y_test, prediction_with_threshold), sep='\n')

# Summary:
* loaded few modules and looked through dataset
* binned some features
* visulaization of few relationships bewteen variables and individual variables.
* feature correaltion visualization
* prepared and compared dataset with different scaling
* chose algorithm with cross-validation
* tuned model with GridSearchCV
* gained recall to the specified value (95%).

**Thanks for reading. Upvote, if this notebook was useful or interesting for you**