# Heart Risk Prediction using supervised ML

In [None]:
import sklearn
import numpy as np
import io
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import cross_val_score
import warnings
warnings.simplefilter(action='ignore')
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
kfold = StratifiedKFold(n_splits=5)

import missingno as msno

import util

## *Data Loading from Source*

### Using UCI  heart risk data used at kaggle competieion
- Download UCI Cleveland heart risk data available on Kaggle
https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset

In [None]:
df_heart_disease = pd.read_csv('heart_disease_dataset_UCI.csv')

In [None]:
df_heart_disease.head()

## Description of feature vectors:

- **age**: The person's age in years
- **sex**: The person's sex (1 = male, 0 = female)
- **cp**:         
        0 = typical angina
        1 = atypical angina
        2 = non-anginal pain
        3 = asymptomatic
- **trestbps**: The person's resting blood pressure (mm Hg on admission to the hospital)
- **chol**: The person's cholesterol measurement in mg/dl
- **fbs**: The person's fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)
- **restecg**: Resting electrocardiographic measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria)
- **thalach**: The person's maximum heart rate achieved
- **exang**: Exercise induced angina (1 = yes; 0 = no)
- **oldpeak**: ST depression induced by exercise relative to rest ('ST' relates to positions on the ECG plot. See more here)
- **slope**: 
        0 = upsloping
        1 = flat
        2 = downsloping
- **ca**: The number of major vessels (0-3)
- **thal**: A blood disorder called thalassemia 
        0: NULL (dropped from the dataset previously)
        1: fixed defect (no blood flow in some part of the heart)
        2: normal blood flow
        3: reversible defect (a blood flow is observed but it is not normal)
- **target**: Heart disease (0 = no, 1 = yes)



## Data Pre-Processing:
- Data cleaning 
- Remove NANs
- Look for missing and unimportant data

***Check for Null and NANs***
Need to convert object types to numerical types

In [None]:
df_heart_disease.dtypes

In [None]:
df_heart_disease.isnull().sum()

In [None]:
df_heart_disease.isna().sum()

***Visualize missing value***

In [None]:
msno.matrix(df_heart_disease);

In [None]:
df_heart_disease.columns

In [None]:
df_heart_disease.columns.duplicated()

In [None]:
df_heart_disease.describe()

***Consider rows with target = 1 or target = 0. Ignore other values***

In [None]:
df_heart_disease = df_heart_disease[(df_heart_disease['target'] == 1) | (df_heart_disease['target'] == 0)]

***Check outliers using Inter Quantile Range (IQR)***

We are not going to remove outliers since they carry valuable info about certain types of patients. However, it can give us an idea on the necessecity of the exploratory data analysis.

In [None]:
Q1 = df_heart_disease.quantile(0.95)
Q3 = df_heart_disease.quantile(0.1)
IQR = Q3 - Q1

In [None]:
Q1

In [None]:
IQR

In [None]:
df_heart_disease_IQR = (df_heart_disease < (Q1 - 1.5 * IQR)) |(df_heart_disease > (Q3 + 1.5 * IQR))

In [None]:
df_heart_disease_IQR

***Drop Column which holds outlier***

In [None]:
Row_with_outliers = df_heart_disease_IQR.all(axis=0)
indx = Row_with_outliers[Row_with_outliers== False].index.to_list()
indx

In [None]:
df_heart_disease = df_heart_disease.drop(indx, axis = 1)

In [None]:
df_heart_disease.reset_index(drop=True, inplace=True)

In [None]:
df_heart_disease.head()

In [None]:
df_heart_disease.plot(kind='box', subplots=True, layout=(2,7),
sharex=False,sharey=False, figsize=(20, 10), 
color='deeppink');

In [None]:
df_heart_disease.describe()
#df_heart_disease.style.set_table_attributes('style="font-size: 10px"')

In [None]:
def data_Sex(sex):
    if sex == 0:
        return 'female'
    else:
        return 'male'

def data_target(target):
    if target == 1:
        return 'No Heart Disease'
    else:
        return 'With Heart Disease'
    
def data_thal(thal):
    if thal == 0:
        return 'Non conclusive'
    elif thal == 2:
        return 'Normal'
    elif thal == 1:
        return 'Fixed Defect'
    else:
        return 'Reversible defect'

def data_cp(cp):
    if cp == 0:
        return 'Typical angina'
    elif cp == 1:
        return 'Atypical angina'
    elif cp== 2:
        return 'Non-anginal pain'
    else:
        return 'Asymptomatic'
    
def data_restecg(restecg):
    if restecg == 1:
        return 'Normal'
    elif restecg == 0:
        return 'left ventricular hypertrophy'
    else:
        return 'abnormality in ST-T wave'

def data_st_slope(slope):
    if slope == 0:
        return 'downsloping'
    elif slope == 1:
        return 'flat'
    else:
        return 'upsloping'

def data_age(age):
    if age < 30:
        return 'young patients'
    elif age >= 30 and age < 60:
        return 'middle aged patients'
    else:
        return 'elderly patients'
    
def data_chol(chol):
    if chol < 200:
        return 'Normal Cholesterol Level'
    else:
        return 'High Cholesterol Level'    

    
def data_trestbps(trestbps):
    if trestbps < 90:
        return 'Low Blood Pressure'
    elif trestbps >= 90 and trestbps <= 130:
        return 'Normal Blood Pressure'
    else:
        return 'High Blood Pressure'        
    
    
df_heart_disease_with_catagoricalData = df_heart_disease.copy()
df_heart_disease_with_catagoricalData['sex'] = df_heart_disease['sex'].apply(data_Sex)
df_heart_disease_with_catagoricalData['target'] = df_heart_disease['target'].apply(data_target)
df_heart_disease_with_catagoricalData['thal'] = df_heart_disease['thal'].apply(data_thal)
df_heart_disease_with_catagoricalData['cp'] = df_heart_disease['cp'].apply(data_cp)
df_heart_disease_with_catagoricalData['restecg'] = df_heart_disease['restecg'].apply(data_restecg)
df_heart_disease_with_catagoricalData['slope'] = df_heart_disease['slope'].apply(data_st_slope)
df_heart_disease_with_catagoricalData['age_class'] = df_heart_disease['age'].apply(data_age)
df_heart_disease_with_catagoricalData['chol_level'] = df_heart_disease['chol'].apply(data_chol)
df_heart_disease_with_catagoricalData['bp_level'] = df_heart_disease['trestbps'].apply(data_trestbps)

In [None]:
col_to_move = df_heart_disease_with_catagoricalData.pop('target')
df_heart_disease_with_catagoricalData.insert(len(df_heart_disease_with_catagoricalData.columns), 'target', col_to_move)

In [None]:
df_heart_disease_with_catagoricalData.head()

## **_Exploratory Data Analysis_**:

In [None]:
fig, out_fig = plt.subplots(figsize = (20,15))
plt.xticks(rotation=45)
out_fig = sns.boxplot(data = df_heart_disease, orient="h", palette="crest")

In [None]:
for i,col in enumerate(df_heart_disease.columns.values):
    plt.subplot(5,3,i+1)
    plt.scatter([i for i in range(303)],df_heart_disease[col].values.tolist())
    plt.title(col)
    fig,ax=plt.gcf(),plt.gca()
    fig.set_size_inches(10,10)
    plt.tight_layout()
plt.show()

In [None]:
%matplotlib inline
fig, axis = plt.subplots(7,2,figsize=(10, 17));
df_heart_disease.hist(ax=axis);

In [None]:
categorical_val = []
continous_val = []

for column in df_heart_disease.columns:
    if len(df_heart_disease[column].unique()) <= 15:
        categorical_val.append(column)
    else:
        continous_val.append(column)
        
plt.figure(figsize=(12, 12))
for i, column in enumerate(categorical_val[:-1], 1):
    plt.subplot(3, 3, i)
    df_heart_disease[df_heart_disease["target"] == 0][column].hist(bins=35, color='blue', label='Without Heart Disease', alpha=0.6)
    df_heart_disease[df_heart_disease["target"] == 1][column].hist(bins=35, color='red', label='With Heart Disease', alpha=0.6)
    plt.legend()
    plt.xlabel(column)
    plt.legend(loc='upper right');

In [None]:
sns.set(rc={'figure.figsize':(6,5), 'xtick.labelsize':10})
sns.countplot(data= df_heart_disease_with_catagoricalData, x='age_class',hue='target')
plt.title('Relationship between age and risk of heart disease \n');
plt.legend(loc='upper right');

In [None]:
sns.set(rc={'figure.figsize':(15, 7)})
sns.countplot(data= df_heart_disease_with_catagoricalData[df_heart_disease_with_catagoricalData['target']=='With Heart Disease'], x='age',hue='sex')
plt.title('Relationship between gender and risk of heart disease at all age \n');
plt.legend(loc='upper right');

In [None]:
sns.set(rc={'figure.figsize':(6,5), 'xtick.labelsize':10})
plot_ = sns.countplot(data= df_heart_disease_with_catagoricalData[df_heart_disease_with_catagoricalData['target']=='With Heart Disease'], x='chol_level', hue='sex')
plt.title('Relationship between gender and risk of heart disease at all age with varying cholesterollevel \n');
plt.tight_layout();
plt.title('Relationship between cholesterol level and gender" \n');
plt.legend(loc='upper right');

In [None]:
sns.set(rc={'figure.figsize':(6, 5), 'xtick.labelsize':10})
sns.countplot(data= df_heart_disease_with_catagoricalData[df_heart_disease_with_catagoricalData['target'] == 'With Heart Disease'], x='sex',hue='thal')
plt.title('Relationship between gender and risk of heart disorder "thalassemia" \n');
plt.legend(loc='upper right');

In [None]:
sns.set(rc={'figure.figsize':(6, 5), 'xtick.labelsize':10})
sns.countplot(data= df_heart_disease_with_catagoricalData, x='cp',hue='target')
plt.title('Chest Pain varying with existence of heart disease \n');
plt.legend(loc='upper right');

In [None]:
sns.set(rc={'figure.figsize':(6, 5), 'xtick.labelsize':10})
sns.countplot(data= df_heart_disease_with_catagoricalData, x='chol_level',hue='target')
plt.title('Effect of cholesterol level and  heart disease \n');

In [None]:
sns.countplot(data= df_heart_disease_with_catagoricalData, x='restecg',hue='target')
plt.title('Resting electrocardiographic measurement varying with existence of heart disease \n');

In [None]:
sns.set(rc={'figure.figsize':(6, 5), 'xtick.labelsize':10})
sns.countplot(data= df_heart_disease_with_catagoricalData, x='bp_level',hue='target')
plt.title('Effect of blood pressure level and  heart disease \n');

In [None]:
sns.set(rc={'figure.figsize':(6, 5), 'xtick.labelsize':10})
sns.countplot(data= df_heart_disease_with_catagoricalData, x='slope',hue='target')
plt.title('Effect of slope and  heart disease \n');

In [None]:
sns.set(rc={'figure.figsize':(6, 5), 'xtick.labelsize':10})
sns.countplot(data= df_heart_disease_with_catagoricalData, x='ca',hue='target')
plt.title('Effect of # of blood vessel and  heart disease \n');

In [None]:
pal = sns.light_palette("blue", as_cmap=True)
sns.jointplot(data=df_heart_disease[df_heart_disease['target'] == 1],
              x='chol',
              y='trestbps',
              kind='hex',
              cmap='Reds');

In [None]:
pal = sns.light_palette("blue", as_cmap=True)
sns.jointplot(data=df_heart_disease[df_heart_disease['target'] == 1],
              x='thalach',
              y='chol',
              kind='hex',
              cmap='Reds');

In [None]:
plt.figure(figsize=(10,5))
sns.pointplot(x=df_heart_disease['age'],y=df_heart_disease['thalach'],color='red',alpha=0.8)
plt.xlabel('Age',fontsize = 15,color='blue')
plt.xticks(rotation=45)
plt.ylabel('Thalach',fontsize = 15,color='blue')
plt.title('Age vs Thalach',fontsize = 15,color='blue')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.pointplot(x=df_heart_disease['age'],y=df_heart_disease['chol'],color='red', alpha=0.8)
plt.xlabel('Age',fontsize = 15,color='blue')
plt.xticks(rotation=45)
plt.ylabel('Chol',fontsize = 15,color='blue')
plt.title('Age vs Chol',fontsize = 15,color='blue')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.pointplot(x=df_heart_disease['age'],y=df_heart_disease['oldpeak'],color='red', alpha=0.8)
plt.xlabel('Age',fontsize = 15,color='blue')
plt.xticks(rotation=45)
plt.ylabel('Chol',fontsize = 15,color='blue')
plt.title('Age vs oldpeak',fontsize = 15,color='blue')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.pointplot(x=df_heart_disease['age'],y=df_heart_disease['trestbps'],color='red', alpha=0.8)
plt.xlabel('Age',fontsize = 15,color='blue')
plt.xticks(rotation=45)
plt.ylabel('trestbps',fontsize = 15,color='blue')
plt.title('Age vs trestbps',fontsize = 15,color='blue')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(14,7))
sns.heatmap(df_heart_disease.drop('target', axis=1).corr(),annot=True,cmap="magma",fmt='.2f');

In [None]:
import hvplot.pandas

df_heart_disease.drop('target', axis=1).corrwith(df_heart_disease.target).hvplot.barh(
    width=600, height=400, 
    title="Correlation between Heart Disease and Feature Vector", 
    ylabel='Correlation', xlabel='Feature Vector',
)

### Features are not highly correlated. So it is reasonable to use these features to the machine learning model.

In [None]:
#Top  x% features to consider
fraction_of_top_features = 0.8

X = df_heart_disease.drop('target', axis=1)
y = df_heart_disease['target']
X_train, X_test,y_train,y_test = train_test_split(X, y, test_size=0.25, random_state=42)

transformer = Normalizer()
X_train_scaled = transformer.fit_transform(X_train)
X_test_scaled = transformer.transform(X_test)

# *Important Feature Selection*:
### Univariate Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
UV_model = SelectKBest(chi2, k='all').fit(X_train_scaled, y_train)
mask = np.argsort(np.flip(UV_model.scores_)) #list of booleans for selected features
best_features_SKBest = [] 
best_features_SKBest = X_train.columns[mask]  

best_features_SKBest

### Recursive feature elimination with Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

model_rf = RandomForestClassifier(n_estimators=1500, max_depth=5)
model_rf.fit(X_train_scaled, y_train)

feature_importance = model_rf.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_train.columns)[sorted_idx])
plt.title('Feature Importance: Random Forest')

best_features_rf= X_train.columns[np.flip(sorted_idx)]
best_features_rf = best_features_rf[:int(fraction_of_top_features*len(best_features_rf))]

In [None]:
from sklearn.inspection import permutation_importance
import xgboost as xgb

model_xgb = xgb.XGBClassifier(n_estimators=1500, max_depth=5, eta=0.05)
model_xgb.fit(X_train_scaled, y_train)

feature_importance = model_xgb.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_train.columns)[sorted_idx])
plt.title('Feature Importance: XGBoost')

best_features_xgb = X_train.columns[np.flip(sorted_idx)]
best_features_xgb = best_features_xgb[:int(fraction_of_top_features*len(best_features_xgb))]

### Recursive feature elimination with XGBoost Classifier

In [None]:
best_feature_list = list(set.intersection(set(best_features_SKBest), set(best_features_rf), set(best_features_xgb)))

### Use PCA to see how many features are important:

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = len(X_train.columns)).fit(X_train_scaled)

plt.plot(pca.explained_variance_ratio_.cumsum(), lw=3, color='#087E8B')
plt.title('Significance of principal components', size=20)
plt.show()

Based on PCA seems like first 4 features are significant

### Best Feature Set:

In [None]:
best_feature_list

### Visualizing simple decision tree based classification using ***Best Feature Set***

In [None]:
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from six import StringIO 
from IPython.display import Image  
import pydotplus

clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)
clf = clf.fit(X_train[best_feature_list],y_train)

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=False, rounded=True,
                special_characters=True,feature_names = X_train[best_feature_list].columns  ,class_names=['No Heart Risk','With Heart Risk'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

# ML Model Development for heart risk detection

### Get a tuned model that has comparatively highest accuracy:
Use features with numerical values only ***df_heart_disease***

### Use extracted best features from these schemes. 
***best_feature_list***

In [None]:
X_train = X_train[best_feature_list]
X_test  = X_test[best_feature_list]

In [None]:
X = X[best_feature_list]

In [None]:
df_heart_disease = df_heart_disease[best_feature_list + ['target']]

In [None]:
ML_models = []
ML_Prediction_models = []

## *XGBoost based classification for heart risk*

In [None]:
## Hyperparameter optimization using RandomizedSearchCV
import xgboost

In [None]:
#Initialize Model
clf_mdl = xgboost.XGBClassifier(use_label_encoder = False, verbosity = 1, eval_metric='logloss');

params = {
              'objective':['binary:logistic'],
              'learning_rate': [0.001, 0.005, 0.01, 0.1,0.3,0.5,0.7,1], 
              'max_depth': [1, 2, 3, 4, 5, 6, 7],
              'min_child_weight': [1e-5, 1e-3, 1e-2],
              'subsample': [0.01, 0.1, 0.3,0.5,0.7,1],
              'colsample_bytree': [0.7,1],
              'n_estimators': [100, 200, 300, 400, 500, 1000]
}

#Initializing Grid Search with Stratified K Fold
xgb_ml = RandomizedSearchCV(clf_mdl, param_distributions=params, n_jobs=-1, cv=kfold)
xgb_ml.fit(X_train,y_train)

In [None]:
xgb_ml.best_params_

In [None]:
Tuned_model_xgb = xgb_ml.best_estimator_
Tuned_model_xgb

In [None]:
prediction_xgb = Tuned_model_xgb.predict(X_test)

In [None]:
ML_models.append(Tuned_model_xgb)
ML_Prediction_models.append(prediction_xgb)

In [None]:
prediction_xgb_1 = prediction_xgb = Tuned_model_xgb.predict(X)

In [None]:
cm_xgb = confusion_matrix(y, prediction_xgb, labels=None) 

In [None]:
util.plot_confusion_matrix(cm_xgb,classes=['No Risk', 'Risk'], title='XGB based Confusion Matrix')

In [None]:
metrics.plot_roc_curve(Tuned_model_xgb, X[best_feature_list], y)

In [None]:
prediction_xgb = Tuned_model_xgb.predict(X_test)

In [None]:
cm_xgb_test = confusion_matrix(y_test, prediction_xgb, labels=None) 

In [None]:
util.plot_confusion_matrix(cm_xgb_test,classes=['No Risk', 'Risk'], title='XGB based Confusion Matrix')

In [None]:
metrics.plot_roc_curve(Tuned_model_xgb, X_test[best_feature_list], y_test)

In [None]:
util.plot_learning_curve(estimator = Tuned_model_xgb, title = "XGB learning curve", X = X_train, y = y_train, cv = kfold);

## *Random Forest based classification for heart risk*

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Initialize Model
clf_mdl_2 = RandomForestClassifier()

params = {'bootstrap': [True, False],
 'max_depth': range(1,10, 1),
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 200, 300, 400, 500]}

#Initializing Grid Search with Stratified K Fold
rf_ml = RandomizedSearchCV(clf_mdl_2, param_distributions=params, n_jobs=-1, cv=kfold)
rf_ml.fit(X_train,y_train)

In [None]:
rf_ml.best_params_

In [None]:
Tuned_model_rf = rf_ml.best_estimator_
Tuned_model_rf

In [None]:
prediction_rf = Tuned_model_rf.predict(X_test)

In [None]:
ML_models.append(Tuned_model_rf)
ML_Prediction_models.append(prediction_rf)

In [None]:
prediction_rf_1 = Tuned_model_rf.predict(X)

In [None]:
cm_rf = confusion_matrix(y, prediction_rf_1, labels=None) 

In [None]:
util.plot_confusion_matrix(cm_rf,classes=['No Risk', 'Risk'], title='RF based Confusion Matrix')

In [None]:
metrics.plot_roc_curve(Tuned_model_rf, X, y) 

In [None]:
prediction_rf = Tuned_model_rf.predict(X_test)

In [None]:
cm_rf_test = confusion_matrix(y_test, prediction_rf, labels=None) 

In [None]:
util.plot_confusion_matrix(cm_rf_test,classes=['No Risk', 'Risk'], title='RF based Confusion Matrix')

In [None]:
metrics.plot_roc_curve(Tuned_model_rf, X_test, y_test) 

In [None]:
util.plot_learning_curve(estimator = Tuned_model_rf, title = "RF learning curve", X = X_train, y = y_train, cv = kfold)

## *Logistic Regression based classification for heart risk*

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
#Initialize Model
clf_mdl_3 = LogisticRegression()

params = {
    "max_iter": range(100,500,2),
    "solver" : ['newton-cg', 'lbfgs', 'liblinear'],
    "C": [0.5, 0.1, 1.0]
}

lgr_ml = RandomizedSearchCV(clf_mdl_3, param_distributions=params, n_jobs=-1, cv=kfold)
lgr_ml.fit(X_train,y_train)

In [None]:
lgr_ml.best_params_

In [None]:
Tuned_model_lgr = lgr_ml.best_estimator_
Tuned_model_lgr

In [None]:
prediction_lgr = Tuned_model_lgr.predict(X_test)

In [None]:
ML_models.append(Tuned_model_lgr)
ML_Prediction_models.append(prediction_lgr)

In [None]:
prediction_lgr_1 = Tuned_model_lgr.predict(X)

In [None]:
cm_lgr = confusion_matrix(y, prediction_lgr_1, labels=None) 

In [None]:
util.plot_confusion_matrix(cm_lgr,classes=['No Risk', 'Risk'], title='LGR based Confusion Matrix')

In [None]:
metrics.plot_roc_curve(Tuned_model_lgr, X, y) 

In [None]:
prediction_lgr = Tuned_model_lgr.predict(X_test)

In [None]:
cm_lgr_test = confusion_matrix(y_test, prediction_lgr, labels=None) 

In [None]:
util.plot_confusion_matrix(cm_lgr_test,classes=['No Risk', 'Risk'], title='LGR based Confusion Matrix')

In [None]:
metrics.plot_roc_curve(Tuned_model_lgr, X_test, y_test) 

In [None]:
util.plot_learning_curve(estimator = Tuned_model_lgr, title = "LGR learning curve", X = X_train, y = y_train, cv = kfold)

## *LightGBM based classification for heart risk*

In [None]:
## Hyperparameter optimization using RandomizedSearchCV
import lightgbm as lgb

In [None]:
#Initialize Model
clf_mdl_4 = lgb.LGBMClassifier();

params = {'num_leaves':range(10,100, 10), 'min_child_samples':range(5,25,5),'max_depth': range(5, 15, 1),
             'learning_rate':[0.05,0.1,0.2],'reg_alpha': [0,0.01,0.03]}

#Initializing Grid Search with Stratified K Fold
lgb_ml = RandomizedSearchCV(clf_mdl_4, param_distributions=params, n_jobs=-1, cv=kfold)
lgb_ml.fit(X_train,y_train)

In [None]:
lgb_ml.best_params_

In [None]:
Tuned_model_lgb = lgb_ml.best_estimator_
Tuned_model_lgb

In [None]:
prediction_lgb = Tuned_model_lgb.predict(X_test)

In [None]:
ML_models.append(Tuned_model_lgb)
ML_Prediction_models.append(prediction_lgb)

In [None]:
prediction_lgb_1 = Tuned_model_lgb.predict(X)

In [None]:
cm_lgb = confusion_matrix(y, prediction_lgb_1, labels=None) 

In [None]:
util.plot_confusion_matrix(cm_lgb,classes=['No Risk', 'Risk'], title='LGBM based Confusion Matrix')

In [None]:
metrics.plot_roc_curve(Tuned_model_lgb, X, y) 

In [None]:
prediction_lgb = Tuned_model_lgb.predict(X_test)

In [None]:
cm_lgb_test = confusion_matrix(y_test, prediction_lgb, labels=None) 

In [None]:
util.plot_confusion_matrix(cm_lgb_test,classes=['No Risk', 'Risk'], title='LGBM based Confusion Matrix')

In [None]:
metrics.plot_roc_curve(Tuned_model_lgb, X_test, y_test) 

In [None]:
util.plot_learning_curve(estimator = Tuned_model_lgb, title = "LGM learning curve", X = X_train, y = y_train, cv = kfold)

## *Linear Discriminant Analysis based classification for heart risk*

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
#Initialize Model
clf_mdl_5 = LinearDiscriminantAnalysis()

params = {
    "solver" : ["svd"],
     "tol" : [0.0001,0.0002,0.0003]
}

lda_ml = RandomizedSearchCV(clf_mdl_5, param_distributions=params, n_jobs=-1, cv=kfold)
lda_ml.fit(X_train,y_train)

In [None]:
lda_ml.best_params_

In [None]:
Tuned_model_lda = lda_ml.best_estimator_
Tuned_model_lda

In [None]:
prediction_lda = Tuned_model_lda.predict(X_test)

In [None]:
ML_models.append(Tuned_model_lda)
ML_Prediction_models.append(prediction_lda)

In [None]:
prediction_lda_1 = Tuned_model_lda.predict(X)

In [None]:
cm_lda = confusion_matrix(y, prediction_lda_1, labels=None) 

In [None]:
util.plot_confusion_matrix(cm_lda,classes=['No Risk', 'Risk'], title='LDA based Confusion Matrix')

In [None]:
metrics.plot_roc_curve(Tuned_model_lda, X, y) 

In [None]:
prediction_lda = Tuned_model_lda.predict(X_test)

In [None]:
cm_lda_test = confusion_matrix(y_test, prediction_lda, labels=None) 

In [None]:
util.plot_confusion_matrix(cm_lda_test,classes=['No Risk', 'Risk'], title='LDA based Confusion Matrix')

In [None]:
metrics.plot_roc_curve(Tuned_model_lda, X_test, y_test) 

In [None]:
util.plot_learning_curve(estimator = Tuned_model_lda, title = "LDA learning curve", X = X_train, y = y_train, cv = kfold)

## *Ensemble learning based classification for heart risk*

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
VotingPredictor = VotingClassifier(estimators =
                           [('rfc', Tuned_model_rf), 
                            ('lgr', Tuned_model_lgr), 
                            ('lda', Tuned_model_lda),
                            ('xgb', Tuned_model_xgb),
                            ('lgb', Tuned_model_lgb)],
                           voting='soft', n_jobs = -1)


VotingPredictor = VotingPredictor.fit(X_train, y_train);

In [None]:
prediction_vp = VotingPredictor.predict(X_test)

In [None]:
ML_models.append(VotingPredictor)
ML_Prediction_models.append(prediction_vp)

In [None]:
prediction_vp_1 = VotingPredictor.predict(X)

In [None]:
cm_vp = confusion_matrix(y, prediction_vp_1, labels=None) 

In [None]:
util.plot_confusion_matrix(cm_vp,classes=['No Risk', 'Risk'], title='LDA based Confusion Matrix')

In [None]:
metrics.plot_roc_curve(VotingPredictor, X, y) 

In [None]:
util.plot_learning_curve(estimator = VotingPredictor, title = "VP learning curve",
                    X = X_train, y = y_train, cv = kfold);

In [None]:
Metrics_df = pd.DataFrame()

data_dict = {}

model_name = []
model_metrics = []

for item in enumerate(ML_models):
    y_pred = ML_Prediction_models[item[0]]
    recall_scr = recall_score(y_test, y_pred)
    f1_scr = f1_score(y_test, y_pred)   
    precision_scr = precision_score(y_test, y_pred) 
    accuracy_scr = accuracy_score(y_test, y_pred)
    roc_auc_scr = roc_auc_score(y_test, y_pred)    
    cross_val_scr = cross_val_score(ML_models[item[0]], X_train, y_train,cv=10, n_jobs = -1, scoring = 'accuracy').mean()
    model_name.append(type(ML_models[item[0]]).__name__)
    model_metrics.append([recall_scr, f1_scr, precision_scr, accuracy_scr, roc_auc_scr, cross_val_scr])


for i in range(len(model_name)):
    data_dict[model_name[i]] = model_metrics[i]


In [None]:
metrics_name = ['recall', 'f1', 'precision','accuracy', 'roc_auc', 'cross_validation']
Metrics_df = pd.DataFrame.from_dict(data_dict, orient='index', columns=metrics_name)
Metrics_df.head()

In [None]:
Metrics_df['f1'].plot(kind="bar",title='f1');

In [None]:
Metrics_df['accuracy'].plot(kind="bar", title='accuracy');

In [None]:
Metrics_df['roc_auc'].plot(kind="bar", title='roc_auc');

In [None]:
Metrics_df['recall'].plot(kind="bar", title='recall');

In [None]:
Metrics_df['precision'].plot(kind="bar", title='precision');