# Heart Risk Prediction using supervised ML

In [None]:
import sklearn
import numpy as np
import io
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
import matplotlib.ticker as ticker

In [None]:
import urllib
import re

file1=io.TextIOWrapper(urllib.request.urlopen(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'),encoding='ISO-8859-1')
file2=io.TextIOWrapper(urllib.request.urlopen(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data'),encoding='ISO-8859-1')
file3=io.TextIOWrapper(urllib.request.urlopen(
     'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.va.data'),encoding='ISO-8859-1')
file4=io.TextIOWrapper(urllib.request.urlopen(
     'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.switzerland.data'),encoding='ISO-8859-1')

data_list = []

for file_ in (file1,file2,file3, file4):
    for line in file_:
        quantities = re.split('[^0-9.name-]+',line.strip())
        if len(quantities) == 14:
            data_line = []
            for x in quantities:
                if  x!='[' or x != ']':
                    data_line.append(x)
                data_list.append(data_line)

In [None]:
np.shape(data_list)

## *Data Loading from Source: In this section we will be loading UCI heart disease data*

In [None]:
name_cols = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',  'thalach', 'exang',  'oldpeak',  'slope',  'ca', 'thal',  'target']

df_heart_disease = pd.DataFrame(data_list,columns=name_cols)
df_heart_disease.head()

In [None]:
df_heart_disease.to_csv('heart_disease_dataset_UCI_generated.csv')

In [None]:
df_heart_disease = pd.read_csv('heart_disease_dataset_UCI.csv')

## Data Pre-Processing:
- Data cleaning 
- Remove NANs
- Look for missing and unimportant data

## Description of feature vectors:
 - description

It's a clean, easy to understand set of data. However, the meaning of some of the column headers are not obvious. Here's what they mean,
- **age**: The person's age in years
- **sex**: The person's sex (1 = male, 0 = female)
- **cp**:         
        1 = typical angina
        2 = atypical angina
        3 = non-anginal pain
        4 = asymptomatic
- **trestbps**: The person's resting blood pressure (mm Hg on admission to the hospital)
- **chol**: The person's cholesterol measurement in mg/dl
- **fbs**: The person's fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)
- **restecg**: Resting electrocardiographic measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria)
- **thalach**: The person's maximum heart rate achieved
- **exang**: Exercise induced angina (1 = yes; 0 = no)
- **oldpeak**: ST depression induced by exercise relative to rest ('ST' relates to positions on the ECG plot. See more here)
- **slope**: 
        1 = upsloping
        2 = flat
        3 = downsloping
- **ca**: The number of major vessels (0-3)
- **thal**: A blood disorder called thalassemia 
        3 = normal; 6 = fixed defect; 7 = reversable defect
- **target**: Heart disease (0 = no, 1 = yes)



## Check for Null and NANs
### Need to convert object types to numerical types

In [None]:
df_heart_disease.dtypes

In [None]:
df_heart_disease = df_heart_disease.astype(float, errors = 'raise')

In [None]:
df_heart_disease.dtypes

In [None]:
df_heart_disease.isnull().sum()

In [None]:
df_heart_disease.isna().sum()

In [None]:
df_heart_disease.columns

In [None]:
input_features = df_heart_disease.iloc[:,1:]
output_features = df_heart_disease.iloc[:,-1]

In [None]:
df_heart_disease.describe()

### Consider rows with target = 1 or target = 0. Ignore other values

In [None]:
df_heart_disease = df_heart_disease[(df_heart_disease['target'] == 1) | (df_heart_disease['target'] == 0)]

In [None]:
df_heart_disease.describe()

# **_Exploratory Data Analysis_**:
## Check outliers using Inter Quantile Range (IQR):

We are not going to remove outliers since they carry valuable info about certain types of patients. However, it can give us an idea on the necessecity of the exploratory data analysis.

In [None]:
Q1 = df_heart_disease.quantile(0.90)
Q3 = df_heart_disease.quantile(0.10)
IQR = Q3 - Q1

In [None]:
IQR

In [None]:
df_heart_disease_IQR = (df_heart_disease < (Q1 - 1.5 * IQR)) |(df_heart_disease > (Q3 + 1.5 * IQR))

In [None]:
df_heart_disease_IQR

### Drop Column which holds outlier

In [None]:
Row_with_outliers = df_heart_disease_IQR.all(axis=0)
indx = Row_with_outliers[Row_with_outliers== False].index.to_list()
indx

In [None]:
df_heart_disease = df_heart_disease.drop(indx, axis = 1)

In [None]:
df_heart_disease.reset_index(drop=True, inplace=True)

In [None]:
fig, out_fig = plt.subplots(figsize = (20,15))
plt.xticks(rotation=45)
out_fig = sns.boxplot(data = df_heart_disease, orient="h", palette="crest")

In [None]:
%matplotlib inline
fig, axis = plt.subplots(7,2,figsize=(10, 20));
df_heart_disease.hist(ax=axis);

In [None]:
categorical_val = []
continous_val = []

for column in df_heart_disease.columns:
    if len(df_heart_disease[column].unique()) <= 15:
        categorical_val.append(column)
    else:
        continous_val.append(column)
        
plt.figure(figsize=(15, 20))
for i, column in enumerate(categorical_val[:-1], 1):
    plt.subplot(3, 3, i)
    df_heart_disease[df_heart_disease["target"] == 0][column].hist(bins=35, color='blue', label='With Hear Disease', alpha=0.6)
    df_heart_disease[df_heart_disease["target"] == 1][column].hist(bins=35, color='red', label='Without Heart Disease', alpha=0.6)
    plt.legend()
    plt.xlabel(column)
    plt.legend(loc='upper right');

In [None]:
def data_Sex(sex):
    if sex == 0:
        return 'female'
    else:
        return 'male'

def data_target(target):
    if target == 0:
        return 'No Heart Disease'
    else:
        return 'With Heart Disease'
    
def data_thal(thal):
    if thal == 0:
        return 'Non conclusive'
    elif thal == 3:
        return 'Normal'
    elif thal == 6:
        return 'Fixed Defect'
    else:
        return 'Reversible defect'

def data_cp(cp):
    if cp == 1:
        return 'Typical angina'
    elif cp == 2:
        return 'Atypical angina'
    elif cp== 3:
        return 'Non-anginal pain'
    else:
        return 'Asymptomatic'
    
def data_restecg(restecg):
    if restecg == 0:
        return 'Normal'
    elif restecg == 2:
        return 'left ventricular hypertrophy'
    else:
        return 'abnormality in ST-T wave'

def data_st_slope(slope):
    if slope == 3:
        return 'downsloping'
    elif slope == 2:
        return 'flat'
    else:
        return 'upsloping'

def data_age(age):
    if age < 30:
        return 'young patients'
    elif age >= 30 and age < 60:
        return 'middle aged patients'
    else:
        return 'elderly patients'
    
def data_chol(chol):
    if chol < 200:
        return 'Normal Cholesterol Level'
    else:
        return 'High Cholesterol Level'    
    
df_heart_disease_with_catagoricalData = df_heart_disease.copy()
df_heart_disease_with_catagoricalData['sex'] = df_heart_disease['sex'].apply(data_Sex)
df_heart_disease_with_catagoricalData['target'] = df_heart_disease['target'].apply(data_target)
df_heart_disease_with_catagoricalData['thal'] = df_heart_disease['thal'].apply(data_thal)
df_heart_disease_with_catagoricalData['cp'] = df_heart_disease['cp'].apply(data_cp)
df_heart_disease_with_catagoricalData['restecg'] = df_heart_disease['restecg'].apply(data_restecg)
df_heart_disease_with_catagoricalData['slope'] = df_heart_disease['slope'].apply(data_st_slope)
df_heart_disease_with_catagoricalData['age_class'] = df_heart_disease['age'].apply(data_age)
df_heart_disease_with_catagoricalData['chol_level'] = df_heart_disease['chol'].apply(data_chol)

In [None]:
col_to_move = df_heart_disease_with_catagoricalData.pop('target')
df_heart_disease_with_catagoricalData.insert(len(df_heart_disease_with_catagoricalData.columns), 'target', col_to_move)

In [None]:
df_heart_disease_with_catagoricalData.head()

In [None]:
sns.set(rc={'figure.figsize':(6,5), 'xtick.labelsize':10})
sns.countplot(data= df_heart_disease_with_catagoricalData, x='age_class',hue='target')
plt.title('Relationship between age and risk of heart disease \n');
plt.legend(loc='upper right');

In [None]:
sns.set(rc={'figure.figsize':(15, 7)})
sns.countplot(data= df_heart_disease_with_catagoricalData[df_heart_disease_with_catagoricalData['target']=='With Heart Disease'], x='age',hue='sex')
plt.title('Relationship between gender and risk of heart disease at all age \n');
plt.legend(loc='upper right');

In [None]:
sns.set(rc={'figure.figsize':(6,5), 'xtick.labelsize':10})
plot_ = sns.countplot(data= df_heart_disease_with_catagoricalData[df_heart_disease_with_catagoricalData['target']=='With Heart Disease'], x='chol_level', hue='sex')
plt.title('Relationship between gender and risk of heart disease at all age with varying cholesterollevel \n');
plt.tight_layout();
plt.legend(loc='upper right');

In [None]:
sns.set(rc={'figure.figsize':(6, 5), 'xtick.labelsize':10})
plot_ = sns.countplot(data= df_heart_disease_with_catagoricalData, x='chol_level', hue='target')
plt.tight_layout();
plt.legend(loc='upper right');

In [None]:
sns.set(rc={'figure.figsize':(6, 5), 'xtick.labelsize':10})
sns.countplot(data= df_heart_disease_with_catagoricalData[df_heart_disease_with_catagoricalData['target'] == 'With Heart Disease'], x='sex',hue='thal')
plt.title('Relationship between gender and risk of heart disorder "thalassemia" \n');
plt.legend(loc='upper right');

In [None]:
sns.set(rc={'figure.figsize':(6, 5), 'xtick.labelsize':10})
sns.countplot(data= df_heart_disease_with_catagoricalData, x='cp',hue='target')
plt.title('Chest Pain varying with existence of heart disease \n');
plt.legend(loc='upper right');

In [None]:
sns.set(rc={'figure.figsize':(6, 5), 'xtick.labelsize':10})
sns.countplot(data= df_heart_disease_with_catagoricalData, x='chol_level',hue='target')
plt.title('Effect of cholesterol level and  heart disease \n');

In [None]:
sns.countplot(data= df_heart_disease_with_catagoricalData, x='restecg',hue='target')
plt.title('Resting electrocardiographic measurement varying with existence of heart disease \n');

In [None]:
plt.figure(figsize=(14,7))
sns.heatmap(df_heart_disease.drop('target', axis=1).corr(),annot=True,cmap="magma",fmt='.2f');

In [None]:
import hvplot.pandas

df_heart_disease.drop('target', axis=1).corrwith(df_heart_disease.target).hvplot.barh(
    width=600, height=400, 
    title="Correlation between Heart Disease and Feature Vector", 
    ylabel='Correlation', xlabel='Feature Vector',
)

### Features are not highly correlated. So it is reasonable to use these features to the machine learning model.

# *Important Feature Selection*:
### Univariate Feature Selection (for top 5 important variables)

In [None]:
import warnings 
warnings.filterwarnings("ignore")

#Top  x% features to consider
fraction_of_top_features = 0.7

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer

X = df_heart_disease.drop('target', axis=1)
y = df_heart_disease['target']
X_train, X_test,y_train,y_test = train_test_split(X, y, test_size=0.25, random_state=42)

transformer = Normalizer()
X_train_scaled = transformer.fit_transform(X_train)
X_test_scaled = transformer.transform(X_test)

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
UV_model = SelectKBest(chi2, k='all').fit(X_train_scaled, y_train)
mask = np.argsort(np.flip(UV_model.scores_)) #list of booleans for selected features
best_features_SKBest = [] 
best_features_SKBest = X_train.columns[mask]  

best_features_SKBest

### Recursive feature elimination with Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

model_rf = RandomForestClassifier(n_estimators=1500, max_depth=5)
model_rf.fit(X_train_scaled, y_train)

feature_importance = model_rf.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_train.columns)[sorted_idx])
plt.title('Feature Importance')

best_features_rf= X_train.columns[np.flip(sorted_idx)]
best_features_rf = best_features_rf[:int(fraction_of_top_features*len(best_features_rf))]

In [None]:
from sklearn.inspection import permutation_importance
import xgboost as xgb

model_xgb = xgb.XGBClassifier(n_estimators=1500, max_depth=5, eta=0.05)
model_xgb.fit(X_train_scaled, y_train)

feature_importance = model_xgb.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_train.columns)[sorted_idx])
plt.title('Feature Importance')


best_features_xgb = X_train.columns[np.flip(sorted_idx)]
best_features_xgb = best_features_xgb[:int(fraction_of_top_features*len(best_features_xgb))]

### Recursive feature elimination with XGBoost Classifier

In [None]:
best_feature_list = list(set.intersection(set(best_features_SKBest), set(best_features_rf), set(best_features_xgb)))

### Best Feature Set:

In [None]:
best_feature_list

### Visualizing simple decision tree based classification using ***Best Feature Set***

In [None]:
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from six import StringIO 
from IPython.display import Image  
import pydotplus

clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)
clf = clf.fit(X_train[best_feature_list],y_train)

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=False, rounded=True,
                special_characters=True,feature_names = X_train[best_feature_list].columns  ,class_names=['No Heart Risk','With Heart Risk'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

# *MLJAR-supervised AutoML based classification for heart risk*

## Get a tuned model that has comparatively highest accuracy

### Use extracted best features from three schemes. 
***best_feature_list***

In [None]:
from sklearn.metrics import accuracy_score
import supervised
from supervised.automl import AutoML
from sklearn.model_selection import train_test_split

In [None]:
X_train_mljar = X_train[best_feature_list]
y_train_mljar = y_train
X_test_mljar = X_test[best_feature_list]
y_test_mljar = y_test

In [None]:
automl = AutoML(n_jobs= -1, mode = 'Compete', eval_metric =  'f1', ml_task = 'auto', 
               validation_strategy = {
                        "validation_type": "kfold",
                        "k_folds": 10,
                        "shuffle": True,
                        "stratify": True,
                        "random_seed": 123}, train_ensemble = True)
automl.fit(X_train_mljar, y_train_mljar);

In [None]:
automl.report()

In [None]:
prediction = automl.predict(X_test_mljar)
#accuracy_model = accuracy_score(y_test_mljar, prediction)
print(f"Accuracy:{automl.score(X_test_mljar, y_test_mljar): 0.2f}%")

In [None]:
A = np.array([1,2,3,4,5,6,7])

In [None]:
A[:4]