# Heart Health Ensemble

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler


# Read Data and Perform Data Cleaning(?)

In [4]:
columns = [
    "HeartDisease",
    "BMI",
    "Smoking",
    "AlcoholDrinking",
    "Stroke",
    # "PhysicalHealth",
    "MentalHealth",
    "DiffWalking",
    "Sex",
    "AgeCategory",
    "Race",
    "Diabetic",
    "PhysicalActivity",
    "SleepTime",
    "Asthma",
    "KidneyDisease",
    "SkinCancer"
]

target = ["health_status"]

In [5]:
# Load the data
file_path = Path('./DataTables/heart_2020_cleaned.csv')
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.head(10)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,30,No,Female,55-59,White,Yes,Yes,5,Yes,No,Yes
1,No,20.34,No,No,Yes,0,No,Female,80 or older,White,No,Yes,7,No,No,No
2,No,26.58,Yes,No,No,30,No,Male,65-69,White,Yes,Yes,8,Yes,No,No
3,No,24.21,No,No,No,0,No,Female,75-79,White,No,No,6,No,No,Yes
4,No,23.71,No,No,No,0,Yes,Female,40-44,White,No,Yes,8,No,No,No
5,Yes,28.87,Yes,No,No,0,Yes,Female,75-79,Black,No,No,12,No,No,No
6,No,21.63,No,No,No,0,No,Female,70-74,White,No,Yes,4,Yes,No,Yes
7,No,31.64,Yes,No,No,0,Yes,Female,80 or older,White,Yes,No,9,Yes,No,No
8,No,26.45,No,No,No,0,No,Female,80 or older,White,"No, borderline diabetes",No,5,No,Yes,No
9,No,40.69,No,No,No,0,Yes,Male,65-69,White,No,Yes,10,No,No,No


In [6]:
# Define AgeCategory Bins dictionary
file_path = Path('./DataTables/AgeRecoded.csv')
ageDF = pd.read_csv(file_path)
ageDict = dict(zip(ageDF.AgeCategory, ageDF.AgeRisk))
print(ageDict)

{'18-24': 'Low Risk', '25-29': 'Low Risk', '30-34': 'Low Risk', '35-39': 'Low Risk', '40-44': 'Low Risk', '45-49': 'Medium Risk', '50-54': 'Medium Risk', '55-59': 'Medium Risk', '60-64': 'Medium Risk', '65-69': 'High Risk', '70-74': 'High Risk', '75-79': 'High Risk', '80 or older': 'High Risk'}


In [7]:
# Define Diabetic Bins dictionary
file_path = Path('./DataTables/DiabeticRecoded.csv')
diabeticDF = pd.read_csv(file_path)
diabeticDict = dict(zip(diabeticDF.Diabetic, diabeticDF['Diabetes Bin']))
print(diabeticDict)

{'No': 'No', 'Yes': 'Yes', 'No, borderline diabetes': 'No', 'Yes (during pregnancy)': 'No'}


In [8]:
# Define SleepTime Bins dictionary
file_path = Path('./DataTables/SleepRecoded.csv')
sleepDF = pd.read_csv(file_path)
sleepDict = dict(zip(sleepDF.SleepTime, sleepDF['Recommended Sleep']))
print(sleepDict)

{1: 'Below', 2: 'Below', 3: 'Below', 4: 'Below', 5: 'Below', 6: 'Below', 7: 'Meets', 8: 'Meets', 9: 'Meets', 10: 'Above', 11: 'Above', 12: 'Above', 13: 'Above', 14: 'Above', 15: 'Above', 16: 'Above', 17: 'Above', 18: 'Above', 19: 'Above', 20: 'Above', 21: 'Above', 22: 'Above', 23: 'Above', 24: 'Above'}


In [9]:
# Replace Age, Diabetic, SleepTime values w/ Bin values
df.AgeCategory = df.AgeCategory.replace(ageDict)
df.Diabetic = df.Diabetic.replace(diabeticDict)
df.SleepTime = df.SleepTime.replace(sleepDict)

df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,30,No,Female,Medium Risk,White,Yes,Yes,Below,Yes,No,Yes
1,No,20.34,No,No,Yes,0,No,Female,High Risk,White,No,Yes,Meets,No,No,No
2,No,26.58,Yes,No,No,30,No,Male,High Risk,White,Yes,Yes,Meets,Yes,No,No
3,No,24.21,No,No,No,0,No,Female,High Risk,White,No,No,Below,No,No,Yes
4,No,23.71,No,No,No,0,Yes,Female,Low Risk,White,No,Yes,Meets,No,No,No


In [10]:
# Apply LabelEncoder module to all columns containing string values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
string_columns = [    
    "HeartDisease",
    "Smoking",
    "AlcoholDrinking",
    "Stroke",
    "DiffWalking",
    "Sex",
    "AgeCategory",
    "Race",
    "Diabetic",
    "PhysicalActivity",
    "SleepTime",
    "Asthma",
    "KidneyDisease",
    "SkinCancer"    ]
df2 = df.copy()
df2[string_columns] = df2[string_columns].apply(le.fit_transform)
df2.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,30,0,0,2,5,1,1,1,1,0,1
1,0,20.34,0,0,1,0,0,0,0,5,0,1,2,0,0,0
2,0,26.58,1,0,0,30,0,1,0,5,1,1,2,1,0,0
3,0,24.21,0,0,0,0,0,0,0,5,0,0,1,0,0,1
4,0,23.71,0,0,0,0,1,0,1,5,0,1,2,0,0,0


#  Separate the Features (X) from the Target (y)

In [15]:
# Create our features
X = df.drop(columns="HeartDisease", axis=1 )
X = pd.get_dummies(X)

# Create our target
y = df["HeartDisease"]


In [16]:
pd.set_option('display.max_columns', None)
X.head()

Unnamed: 0,BMI,MentalHealth,Smoking_No,Smoking_Yes,AlcoholDrinking_No,AlcoholDrinking_Yes,Stroke_No,Stroke_Yes,DiffWalking_No,DiffWalking_Yes,Sex_Female,Sex_Male,AgeCategory_High Risk,AgeCategory_Low Risk,AgeCategory_Medium Risk,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White,Diabetic_No,Diabetic_Yes,PhysicalActivity_No,PhysicalActivity_Yes,SleepTime_Above,SleepTime_Below,SleepTime_Meets,Asthma_No,Asthma_Yes,KidneyDisease_No,KidneyDisease_Yes,SkinCancer_No,SkinCancer_Yes
0,16.6,30,0,1,1,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,1,0,0,1,1,0,0,1
1,20.34,0,1,0,1,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,1,0,1,0,1,0
2,26.58,30,0,1,1,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,1,1,0,1,0
3,24.21,0,1,0,1,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,1,0,0,1
4,23.71,0,1,0,1,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,1,0,1,0,1,0


In [17]:
X.shape

(319795, 34)

# Split Data into Training and Testing

In [20]:
#Split the data inot Training and Testing sets
X_train, X_test, y_train, y_test = train_test_split(X,
   y,  random_state=78, train_size=0.80)

# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(255836, 34)
(63959, 34)
(255836,)
(63959,)


In [23]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Undersampling 

In [22]:
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'No': 21956, 'Yes': 21956})

In [24]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [25]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[42449, 16093],
       [ 1256,  4161]], dtype=int64)

In [26]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.7466203450007101

In [27]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.97      0.73      0.77      0.83      0.75      0.55     58542
        Yes       0.21      0.77      0.73      0.32      0.75      0.56      5417

avg / total       0.91      0.73      0.76      0.79      0.75      0.55     63959

