# Create the binary classifier model

In [1]:
#Import the NumPy and Pandas dependencies   
import numpy as np
import pandas as pd

# Import the Path module from the pathlib library
from pathlib import Path

# Import SKLearn dependencies
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Import confusion_matrix
from sklearn.metrics import confusion_matrix

In [2]:
# Read the CSV file from the resources folder into a Pandas DataFrame

heart_data_path = Path('../resources/heart_data_2015.csv')
heart_data_df = pd.read_csv(heart_data_path)

# Review the DataFrame
heart_data_df.head()

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENUM,...,_PAREC1,_PASTAE1,_LMTACT1,_LMTWRK1,_LMTSCL1,_RFSEAT2,_RFSEAT3,_FLSHOT6,_PNEUMO2,_AIDTST3
0,1.0,1.0,b'01292015',b'01',b'29',b'2015',1200.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,1.0
1,1.0,1.0,b'01202015',b'01',b'20',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,2.0,2.0,3.0,3.0,4.0,2.0,2.0,,,2.0
2,1.0,1.0,b'02012015',b'02',b'01',b'2015',1200.0,2015000000.0,2015000000.0,1.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,
3,1.0,1.0,b'01142015',b'01',b'14',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,9.0
4,1.0,1.0,b'01142015',b'01',b'14',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,1.0


In [3]:
# Get the shape of the DataFrame
heart_data_df.shape

(441456, 330)

In [4]:
# Choose the columns to keep
heart_data_cols_df = heart_data_df[[ '_MICHD', 
                                        'BPMEDS',
                                        'TOLDHI2',
                                        'CHCKIDNY',
                                         'CVDINFR4', 
                                         '_BMI5CAT', 
                                         'SMOKDAY2',
                                         'CVDSTRK3', 
                                         'DIABETE3',
                                         '_RFDRHV5', 
                                         'DIFFWALK', 
                                         'EXERANY2',
                                         'SEX', 
                                         '_AGEG5YR',]]

heart_data_cols_df.head()   

Unnamed: 0,_MICHD,BPMEDS,TOLDHI2,CHCKIDNY,CVDINFR4,_BMI5CAT,SMOKDAY2,CVDSTRK3,DIABETE3,_RFDRHV5,DIFFWALK,EXERANY2,SEX,_AGEG5YR
0,2.0,1.0,1.0,2.0,2.0,4.0,3.0,2.0,3.0,1.0,1.0,2.0,2.0,9.0
1,2.0,,2.0,2.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,1.0,2.0,7.0
2,,,1.0,2.0,7.0,2.0,,1.0,3.0,9.0,,,2.0,11.0
3,2.0,1.0,1.0,2.0,2.0,3.0,,2.0,3.0,1.0,1.0,2.0,2.0,9.0
4,2.0,,2.0,2.0,2.0,2.0,,2.0,3.0,1.0,2.0,2.0,2.0,9.0


In [5]:
# Get the shape of the new DataFrame
heart_data_cols_df.shape    

(441456, 14)

In [6]:
# Rename the columns    

heart_data_cols_df = heart_data_cols_df.rename(columns={'_MICHD': 'heart_disease', 
                                                        'BPMEDS': 'bp_meds',
                                                        'TOLDHI2': 'high_cholesterol',
                                                        'CHCKIDNY': 'kidney_disease',
                                                        'CVDINFR4': 'cardiovascular_conditions', 
                                                        '_BMI5CAT': 'bmi', 
                                                        'SMOKDAY2': 'smoke_now',
                                                        'CVDSTRK3': 'stroke',
                                                        'DIABETE3': 'diabetes',
                                                        '_RFDRHV5': 'heavy_alcohol',
                                                        'DIFFWALK': 'diff_walk',
                                                        'EXERANY2': 'exercise',
                                                        'SEX': 'sex',
                                                        '_AGEG5YR': 'age'})

In [7]:
# Check the shape of the DataFrame
heart_data_cols_df.shape

(441456, 14)

In [8]:
# Drop all null values
heart_data_cols_df = heart_data_cols_df.dropna()

In [9]:
# Check the unique value counts to see if binning is required
heart_data_cols_df.nunique(axis=0)

heart_disease        2
bp_meds              4
high_cholesterol     4
kidney_disease       4
heart_attack         4
bmi                  4
smoke_now            5
stroke               4
diabetes             6
heavy_alcohol        3
diff_walk            4
exercise             4
sex                  2
age                 14
dtype: int64

In [10]:
# Check out the data types 
heart_data_cols_df.dtypes

heart_disease       float64
bp_meds             float64
high_cholesterol    float64
kidney_disease      float64
heart_attack        float64
bmi                 float64
smoke_now           float64
stroke              float64
diabetes            float64
heavy_alcohol       float64
diff_walk           float64
exercise            float64
sex                 float64
age                 float64
dtype: object

In [11]:
# Check the unique values in each column to convert to binary zeros and ones 
column_mapping = {
    '_MICHD': 'heart_disease',
    'BPMEDS': 'bp_meds',
    'TOLDHI2': 'high_cholesterol',
    'CHCKIDNY': 'kidney_disease',
    'CVDINFR4': 'cardiovascular_conditions',
    '_BMI5': 'bmi',
    'SMOKDAY2': 'smoke_now',
    'CVDSTRK3': 'stroke',
    'DIABETE3': 'diabetes',
    '_RFDRHV5': 'heavy_alcohol',
    'DIFFWALK': 'diff_walk',
    'EXERANY2': 'exercise',
    'SEX': 'sex',
    '_AGEG5YR': 'age'
}

# Iterate through columns in the mapping
for col_name, new_col_name in column_mapping.items():
    unique_values = heart_data_cols_df[new_col_name].unique()
    print(f"Unique values in {new_col_name}: {unique_values}")


Unique values in heart_disease: [2. 1.]
Unique values in bp_meds: [1. 2. 7. 9.]
Unique values in high_cholesterol: [1. 2. 7. 9.]
Unique values in kidney_disease: [2. 7. 1. 9.]
Unique values in heart_attack: [2. 1. 7. 9.]
Unique values in bmi: [4. 3. 2. 1.]
Unique values in smoke_now: [3. 1. 2. 9. 7.]
Unique values in stroke: [2. 1. 7. 9.]
Unique values in diabetes: [3. 1. 4. 2. 7. 9.]
Unique values in heavy_alcohol: [1. 9. 2.]
Unique values in diff_walk: [1. 2. 7. 9.]
Unique values in exercise: [2. 1. 9. 7.]
Unique values in sex: [2. 1.]
Unique values in age: [ 9. 10. 11. 13.  7. 12.  8.  6.  4.  3.  5.  1.  2. 14.]


In [12]:
# Drop all 7 and 9 values from the DataFrame except for 'cols_to_exclude'
cols_to_exclude = ['age']

# Create a condition to filter rows
condition = ~heart_data_cols_df.drop(columns=cols_to_exclude).isin([7, 9]).any(axis=1)

# Apply the condition to the DataFrame
heart_data_cols_df = heart_data_cols_df[condition]

# Verify the changes
for col_name, new_col_name in column_mapping.items():
    unique_values = heart_data_cols_df[new_col_name].unique()
    print(f"Unique values in {new_col_name}: {unique_values}")

Unique values in heart_disease: [2. 1.]
Unique values in bp_meds: [1. 2.]
Unique values in high_cholesterol: [1. 2.]
Unique values in kidney_disease: [2. 1.]
Unique values in heart_attack: [2. 1.]
Unique values in bmi: [4. 3. 2. 1.]
Unique values in smoke_now: [3. 1. 2.]
Unique values in stroke: [2. 1.]
Unique values in diabetes: [3. 1. 4. 2.]
Unique values in heavy_alcohol: [1. 2.]
Unique values in diff_walk: [1. 2.]
Unique values in exercise: [2. 1.]
Unique values in sex: [2. 1.]
Unique values in age: [ 9. 10. 11. 13. 12.  8.  7.  4.  6.  3.  5.  1.  2. 14.]


In [13]:
heart_data_cols_df.head()

Unnamed: 0,heart_disease,bp_meds,high_cholesterol,kidney_disease,heart_attack,bmi,smoke_now,stroke,diabetes,heavy_alcohol,diff_walk,exercise,sex,age
0,2.0,1.0,1.0,2.0,2.0,4.0,3.0,2.0,3.0,1.0,1.0,2.0,2.0,9.0
9,2.0,1.0,1.0,2.0,2.0,3.0,3.0,2.0,3.0,1.0,2.0,1.0,1.0,10.0
10,2.0,1.0,2.0,2.0,2.0,4.0,3.0,2.0,3.0,1.0,2.0,2.0,2.0,9.0
15,2.0,1.0,1.0,2.0,2.0,3.0,1.0,2.0,3.0,1.0,1.0,1.0,2.0,11.0
16,1.0,1.0,1.0,2.0,1.0,4.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,9.0


In [14]:
# Set the binary values for the heart_disease column
heart_data_cols_df['heart_disease'] = heart_data_cols_df['heart_disease'].replace({2: 0})

# Verify the changes
print(heart_data_cols_df['heart_disease'].value_counts())

0.0    53636
1.0    14168
Name: heart_disease, dtype: int64


In [15]:
# Set the binary values for the bp_meds column
heart_data_cols_df['bp_meds'] = heart_data_cols_df['bp_meds'].replace({2: 0})
# Verify the changes
print(heart_data_cols_df['bp_meds'].value_counts())

1.0    58069
0.0     9735
Name: bp_meds, dtype: int64


In [16]:
# Set the binary values for the high_cholesterol column
heart_data_cols_df['high_cholesterol'] = heart_data_cols_df['high_cholesterol'].replace({2: 1, 1:0})
# Verify the changes
print(heart_data_cols_df['high_cholesterol'].value_counts())

0.0    42473
1.0    25331
Name: high_cholesterol, dtype: int64


In [17]:
# Set the binary values for the kidney_disease column
heart_data_cols_df['kidney_disease'] = heart_data_cols_df['kidney_disease'].replace({2: 0})
# Verify the changes
print(heart_data_cols_df['kidney_disease'].value_counts())

0.0    63049
1.0     4755
Name: kidney_disease, dtype: int64


In [18]:
# Set the binary values for the heart_attack column
heart_data_cols_df['cardiovascular_conditions' ] = heart_data_cols_df['cardiovascular_conditions'].replace({2:0})
# Verify the changes
print(heart_data_cols_df['cardiovascular_conditions'].value_counts())

0.0    57990
1.0     9814
Name: heart_attack, dtype: int64


In [19]:
# Set the binary values for the smoke_now column
heart_data_cols_df['smoke_now' ] = heart_data_cols_df['smoke_now'].replace({2:1, 3:0})
# Verify the changes
print(heart_data_cols_df['smoke_now'].value_counts())

0.0    49719
1.0    18085
Name: smoke_now, dtype: int64


In [20]:
# Set the binary values for the stroke column
heart_data_cols_df['stroke' ] = heart_data_cols_df['stroke'].replace({2:0})
# Verify the changes
print(heart_data_cols_df['stroke'].value_counts())

0.0    61756
1.0     6048
Name: stroke, dtype: int64


In [21]:
# Set the binary values for the diabetes column
heart_data_cols_df['diabetes' ] = heart_data_cols_df['diabetes'].replace({2:0, 3:0, 4:0})
# Verify the changes
print(heart_data_cols_df['diabetes'].value_counts())

0.0    50326
1.0    17478
Name: diabetes, dtype: int64


In [22]:
# Set the binary values for the heavy_alcohol column
heart_data_cols_df['heavy_alcohol' ] = heart_data_cols_df['heavy_alcohol'].replace({2:1, 1:0})
# Verify the changes
print(heart_data_cols_df['heavy_alcohol'].value_counts())

0.0    62864
1.0     4940
Name: heavy_alcohol, dtype: int64


In [23]:
# Set the binary values for the diff_walk column
heart_data_cols_df['diff_walk' ] = heart_data_cols_df['diff_walk'].replace({2:0})
# Verify the changes
print(heart_data_cols_df['diff_walk'].value_counts())

0.0    46229
1.0    21575
Name: diff_walk, dtype: int64


In [24]:
# Set the binary values for the exercise column
heart_data_cols_df['exercise' ] = heart_data_cols_df['exercise'].replace({2:0})
# Verify the changes
print(heart_data_cols_df['exercise'].value_counts())

1.0    44449
0.0    23355
Name: exercise, dtype: int64


In [25]:
# Set the binary values for the sex column
heart_data_cols_df['sex' ] = heart_data_cols_df['sex'].replace({2:0})
# Verify the changes
print(heart_data_cols_df['sex'].value_counts())

1.0    34959
0.0    32845
Name: sex, dtype: int64


In [26]:
# Verify the changes
for col_name, new_col_name in column_mapping.items():
    unique_values = heart_data_cols_df[new_col_name].unique()
    print(f"Unique values in {new_col_name}: {unique_values}")

Unique values in heart_disease: [0. 1.]
Unique values in bp_meds: [1. 0.]
Unique values in high_cholesterol: [0. 1.]
Unique values in kidney_disease: [0. 1.]
Unique values in heart_attack: [0. 1.]
Unique values in bmi: [4. 3. 2. 1.]
Unique values in smoke_now: [0. 1.]
Unique values in stroke: [0. 1.]
Unique values in diabetes: [0. 1.]
Unique values in heavy_alcohol: [0. 1.]
Unique values in diff_walk: [1. 0.]
Unique values in exercise: [0. 1.]
Unique values in sex: [0. 1.]
Unique values in age: [ 9. 10. 11. 13. 12.  8.  7.  4.  6.  3.  5.  1.  2. 14.]


In [27]:
# Get the shape of the DataFrame
heart_data_cols_df.shape

(67804, 14)

In [28]:
# Separate the data into labels and features
# Create the labels (y) from the 'heart_disease' column
y = heart_data_cols_df['heart_disease']

# Create the features (X) DataFrame from the remaining columns
X = heart_data_cols_df.drop('heart_disease', axis=1)

In [29]:
# Review the y variable Series
print(y)

0         0.0
9         0.0
10        0.0
15        0.0
16        1.0
         ... 
441350    0.0
441358    0.0
441361    0.0
441431    0.0
441446    1.0
Name: heart_disease, Length: 67804, dtype: float64


In [30]:
# Review the X variable DataFrame
print(X.head())

    bp_meds  high_cholesterol  kidney_disease  heart_attack  bmi  smoke_now  \
0       1.0               0.0             0.0           0.0  4.0        0.0   
9       1.0               0.0             0.0           0.0  3.0        0.0   
10      1.0               1.0             0.0           0.0  4.0        0.0   
15      1.0               0.0             0.0           0.0  3.0        1.0   
16      1.0               0.0             0.0           1.0  4.0        1.0   

    stroke  diabetes  heavy_alcohol  diff_walk  exercise  sex   age  
0      0.0       0.0            0.0        1.0       0.0  0.0   9.0  
9      0.0       0.0            0.0        0.0       1.0  1.0  10.0  
10     0.0       0.0            0.0        0.0       0.0  0.0   9.0  
15     0.0       0.0            0.0        1.0       1.0  0.0  11.0  
16     0.0       1.0            0.0        1.0       0.0  0.0   9.0  


In [31]:
# Check the balance of our target values
y_balance = y.value_counts()
print(y_balance)

0.0    53636
1.0    14168
Name: heart_disease, dtype: int64


In [34]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function  
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    stratify=y, 
                                                    random_state=1)