In [1]:
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
# read in health indicator CSV file:
df = pd.read_csv("../data/heart_2020_cleaned.csv")
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


### Continuous Variables:
- BMI, PhysicalHealth, MentalHealth, SleepTime

In [3]:
df.describe() 

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,319795.0,319795.0,319795.0,319795.0
mean,28.325399,3.37171,3.898366,7.097075
std,6.3561,7.95085,7.955235,1.436007
min,12.02,0.0,0.0,1.0
25%,24.03,0.0,0.0,6.0
50%,27.34,0.0,0.0,7.0
75%,31.42,2.0,3.0,8.0
max,94.85,30.0,30.0,24.0


In [6]:
df.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

### Creating Dummy Variables:
- For variables with multiple classes.

In [29]:
df_dummy

Unnamed: 0,AgeCategory_18-24,AgeCategory_25-29,AgeCategory_30-34,AgeCategory_35-39,AgeCategory_40-44,AgeCategory_45-49,AgeCategory_50-54,AgeCategory_55-59,AgeCategory_60-64,AgeCategory_65-69,...,Race_White,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
0,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,1,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
319791,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
319792,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
319793,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [31]:
# Dummy variables for Age, Race, General health, Diabetic

df_1 = df[["AgeCategory","Race","GenHealth","Diabetic"]]
df_dummy = pd.get_dummies(df_1)

# Removing Age, Race, General health, Diabetic from df

df.drop(["AgeCategory","Race","GenHealth","Diabetic"], axis=1)

# Joining df_dummy with df
for colname in df_dummy:
    df[colname] = df_dummy[colname]

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,Race_White,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,...,1,0,0,0,0,1,0,0,1,0
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,...,1,0,0,0,0,1,1,0,0,0
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,...,1,0,1,0,0,0,0,0,1,0
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,...,1,0,0,1,0,0,1,0,0,0
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,...,1,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,...,0,0,1,0,0,0,0,0,1,0
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,...,0,0,0,0,0,1,1,0,0,0
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,...,0,0,0,1,0,0,1,0,0,0
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,...,0,0,0,1,0,0,1,0,0,0


In [33]:
bin_cols = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']
mult_cols = ["Age", "Race", "GenHealth", "Diabetic"]
for colname in bin_cols:
    if("Sex" != colname):
        print(heart[colname])
        df[colname] = df[colname].str.replace("No",  "0", regex=False)
        df[colname] = df[colname].str.replace("Yes", "1", regex=False)
    else:
        df[colname] = df[colname].str.replace("Female",  "0", regex=False)
        df[colname] = df[colname].str.replace("Male", "1", regex=False)
    df[colname] = df[colname].astype("int")
print(df.head(3))

NameError: name 'heart' is not defined

In [9]:
X = df.drop("HeartDisease", axis=1)
Y = df["HeartDisease"]
print(np.shape(X))
print(np.shape(Y))

train_X = X.iloc[0:255836,:]
train_y = Y[0:255836]
test_X = X.iloc[63959:,:] 
test_y = Y.iloc[63959:]

(319795, 17)
(319795,)


In [10]:
train_X

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255831,35.56,No,No,No,0.0,0.0,No,Male,55-59,White,No,No,Very good,6.0,No,No,Yes
255832,33.66,No,No,No,0.0,3.0,No,Female,70-74,White,No,No,Very good,8.0,No,No,Yes
255833,20.98,No,No,No,0.0,0.0,No,Male,25-29,White,No,Yes,Very good,8.0,No,No,No
255834,21.80,Yes,Yes,No,0.0,0.0,No,Female,65-69,White,No,Yes,Excellent,6.0,No,No,No
