In [2]:
import pandas as pd
df = pd.read_csv('data/Cardiovascular_Diseases_Risk_Prediction_Dataset.csv')

In [3]:
from sklearn.model_selection import train_test_split
y = df['Heart_Disease']
X = df.drop('Heart_Disease',axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=12, stratify= y)

In [None]:
# I don't think this needs to be done on the target

# Transforming the target variable into 1's and 0's
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# y_train = le.fit_transform(y_train)
# y_test = le.transform(y_test)

In [4]:
num_cols = ['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption', 'Green_Vegetables_Consumption', 'FriedPotato_Consumption']
ord_cols = ['General_Health', 'Checkup', 'Age_Category']
dum_cols = ['Exercise', 'Skin_Cancer', 'Depression', 'Arthritis', 'Other_Cancer', 'Smoking_History','Sex', 'Diabetes']

In [14]:
num_df = X_train[num_cols]
num_df.head()

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
154606,160.0,56.7,22.14,2.0,3.0,3.0,3.0
223948,163.0,89.81,33.99,0.0,10.0,12.0,1.0
298682,173.0,89.81,30.11,0.0,28.0,0.0,30.0
185124,147.0,53.07,24.45,0.0,16.0,0.0,0.0
23179,183.0,95.25,28.48,4.0,60.0,60.0,4.0


In [12]:
# Encoding Ordinals
from sklearn.preprocessing import OrdinalEncoder
health = ['Poor', 'Fair', 'Good','Very Good', 'Excellent']
check = ['Never', '5 or more years ago', 'Within the past 5 years', 'Within the past 2 years', 'Within the past year']
age = ['18-24','25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80+']
ordinals = X_train[ord_cols].copy()
oe = OrdinalEncoder(categories=[health, check, age])
oe.fit(ordinals)
ord_encoded = oe.transform(ordinals)
ord_df = pd.DataFrame(ord_encoded,columns = ordinals.columns, index= X_train.index)
ord_df.head()

Unnamed: 0,General_Health,Checkup,Age_Category
154606,3.0,4.0,10.0
223948,1.0,4.0,4.0
298682,3.0,4.0,11.0
185124,2.0,4.0,12.0
23179,3.0,3.0,1.0


In [13]:
# Encoding other categories
from sklearn.preprocessing import OneHotEncoder
dummies = X_train[dum_cols].copy()
ohe = OneHotEncoder(sparse_output=False)
ohe.fit(dummies)
dum_encoded = ohe.transform(dummies)
dum_df = pd.DataFrame(dum_encoded,columns=ohe.get_feature_names_out(), index= X_train.index)
dum_df.head()

Unnamed: 0,Exercise_No,Exercise_Yes,Skin_Cancer_No,Skin_Cancer_Yes,Depression_No,Depression_Yes,Arthritis_No,Arthritis_Yes,Other_Cancer_No,Other_Cancer_Yes,Smoking_History_No,Smoking_History_Yes,Sex_Female,Sex_Male,Diabetes_No,"Diabetes_No, pre-diabetes or borderline diabetes",Diabetes_Yes,"Diabetes_Yes, but female told only during pregnancy"
154606,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
223948,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
298682,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
185124,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
23179,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0


In [17]:
# Concatenate columns back.
X_train_prep = pd.concat([num_df, ord_df,dum_df], axis=1)
X_train_prep.head()

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,General_Health,Checkup,Age_Category,...,Other_Cancer_No,Other_Cancer_Yes,Smoking_History_No,Smoking_History_Yes,Sex_Female,Sex_Male,Diabetes_No,"Diabetes_No, pre-diabetes or borderline diabetes",Diabetes_Yes,"Diabetes_Yes, but female told only during pregnancy"
154606,160.0,56.7,22.14,2.0,3.0,3.0,3.0,3.0,4.0,10.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
223948,163.0,89.81,33.99,0.0,10.0,12.0,1.0,1.0,4.0,4.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
298682,173.0,89.81,30.11,0.0,28.0,0.0,30.0,3.0,4.0,11.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
185124,147.0,53.07,24.45,0.0,16.0,0.0,0.0,2.0,4.0,12.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
23179,183.0,95.25,28.48,4.0,60.0,60.0,4.0,3.0,3.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0


In [26]:
# Scale Data for Logistic Regression or K-Neighbors (Do not need for decision trees)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(X_train_prep)
X_scaled = ss.transform(X_train_prep)
X_scaled_df = pd.DataFrame(X_scaled, columns= X_train_prep.columns, index= X_train_prep.index)
X_scaled_df.head()

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,General_Health,Checkup,Age_Category,...,Other_Cancer_No,Other_Cancer_Yes,Smoking_History_No,Smoking_History_Yes,Sex_Female,Sex_Male,Diabetes_No,"Diabetes_No, pre-diabetes or borderline diabetes",Diabetes_Yes,"Diabetes_Yes, but female told only during pregnancy"
154606,-0.994409,-1.259698,-0.996522,-0.376863,-1.079341,-0.812577,-0.382752,0.454727,0.469379,0.982537,...,0.327385,-0.327385,0.824916,-0.824916,0.96144,-0.96144,0.437911,-0.150867,-0.386348,-0.094097
223948,-0.712965,0.291715,0.823609,-0.621071,-0.797973,-0.208986,-0.614847,-1.484464,0.469379,-0.72072,...,0.327385,-0.327385,-1.212244,1.212244,0.96144,-0.96144,0.437911,-0.150867,-0.386348,-0.094097
298682,0.225184,0.291715,0.227651,-0.621071,-0.074455,-1.013773,2.750534,0.454727,0.469379,1.266413,...,-3.054505,3.054505,-1.212244,1.212244,-1.040106,1.040106,0.437911,-0.150867,-0.386348,-0.094097
185124,-2.214002,-1.429786,-0.641711,-0.621071,-0.5568,-1.013773,-0.730895,-0.514869,0.469379,1.550289,...,0.327385,-0.327385,0.824916,-0.824916,0.96144,-0.96144,0.437911,-0.150867,-0.386348,-0.094097
23179,1.163332,0.546613,-0.022713,-0.132655,1.211799,3.010162,-0.266704,0.454727,-0.757884,-1.572348,...,0.327385,-0.327385,-1.212244,1.212244,-1.040106,1.040106,0.437911,-0.150867,-0.386348,-0.094097


In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.read_csv('data/Cardiovascular_Diseases_Risk_Prediction_Dataset.csv')

y = df['Heart_Disease']
X = df.drop('Heart_Disease',axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=12, stratify= y)

num_cols = ['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption', 'Green_Vegetables_Consumption', 'FriedPotato_Consumption']
ord_cols = ['General_Health', 'Checkup', 'Age_Category']
dum_cols = ['Exercise', 'Skin_Cancer', 'Depression', 'Arthritis', 'Other_Cancer', 'Smoking_History','Sex', 'Diabetes']

health = ['Poor', 'Fair', 'Good','Very Good', 'Excellent']
check = ['Never', '5 or more years ago', 'Within the past 5 years', 'Within the past 2 years', 'Within the past year']
age = ['18-24','25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80+']

col_transformer = ColumnTransformer(transformers=[
    ('oe', OrdinalEncoder(categories=[health, check, age]), ord_cols),
    ('ohe', OneHotEncoder(), dum_cols)
], remainder="passthrough")

pipe = Pipeline(steps=[
    ('col_transformer', col_transformer),
    ('ss', StandardScaler())
])
pipe.fit(X_train)
X_train_final = pipe.transform(X_train)

X_train_final_df = pd.DataFrame(X_train_final, columns= pipe.get_feature_names_out(), index = X_train.index)
X_train_final_df.head()

Unnamed: 0,oe__General_Health,oe__Checkup,oe__Age_Category,ohe__Exercise_No,ohe__Exercise_Yes,ohe__Skin_Cancer_No,ohe__Skin_Cancer_Yes,ohe__Depression_No,ohe__Depression_Yes,ohe__Arthritis_No,...,"ohe__Diabetes_No, pre-diabetes or borderline diabetes",ohe__Diabetes_Yes,"ohe__Diabetes_Yes, but female told only during pregnancy",remainder__Height_(cm),remainder__Weight_(kg),remainder__BMI,remainder__Alcohol_Consumption,remainder__Fruit_Consumption,remainder__Green_Vegetables_Consumption,remainder__FriedPotato_Consumption
154606,0.454727,0.469379,0.982537,1.852757,-1.852757,0.327926,-0.327926,0.500074,-0.500074,0.697141,...,-0.150867,-0.386348,-0.094097,-0.994409,-1.259698,-0.996522,-0.376863,-1.079341,-0.812577,-0.382752
223948,-1.484464,0.469379,-0.72072,-0.539736,0.539736,0.327926,-0.327926,-1.999703,1.999703,-1.43443,...,-0.150867,-0.386348,-0.094097,-0.712965,0.291715,0.823609,-0.621071,-0.797973,-0.208986,-0.614847
298682,0.454727,0.469379,1.266413,1.852757,-1.852757,-3.049464,3.049464,0.500074,-0.500074,-1.43443,...,-0.150867,-0.386348,-0.094097,0.225184,0.291715,0.227651,-0.621071,-0.074455,-1.013773,2.750534
185124,-0.514869,0.469379,1.550289,1.852757,-1.852757,0.327926,-0.327926,0.500074,-0.500074,-1.43443,...,-0.150867,-0.386348,-0.094097,-2.214002,-1.429786,-0.641711,-0.621071,-0.5568,-1.013773,-0.730895
23179,0.454727,-0.757884,-1.572348,-0.539736,0.539736,0.327926,-0.327926,0.500074,-0.500074,0.697141,...,-0.150867,-0.386348,-0.094097,1.163332,0.546613,-0.022713,-0.132655,1.211799,3.010162,-0.266704
