In [1]:
import pandas as pd
import numpy as np 
import pickle

from ucimlrepo import fetch_ucirepo 
from pathlib import Path
from sklearn.model_selection import train_test_split

In [2]:
data_cache = Path('assets/data_cache')

if data_cache.is_file():
    with open(data_cache, 'rb') as f:
        df, y = pickle.load(f)
else: 
    # fetch dataset 
    cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 
      
    # data (as pandas dataframes) 
    df = cdc_diabetes_health_indicators.data.features 
    y = cdc_diabetes_health_indicators.data.targets 
      
    # metadata 
    print(cdc_diabetes_health_indicators.metadata) 
      
    # variable information 
    print(cdc_diabetes_health_indicators.variables)

    # cache it for further use
    with open(data_cache, 'wb') as f: 
        pickle.dump((df,y), f)

In [3]:
df['diabetes_binary'] = y

In [4]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

In [5]:
df

Unnamed: 0,highbp,highchol,cholcheck,bmi,smoker,stroke,heartdiseaseorattack,physactivity,fruits,veggies,...,nodocbccost,genhlth,menthlth,physhlth,diffwalk,sex,age,education,income,diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,1,1,1,45,0,0,0,0,1,1,...,0,3,0,5,0,1,5,6,7,0
253676,1,1,1,18,0,0,0,0,0,0,...,0,4,0,0,1,0,11,2,4,1
253677,0,0,1,28,0,0,0,1,1,0,...,0,1,0,0,0,0,2,5,2,0
253678,1,0,1,23,0,0,0,0,1,1,...,0,3,0,0,0,1,7,5,1,0


In [6]:
len(list(df.columns.values))

22

In [7]:
binary_cols = ['highbp',
 'highchol',
 'cholcheck',
 'smoker',
 'stroke',
 'heartdiseaseorattack',
 'physactivity',
 'fruits',
 'veggies',
 'hvyalcoholconsump',
 'anyhealthcare',
 'nodocbccost',
 'diffwalk',
 'sex',
 'diabetes_binary'
]
purely_num = ['menhlth', 'physhlth']

In [8]:
len(binary_cols)

15

In [9]:
df

Unnamed: 0,highbp,highchol,cholcheck,bmi,smoker,stroke,heartdiseaseorattack,physactivity,fruits,veggies,...,nodocbccost,genhlth,menthlth,physhlth,diffwalk,sex,age,education,income,diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,1,1,1,45,0,0,0,0,1,1,...,0,3,0,5,0,1,5,6,7,0
253676,1,1,1,18,0,0,0,0,0,0,...,0,4,0,0,1,0,11,2,4,1
253677,0,0,1,28,0,0,0,1,1,0,...,0,1,0,0,0,0,2,5,2,0
253678,1,0,1,23,0,0,0,0,1,1,...,0,3,0,0,0,1,7,5,1,0


In [10]:
# map categorical values
binary_values = {
    0 : 'false',
    1 : 'true'
}

for col in binary_cols: 
    df[col] = df[col].map(binary_values)


In [11]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,253670,253671,253672,253673,253674,253675,253676,253677,253678,253679
highbp,true,false,true,true,true,true,true,true,true,false,...,true,true,true,true,false,true,true,false,true,true
highchol,true,false,true,false,true,true,false,true,true,false,...,true,true,false,false,false,true,true,false,false,true
cholcheck,true,false,true,true,true,true,true,true,true,true,...,true,true,true,true,true,true,true,true,true,true
bmi,40,25,28,27,24,25,30,25,30,24,...,25,23,30,42,27,45,18,28,23,25
smoker,true,true,false,false,false,true,true,true,true,false,...,false,false,true,false,false,false,false,false,false,false
stroke,false,false,false,false,false,false,false,false,false,false,...,false,true,false,false,false,false,false,false,false,false
heartdiseaseorattack,false,false,false,false,false,false,false,false,true,false,...,true,true,true,false,false,false,false,false,false,true
physactivity,false,true,false,true,true,true,false,true,false,false,...,false,false,true,true,false,false,false,true,false,true
fruits,false,false,true,true,true,true,false,false,true,false,...,true,false,true,true,false,true,false,true,true,true
veggies,true,false,false,true,true,true,false,true,true,true,...,false,false,true,true,true,true,false,false,true,false


In [12]:
# change categorical values to cateogary for so that we can better binary hot encode it
genhlth_values = {
    1: 'excellent',
    2: 'very_good',
    3: 'good',
    4: 'fair',
    5: 'poor'
}

df.genhlth = df.genhlth.map(genhlth_values)

In [13]:
 education_values = {
     1 : 'never_attended_school',
     2 : 'grade_1_to_8',
     3 : 'grade_9_to_11',
     4 : 'grade_12_to_high_school_graduate',
     5 : 'college_1_to_3_years',
     6 : 'college_4_to_more'
}
df.education = df.education.map(education_values)

In [14]:
 income_values = {
     1 : 'Less than $10,000',
     2 : '$10,000 to less than $15,000',
     3 : '$15,000 to less than $20,000',
     4 : '$20,000 to less than $25,000',
     5 : '$25,000 to less than $35,000',
     6 : '$35,000 to less than $50,000',
     7 : '$50,000 to less than $75,000',
     8 : '$75,000 or more'
}
df.income = df.income.map(income_values)
df.income = df.income.str.lower().str.replace(' ', '_').str.replace(',','')

In [16]:
age_values = {
    1 :	'Age 18 to 24',
    2 :	'Age 25 to 29',
    3 :	'Age 30 to 34',
    4 :	'Age 35 to 39',
    5 :	'Age 40 to 44',
    6 :	'Age 45 to 49',
    7 :	'Age 50 to 54',
    8 :	'Age 55 to 59',
    9 :	'Age 60 to 64',
    10 : 'Age 65 to 69',
    11 : 'Age 70 to 74',
    12 : 'Age 75 to 79',
    13 : 'Age 80 or older',
}
df.age = df.age.map(age_values)
df.age = df.age.str.lower().str.replace(' ', '_')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,253670,253671,253672,253673,253674,253675,253676,253677,253678,253679
highbp,true,false,true,true,true,true,true,true,true,false,...,true,true,true,true,false,true,true,false,true,true
highchol,true,false,true,false,true,true,false,true,true,false,...,true,true,false,false,false,true,true,false,false,true
cholcheck,true,false,true,true,true,true,true,true,true,true,...,true,true,true,true,true,true,true,true,true,true
bmi,40,25,28,27,24,25,30,25,30,24,...,25,23,30,42,27,45,18,28,23,25
smoker,true,true,false,false,false,true,true,true,true,false,...,false,false,true,false,false,false,false,false,false,false
stroke,false,false,false,false,false,false,false,false,false,false,...,false,true,false,false,false,false,false,false,false,false
heartdiseaseorattack,false,false,false,false,false,false,false,false,true,false,...,true,true,true,false,false,false,false,false,false,true
physactivity,false,true,false,true,true,true,false,true,false,false,...,false,false,true,true,false,false,false,true,false,true
fruits,false,false,true,true,true,true,false,false,true,false,...,true,false,true,true,false,true,false,true,true,true
veggies,true,false,false,true,true,true,false,true,true,true,...,false,false,true,true,true,true,false,false,true,false


In [15]:
income_values = {
    #TODO
}
df.income = df.income.map(income_values)

In [9]:
# Let's split the data from in train test split 
df_full_train, df_val = train_test_split(df, test_size=0.25

SyntaxError: unexpected EOF while parsing (135549221.py, line 2)