## Imports

In [150]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

## Load Data

In [151]:
df = pd.read_csv('data/U.S._Chronic_Disease_Indicators__CDI_.csv')
df.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2014,2014,AR,Arkansas,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,5,AST,AST3_1,NMBR,GENDER,GENM,,,,
1,2018,2018,CO,Colorado,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,8,AST,AST3_1,NMBR,OVERALL,OVR,,,,
2,2018,2018,DC,District of Columbia,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,11,AST,AST3_1,NMBR,OVERALL,OVR,,,,
3,2017,2017,GA,Georgia,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,13,AST,AST3_1,NMBR,GENDER,GENF,,,,
4,2010,2010,MI,Michigan,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,26,AST,AST3_1,NMBR,RACE,HIS,,,,


In [172]:
df_filtered = df[df['DataValueType'] == 'Number']
df_filtered.to_csv('data/df_filtered.csv')
df_filtered.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2014,2014,AR,Arkansas,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,5,AST,AST3_1,NMBR,GENDER,GENM,,,,
1,2018,2018,CO,Colorado,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,8,AST,AST3_1,NMBR,OVERALL,OVR,,,,
2,2018,2018,DC,District of Columbia,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,11,AST,AST3_1,NMBR,OVERALL,OVR,,,,
3,2017,2017,GA,Georgia,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,13,AST,AST3_1,NMBR,GENDER,GENF,,,,
4,2010,2010,MI,Michigan,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,26,AST,AST3_1,NMBR,RACE,HIS,,,,


## Preprocessing / Data Exploration

In [153]:
# remove columns
df_filtered = df_filtered.drop(
    ['LocationAbbr', 
     'DataSource', 
     'DataValueUnit', 
     'LocationID', 
     'StratificationCategoryID1', 
     'StratificationID1', 
     'StratificationCategoryID2',
     'StratificationID2', 
     'StratificationCategoryID3', 
     'StratificationID3',
     'Stratification1',
     'StratificationCategory2',
     'Stratification2',
     'StratificationCategory3',
     'Stratification3',
     'GeoLocation',
     'DataValueTypeID',
     'StratificationCategory1',
     'ResponseID',
     'TopicID',
     'QuestionID',
     'DataValueFootnoteSymbol',
     'DataValueType',
     'DatavalueFootnote',
     'Response',
     'LowConfidenceLimit',
     'HighConfidenceLimit'], axis=1)


In [154]:
# Remove DataValue rows that are blank
df_filtered = df_filtered[df_filtered['DataValue'].notnull()]

In [155]:
df_filtered.shape

(71811, 7)

In [156]:
df_filtered.sample(5)

Unnamed: 0,YearStart,YearEnd,LocationDesc,Topic,Question,DataValue,DataValueAlt
334941,2013,2013,Montana,Cardiovascular Disease,Mortality from coronary heart disease,1041.0,1041.0
288291,2018,2018,California,Chronic Obstructive Pulmonary Disease,Hospitalization for chronic obstructive pulmon...,19153.0,19153.0
370504,2012,2012,District of Columbia,Cardiovascular Disease,Mortality from cerebrovascular disease (stroke),44.0,44.0
286813,2020,2020,Alabama,Chronic Obstructive Pulmonary Disease,Hospitalization for chronic obstructive pulmon...,32.0,32.0
216710,2013,2013,Indiana,Chronic Kidney Disease,Incidence of treated end-stage renal disease a...,1135.0,1135.0


In [157]:
df_filtered.nunique()

YearStart          13
YearEnd            13
LocationDesc       53
Topic              10
Question           34
DataValue       20276
DataValueAlt    17253
dtype: int64

In [158]:
# find values of Topic
df_filtered['Topic'].unique()

array(['Asthma', 'Chronic Kidney Disease',
       'Chronic Obstructive Pulmonary Disease', 'Cardiovascular Disease',
       'Diabetes', 'Alcohol', 'Tobacco',
       'Nutrition, Physical Activity, and Weight Status', 'Older Adults',
       'Overarching Conditions'], dtype=object)

In [159]:
df_filtered["YearStart"].unique()

array([2014, 2018, 2017, 2010, 2013, 2016, 2015, 2020, 2012, 2019, 2011,
       2001, 2009], dtype=int64)

In [160]:
df_filtered["YearEnd"].unique()

array([2014, 2018, 2017, 2010, 2013, 2016, 2015, 2020, 2012, 2019, 2011,
       2001, 2009], dtype=int64)

In [161]:
df_filtered['YearStart'] = pd.to_datetime(df_filtered['YearStart'])
df_filtered['YearEnd'] = pd.to_datetime(df_filtered['YearEnd'])

In [162]:
# Convert DataValue to float
df_filtered['DataValue'] = df_filtered['DataValue'].astype(float)

In [163]:
df_filtered.dtypes

YearStart       datetime64[ns]
YearEnd         datetime64[ns]
LocationDesc            object
Topic                   object
Question                object
DataValue              float64
DataValueAlt           float64
dtype: object

## Modeling

In [164]:
X = df_filtered
y = df_filtered['DataValue']

In [165]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [166]:
nums = df_filtered.select_dtypes(include=['int64', 'float64']).columns
cats = df_filtered.select_dtypes(include=['object']).columns

In [167]:
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, nums),
        ('cat', cat_transformer, cats)
    ])

In [168]:
models = [
    ('lr', LinearRegression())
]

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

In [169]:
pipe.fit(X_train, y_train)

In [170]:
pred = pipe.predict(X_test) 
pred

array([  679.74528718,   424.76924892, 13264.9754377 , ...,
        2644.1387136 ,   509.04491338,   958.92611605])