## Imports

In [169]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from itertools import product
from itertools import product
from statsmodels.tsa.arima.model import ARIMA

import warnings
warnings.filterwarnings('ignore')

## Load Data

In [170]:
df = pd.read_csv('data/U.S._Chronic_Disease_Indicators__CDI_.csv')
df.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2014,2014,AR,Arkansas,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,5,AST,AST3_1,NMBR,GENDER,GENM,,,,
1,2018,2018,CO,Colorado,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,8,AST,AST3_1,NMBR,OVERALL,OVR,,,,
2,2018,2018,DC,District of Columbia,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,11,AST,AST3_1,NMBR,OVERALL,OVR,,,,
3,2017,2017,GA,Georgia,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,13,AST,AST3_1,NMBR,GENDER,GENF,,,,
4,2010,2010,MI,Michigan,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,26,AST,AST3_1,NMBR,RACE,HIS,,,,


In [171]:
df_filtered = df[df['DataValueType'] == 'Number']
df_filtered.to_csv('data/df_filtered.csv')
df_filtered.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2014,2014,AR,Arkansas,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,5,AST,AST3_1,NMBR,GENDER,GENM,,,,
1,2018,2018,CO,Colorado,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,8,AST,AST3_1,NMBR,OVERALL,OVR,,,,
2,2018,2018,DC,District of Columbia,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,11,AST,AST3_1,NMBR,OVERALL,OVR,,,,
3,2017,2017,GA,Georgia,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,13,AST,AST3_1,NMBR,GENDER,GENF,,,,
4,2010,2010,MI,Michigan,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,26,AST,AST3_1,NMBR,RACE,HIS,,,,


## Preprocessing / Data Exploration

In [172]:
# remove columns
df_filtered = df_filtered.drop(
    ['LocationAbbr', 
     'DataSource', 
     'DataValueUnit', 
     'LocationID', 
     'StratificationCategoryID1', 
     'StratificationID1', 
     'StratificationCategoryID2',
     'StratificationID2', 
     'StratificationCategoryID3', 
     'StratificationID3',
     'Stratification1',
     'StratificationCategory2',
     'Stratification2',
     'StratificationCategory3',
     'Stratification3',
     'GeoLocation',
     'DataValueTypeID',
     'StratificationCategory1',
     'ResponseID',
     'TopicID',
     'QuestionID',
     'DataValueFootnoteSymbol',
     'DataValueType',
     'DatavalueFootnote',
     'Response',
     'LowConfidenceLimit',
     'HighConfidenceLimit',
     'DataValueAlt'], axis=1)


In [173]:
# Remove DataValue rows that are blank
df_filtered = df_filtered[df_filtered['DataValue'].notnull()]

In [174]:
df_filtered.shape

(71811, 6)

In [175]:
df_filtered.sample(5)

Unnamed: 0,YearStart,YearEnd,LocationDesc,Topic,Question,DataValue
288281,2011,2011,Michigan,Chronic Obstructive Pulmonary Disease,Hospitalization for chronic obstructive pulmon...,289.0
362407,2017,2017,Tennessee,Cardiovascular Disease,Mortality from heart failure,8076.0
322198,2015,2015,Vermont,Cardiovascular Disease,Mortality from total cardiovascular diseases,873.0
435164,2010,2010,North Carolina,Diabetes,Amputation of a lower extremity attributable t...,1151.0
296079,2013,2013,Michigan,Cardiovascular Disease,Mortality from heart failure,11432.0


In [176]:
df_filtered.nunique()

YearStart          13
YearEnd            13
LocationDesc       53
Topic              10
Question           34
DataValue       20276
dtype: int64

In [177]:
# find values of Topic
df_filtered['Topic'].unique()

array(['Asthma', 'Chronic Kidney Disease',
       'Chronic Obstructive Pulmonary Disease', 'Cardiovascular Disease',
       'Diabetes', 'Alcohol', 'Tobacco',
       'Nutrition, Physical Activity, and Weight Status', 'Older Adults',
       'Overarching Conditions'], dtype=object)

In [178]:
df_filtered["YearStart"].unique()

array([2014, 2018, 2017, 2010, 2013, 2016, 2015, 2020, 2012, 2019, 2011,
       2001, 2009], dtype=int64)

In [179]:
df_filtered["YearEnd"].unique()

array([2014, 2018, 2017, 2010, 2013, 2016, 2015, 2020, 2012, 2019, 2011,
       2001, 2009], dtype=int64)

In [180]:
# df_filtered['YearStart'] = pd.to_datetime(df_filtered['YearStart'])
# df_filtered['YearEnd'] = pd.to_datetime(df_filtered['YearEnd'])

In [181]:
# Convert DataValue to float
df_filtered['DataValue'] = df_filtered['DataValue'].astype(float)

In [182]:
# remove locationdesc that equals United States
df_filtered = df_filtered[df_filtered['LocationDesc'] != 'United States']

In [183]:
df_filtered.dtypes

YearStart         int64
YearEnd           int64
LocationDesc     object
Topic            object
Question         object
DataValue       float64
dtype: object

In [184]:
# Group by location, year and datavalue
df_final = df_filtered.groupby(['LocationDesc', 'Topic', 'DataValue']).sum().reset_index()
df_final.head()

Unnamed: 0,LocationDesc,Topic,DataValue,YearStart,YearEnd,Question
0,Alabama,Alcohol,84.0,2015,2015,Chronic liver disease mortality
1,Alabama,Alcohol,90.0,2019,2019,Chronic liver disease mortality
2,Alabama,Alcohol,91.0,2010,2010,Chronic liver disease mortality
3,Alabama,Alcohol,93.0,2013,2013,Chronic liver disease mortality
4,Alabama,Alcohol,96.0,2011,2011,Chronic liver disease mortality


In [185]:
# take the mean of the data value for each location and topic
df_final = df_final[['LocationDesc', 'Topic', 'Question', 'DataValue', 'YearEnd']].groupby(
    ['LocationDesc', 'Topic', 'Question', 'YearEnd']).mean().reset_index()

In [186]:
# rename yearend to year
df_final = df_final.rename(columns={'YearEnd': 'Year'})

In [187]:
df_final.head()

Unnamed: 0,LocationDesc,Topic,Question,Year,DataValue
0,Alabama,Alcohol,Chronic liver disease mortality,2010,301.2
1,Alabama,Alcohol,Chronic liver disease mortality,2011,329.0
2,Alabama,Alcohol,Chronic liver disease mortality,2012,371.4
3,Alabama,Alcohol,Chronic liver disease mortality,2013,344.4
4,Alabama,Alcohol,Chronic liver disease mortality,2014,407.4


## Modeling

In [188]:
# X = df_final
X = df_final
y = df_final['DataValue']

In [189]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [190]:
nums = df_final.select_dtypes(include=['int64', 'float64']).columns
cats = df_final.select_dtypes(include=['object']).columns

In [191]:
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, nums),
        ('cat', cat_transformer, cats)
       
    ], remainder='passthrough')

In [192]:
models = [
    # ('lr', LinearRegression()),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('lasso', Lasso(alpha=0.1, random_state=42)),    
]

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', StackingRegressor(estimators=models))
])

In [193]:
pipe.fit(X_train, y_train)

In [194]:
pred = pipe.predict(X_test) 
pred

array([ 2141.00852484,    58.44707416,    25.45085217, ...,
        1948.23058621, 14684.19969598,  2899.12172342])

In [195]:
predicted_dfs = []


In [196]:
pipe.score(X_test, y_test)

0.9999999868561013

In [197]:
# mean squared error
mean_squared_error(y_test, pred)

1.5920698070739094

In [198]:
predictions = pd.DataFrame(
    {'State': X_test['LocationDesc'], 
     'Topic': X_test['Topic'],
     'Question': X_test['Question'],
     'Predicted': pred.round(2),
     'Future Year': X_test['Year'] + 5
     })

predictions = predictions.sort_values(by=['State', 'Topic'])

# remove years that are 5 digits and less than 2019
predictions = predictions[(predictions['Future Year'] > 2019) & (predictions['Future Year'] < 2024)]
predictions.to_csv('data/predictions.csv')

predictions.head()

Unnamed: 0,State,Topic,Question,Predicted,Future Year
8,Alabama,Alcohol,Chronic liver disease mortality,454.6,2023
5,Alabama,Alcohol,Chronic liver disease mortality,428.2,2020
50,Alabama,Cardiovascular Disease,Mortality from cerebrovascular disease (stroke),1751.05,2022
102,Alabama,Cardiovascular Disease,Mortality from total cardiovascular diseases,7277.76,2020
103,Alabama,Cardiovascular Disease,Mortality from total cardiovascular diseases,7234.2,2021
