In [None]:
import pandas as pd
from pathlib import Path

# Make Dataset

1. Start with NETS/LTDB priority dataset For Simplicity
 - Build a bare bones NETS only, LTDB only, and combined model and then and assess before adding other variables.
 - This changes our plans regarding years to use in LTDB. Instead of drawing from the 2011-2015 ACS we will use the interpolated LTDB data from the priority dataset
2. Subset RECVD data to tracts only available in life expectancy estimation target

## Variables of Interest:

In this iteration for both the NETS and LTDB data we will average values from 2010-2014, inclusive.

In [None]:
data_path = Path.cwd().parent / 'data'
years = ['2010', '2011', '2012', '2013', '2014']
# get colnames that we need

with open(data_path / 'raw' / 'T10_Priority_Wide_Interpolated.csv', 'r') as f:
    cols = f.readline().strip().split(',')

proj_cols = [x for x in cols if x[-4:] in years]

data_X = pd.read_csv(data_path / 'raw' / 'T10_Priority_Wide_Interpolated.csv', usecols=proj_cols,
                     dtype={'t10_cen_uid_u_2010': "object"}) \
                     .set_index('t10_cen_uid_u_2010')

In [None]:
data_y = pd.read_csv(data_path / 'raw' / 'US_A.csv',
                    usecols=['Tract ID', 'e(0)'],
                    dtype={'Tract ID': "object"}) \
    .rename(columns={'Tract ID': 't10_cen_uid_u_2010'}) \
    .set_index('t10_cen_uid_u_2010')

In [None]:
data_allyrs = data_X.join(data_y, how='right')

### Drop columns of un-needed geographic vars

In [None]:
drop_cols = ['t10_gis_area_k_2010',
 't10_gis_area_l_2010',
 'm10_cen_memi_x_2010',
 'm10_cen_uid_u_2010',
 'c10_cen_uid_u_2010',
 'z10_cen_uid_u_2010']

data_allyrs.drop(columns=drop_cols, inplace=True)

### Write Intermediate Priority Dataset to File

In [None]:
X_priority_allyrs = data_allyrs.iloc[:, :-1] 

X_priority_allyrs.to_csv(data_path / 'interim' / 'X_priority_allyrs.csv')

# no more processing necessary for our target so it goes directly to "processed"
data_allyrs.iloc[:, -1].to_csv(data_path / 'processed' / 'y_priority.csv', header=True)

### Average Variables of Interest

In [None]:
X_priority_allyrs.columns = pd.Index([(x[:-5], int(x[-4:])) for x in X_priority_allyrs.columns])

In [None]:
X_priority = X_priority_allyrs.groupby(axis=1, level=0).mean()

### Write Final Priority Dataset to File

In [None]:
X_priority.to_csv(data_path / 'processed' / 'X_priority.csv')