In [3]:
import pandas as pd
import numpy as np
import time
import json
from datetime import date
from matplotlib import pyplot as plt

In [2]:
county_level_feat = pd.read_csv('Choose County Level Features.csv')
state_level_feat = pd.read_csv('Choose State Level Features.csv')
county_feat_info = pd.read_csv('clipped_series_table.csv')
state_feat_info = pd.read_csv('state_series_table_all.csv')

In [3]:
chosen_county_feat = county_level_feat.title.values
chosen_state_feat = state_level_feat.title.values

In [4]:
chosen_county_feat_info = county_feat_info[county_feat_info.title.isin(chosen_county_feat)]
print(chosen_county_feat_info.shape)

(205840, 8)


Some chosen features have multiple frequencies: ie Monthly and Annual. Let's get rid of the annual version

In [5]:
feat_freq_count = chosen_county_feat_info.groupby('title').frequency.nunique()
feat_with_multiple_frequencies = feat_freq_count[feat_freq_count > 1].index.values
print(feat_with_multiple_frequencies)

series_to_remove = []
for feat in feat_with_multiple_frequencies:
    df_cur_feat = chosen_county_feat_info[chosen_county_feat_info.title == feat]
    series_to_remove.extend(df_cur_feat[df_cur_feat.frequency == 'Annual'].id.values)

print('Number of series to remove:', len(series_to_remove))
df_chose_county_feat = chosen_county_feat_info[~chosen_county_feat_info.id.isin(series_to_remove)]

['Civilian Labor Force' 'Employed Persons' 'Unemployed Persons'
 'Unemployment Rate']
Number of series to remove: 9584


In [6]:
def num_months(row):
    end = date.fromisoformat(row['observation_end'])
    start = date.fromisoformat(row['observation_start'])
    num_months = (end.year - start.year)*12 + (end.month - start.month)
    return num_months

df_chose_county_feat['num_months'] = df_chose_county_feat.apply(num_months, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chose_county_feat['num_months'] = df_chose_county_feat.apply(num_months, axis=1)


Some features have the same id and title

In [7]:
print(df_chose_county_feat[df_chose_county_feat.duplicated(subset = ['county_id', 'title'], keep=False)][38:])

       frequency                id observation_end observation_start  \
80932    Monthly           KYHALFN      2020-02-01        1990-01-01   
80933    Monthly        KYHANC1LFN      2015-12-01        1990-01-01   
80934    Monthly        KYHANC1URN      2018-01-01        1990-01-01   
80936    Monthly           KYHAURN      2020-02-01        1990-01-01   
240455    Annual  NETMIGNACS022033      2017-01-01        2009-01-01   
240456    Annual  NETMIGNACS022121      2017-01-01        2009-01-01   

            seasonal_adjustment  \
80932   Not Seasonally Adjusted   
80933   Not Seasonally Adjusted   
80934   Not Seasonally Adjusted   
80936   Not Seasonally Adjusted   
240455  Not Seasonally Adjusted   
240456  Not Seasonally Adjusted   

                                                    title  \
80932                                Civilian Labor Force   
80933                                Civilian Labor Force   
80934                                   Unemployment Rate   
80936

In [8]:
series_id_to_remove = ['MHICILBFL12025A052NCEN', 'MHICIUBFL12025A052NCEN', 'MHIFL12025A052NCEN', 
                       'PE5T17FL12025A647NCEN', 'PECILB5T17FL12025A647NCEN', 'PECILBAAFL12025A647NCEN',
                       'PECILBU18FL12025A647NCEN', 'PECIUB5T17FL12025A647NCEN', 'PECIUBAAFL12025A647NCEN',
                       'PECIUBU18FL12025A647NCEN', 'PP5T17FL12025A156NCEN', 'PPAAFL12025A156NCEN', 
                       'PPCILB5T17FL12025A156NCEN', 'PPCILBAAFL12025A156NCEN', 'PPCILBU18FL12025A156NCEN',
                       'PPCIUB5T17FL12025A156NCEN','PPCIUBAAFL12025A156NCEN', 'PPCIUBU18FL12025A156NCEN', 
                       'PPU18FL12025A156NCEN', 'KYHANC1LFN', 'KYHANC1URN', 'NETMIGNACS022121']

print(df_chose_county_feat.shape)
df_chose_county_feat = df_chose_county_feat[~df_chose_county_feat.id.isin(series_id_to_remove)]
print(df_chose_county_feat.shape)

(196256, 9)
(196234, 9)


In [9]:
# df_chose_county_feat.to_csv('county_feat_months.csv')

In [10]:
house_price_sorted = df_chose_county_feat[df_chose_county_feat.title == 'All-Transactions House Price Index'].sort_values('observation_end')
print(house_price_sorted.head(10))

       frequency              id observation_end observation_start  \
138810    Annual  ATNHPIUS36061A      2013-01-01        1998-01-01   
123810    Annual  ATNHPIUS31003A      2016-01-01        1999-01-01   
108335    Annual  ATNHPIUS28025A      2016-01-01        1992-01-01   
98545     Annual  ATNHPIUS26131A      2016-01-01        1998-01-01   
178214    Annual  ATNHPIUS42115A      2016-01-01        1992-01-01   
117263    Annual  ATNHPIUS29125A      2016-01-01        1998-01-01   
223597    Annual  ATNHPIUS54045A      2016-01-01        1998-01-01   
195627    Annual  ATNHPIUS48035A      2017-01-01        1999-01-01   
103683    Annual  ATNHPIUS27081A      2017-01-01        1999-01-01   
68397     Annual  ATNHPIUS19093A      2017-01-01        1998-01-01   

            seasonal_adjustment                               title  \
138810  Not Seasonally Adjusted  All-Transactions House Price Index   
123810  Not Seasonally Adjusted  All-Transactions House Price Index   
108335  Not Seas

By removing 1 county, our ground truth range is 2000-2016 rather than 2000-2013<br>
By removing 6 more counties, our ground truth range is 2000-2017 rather than 2000-2016.<br>
I will remove 6 more counties bc we lose 16 years worth of data for 6 counties but gain 1 year worth of data for 2395 counties

In [11]:
counties_to_remove = house_price_sorted[house_price_sorted.observation_end < '2017-01-01'].county_id.values
print(df_chose_county_feat.shape)
new_df_chose_county_feat = df_chose_county_feat[~df_chose_county_feat.county_id.isin(counties_to_remove)]
print(new_df_chose_county_feat.shape)

(196234, 9)
(195663, 9)


In [12]:
# Now let's see what features we lose if the earliest end date is 2017
feats_end_too_early = new_df_chose_county_feat[new_df_chose_county_feat.observation_end < '2017-01-01']
print(feats_end_too_early.title.unique())

['Rate of Preventable Hospital Admissions'
 'Gross Domestic Product: Private Goods-Producing Industries'
 'Gross Domestic Product: Private Services-Providing Industries'
 'Real Gross Domestic Product: Private Goods-Producing Industries'
 'Real Gross Domestic Product: Private Services-Providing Industries'
 'Combined Violent and Property Crime Incidents Known to Law Enforcement'
 'White to Non-White Racial Dissimilarity Index' 'Premature Death Rate'
 'Age-Adjusted Premature Death Rate']


I think we are ok to lose preventable hospital admissions, premature death rate, age-adjusted premature death rate. Let's see if there are any measures of GDP that we would still keep and if we could keep Crime

In [13]:
print(new_df_chose_county_feat[new_df_chose_county_feat.title.str.match('.*domestic product', case=False)].title.unique())

['Gross Domestic Product: All Industries'
 'Gross Domestic Product: Private Goods-Producing Industries'
 'Gross Domestic Product: Government and Government Enterprises'
 'Gross Domestic Product: Private Services-Providing Industries'
 'Real Gross Domestic Product: All Industries'
 'Real Gross Domestic Product: Private Goods-Producing Industries'
 'Real Gross Domestic Product: Government and Government Enterprises'
 'Real Gross Domestic Product: Private Services-Providing Industries']


Looks like we would still keep GDP all industries which should be fine. Let's look at crime

In [14]:
crime_feats = new_df_chose_county_feat[new_df_chose_county_feat.title.str.match('.*crime', case=False)]
print(crime_feats.title.unique())

['Combined Violent and Property Crime Incidents Known to Law Enforcement']


So that's the only feature related to crime. Let's see how many counties end before 2017

In [15]:
print(crime_feats[crime_feats.observation_end < '2017-01-01'].shape)

(173, 9)


So we have to lose 173 counties to keep this feature for 2222 counties. Let's wait to make this decision for now. <br>
First lets drop the features that end too early that we found earlier

In [16]:
feat_names_to_drop = feats_end_too_early.title.unique()
shortlist = list(feat_names_to_drop)
shortlist.remove('Combined Violent and Property Crime Incidents Known to Law Enforcement')
print(shortlist)
print(new_df_chose_county_feat.shape)
df_county_feat_end_clipped = new_df_chose_county_feat[~new_df_chose_county_feat.title.isin(shortlist)]
print(df_county_feat_end_clipped.shape)

['Rate of Preventable Hospital Admissions', 'Gross Domestic Product: Private Goods-Producing Industries', 'Gross Domestic Product: Private Services-Providing Industries', 'Real Gross Domestic Product: Private Goods-Producing Industries', 'Real Gross Domestic Product: Private Services-Providing Industries', 'White to Non-White Racial Dissimilarity Index', 'Premature Death Rate', 'Age-Adjusted Premature Death Rate']
(195663, 9)
(176715, 9)


Now let's look at features that start too late

Note, we can choose to drop 9 more counties to start 1 year earlier in 1/1/1999. So we lose 1 year of data for 9 counties to add 1 year of data for 2213 counties

In [17]:
# Now let's see what features we lose if the latest start date is 2000
feats_start_too_late = df_county_feat_end_clipped[df_county_feat_end_clipped.observation_start > '1999-01-01']
print(len(feats_start_too_late.title.unique()))

69


Wow looks like there are a lot of features that have at least 1 county with a late start date. Let's look into these

In [18]:
diff_start_dates = feats_start_too_late.observation_start.unique()
diff_start_dates.sort()
print(diff_start_dates)

['2000-01-01' '2001-01-01' '2002-01-01' '2003-01-01' '2004-01-01'
 '2005-01-01' '2006-01-01' '2007-01-01' '2008-01-01' '2009-01-01'
 '2010-01-01' '2011-01-01' '2012-01-01' '2013-01-01' '2014-01-01'
 '2015-01-01']


In [19]:
cur_late = df_county_feat_end_clipped[df_county_feat_end_clipped.observation_start > '2010-01-01']
print(cur_late.title.unique())

['Percent of Population Below the Poverty Level'
 'Combined Violent and Property Crime Incidents Known to Law Enforcement'
 'New Private Housing Structures Authorized by Building Permits']


- For crime we'd lose ~225 counties to maintain a date range of 2010-2017
- For new private housing we can drop 3 counties and maintain a start date of 2004
- Percent of pop below poverty level starts in 2012
- Income Inequality starts in 2010
- Burdened Households starts in 2010
- High school grad starts in 2010
- Bach or higher starts in 2010
- Socioeconomic population estimate starts in 2008

### Basically i think we should try to keep 2010 to 2017 as our date range

In [20]:
# First let's drop the 3 counties for private housing structures.
# Then let's drop the 225 counties for crime
# Then let's drop percent of pop below poverty level as a feature and we should be good

late_counties = df_county_feat_end_clipped[df_county_feat_end_clipped.observation_start > '2010-01-01']
housing_counties = late_counties[late_counties.title == 'New Private Housing Structures Authorized by Building Permits'].county_id.values
late_crime_counties = late_counties[late_counties.title == 'Combined Violent and Property Crime Incidents Known to Law Enforcement'].county_id.values
poverty_feat = late_counties[late_counties.title == 'Percent of Population Below the Poverty Level'].id.values
early_counties = df_county_feat_end_clipped[df_county_feat_end_clipped.observation_end < '2017-01-01'].county_id.values

counties_to_drop = []
counties_to_drop.extend(housing_counties)
counties_to_drop.extend(late_crime_counties)
counties_to_drop.extend(early_counties)

print(df_county_feat_end_clipped.shape)
df_feat_dropped_county = df_county_feat_end_clipped[~df_county_feat_end_clipped.county_id.isin(counties_to_drop)]
print(df_feat_dropped_county.shape)

(176715, 9)
(160218, 9)


In [21]:
df_county_feat_trimmed = df_feat_dropped_county[~df_feat_dropped_county.id.isin(poverty_feat)]
print(df_county_feat_trimmed.shape)

(158046, 9)


Now every feature in our dataset should have data from 2010-2017 

In [22]:
print(df_county_feat_trimmed.observation_start.max())
print(df_county_feat_trimmed.observation_end.min())

2010-01-01
2017-01-01


Time Range Looks good!<br>
Let's see how many features we have

In [23]:
print(df_county_feat_trimmed.title.nunique())
print(df_county_feat_trimmed.county_id.nunique())

73
2172


#### Check that we have each feature for every county

In [24]:
agg_county = df_county_feat_trimmed.groupby('title').county_id.count()
print(agg_county.sort_values())

title
Combined Violent and Property Crime Incidents Known to Law Enforcement    2010
Real Gross Domestic Product: Government and Government Enterprises        2121
Real Gross Domestic Product: All Industries                               2121
Gross Domestic Product: Government and Government Enterprises             2121
Gross Domestic Product: All Industries                                    2121
                                                                          ... 
Estimated Percent of Related Children Age 5-17 in Families in Poverty     2172
High School Graduate or Higher (5-year estimate)                          2172
Homeownership Rate (5-year estimate)                                      2172
Bachelor's Degree or Higher (5-year estimate)                             2172
Unemployment Rate                                                         2172
Name: county_id, Length: 73, dtype: int64


We do not. We need to drop the counties we don't have all the features for

In [25]:
counties_to_keep = df_county_feat_trimmed[df_county_feat_trimmed.title.str.match('combined violent', case=False)].county_id.values
final_trimmed_county_data = df_county_feat_trimmed[df_county_feat_trimmed.county_id.isin(counties_to_keep)]
print(final_trimmed_county_data.shape)

(146567, 9)


Again let's check the number of counties, features, and check that we have every feature for each county

In [26]:
print(final_trimmed_county_data.observation_start.max())
print(final_trimmed_county_data.observation_end.min())
print(final_trimmed_county_data.title.nunique())
print(final_trimmed_county_data.county_id.nunique())
agg_county_fin = final_trimmed_county_data.groupby('title').county_id.count()
print(agg_county_fin[agg_county_fin < 2010])

2010-01-01
2017-01-01
73
2010
title
Gross Domestic Product: All Industries                                1985
Gross Domestic Product: Government and Government Enterprises         1985
Net County-to-County Migration Flow (5-year estimate)                 2009
New Private Housing Structures Authorized by Building Permits         1996
Per Capita Personal Income                                            1986
Personal Income                                                       1986
Real Gross Domestic Product: All Industries                           1985
Real Gross Domestic Product: Government and Government Enterprises    1985
Name: county_id, dtype: int64


In [27]:
counties_keep = final_trimmed_county_data[final_trimmed_county_data.title == 'Real Gross Domestic Product: All Industries'].county_id.unique()
trim_c_data = final_trimmed_county_data[final_trimmed_county_data.county_id.isin(counties_keep)]
print(trim_c_data.shape)

(144890, 9)


In [28]:
print(trim_c_data.observation_start.max())
print(trim_c_data.observation_end.min())
print(trim_c_data.title.nunique())
print(trim_c_data.county_id.nunique())
agg_county_fin = trim_c_data.groupby('title').county_id.count()
print(agg_county_fin)

2010-01-01
2017-01-01
73
1985
title
90% Confidence Interval Lower Bound of Estimate of Median Household Income                     1985
90% Confidence Interval Lower Bound of Estimate of People Age 0-17 in Poverty                  1985
90% Confidence Interval Lower Bound of Estimate of People of All Ages in Poverty               1985
90% Confidence Interval Lower Bound of Estimate of Percent of People Age 0-17 in Poverty       1985
90% Confidence Interval Lower Bound of Estimate of Percent of People of All Ages in Poverty    1985
                                                                                               ... 
Resident Population                                                                            1985
SNAP Benefits Recipients                                                                       1985
Single-parent Households with Children as a Percentage of Households with Children             1985
Unemployed Persons                                              

In [30]:
trim_c_data.to_csv('county_features_trimmed.csv', index=False)