In [1]:
import pandas as pd
import numpy as np
import time
import json
from datetime import date
from matplotlib import pyplot as plt

In [2]:
state_level_feat = pd.read_csv('Choose State Level Features.csv')
state_feat_info = pd.read_csv('state_series_table_all.csv')

In [3]:
print(state_level_feat.shape)
print(state_feat_info.shape)

(229, 2)
(85514, 8)


In [4]:
chosen_state_feat = state_level_feat.title.values

In [5]:
state_feat_info.title = state_feat_info.apply(lambda row: row['title'].strip(), axis=1)
print(state_feat_info.title.values)
chosen_state_feat_info = state_feat_info[state_feat_info.title.isin(chosen_state_feat)]
print(chosen_state_feat_info.shape)

['Housing Inventory: Active Listing Count in'
 'Housing Inventory: Active Listing Count Month-Over-Month in'
 'Housing Inventory: Active Listing Count Year-Over-Year in' ...
 'Deposits in Insured Commercial Nonmember Banks in'
 'Total Deposits in Commercial Nonmember Banks in'
 'Total Deposits in Commercial Banks in']
(14496, 8)


In [6]:
late_features = chosen_state_feat_info[chosen_state_feat_info.observation_start > '2010-01-01']
print(late_features.shape)
late_features_to_drop = late_features.title.unique()
state_feat_end_trim = chosen_state_feat_info[~chosen_state_feat_info.title.isin(late_features_to_drop)]
print(state_feat_end_trim.shape)

(1502, 8)
(12945, 8)


In [7]:
early_features = state_feat_end_trim[state_feat_end_trim.observation_end < '2017-01-01']
print(early_features.shape)
early_features_to_drop = early_features.title.unique()
state_feat_trim = state_feat_end_trim[~state_feat_end_trim.title.isin(early_features_to_drop)]
print(state_feat_trim.shape)

(826, 8)
(11984, 8)


#### Check that we have each feature for every state that a county belongs to

In [50]:
county_feat_trimmed = pd.read_csv('county_features_trimmed.csv')
unique_county_ids = county_feat_trimmed.county_id.unique()
df_county_ids = pd.DataFrame(data=unique_county_ids, columns=['county_id'])
county_table = pd.read_csv('clipped_county_table.csv')

In [51]:
joined = df_county_ids.merge(county_table, on='county_id', how='left')

In [52]:
states_needed = joined.state_id.unique()
print(len(states_needed))

51


Make sure you have every county for each feature and every state

In [53]:
state_freq_count = state_feat_trim.groupby('title').frequency.nunique()
feat_with_multiple_frequencies = state_freq_count[state_freq_count > 1].index.values

series_to_remove = []
for feat in feat_with_multiple_frequencies:
    df_cur_feat = state_feat_trim[state_feat_trim.title == feat]
    print(df_cur_feat.frequency.unique())

['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Monthly' 'Annual']
['Annual' 'Quarterly']
['Annual' 'Monthly']
['Monthly' 'Annual']


In [54]:
series_to_remove = []

monthly_feat = feat_with_multiple_frequencies[:-3]
monthly_feat = list(monthly_feat)
monthly_feat.extend(feat_with_multiple_frequencies[-2:])

annual_feat = feat_with_multiple_frequencies[-3:-2]
for feat in annual_feat:
    df_cur_feat = state_feat_trim[state_feat_trim.title == feat]
    series_to_remove.extend(df_cur_feat[df_cur_feat.frequency == 'Quarterly'].id.values)

for feat in monthly_feat:
    df_cur_feat = state_feat_trim[state_feat_trim.title == feat]
    series_to_remove.extend(df_cur_feat[df_cur_feat.frequency == 'Annual'].id.values)

print('Number of series to remove:', len(series_to_remove))
df_state_feat = state_feat_trim[~state_feat_trim.id.isin(series_to_remove)]
print(df_state_feat.shape)

Number of series to remove: 1151
(10833, 8)


In [56]:
# Remove states that are not needed
df_state_feat_final = df_state_feat[df_state_feat.state_id.isin(states_needed)]
print(df_state_feat_final.shape)

(10830, 8)


In [83]:
# Check that we have each feature for each state
print(df_state_feat_final.observation_start.max())
print(df_state_feat_final.observation_end.min())
print(df_state_feat_final.title.nunique())
print(df_state_feat_final.state_id.nunique())
agg_state = df_state_feat_final.groupby('title').state_id.count()
print(agg_state.sort_values().head(32).index)
# print(agg_state[agg_state > 51].title.values)

2008-01-01
2017-01-01
181
51
Index(['Not in Labor Force: Discouraged Workers for',
       'Job Losers and Persons Who Completed Temporary Jobs, as a Percent of the Civilian Labor Force for',
       'Insured Unemployment Rate in', 'Initial Claims in',
       'State Unemployment Benefits in', 'Civilian Labor Force for',
       'Coincident Economic Activity Index for', 'Resident Population in',
       'Rental Vacancy Rate for', 'Continued Claims (Insured Unemployment) in',
       'Covered Employment in',
       'Number of Civilians Unemployed for 15 Weeks or Longer for',
       'Real Trade-Weighted Value of the dollar for',
       'Real Total Gross Domestic Product for', 'Homeownership Rate for',
       'Home Vacancy Rate for', 'Employed Involuntary Part-Time for',
       'Employment Level for', 'Total Gross Domestic Product for',
       'Leading Index for', 'Real Median Household Income in',
       'All-Transactions House Price Index for',
       'Persons Unemployed 15 Weeks or Longer, a

In [72]:
red_states = df_state_feat_final[df_state_feat_final.title.str.match('Not in Labor Force')].state_id.unique()
all_states = df_state_feat_final.state_id.unique()
print(set(all_states) - set(red_states))

{27290}


This corresponds to DC. How many counties does DC have?

In [85]:
print(county_table[county_table.state_id == 27290])

     county_id                  name  state_id
232    33508.0  District of Columbia     27290


Ok, so we will drop county 33508 from the dataset, and drop state 27290.

In [107]:
df_state_feat_out = df_state_feat_final[~(df_state_feat_final.state_id == 27290)]
print(df_state_feat_out.shape)

(10694, 8)


In [108]:
# Check that we have each feature for each state
print(df_state_feat_out.observation_start.max())
print(df_state_feat_out.observation_end.min())
print(df_state_feat_out.title.nunique())
print(df_state_feat_out.state_id.nunique())
agg_state_out = df_state_feat_out.groupby('title').state_id.count()
print(agg_state_out.sort_values())

2008-01-01
2017-01-01
181
50
title
Accommodation and Food Services Earnings in                                           50
Poverty Universe, Age 5-17 related for                                                50
Poverty Universe, All Ages for                                                        50
Poverty, Child Tax Exemptions for                                                     50
Professional and Technical Services Earnings in                                       50
                                                                                    ... 
Average Weekly Earnings of Production Employees: Manufacturing in                    100
Average Weekly Earnings of All Employees: Trade, Transportation, and Utilities in    100
Average Weekly Earnings of All Employees: Professional and Business Services in      100
New Private Housing Units Authorized by Building Permits for                         100
All Employees: Wholesale Trade in                                          

Seems like we have duplicate series or something for some things

In [116]:
df_out_adj_count = df_state_feat_out.groupby('title').seasonal_adjustment.nunique()
df_out_mult_adj = df_out_adj_count[df_out_adj_count > 1].index.values

series_to_remove = []
for feat in df_out_mult_adj:
    df_cur_feat = df_state_feat_out[df_state_feat_out.title == feat]
    series_to_remove.extend(df_cur_feat[df_cur_feat.seasonal_adjustment == 'Not Seasonally Adjusted'].id.values)

print(len(series_to_remove))

1642


Remove series that are not seasonally adjusted and keep the seasonally adjusted ones

In [117]:
df_out_fin = df_state_feat_out[~df_state_feat_out.id.isin(series_to_remove)]
print(df_out_fin.shape)

(9052, 8)


In [118]:
# Check that we have each feature for each state
print(df_out_fin.observation_start.max())
print(df_out_fin.observation_end.min())
print(df_out_fin.title.nunique())
print(df_out_fin.state_id.nunique())
agg_fin = df_out_fin.groupby('title').state_id.count()
print(agg_fin.sort_values())

2008-01-01
2017-01-01
181
50
title
Accommodation and Food Services Earnings in                      50
Professional and Technical Services Earnings in                  50
Professional and Technical Services Wages and Salaries in        50
Projected Business Formations Within 4 Quarters for              50
Projected Business Formations within 8 Quarters for              50
                                                                 ..
Implicit Regional Price Deflator: Metropolitan Portion for       50
Implicit Regional Price Deflator: Nonmetropolitan Portion for    50
Imports of Goods for                                             50
Exports of Goods for                                             50
All Employees: Wholesale Trade in                                52
Name: state_id, Length: 181, dtype: int64


Looks like we still have something going on with All Employees: Wholesale Trade in

In [128]:
temp = df_out_fin[df_out_fin.title.str.match('All Employees: Wholesale')]
# print(temp.sort_values('state_id'))
print(temp[temp.state_id == 27300])  
print(temp[temp.state_id == 27323])  

                           id                              title  \
30612    SMS22000004100000001  All Employees: Wholesale Trade in   
30798  SMU22000004100000001SA  All Employees: Wholesale Trade in   

      observation_start observation_end frequency                 units  \
30612        1990-01-01      2020-02-01   Monthly  Thousands of Persons   
30798        1990-01-01      2017-01-01   Monthly  Thousands of Persons   

       seasonal_adjustment  state_id  
30612  Seasonally Adjusted     27300  
30798  Seasonally Adjusted     27300  
                           id                              title  \
67609    SMS45000004100000001  All Employees: Wholesale Trade in   
67769  SMU45000004100000001SA  All Employees: Wholesale Trade in   

      observation_start observation_end frequency                 units  \
67609        1990-01-01      2020-02-01   Monthly  Thousands of Persons   
67769        1990-01-01      2020-02-01   Monthly  Thousands of Persons   

       seasonal_adjus

We will drop SMS22000004100000001 and SMS45000004100000001

In [130]:
df_state_features_out = df_out_fin[~df_out_fin.id.isin(['SMS22000004100000001', 'SMS45000004100000001'])]

In [131]:
# Check that we have each feature for each state
print(df_state_features_out.observation_start.max())
print(df_state_features_out.observation_end.min())
print(df_state_features_out.title.nunique())
print(df_state_features_out.state_id.nunique())
agg_fin = df_state_features_out.groupby('title').state_id.count()
print(agg_fin.sort_values())

2008-01-01
2017-01-01
181
50
title
Accommodation and Food Services Earnings in                      50
Professional and Technical Services Earnings in                  50
Professional and Technical Services Wages and Salaries in        50
Projected Business Formations Within 4 Quarters for              50
Projected Business Formations within 8 Quarters for              50
                                                                 ..
Implicit Regional Price Deflator for                             50
Implicit Regional Price Deflator: Metropolitan Portion for       50
Implicit Regional Price Deflator: Nonmetropolitan Portion for    50
Imports of Goods: Manufactured Commodities for                   50
Wholesale Trade Wages and Salaries in                            50
Name: state_id, Length: 181, dtype: int64


Next we must drop DC from the county features too

In [133]:
df_county_feat_trimmed = pd.read_csv('county_features_trimmed.csv')
print(df_county_feat_trimmed.shape)
df_county_feat_out = df_county_feat_trimmed[df_county_feat_trimmed.county_id != 33508]
print(df_county_feat_out.shape)

(158046, 9)
(157982, 9)


### Finally

In [134]:
df_county_feat_out.to_csv('county_features_final.csv')
df_state_features_out.to_csv('state_features_final.csv')