In [32]:
# general imports
import requests as rq
import numpy as np
import pandas as pd
from urllib.request import urlopen
import json
import os
import pathlib
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# pandas formatting 
pd.set_option("display.max_rows", 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
# load county information for plotly from saved file 
with open('counties.json','r') as jin:
    counties = json.load(jin)

In [33]:
# grab a prebuilt fips mapping 
r = rq.get('https://raw.githubusercontent.com/kjhealy/fips-codes/master/county_fips_master.csv')
cleaned = r.text.replace('\r\n','\n')

In [55]:
# dir(r)
with open('temp.csv', 'w',encoding='utf-8') as tempout:
    tempout.write(cleaned)

In [85]:
f2c_map = pd.read_csv('temp.csv', usecols=['fips','county_name','state_name'])


In [86]:
newidx = f2c_map.set_index('fips')

In [91]:
# newidx.head()
# f2c_dict =  
len(list(f2c_map['fips'].unique()))
f2c_map['fips']

3145

In [83]:
with open("fips_map_to_county.json","w") as outfile:
    json.dump(f2c_map.to_json(orient="index"),outfile)

ValueError: DataFrame index must be unique for orient='index'.

In [81]:
f2c_map

Unnamed: 0,fips,county_name,state_name
0,1001,Autauga County,Alabama
1,1003,Baldwin County,Alabama
2,1005,Barbour County,Alabama
3,1007,Bibb County,Alabama
4,1009,Blount County,Alabama
...,...,...,...
3141,56037,Sweetwater County,Wyoming
3142,56039,Teton County,Wyoming
3143,56041,Uinta County,Wyoming
3144,56043,Washakie County,Wyoming


In [76]:
jdata = json.loads(f2c_map.to_json(orient="index"))

In [80]:
jdata['0']

{'fips': 1001, 'county_name': 'Autauga County', 'state_name': 'Alabama'}

In [4]:

def FIPS_function(row):
    state = str(row['State Code']).zfill(2)
    county = str(row['County Code']).zfill(3)
    return str(state + county)
# convert to NO2 ug/m^3 for reference
def no2_mass_by_vol(ppb):
    ugm3 = 1.88*ppb
    return ugm3



def set_daily_cases_deaths(df):
    df['daily_new_cases'] = df['JHU_ConfirmedCases.data'].diff()
    df['daily_new_deaths'] = df['JHU_ConfirmedDeaths.data'].diff()
    return df
# plotting one day's avg 
def show_day_mean(df, date):
    fig = px.choropleth(df[df['Date Local']==date], geojson=counties, locations='fips', color='Arithmetic Mean',
                               color_continuous_scale="Plasma",
                               range_color=(0, 70), #max value for daily avg is ~60ppb
                               scope="usa",
                               labels={'Arithmetic Mean':'Arithmetic Mean (ppb)'}
                              )
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":1,'autoexpand':True })
    fig.update_layout(
        autosize=False,
        width=1200,
        height=900,
    )
    return fig


# plotting one day's max value
def show_day_max(df, date):
    fig = px.choropleth(df[df['Date Local']==date], geojson=counties, locations='fips', color='1st Max Value',
                               color_continuous_scale="Plasma",
                               range_color=(0, 70), #max value for daily avg is ~60ppb
                               scope="usa",
                               labels={'1st Max Value':'1st Max Value (ppb)'}
                              )
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":1,'autoexpand':True })
    fig.update_layout(
        autosize=False,
        width=1200,
        height=900,
    )
    return fig   
def show_sites(df):
    fig = px.choropleth(df, geojson=counties, locations='fips', color='Parameter Code',
                               color_continuous_scale="Plasma",
                               range_color=(0, 1), #max value for daily avg is ~60ppb
                               scope="usa",
                               labels={'1st Max Value':'1st Max Value (ppb)'}
                              )
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":1,'autoexpand':True })
    fig.update_layout(
        autosize=False,
        width=1200,
        height=900,
    )
    return fig

# Loading EPA Data

In [5]:
cwd = pathlib.Path.cwd()
air_data_dir = cwd.joinpath("data","air_quality")


In [6]:
# no2_20 = pd.read_csv("./data/air_quality/no2/daily_no2_2020_with_FIPS.csv", dtype={'fips':'string'})
no2_20 = pd.read_csv(air_data_dir.joinpath("no2","daily_no2_2020_with_FIPS.csv"), dtype={'fips':'string'})
# covid1 = pd.read_csv(cwd.joinpath("data","covid",","daily_no2_2020_with_FIPS.csv"), dtype={'fips':'string'})

#### Argument for removing columns
 
We are looking for a link between 2 datasets, for now we will assume things like the 'POC' field (This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.) are unimportant to our investigation.

Similarly we will remove the method code, method name, ect.

more information on the field definitions can be found here -> https://aqs.epa.gov/aqsweb/airdata/FileFormats.html#_content_4

In [7]:
no2_columns = ['Date Local', 'fips', 'Arithmetic Mean','1st Max Value', '1st Max Hour',
               'AQI', 'Units of Measure', 'Event Type', 'Mean ugm3', 'Site Num', 'Observation Count',
               'Observation Percent', 'Longitude', 'Latitude', 'Site Num','Local Site Name', 'Address', 
               'State Name', 'County Name', 'CBSA Name', 'State Code', 'County Code']
slim_no2 = no2_20[no2_columns]
slim_no2.head()

Unnamed: 0,Date Local,fips,Arithmetic Mean,1st Max Value,1st Max Hour,AQI,Units of Measure,Event Type,Mean ugm3,Site Num,Observation Count,Observation Percent,Longitude,Latitude,Site Num.1,Local Site Name,Address,State Name,County Name,CBSA Name,State Code,County Code
0,2020-01-01,1073,15.752381,28.2,6,26,Parts per billion,,29.614476,23,21,88.0,-86.815,33.553056,23,North Birmingham,"NO. B'HAM,SOU R.R., 3009 28TH ST. NO.",Alabama,Jefferson,"Birmingham-Hoover, AL",1,73
1,2020-01-02,1073,9.595833,16.3,10,15,Parts per billion,,18.040166,23,24,100.0,-86.815,33.553056,23,North Birmingham,"NO. B'HAM,SOU R.R., 3009 28TH ST. NO.",Alabama,Jefferson,"Birmingham-Hoover, AL",1,73
2,2020-01-03,1073,17.3,31.6,10,29,Parts per billion,,32.524,23,24,100.0,-86.815,33.553056,23,North Birmingham,"NO. B'HAM,SOU R.R., 3009 28TH ST. NO.",Alabama,Jefferson,"Birmingham-Hoover, AL",1,73
3,2020-01-04,1073,2.791667,7.5,23,7,Parts per billion,,5.248334,23,24,100.0,-86.815,33.553056,23,North Birmingham,"NO. B'HAM,SOU R.R., 3009 28TH ST. NO.",Alabama,Jefferson,"Birmingham-Hoover, AL",1,73
4,2020-01-05,1073,14.408333,34.0,19,32,Parts per billion,,27.087666,23,24,100.0,-86.815,33.553056,23,North Birmingham,"NO. B'HAM,SOU R.R., 3009 28TH ST. NO.",Alabama,Jefferson,"Birmingham-Hoover, AL",1,73


In [8]:

# multiple sites in one county
slim_no2.loc[(slim_no2['Date Local']=='2020-01-01') & (slim_no2['fips']=='06001')]

Unnamed: 0,Date Local,fips,Arithmetic Mean,1st Max Value,1st Max Hour,AQI,Units of Measure,Event Type,Mean ugm3,Site Num,Observation Count,Observation Percent,Longitude,Latitude,Site Num.1,Local Site Name,Address,State Name,County Name,CBSA Name,State Code,County Code
4002,2020-01-01,6001,7.865217,18.5,1,17,Parts per billion,,14.786608,7,23,96.0,-121.784217,37.687526,7,Livermore,793 Rincon Ave.,California,Alameda,"San Francisco-Oakland-Hayward, CA",6,1
4368,2020-01-01,6001,12.617391,21.3,0,20,Parts per billion,,23.720695,9,23,96.0,-122.169935,37.743065,9,Oakland,9925 International Blvd,California,Alameda,"San Francisco-Oakland-Hayward, CA",6,1
4733,2020-01-01,6001,15.921739,28.0,21,26,Parts per billion,,29.932869,11,23,96.0,-122.282347,37.814781,11,Oakland West,1100 21st Street,California,Alameda,"San Francisco-Oakland-Hayward, CA",6,1
5096,2020-01-01,6001,15.934783,23.2,21,22,Parts per billion,,29.957392,12,23,96.0,-122.263376,37.793624,12,Laney College,Laney College Eighth St. parking lot Aisle J,California,Alameda,"San Francisco-Oakland-Hayward, CA",6,1
5462,2020-01-01,6001,12.826087,23.5,0,22,Parts per billion,,24.113044,13,23,96.0,-122.302741,37.864767,13,Berkeley Aquatic Park,1 Bolivar Dr,California,Alameda,"San Francisco-Oakland-Hayward, CA",6,1
5828,2020-01-01,6001,10.808696,18.7,0,17,Parts per billion,,20.320348,15,23,96.0,-121.903019,37.701222,15,Pleasanton - Owens Ct,Owens Ct.,California,Alameda,"San Francisco-Oakland-Hayward, CA",6,1


In [9]:
date_only=slim_no2.groupby(['Date Local'])
date_fips=slim_no2.groupby(['Date Local','fips'])

In [10]:
# date_only.
# date_fips.get_group(('2020-01-01', '01073')
county_mean=date_fips.agg({'Arithmetic Mean':'mean'})

In [11]:
# county_mean[(county_mean['Date Local']=='2020-01-01')&(county_mean['fips']=='01073')]
county_mean.loc[('2020-01-01','01073')]

Arithmetic Mean    16.373918
Name: (2020-01-01, 01073), dtype: float64

### Multiple sample sites

Some counties have multiple air monitoring stations. Below code builds a dictionary whose keys are the fips codes with multiple sites.

In [12]:
all_fips = list(no2_20['fips'].unique())
county_site_count = {}
for code in all_fips:
    site_count = no2_20[(no2_20['Date Local']=='2020-01-01')&(no2_20['fips']== code)]['Site Num'].count()
    if site_count > 1:
        county_site_count[code] = site_count

For a quick naive solution we can average the averages for each county, then take the max recorded across the counties.

In [21]:
no2_20[(no2_20['fips']=='34013') & (no2_20['Date Local'] == '2020-01-24')]

Unnamed: 0,State Code,County Code,Site Num,Parameter Code,POC,Latitude,Longitude,Datum,Parameter Name,Sample Duration,Pollutant Standard,Date Local,Units of Measure,Event Type,Observation Count,Observation Percent,Arithmetic Mean,1st Max Value,1st Max Hour,AQI,Method Code,Method Name,Local Site Name,Address,State Name,County Name,City Name,CBSA Name,Date of Last Change,fips,Mean ugm3
85907,34,13,3,42602,1,40.720989,-74.192892,WGS84,Nitrogen dioxide (NO2),1 HOUR,NO2 1-hour,2020-01-24,Parts per billion,,24,100.0,55.7875,65.0,6,63,74,INSTRUMENTAL - CHEMILUMINESCENCE,Newark Firehouse,360 Clinton Avenue,New Jersey,Essex,Newark,"New York-Newark-Jersey City, NY-NJ-PA",2021-02-25,34013,104.8805


In [14]:
def avg_county_pollution(df, column_to_avg = "Arithmetic Mean", date_column=None):
    
    
    if date_column == None:
        date_column = 'Date Local'
    # All days 
    # days = list(df[date_column].unique())
    # all_fips = list(df['fips'].unique())
    avg_by_day = df.groupby(['Date Local','fips']).agg({'Arithmetic Mean': 'mean','1st Max Value':'max','AQI':'max'})
    return avg_by_day

def load_covid_county(fips, county=None, state=None):
    covid_data_dir = cwd.joinpath("data","covid","processed_data","county_merged_parts")
    

In [16]:
squashed_no2 = avg_county_pollution(no2_20)
squashed_no2.to_csv(air_data_dir.joinpath("no2","daily_2020_squashed.csv"))

In [19]:

squashed_no2['Arithmetic Mean'].idxmax()
# squashed_no2.to_pickle(air_data_dir.joinpath("no2","daily_2020_multi_index.pkl"))

('2020-01-24', '34013')

# exploring Covid data

In [133]:
minimal_cols = ['JHU_ConfirmedCases.data',
                'JHU_ConfirmedDeaths.data',
                'JHU_ConfirmedRecoveries.data',
                'TotalPopulation.data',
                'MaleAndFemale_AtLeast65_Population.data',
                'Male_Total_Population.data',
                'Female_Total_Population.data',
                'hospitalIcuBeds',
                'hospitalStaffedBeds',
                'fips']


In [166]:
abbeville = pd.read_pickle(cwd.joinpath("data","covid","processed_data","county_merged_parts","Abbeville_SouthCarolina_UnitedStates.pkl"))
other = pd.read_pickle(cwd.joinpath("data","covid","processed_data","county_merged_parts","Acadia_Louisiana_UnitedStates.pkl"))

In [185]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
            ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
arrays2 = [['bar', 'bar', 'test', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
            ['one', 'two','three', 'one', 'two', 'one', 'two', 'one', 'two']]
tuples = list(zip(*arrays))
t2 = list(zip(*arrays2))
index1 = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index2 = pd.MultiIndex.from_tuples(t2, names=['third', 'fourth'])

s1 = pd.DataFrame(np.random.randn(8), index=index1, columns=['s1'])
s2 = pd.DataFrame(np.random.randn(9), index=index2, columns=['s2'])

In [186]:
s1.merge(s2, left_index=True, right_on=['third', 'fourth'])

Unnamed: 0_level_0,Unnamed: 1_level_0,s1,s2
third,fourth,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.319901,-0.491907
bar,two,1.105679,-0.752759
baz,one,-0.871174,0.190294
baz,two,-1.546483,-0.49155
foo,one,0.620842,0.439202
foo,two,-0.432264,-2.638884
qux,one,-1.089623,-0.076361
qux,two,-1.637294,-3.275784


In [158]:
s1.merge(s2, left_index=True, right_on=['third', 'fourth'])

Unnamed: 0_level_0,Unnamed: 1_level_0,s1,s2
third,fourth,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.147225,-1.656935
bar,two,-0.017699,0.214461
baz,one,-0.480291,0.406391
baz,two,2.106873,0.363508
foo,one,1.534973,-0.066036
foo,two,0.077661,-1.800659
qux,one,1.170215,-0.712484
qux,two,1.334965,-0.75559


In [167]:
a1 = abbeville[minimal_cols]
# a2 = a1.set_index(['dates','fips'])
o1 = other[minimal_cols]
# o2 = o1.set_index(['dates','fips'])

In [183]:
df2 = pd.concat([a1,o1.reindex(a1.index)],axis=0, verify_integrity=True, join='inner')


ValueError: Indexes have overlapping values: Index(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08', '2020-01-09', '2020-01-10',
       ...
       '2021-10-06', '2021-10-07', '2021-10-08', '2021-10-09', '2021-10-10', '2021-10-11', '2021-10-12', '2021-10-13', '2021-10-14', '2021-10-15'], dtype='object', name='dates', length=654)

In [174]:
df2.loc['2020-01-01']

Unnamed: 0_level_0,JHU_ConfirmedCases.data,JHU_ConfirmedDeaths.data,JHU_ConfirmedRecoveries.data,TotalPopulation.data,MaleAndFemale_AtLeast65_Population.data,Male_Total_Population.data,Female_Total_Population.data,hospitalIcuBeds,hospitalStaffedBeds,fips
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-01,0.0,0.0,0.0,24527.0,5343.0,11868.0,12673.0,6.0,25.0,45001.0
2020-01-01,0.0,0.0,0.0,62045.0,9491.0,30342.0,31848.0,7.0,171.0,22001.0


In [176]:
df2.reset_index(inplace=True)
df3 = df2.set_index(["dates","fips"])
df3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,JHU_ConfirmedCases.data,JHU_ConfirmedDeaths.data,JHU_ConfirmedRecoveries.data,TotalPopulation.data,MaleAndFemale_AtLeast65_Population.data,Male_Total_Population.data,Female_Total_Population.data,hospitalIcuBeds,hospitalStaffedBeds
dates,fips,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-01,45001.0,0.0,0.0,0.0,24527.0,5343.0,11868.0,12673.0,6.0,25.0
2020-01-02,45001.0,0.0,0.0,0.0,24527.0,5343.0,11868.0,12673.0,6.0,25.0
2020-01-03,45001.0,0.0,0.0,0.0,24527.0,5343.0,11868.0,12673.0,6.0,25.0
2020-01-04,45001.0,0.0,0.0,0.0,24527.0,5343.0,11868.0,12673.0,6.0,25.0
2020-01-05,45001.0,0.0,0.0,0.0,24527.0,5343.0,11868.0,12673.0,6.0,25.0


In [179]:
# df3.loc['2020-01-01']
df3.iloc[[0]]

Unnamed: 0_level_0,Unnamed: 1_level_0,JHU_ConfirmedCases.data,JHU_ConfirmedDeaths.data,JHU_ConfirmedRecoveries.data,TotalPopulation.data,MaleAndFemale_AtLeast65_Population.data,Male_Total_Population.data,Female_Total_Population.data,hospitalIcuBeds,hospitalStaffedBeds
dates,fips,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-01,45001.0,0.0,0.0,0.0,24527.0,5343.0,11868.0,12673.0,6.0,25.0


In [131]:
# two = pd.concat([a2,other], keys=["dates","fips"])
two = pd.merge(a2,other,
               left_on=["dates","fips"],
               right_on=["dates","fips"],)


In [132]:
two.head()


Unnamed: 0,index_x,dates,JHU_ConfirmedCases.data_x,JHU_ConfirmedCases.missing_x,NYT_ConfirmedCases.data_x,NYT_ConfirmedCases.missing_x,JHU_ConfirmedDeaths.data_x,JHU_ConfirmedDeaths.missing_x,JHU_ConfirmedRecoveries.data_x,JHU_ConfirmedRecoveries.missing_x,NYT_AllCausesDeathsWeekly_Expected_Deaths_AllCauses.data_x,NYT_AllCausesDeathsWeekly_Expected_Deaths_AllCauses.missing_x,NYT_ConfirmedDeaths.data_x,NYT_ConfirmedDeaths.missing_x,NYT_AllCausesDeathsWeekly_Excess_Deaths.data_x,NYT_AllCausesDeathsWeekly_Excess_Deaths.missing_x,NYT_AllCausesDeathsWeekly_Deaths_AllCauses.data_x,NYT_AllCausesDeathsWeekly_Deaths_AllCauses.missing_x,NYT_AllCausesDeathsMonthly_Deaths_AllCauses.data_x,NYT_AllCausesDeathsMonthly_Deaths_AllCauses.missing_x,NYT_AllCausesDeathsMonthly_Excess_Deaths.data_x,NYT_AllCausesDeathsMonthly_Excess_Deaths.missing_x,NYT_AllCausesDeathsMonthly_Expected_Deaths_AllCauses.data_x,NYT_AllCausesDeathsMonthly_Expected_Deaths_AllCauses.missing_x,TotalPopulation.data_x,TotalPopulation.missing_x,MaleAndFemale_AtLeast65_Population.data_x,MaleAndFemale_AtLeast65_Population.missing_x,Male_Total_Population.data_x,Male_Total_Population.missing_x,Female_Total_Population.data_x,Female_Total_Population.missing_x,MaleAndFemale_Under18_Population.data_x,MaleAndFemale_Under18_Population.missing_x,BLS_EmployedPopulation.data_x,BLS_EmployedPopulation.missing_x,BLS_UnemployedPopulation.data_x,BLS_UnemployedPopulation.missing_x,BLS_UnemploymentRate.data_x,BLS_UnemploymentRate.missing_x,BLS_LaborForcePopulation.data_x,BLS_LaborForcePopulation.missing_x,AverageDailyTemperature.data_x,AverageDailyTemperature.missing_x,AverageDewPoint.data_x,AverageDewPoint.missing_x,AverageRelativeHumidity.data_x,AverageRelativeHumidity.missing_x,AverageSurfaceAirPressure.data_x,AverageSurfaceAirPressure.missing_x,AveragePrecipitationTotal.data_x,AveragePrecipitationTotal.missing_x,AveragePrecipitation.data_x,AveragePrecipitation.missing_x,AverageWindDirection.data_x,AverageWindDirection.missing_x,AverageWindSpeed.data_x,AverageWindSpeed.missing_x,hospitalIcuBeds_x,hospitalStaffedBeds_x,hospitalLicensedBeds_x,latestTotalPopulation_x,fips,LND110210_x,index_y,JHU_ConfirmedCases.data_y,JHU_ConfirmedCases.missing_y,NYT_ConfirmedCases.data_y,NYT_ConfirmedCases.missing_y,JHU_ConfirmedDeaths.data_y,JHU_ConfirmedDeaths.missing_y,JHU_ConfirmedRecoveries.data_y,JHU_ConfirmedRecoveries.missing_y,NYT_AllCausesDeathsWeekly_Expected_Deaths_AllCauses.data_y,NYT_AllCausesDeathsWeekly_Expected_Deaths_AllCauses.missing_y,NYT_ConfirmedDeaths.data_y,NYT_ConfirmedDeaths.missing_y,NYT_AllCausesDeathsWeekly_Excess_Deaths.data_y,NYT_AllCausesDeathsWeekly_Excess_Deaths.missing_y,NYT_AllCausesDeathsWeekly_Deaths_AllCauses.data_y,NYT_AllCausesDeathsWeekly_Deaths_AllCauses.missing_y,NYT_AllCausesDeathsMonthly_Deaths_AllCauses.data_y,NYT_AllCausesDeathsMonthly_Deaths_AllCauses.missing_y,NYT_AllCausesDeathsMonthly_Excess_Deaths.data_y,NYT_AllCausesDeathsMonthly_Excess_Deaths.missing_y,NYT_AllCausesDeathsMonthly_Expected_Deaths_AllCauses.data_y,NYT_AllCausesDeathsMonthly_Expected_Deaths_AllCauses.missing_y,TotalPopulation.data_y,TotalPopulation.missing_y,MaleAndFemale_AtLeast65_Population.data_y,MaleAndFemale_AtLeast65_Population.missing_y,Male_Total_Population.data_y,Male_Total_Population.missing_y,Female_Total_Population.data_y,Female_Total_Population.missing_y,MaleAndFemale_Under18_Population.data_y,MaleAndFemale_Under18_Population.missing_y,BLS_EmployedPopulation.data_y,BLS_EmployedPopulation.missing_y,BLS_UnemployedPopulation.data_y,BLS_UnemployedPopulation.missing_y,BLS_UnemploymentRate.data_y,BLS_UnemploymentRate.missing_y,BLS_LaborForcePopulation.data_y,BLS_LaborForcePopulation.missing_y,AverageDailyTemperature.data_y,AverageDailyTemperature.missing_y,AverageDewPoint.data_y,AverageDewPoint.missing_y,AverageRelativeHumidity.data_y,AverageRelativeHumidity.missing_y,AverageSurfaceAirPressure.data_y,AverageSurfaceAirPressure.missing_y,AveragePrecipitationTotal.data_y,AveragePrecipitationTotal.missing_y,AveragePrecipitation.data_y,AveragePrecipitation.missing_y,AverageWindDirection.data_y,AverageWindDirection.missing_y,AverageWindSpeed.data_y,AverageWindSpeed.missing_y,hospitalIcuBeds_y,hospitalStaffedBeds_y,hospitalLicensedBeds_y,latestTotalPopulation_y,LND110210_y


In [96]:
abbeville.shape

(654, 62)

In [30]:
no2_20[no2_20['fips']=='45001']

Unnamed: 0,State Code,County Code,Site Num,Parameter Code,POC,Latitude,Longitude,Datum,Parameter Name,Sample Duration,Pollutant Standard,Date Local,Units of Measure,Event Type,Observation Count,Observation Percent,Arithmetic Mean,1st Max Value,1st Max Hour,AQI,Method Code,Method Name,Local Site Name,Address,State Name,County Name,City Name,CBSA Name,Date of Last Change,fips,Mean ugm3


In [11]:
non_zero_columns = [c for c in abbeville.columns if (abbeville[c].sum() > 0)]

non_zero_columns

['JHU_ConfirmedCases.data',
 'NYT_ConfirmedCases.data',
 'NYT_ConfirmedCases.missing',
 'JHU_ConfirmedDeaths.data',
 'JHU_ConfirmedRecoveries.missing',
 'NYT_AllCausesDeathsWeekly_Expected_Deaths_AllCauses.missing',
 'NYT_ConfirmedDeaths.data',
 'NYT_ConfirmedDeaths.missing',
 'NYT_AllCausesDeathsWeekly_Excess_Deaths.missing',
 'NYT_AllCausesDeathsWeekly_Deaths_AllCauses.missing',
 'NYT_AllCausesDeathsMonthly_Deaths_AllCauses.missing',
 'NYT_AllCausesDeathsMonthly_Excess_Deaths.missing',
 'NYT_AllCausesDeathsMonthly_Expected_Deaths_AllCauses.missing',
 'TotalPopulation.data',
 'MaleAndFemale_AtLeast65_Population.data',
 'Male_Total_Population.data',
 'Female_Total_Population.data',
 'MaleAndFemale_Under18_Population.data',
 'BLS_EmployedPopulation.data',
 'BLS_UnemployedPopulation.data',
 'BLS_UnemploymentRate.data',
 'BLS_LaborForcePopulation.data',
 'AverageDailyTemperature.data',
 'AverageDewPoint.data',
 'AverageRelativeHumidity.data',
 'AverageSurfaceAirPressure.data',
 'AveragePr

### Slimming down the census data

There are things that are not needed at this stage and if needed later could be looked up by county. I will remove the ones that don't seem to contribute much and see if a merge with no2 is reasonable

In [43]:
# Super slimmed down, no NYT, no *.missing
less_cols=['fips','JHU_ConfirmedCases.data','NYT_ConfirmedCases.data','JHU_ConfirmedDeaths.data','JHU_ConfirmedRecoveries.data',
           'TotalPopulation.data','MaleAndFemale_AtLeast65_Population.data','Male_Total_Population.data','Female_Total_Population.data',
           'MaleAndFemale_Under18_Population.data','BLS_EmployedPopulation.data','BLS_UnemployedPopulation.data','BLS_UnemploymentRate.data',
           'BLS_LaborForcePopulation.data','AverageDailyTemperature.data','hospitalIcuBeds','hospitalStaffedBeds','hospitalLicensedBeds']
df_small=abbeville[less_cols].reset_index()


In [45]:
ab_df = df_small.set_index(['dates','fips'])
ab_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,JHU_ConfirmedCases.data,NYT_ConfirmedCases.data,JHU_ConfirmedDeaths.data,JHU_ConfirmedRecoveries.data,TotalPopulation.data,MaleAndFemale_AtLeast65_Population.data,Male_Total_Population.data,Female_Total_Population.data,MaleAndFemale_Under18_Population.data,BLS_EmployedPopulation.data,BLS_UnemployedPopulation.data,BLS_UnemploymentRate.data,BLS_LaborForcePopulation.data,AverageDailyTemperature.data,hospitalIcuBeds,hospitalStaffedBeds,hospitalLicensedBeds
dates,fips,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-01-01,45001.0,0.0,0.0,0.0,0.0,24527.0,5343.0,11868.0,12673.0,4924.0,9706.5,370.5,3.676689,10077.0,44.875,6.0,25.0,25.0
2020-01-02,45001.0,0.0,0.0,0.0,0.0,24527.0,5343.0,11868.0,12673.0,4924.0,9706.5,370.5,3.676689,10077.0,45.84375,6.0,25.0,25.0
2020-01-03,45001.0,0.0,0.0,0.0,0.0,24527.0,5343.0,11868.0,12673.0,4924.0,9706.5,370.5,3.676689,10077.0,51.93502,6.0,25.0,25.0
2020-01-04,45001.0,0.0,0.0,0.0,0.0,24527.0,5343.0,11868.0,12673.0,4924.0,9706.5,370.5,3.676689,10077.0,54.208333,6.0,25.0,25.0
2020-01-05,45001.0,0.0,0.0,0.0,0.0,24527.0,5343.0,11868.0,12673.0,4924.0,9706.5,370.5,3.676689,10077.0,41.916667,6.0,25.0,25.0


In [49]:
all_no2_fips = list(no2_20['fips'].unique())
len(all_no2_fips)

260