# DSCI 591 Data Analysis
## Covid-19

In [1]:
import os
import glob
import re
import scipy

import pandas as pd
import pyspark.pandas as ps
from pyspark.sql import SparkSession
import findspark

from ipynb.fs.full.covid_data_collection import get_counties_df

### Initialize Spark

In [2]:
findspark.init() 

In [3]:
spark = SparkSession.builder.getOrCreate()

21/10/31 14:09:45 WARN Utils: Your hostname, noah-WS-Z390-PRO resolves to a loopback address: 127.0.1.1; using 10.0.0.58 instead (on interface eno1)
21/10/31 14:09:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/31 14:09:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
21/10/31 14:09:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Merge and reformat COVID data by date and FIPS

In [25]:
# FIPS codes from the FCC.  Attempt to match all counties


def make_lookup_functions():
    fcc_county_file = './config/fcc_county_fips.txt'
    with open(fcc_county_file) as fh:
        state = None
        fips_to_county = {}
        county_to_fips = {}
        for row in fh:
            if match := re.match(r'\s+(\d\d\d\d\d)\s+ (.*)', row):
                fips, location = match.groups()
                if fips.endswith('000'):
                    state = location.replace(' ','')
                    continue

                location = location.split(' County')[0]
                location = location.split(' Borough')[0]
                location = location.split(' Census Area')[0]
                location = location.split(' Parish')[0]
                location = location.split(' National Park')[0]
                
                if state == 'Florida' and location == 'Dade':
                    location = 'Miami-Dade'
                if state == 'Massachusetts' and location == 'Dukes':
                    location = 'DukesandNantucket'
                if state == 'Alaska' and location == 'Wrangell-Petersburg':
                    location = 'Wrangell'
                if state == 'Alaska' and location == 'Skagway-Hoonah-Angoon':
                    location = 'Skagway'
                if state == 'Alaska' and location == 'Prince of Wales-Outer Ketchikan':
                    location = 'Prince of Wales'
                if state == 'New York' and location == 'New York':
                    location = 'New York City'
                    
                try:
                    location = ''.join([s[0].upper() + s[1:] for s in location.split(' ')])    
                except:
                    pass  # garbage

                county = location.replace(' ', '') + '_' + state + '_UnitedStates'
                fips_to_county[int(fips)] = county
                county_to_fips[county.lower()] = int(fips)
                
        def lookup_county_from_fips(fips_):
            return fips_to_county[int(fips_)]
        
        def lookup_fips_from_county(county_):
            try:
                return county_to_fips[county_.lower()]
            except KeyError:
                pass
            
            try:
                county_ = county_.lower()
                parts = county_.split('_')
                parts[0] = parts[0][:parts[0].index('city')]
                return county_to_fips['_'.join(parts)]
            except:
                pass
            
            try:
                parts = county_.split('_')
                parts[0] = parts[0] +'city'
                return county_to_fips['_'.join(parts)]
            except:
                pass
    
            parts = county_.split('_')
            for k, v in county_to_fips.items():
                part0, part1, _ = k.split('_')
                if part1 != parts[1]:
                    continue
                if parts[0].startswith(part0) or part0.startswith(parts[0]):
                    return v

            raise KeyError
                
        
        return lookup_county_from_fips, lookup_fips_from_county

get_county_from_fips,  get_fips_from_county = make_lookup_functions()

In [26]:
def merge_processed_data() -> ps.DataFrame:
    path = './processed_data/county_merged_parts/*.pkl'
    sdf = None
    error_count = 0
    total_length = 0
    for file in glob.glob(path):
        county = re.search(r'merged_parts.(.*)\.pkl', file).groups(1)[0]

        df = pd.read_pickle(file)
 
        try:
            df.reset_index(inplace=True)

            df['fips'] = get_fips_from_county(county)
            df['county'] = county
            df['Date Local'] = df.dates.to_list()
            
            if sdf is not None:
                sdf = sdf.append(df, ignore_index=True)
            else:
                sdf = df

        except KeyError:
            if not county.startswith('Unassigned') and not county.startswith('Outof'):
                print(f'Key Error {county}')
        
    return sdf
    
    
sdf = merge_processed_data()



Key Error KansasCity_Missouri_UnitedStates
Key Error Hoonah-Angoon_Alaska_UnitedStates


In [27]:
len(sdf)

2052906

In [29]:
sdf.head()

Unnamed: 0,dates,JHU_ConfirmedCases.data,JHU_ConfirmedCases.missing,NYT_ConfirmedCases.data,NYT_ConfirmedCases.missing,JHU_ConfirmedDeaths.data,JHU_ConfirmedDeaths.missing,JHU_ConfirmedRecoveries.data,JHU_ConfirmedRecoveries.missing,NYT_AllCausesDeathsWeekly_Expected_Deaths_AllCauses.data,...,AverageWindSpeed.data,AverageWindSpeed.missing,hospitalIcuBeds,hospitalStaffedBeds,hospitalLicensedBeds,latestTotalPopulation,fips,LND110210,county,Date Local
0,2020-01-01,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,6.510417,0.0,,,,4148.0,31129,,Nuckolls_Nebraska_UnitedStates,2020-01-01
1,2020-01-02,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,5.322917,0.0,,,,4148.0,31129,,Nuckolls_Nebraska_UnitedStates,2020-01-02
2,2020-01-03,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,8.59375,0.0,,,,4148.0,31129,,Nuckolls_Nebraska_UnitedStates,2020-01-03
3,2020-01-04,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,8.359375,0.0,,,,4148.0,31129,,Nuckolls_Nebraska_UnitedStates,2020-01-04
4,2020-01-05,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,8.020833,0.0,,,,4148.0,31129,,Nuckolls_Nebraska_UnitedStates,2020-01-05


#### Pull in the NO2 data and merge it

In [34]:
def get_no2_df():
    filename = '../air_quality/no2/daily_2020_multi_index.pkl'
    daily_no2 = pd.read_pickle(filename).reset_index()
    daily_no2 = daily_no2.astype({'fips': 'int32'})
    return daily_no2

daily_no2 = get_no2_df()
daily_no2.head()

Unnamed: 0,Date Local,fips,Arithmetic Mean,1st Max Value,AQI
0,2020-01-01,1073,16.373918,31.0,29
1,2020-01-01,4013,16.660833,34.0,32
2,2020-01-01,4019,11.20625,23.7,22
3,2020-01-01,5035,9.175,23.1,22
4,2020-01-01,5119,12.6375,28.9,26


In [45]:
def merge_sdf_no2(sdf, daily_no2):
    return sdf.merge(daily_no2, left_on=['fips', 'Date Local'], right_on=['fips', 'Date Local'], suffixes=(None, '_r'))

def pickle_sdf_no2(sdf_no2):
    sdf_no2.to_pickle('./processed_data/merged_eda/sdf_no2.pkl')
    
def load_sdf_no2():
    return pd.read_pickle('./processed_data/merged_eda/sdf_no2.pkl')

sdf_no2 = merge_sdf_no2(sdf, daily_no2)
pickle_sdf_no2(sdf_no2)

In [46]:
sdf_no2.head()

Unnamed: 0,dates,JHU_ConfirmedCases.data,JHU_ConfirmedCases.missing,NYT_ConfirmedCases.data,NYT_ConfirmedCases.missing,JHU_ConfirmedDeaths.data,JHU_ConfirmedDeaths.missing,JHU_ConfirmedRecoveries.data,JHU_ConfirmedRecoveries.missing,NYT_AllCausesDeathsWeekly_Expected_Deaths_AllCauses.data,...,hospitalStaffedBeds,hospitalLicensedBeds,latestTotalPopulation,fips,LND110210,county,Date Local,Arithmetic Mean,1st Max Value,AQI
0,2020-01-01,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,165.0,165.0,67006.0,39013,532.13,Belmont_Ohio_UnitedStates,2020-01-01,4.130435,14.0,13
1,2020-01-02,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,165.0,165.0,67006.0,39013,532.13,Belmont_Ohio_UnitedStates,2020-01-02,10.73913,20.0,19
2,2020-01-03,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,165.0,165.0,67006.0,39013,532.13,Belmont_Ohio_UnitedStates,2020-01-03,17.130435,24.0,23
3,2020-01-04,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,165.0,165.0,67006.0,39013,532.13,Belmont_Ohio_UnitedStates,2020-01-04,6.826087,15.0,14
4,2020-01-05,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,165.0,165.0,67006.0,39013,532.13,Belmont_Ohio_UnitedStates,2020-01-05,2.26087,5.0,5


In [47]:
load_sdf_no2()

Unnamed: 0,dates,JHU_ConfirmedCases.data,JHU_ConfirmedCases.missing,NYT_ConfirmedCases.data,NYT_ConfirmedCases.missing,JHU_ConfirmedDeaths.data,JHU_ConfirmedDeaths.missing,JHU_ConfirmedRecoveries.data,JHU_ConfirmedRecoveries.missing,NYT_AllCausesDeathsWeekly_Expected_Deaths_AllCauses.data,...,hospitalStaffedBeds,hospitalLicensedBeds,latestTotalPopulation,fips,LND110210,county,Date Local,Arithmetic Mean,1st Max Value,AQI
0,2020-01-01,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,165.0,165.0,67006.0,39013,532.13,Belmont_Ohio_UnitedStates,2020-01-01,4.130435,14.0,13
1,2020-01-02,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,165.0,165.0,67006.0,39013,532.13,Belmont_Ohio_UnitedStates,2020-01-02,10.739130,20.0,19
2,2020-01-03,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,165.0,165.0,67006.0,39013,532.13,Belmont_Ohio_UnitedStates,2020-01-03,17.130435,24.0,23
3,2020-01-04,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,165.0,165.0,67006.0,39013,532.13,Belmont_Ohio_UnitedStates,2020-01-04,6.826087,15.0,14
4,2020-01-05,0.0,0,0.0,100,0.0,0,0.0,100,0.0,...,165.0,165.0,67006.0,39013,532.13,Belmont_Ohio_UnitedStates,2020-01-05,2.260870,5.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89867,2020-12-27,9945.0,0,9945.0,0,67.0,0,0.0,100,0.0,...,1685.0,780.0,283111.0,6079,3298.57,SanLuisObispo_California_UnitedStates,2020-12-27,1.614130,6.0,6
89868,2020-12-28,10154.0,0,10154.0,0,67.0,0,0.0,100,0.0,...,1685.0,780.0,283111.0,6079,3298.57,SanLuisObispo_California_UnitedStates,2020-12-28,1.679348,9.0,8
89869,2020-12-29,10260.0,0,10260.0,0,70.0,0,0.0,100,0.0,...,1685.0,780.0,283111.0,6079,3298.57,SanLuisObispo_California_UnitedStates,2020-12-29,3.627329,16.0,15
89870,2020-12-30,10387.0,0,10387.0,0,74.0,0,0.0,100,0.0,...,1685.0,780.0,283111.0,6079,3298.57,SanLuisObispo_California_UnitedStates,2020-12-30,5.342885,21.0,20


89872