This Notebook takes the NYT county-level data, which is a single csv file with columns (date, county, state, fips, cases, deaths) and turns it into 52 JSON files, one per state, (50 states + PR and  DC), where each JSON structure is an array of county records.  Each county record is of the form {name, population, state, records} where records are the daily records for the county.  The first record is a header, which shows the columns in the remaining records.  These are typically "Month", "Day", "DayNum", "Cases", "Deaths", "New Cases", "New Deaths", "Growth in Cases", "Growth in Deaths", "7-Day Average New Cases", "7-Day Average New Deaths", "7-Day Average Growth in Cases", "7-Day Average Growth in Deaths".
The NYT data is fetched from github: https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv

In [None]:

import urllib.request
url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
with urllib.request.urlopen(url) as response:
   data = response.read()
asString = data.decode('utf-8')
lines = asString.split('\n')
lines = lines[1:]
lines[-1]


Setup an error catcher (just in case) and a function which takes a line, and turns it into a list of values.  The output after this is [month, day of month, day of year, fips, state, county, cases, deaths].  On some records the last value gets chopped, so put in 0 as a default for that.

In [198]:
errors = []
firstDays = [0, 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335]
def process_line(line):
    try:
        record = line.split(',')
        date = record[0].split('-')
        month = int(date[1])
        day = int(date[2])
        day_num = firstDays[month] + day - 21
        return [month, day, day_num, record[3], record[2], record[1], int(record[-2]), int(record[-1])]
    except ValueError:
        return [month, day, day_num, record[2], record[1], int(record[-2]), 0]

Do all the lines, then print out the first 10, just to be on the safe side.  Note we sort in increasing order by fips.


In [None]:
errors = []
next_lines = [process_line(line) for line in lines]
next_lines = [line for line in next_lines if len(line[3]) > 0]
next_lines.sort( key=lambda x:x[3])
next_lines[:10]

Utility functions to compute 7-day averages

In [200]:
def average(list):
    return round(sum(list)/len(list), 2)

def seven_day_average(list):
    result = []
    for i in range(1, 7):
        result.append(average(list[:i]))
    for i in range(7, len(list)+1):
        result.append(average(list[i - 7:i-1]))
    return result
    

The class that adds additional fields to each County.  Specifically, adds change, growth for both deaths and cases, and adds a seven-day average for growths and deaths

In [201]:
class County:
    def __init__(self, state_name, county_name, population):
        self.delta = {'cases': None, 'deaths': None}
        self.growth = {'cases': None, 'deaths': None}
        self.seven_day_average = {'cases': None, 'deaths': None}
        self.seven_day_average_growth = {'cases': None, 'deaths': None}
        self.records = []
        self.state_name = state_name
        self.county_name = county_name
        self.population = population
    
    def set_growth(self, index, name):
        delta = [0]
        pct = [0]
        values = [line[index] for line in self.records]
        for i in range(1, len(values)):
            dayGrowth = values[i] - values[i - 1]
            pctGrowth = dayGrowth * 1.0/values[i - 1]if values[i - 1] > 0 else 1.0
            delta.append(dayGrowth)
            pct.append(pctGrowth)
        self.delta[name] = delta
        self.growth[name] = pct
        
    #
    # Dumps the output record.  The output record for a county is given in the top cell of the Notebook, so check there.
    #
        
        
    def output_record(self):
        result = {'name': self.county_name, 'state': self.state_name, 'population': self.population}
        records = [line[:3] + line[6:] for line in self.records]
        header = ['Month', 'Day', 'DayNum', 'Cases', 'Deaths']
        if self.delta['cases'] and len(self.delta['cases'])  == len(self.records):
            header = header + ['New Cases', 'New Deaths']
            for i in range(len(self.records)):
                records[i] = records[i] + [self.delta['cases'][i], self.delta['deaths'][i]]
        if self.growth['cases'] and len(self.growth['cases'])  == len(self.records):
            header = header + ['Growth in Cases', 'Growth in Deaths']
            for i in range(len(self.records)):
                records[i] = records[i] + [self.growth['cases'][i], self.growth['deaths'][i]]
        if self.seven_day_average['cases'] and len(self.seven_day_average['cases'])  == len(self.records):
            header = header + ['7-Day Average New Cases', '7-Day Average New Deaths']
            for i in range(len(self.records)):
                records[i] = records[i] + [self.seven_day_average['cases'][i], self.seven_day_average['deaths'][i]]
        if self.seven_day_average_growth['cases'] and len(self.seven_day_average_growth['cases'])  == len(self.records):
            header = header + ['7-Day Average Growth in Cases', '7-Day Average Growth in Deaths']
            for i in range(len(self.records)):
                records[i] = records[i] + [self.seven_day_average_growth['cases'][i], self.seven_day_average_growth['deaths'][i]]
        result['records'] = [header] + records
        return result

Create the County records, and stick each one in two dictionaries: one by state name, and the other indexed by the County's FIPS.  The dictionary ordered by FIPS is used to iterate through all the counties; the one by state used to output the state file.  Start by reading the county populations from County_Population.csv, and using those to create County structures.  Note that in County_Population.csv, the FIPS entry sometimes is missing the leading 0, so add that.
Once we've created the county records, add each line from the infection records to the appropriate county.  bad_lines contains the lines we weren't able to place (should be empty)

In [202]:
f = open('County_Population.csv', 'r')
county_lines = f.readlines()[1:]
f.close()
county_records = [line[:-1].split(',') for line in county_lines]
county_dict = {}
state_dict = {}
bad_lines = []
for record in county_records:
    state_name = record[3]
    new_county = County(state_name, record[2], int(record[4]))
    county_fips = record[1] if len(record[1]) == 5 else '0' + record[1]
    county_dict[county_fips] = new_county
    if (state_name in state_dict):
        state_dict[state_name].append(new_county)
    else:
        state_dict[state_name] = [new_county]
for line in next_lines:
    county_fips = line[3]
    try: 
        county_dict[county_fips].records.append(line)
    except:
        bad_lines.append(line)

Add growth to each county.

In [203]:
for county in county_dict.values():
    county.records.sort(key = lambda rec: rec[2])
    county.set_growth(-2, 'cases')
    county.set_growth(-1, 'deaths')
        

Add the averages to each county

In [204]:
for county in county_dict.values():
    county.seven_day_average = {'cases': seven_day_average(county.delta['cases']), 'deaths': seven_day_average(county.delta['deaths'])}
    county.seven_day_average_growth = {'cases': seven_day_average(county.growth['cases']), 'deaths': seven_day_average(county.growth['deaths'])}

At this point the analytics are complete.  For each state in order, open the appropriate file in covid_state_data and dump the record into it.

In [205]:
import json
for state in state_dict.keys():
    f = open('covid_state_data/%s.json' % state.lower(), 'w')
    counties = state_dict[state]
    county_records = [county.output_record() for county in counties]
    f.write(json.dumps(county_records))
    f.close()