# Loading NJ COVID-19 Data Into JSON and CSV

This python notebook contains the code used to scrape articles from [NJ.com](https://www.nj.com/) to get data on number of COVID-19 cases per municipalities in New Jersey. 

I use `urllib` for loading the webpage, `bs4` & `re` for scraping, `datetime` for getting the current date, and `json` & `csv` for loading the data into the correct outputting files.

In [1]:
from urllib import request, error, parse
from bs4 import BeautifulSoup
from datetime import date
import pandas as pd
import numpy as np
import json, csv, re

Getting the current date and loading it into the correct format for accessing the article url where the data lies.

In [2]:
def current_date():
    rn = date.today()
    month = rn.strftime("%B").lower()
    month_number = str(rn.month) if rn.month > 9 else '0' + str(rn.month)
    return "{}-{}-{}".format(month,rn.day,rn.year), month_number

def get_date(month,day):
    return "{}-{}-{}".format(month,day,2020)

Loading the HTML file locally for scraping and parsing. Getting the main part of the article where the data lies within the HTML file.

In [3]:
def get_html(link):
    response = request.urlopen(link)
    covid19_html = response.read()
    soup = BeautifulSoup(covid19_html,'lxml')
    return soup.find_all('article')

Loading and cleaning the data (in string format) for matching. Prints an error if the row is not cleaned properly.

In [4]:
def clean_data(article):
    data = []
    for row in article.find_all('p'):
        delims = '.+\:\s*\d*,?\d+'
        statistic = row.getText().lower()

        # weird april error on a particular set of days
        if '• springfield ' in statistic:
            statistic = statistic.replace('• springfield ','• springfield: ')

        if ':' in statistic and '•' in statistic:
            cleaned_row = re.findall(delims,statistic)
            if len(cleaned_row) != 1:
                print(statistic)
            else:
                data.append(cleaned_row[0][1:].strip())
        
        # weird march error on particular day
        if '• west orange' in statistic:
            data.append('gloucester county')

        for county in counties:
            if county in statistic:
                data.append(county)
                break
    print()            
    return data

A dictionary containing the historical typos made in data parsing.

In [5]:
errors = json.load(open('../data/errors.json'))
substring_errors = errors[0]
fullstring_errors = errors[1]

Loading the data into a json file `nj_covid19_today.json` containing the number for cases per municipalities in New Jersey for the current date.

In [6]:
def check_errors(town, fullstring_errs=fullstring_errors, substring_errs=substring_errors):
    for error in substring_errs.keys():
        town = town.replace(error,substring_errs[error])
        
    for error in fullstring_errs.keys():
        if town.strip() == error:
            return fullstring_errs[error]
    
    return town

In [7]:
def replace_town(row, county, visited_municipal):
    infected_township = re.split(':',row)
    town = check_errors(infected_township[0].strip()).strip()
    num_infected = int(re.findall('^\d+',infected_township[1].replace(',','').strip())[0])
    
    township = town + ' township'
    city = town + ' city'
    borough = town + ' borough'
    
    if town == 'address not reported' or town == 'unknown':
        return 'other', num_infected
        
    elif town in visited_municipal and visited_municipal[town] == False:
        visited_municipal[town] = True
        return town,num_infected
    
    elif township in visited_municipal and visited_municipal[township] == False:
        visited_municipal[township] = True
        return township,num_infected
    
    elif city in visited_municipal and visited_municipal[city] == False:
        visited_municipal[city] = True 
        return city, num_infected
    
    elif borough in visited_municipal and visited_municipal[borough] == False:
        visited_municipal[borough] = True 
        return borough, num_infected
    
    else: 
        print('ERROR TOWN NOT FOUND IN {}: {}'.format(county,town))
        return '',-1

In [8]:
def put_data_in_dataframe(data, covid_df, municipals, date):
    todays_data = []
    current_county = ''

    for row in data:
        if row in counties:
            current_county = row
            continue

        if current_county == '':
            continue

        municipal, infected = replace_town(row, current_county, municipals[current_county])
        
        if municipal == '' or infected < 0:
            continue
        
        todays_data.append(
            pd.DataFrame(
                data=[[municipal,current_county,infected,date]],
                columns=['Municipal','County','Cases','Date']
            )
        )

    return covid_df.append(pd.concat(todays_data), ignore_index=True).copy()

# EXECUTABLE

Run these cells to update the data based on a certain day.

Finding the correct date and link for that date's COVID-19 data in New Jersey

In [9]:
# date, month = current_date()
date, month = get_date('may',20), '05'

In [10]:
nj_dot_com_link = 'https://www.nj.com/coronavirus/2020/{}/where-is-the-coronavirus\
-in-nj-latest-map-update-on-county-by-county-cases-{}.html'.format(month,date)

Loading all of the NJ Municipalities names from `nj_municipals.json`. Loading dictionaries for handling errors such as misspellings and shorthands for munipalities.

In [11]:
nj_municipals = json.load(open('../data/nj_municipals.json'))
counties = list(nj_municipals.keys())

In [12]:
article = get_html(nj_dot_com_link) # article component
covid_data = clean_data(article[0])




Updating the total dataframe.

In [13]:
nj_covid_df = pd.read_csv('../data/ex_nj_total.csv')
updated_covid_df = put_data_in_dataframe(covid_data, nj_covid_df, nj_municipals, date)

In [14]:
print(updated_covid_df.head())
print(updated_covid_df.tail())

     Municipal         County Cases           Date
0    allendale  bergen county     4  march-25-2020
1       alpine  bergen county     1  march-25-2020
2  bergenfield  bergen county    26  march-25-2020
3       bogota  bergen county     8  march-25-2020
4    carlstadt  bergen county     6  march-25-2020
                 Municipal         County Cases         Date
27658         phillipsburg  warren county   118  may-20-2020
27659   pohatcong township  warren county    20  may-20-2020
27660           washington  warren county    50  may-20-2020
27661  washington township  warren county    39  may-20-2020
27662       white township  warren county    51  may-20-2020


Making sure there were no leftover errors as a result of error filtering.

In [15]:
assert len(updated_covid_df[updated_covid_df.Municipal == '']) == 0

Loading the updated data into their respective locations. The new total will go to `nj_total.csv` and today's data will go to `nj_today.json`.

In [16]:
updated_covid_df.to_csv('../data/ex_nj_total.csv',mode='w',index=False)

In [17]:
updated_covid_df = updated_covid_df[updated_covid_df['Date'] == date]
updated_covid_df.to_json('../data/ex_nj_today.json', orient='records')

# AUTOMATION

Some functions I created to automate the process of data collection over a month's time period.

In [18]:
def get_link(date):
    link = 'https://www.nj.com/coronavirus/2020/05/where-is-the-coronavirus-in-nj-latest-map-update-on-county-by-county-cases-{}.html'
    return link.format(date)

In [19]:
def automate(month,i,j):
    for d in range(i,j):
        current_date = get_date(month,d)
        nj_link = get_link(current_date)
        
        print(current_date)
        
        municipals = json.load(open('nj_municipals.json'))
        
        article = get_html(nj_link) # article component
        covid_data = clean_data(article[0])
        
        nj_covid_df = pd.read_csv('ddata.csv')
        updated_covid_df = put_data_in_dataframe(covid_data, nj_covid_df, municipals, current_date)
        
        assert len(updated_covid_df[updated_covid_df.Municipal == '']) == 0
        
        updated_covid_df.to_csv('data.csv',mode='w',index=False)
        print('\n')

In [20]:
def weird_links(nj_link):
    municipals = json.load(open('nj_municipals.json'))
        
    article = get_html(nj_link) # article component
    covid_data = clean_data(article[0])

    nj_covid_df = pd.read_csv('data.csv')
    updated_covid_df = put_data_in_dataframe(covid_data, nj_covid_df, municipals, current_date)

    assert len(updated_covid_df[updated_covid_df.Municipal == '']) == 0

    updated_covid_df.to_csv('data.csv',mode='w',index=False)
    print('\n')

In [21]:
# automate('april',1,31)