In [1]:
import re
from bs4 import BeautifulSoup
import sys
import os
import csv

In [5]:
if 'JUPYTER_DATE' in os.environ:
    filename = os.environ['JUPYTER_DATE']
else:
    filename = '2020-05-24.html'
outfile = filename.replace('.html', '.csv')
txtfile = filename.replace('.html', '.txt')

In [6]:
soup=BeautifulSoup(open(filename), features="lxml")

In [7]:
gs = soup.findAll('g', class_='amcharts-graph-column')
labeled = [x for x in gs if x.has_attr('aria-label')]

In [8]:
negatives = [l['aria-label'] for l in labeled if 'Total Negatives' in l['aria-label']]
positives = [l['aria-label'] for l in labeled if 'Total All Positives' in l['aria-label']]
deaths    = [l['aria-label'] for l in labeled if 'Total Deaths' in l['aria-label']]

In [9]:
def clean_labels(labels, basetext, dates=False):
    rgx = re.compile(r'^' + basetext + r' (... \d\d, 202\d) ([\d,]+)$')

    data = []
    for (i, label) in enumerate(labels):
        m = rgx.search(label)
        if m:
            (date, value) = m.groups()
            if i == 0:
                assert date == 'Mar 01, 2020'
            if dates:
                data.append(date)
            else:
                data.append(int(value.replace(',','')))
        else:
            print(f'ERROR: {label}')
    return data

In [10]:
neg = clean_labels(negatives, 'Total Negatives')
pos = clean_labels(positives, 'Total All Positives')
rip = clean_labels(deaths, 'Total Deaths')
dates = clean_labels(negatives, 'Total Negatives', dates=True)

In [11]:
write_csv = False

if write_csv: 
    fh = open(outfile,'w')
    writer = csv.writer(fh)
    header = ['Date', '#', 'Positives', 'Negatives', 'Deaths']
    writer.writerow(header)    

ft = open(txtfile,'w')
for i, (dt,p,n,d) in enumerate(zip(dates,pos, neg, rip)):
    row = [dt, i+1, p, n, d]
    if write_csv: 
        writer.writerow(row)
    print(f'{p}\t{n}\t{d}', file=ft)
ft.close()

if write_csv:
    fh.close()
