In [1]:
from bs4 import BeautifulSoup
from pathlib import Path
from datetime import datetime, timedelta
import json

In [2]:
data_fields = {
    9: "staff.on",
    10: "staff.off",
    11: "student.on",
    12: "student.off",
}
text_fields = {
    1: "Staff",
    2: "Students",
    4: "On campus *",
    5: "Off campus **",
    6: "On campus *",
    7: "Off campus **",
    8: "New cases in last counted 24 hour period ***",
    13: "New cases in last counted 7 day period ***",
    18: "Total cases since 28 Sept 2020 (start of Term 1)",
}
def parse_file(fh):
    soup = BeautifulSoup(fh, 'html.parser')
    table = soup.select_one('#current-confirmed-cases-covid-19 > div.site-content.wrapper > div > div > div > article > div > table')
    data = {}
    for i, tag in enumerate(table.find_all(["td","th"])):
        if i in text_fields:
            assert(tag.string == text_fields[i])
        elif i in data_fields:
            data[data_fields[i]] = int(tag.string)
            
    return data

In [3]:
p = Path('../data')
duplicates = p / 'duplicates'
duplicates.mkdir(exist_ok=True)
last_data = None
dataset_names = ['staff.on', 'staff.off', 'student.on', 'student.off']
datasets = {}
for file in p.glob("covid-*.html"):
    print("Loading from", file)
    fh = file.open()
    file_date = datetime.strptime(file.name, "covid-%Y-%m-%dT%H-%M-%S.html").date() - timedelta(days = 1)
    data = parse_file(fh)
    fh.close()
    if data != last_data:
        print("New data at", file_date)
        for n in dataset_names:
            ds = datasets.setdefault(n, [])
            ds.append((file_date.strftime("%Y-%m-%d"), data[n]))
        last_data = data
    else:
        print("File is a duplicate", file.name)
        file.rename(duplicates / file.name)
        
ofh = open("../data/covid.json", "w")
json.dump(datasets, ofh, sort_keys=True, indent=4)
ofh.close()

Loading from ..\data\covid-2020-10-12T20-01-30.html
New data at 2020-10-11
Loading from ..\data\covid-2020-10-13T09-51-01.html
File is a duplicate covid-2020-10-13T09-51-01.html
Loading from ..\data\covid-2020-10-13T10-51-01.html
File is a duplicate covid-2020-10-13T10-51-01.html
Loading from ..\data\covid-2020-10-13T11-51-01.html
File is a duplicate covid-2020-10-13T11-51-01.html
Loading from ..\data\covid-2020-10-13T12-51-01.html
File is a duplicate covid-2020-10-13T12-51-01.html
Loading from ..\data\covid-2020-10-13T13-51-01.html
File is a duplicate covid-2020-10-13T13-51-01.html
Loading from ..\data\covid-2020-10-13T14-51-01.html
File is a duplicate covid-2020-10-13T14-51-01.html
Loading from ..\data\covid-2020-10-13T15-51-01.html
File is a duplicate covid-2020-10-13T15-51-01.html
Loading from ..\data\covid-2020-10-13T16-51-01.html
File is a duplicate covid-2020-10-13T16-51-01.html
Loading from ..\data\covid-2020-10-13T17-51-01.html
File is a duplicate covid-2020-10-13T17-51-01.htm