In [1]:
from bs4 import BeautifulSoup
from pathlib import Path
from datetime import datetime, timedelta
import csv
import json

In [2]:
data_fields = {
    9: "staff.on",
    10: "staff.off",
    11: "student.on",
    12: "student.off",
    
    14: "staff7.on",
    15: "staff7.off",
    16: "student7.on",
    17: "student7.off",
    
    19: "stafftotal.on",
    20: "stafftotal.off",
    21: "studenttotal.on",
    22: "studenttotal.off",
}
text_fields = {
    1: "Staff",
    2: "Students",
    4: "On campus *",
    5: "Off campus **",
    6: "On campus *",
    7: "Off campus **",
    8: "New cases in last counted 24 hour period ***",
    13: "New cases in last counted 7 day period ***",
    18: "Total cases since 28 Sept 2020 (start of Term 1)",
}
def parse_file(fh):
    soup = BeautifulSoup(fh, 'html.parser')
    table = soup.select_one('#current-confirmed-cases-covid-19 > div.site-content.wrapper > div > div > div > article > div > table')
    data = {}
    for i, tag in enumerate(table.find_all(["td","th"])):
        if i in text_fields:
            assert(tag.string == text_fields[i])
        elif i in data_fields:
            data[data_fields[i]] = int(tag.string)
            
    return table, data

In [3]:
p = Path('../data')
duplicates = p / 'duplicates'
duplicates.mkdir(exist_ok=True)
last_data = None
dataset_names = ['staff.on', 'staff.off', 'student.on', 'student.off',
                 'staff7.on', 'staff7.off', 'student7.on', 'student7.off',
                 'stafftotal.on', 'stafftotal.off', 'studenttotal.on', 'studenttotal.off']
datasets = {} # For JSON
rows = [] # For CSV
rows.append(["date"] + dataset_names)
for file in p.glob("covid-*.html"):
    print("Loading from", file)
    fh = file.open()
    file_date = datetime.strptime(file.name, "covid-%Y-%m-%dT%H-%M-%S.html").date() - timedelta(days = 1)
    table, data = parse_file(fh)
    fh.close()
    row = []
    row.append(file_date.strftime("%Y-%m-%d"))
    if data != last_data:
        print("New data at", file_date)
        for n in dataset_names:
            ds = datasets.setdefault(n, [])
            ds.append((file_date.strftime("%Y-%m-%d"), data[n]))
            row.append(data[n])
        rows.append(row)
        last_data = data
    else:
        print("File is a duplicate", file.name)
        #file.rename(duplicates / file.name)
        
ofh = open("../data/covid.json", "w", newline='')
json.dump(datasets, ofh, sort_keys=True, indent=4)
ofh.close()

with open("../data/covid.csv", "w", newline='') as csvfile:
    datawriter = csv.writer(csvfile)
    for row in rows:
        datawriter.writerow(row)

Loading from ..\data\covid-2020-10-12T20-01-30.html
New data at 2020-10-11
Loading from ..\data\covid-2020-10-13T21-51-01.html
New data at 2020-10-12
Loading from ..\data\covid-2020-10-14T08-51-01.html
New data at 2020-10-13


In [4]:
list(enumerate(table.find_all(["td","th"])))

[(0, <th scope="col">Â </th>),
 (1, <th colspan="2" rowspan="1" scope="col">Staff</th>),
 (2, <th colspan="2" rowspan="1" scope="col">Students</th>),
 (3, <td>Â </td>),
 (4, <td>On campus *</td>),
 (5, <td>Off campus **</td>),
 (6, <td>On campus *</td>),
 (7, <td>Off campus **</td>),
 (8, <td>New cases in last counted 24 hour period ***</td>),
 (9, <td>1</td>),
 (10, <td>0</td>),
 (11, <td>12</td>),
 (12, <td>7</td>),
 (13, <td>New cases in last counted 7 day period ***</td>),
 (14, <td>5</td>),
 (15, <td>2</td>),
 (16, <td>117</td>),
 (17, <td>12</td>),
 (18, <td>Total cases since 28 Sept 2020 (start of Term 1)</td>),
 (19, <td>8</td>),
 (20, <td>3</td>),
 (21, <td>138</td>),
 (22, <td>24</td>)]