In [1]:
from bs4 import BeautifulSoup
from pathlib import Path
from datetime import datetime, timedelta
import csv
import json

In [2]:
data_fields = {
    9: "staff.on",
    10: "staff.off",
    11: "student.on",
    12: "student.off",
    
    14: "staff7.on",
    15: "staff7.off",
    16: "student7.on",
    17: "student7.off",
    
    19: "stafftotal.on",
    20: "stafftotal.off",
    21: "studenttotal.on",
    22: "studenttotal.off",
}
text_fields = {
    1: "Staff",
    2: "Students",
    4: "On campus *",
    5: "Off campus **",
    6: "On campus *",
    7: "Off campus **",
    8: "New cases in last counted 24 hour period ***",
    13: "New cases in last counted 7 day period ***",
    18: "Total cases since 28 Sept 2020 (start of Term 1)",
}

TUESDAY = 1

def parse_file(fh):
    soup = BeautifulSoup(fh, 'html.parser')
    table = soup.select_one('#current-confirmed-cases-covid-19 > div.site-content.wrapper > div > div > div > article > div > table')
    data = {}
    for i, tag in enumerate(table.find_all(["td","th"])):
        if i in text_fields:
            assert(tag.string == text_fields[i])
        elif i in data_fields:
            data[data_fields[i]] = int(tag.string)
            
    return table, data

In [3]:
p = Path('../data')
duplicates = p / 'duplicates'
duplicates.mkdir(exist_ok=True)
last_data = None
dataset_names = ['staff.on', 'staff.off', 'student.on', 'student.off',
                 'staff7.on', 'staff7.off', 'student7.on', 'student7.off',
                 'stafftotal.on', 'stafftotal.off', 'studenttotal.on', 'studenttotal.off']

## These figures need smoothing over the weekend
smoothed_names = set(['staff.on', 'staff.off', 'student.on', 'student.off'])

datasets = {} # For JSON
rows = [] # For CSV

## CSV header
rows.append(["date"] + dataset_names)

for file in p.glob("covid-*.html"):
    print("Loading from", file)
    fh = file.open()
    file_date = datetime.strptime(file.name, "covid-%Y-%m-%dT%H-%M-%S.html").date()
    if file_date.weekday() == 0:
        ## Monday, data is correct as of Friday 5pm
        data_date = file_date - timedelta(days = 3)
    else:
        ## other days, data is correct as of previous day at 5pm
        data_date = file_date - timedelta(days = 1)
    table, data = parse_file(fh)
    fh.close()
    
    if data != last_data:
        print("New data at", file_date)
        
        ## JSON data
        for n in dataset_names:
            v = data[n]
            ds = datasets.setdefault(n, [])
            if file_date.weekday() == TUESDAY and n in smoothed_names:
                ## Tuesday contains three days of data so smooth
                ## daily figures over the weekend
                for i in range(2,-1,-1):
                    entry_date = data_date - timedelta(days = i)
                    ds.append((entry_date.strftime("%Y-%m-%d"), v / 3.0))
            else:
                ## Weekly figures don't need smoothing, nor do other days
                ds.append((data_date.strftime("%Y-%m-%d"), v))
                
        ## CSV data
        new_rows = []
        if file_date.weekday() == TUESDAY:
            ## Tuesday's data contains three days of data
            for i in range(2,-1,-1):
                row = []
                entry_date = data_date - timedelta(days = i)
                row.append(entry_date.strftime("%Y-%m-%d"))
                for n in dataset_names:
                    if n in smoothed_names:
                        ## Smooth daily data over the weekend
                        row.append(data[n] / 3.0)
                    elif i > 0:
                        ## Omit weekly data over the weekend
                        row.append('')
                    else:
                        ## Still output Monday's weekly data
                        row.append(data[n])
                      
                new_rows.append(row)
        else:
            ## Days other than Tuesday are treated normally
            row = []
            row.append(data_date.strftime("%Y-%m-%d"))
            for n in dataset_names:
                row.append(data[n])
            new_rows.append(row)
        rows.extend(new_rows)
        
        last_data = data
    else:
        print("File is a duplicate", file.name)
        #file.rename(duplicates / file.name)
        
ofh = open("../data/covid.json", "w", newline='')
json.dump(datasets, ofh, sort_keys=True, indent=4)
ofh.close()
        
with open("../data/covid.csv", "w", newline='') as csvfile:
    datawriter = csv.writer(csvfile)
    for row in rows:
        datawriter.writerow(row)

Loading from ..\data\covid-2020-10-12T20-01-30.html
New data at 2020-10-12
Loading from ..\data\covid-2020-10-12T20-42-01.html
File is a duplicate covid-2020-10-12T20-42-01.html
Loading from ..\data\covid-2020-10-12T21-42-01.html
File is a duplicate covid-2020-10-12T21-42-01.html
Loading from ..\data\covid-2020-10-12T22-42-01.html
File is a duplicate covid-2020-10-12T22-42-01.html
Loading from ..\data\covid-2020-10-12T23-42-01.html
File is a duplicate covid-2020-10-12T23-42-01.html
Loading from ..\data\covid-2020-10-13T00-42-02.html
File is a duplicate covid-2020-10-13T00-42-02.html
Loading from ..\data\covid-2020-10-13T01-42-02.html
File is a duplicate covid-2020-10-13T01-42-02.html
Loading from ..\data\covid-2020-10-13T02-42-01.html
File is a duplicate covid-2020-10-13T02-42-01.html
Loading from ..\data\covid-2020-10-13T03-42-01.html
File is a duplicate covid-2020-10-13T03-42-01.html
Loading from ..\data\covid-2020-10-13T04-42-01.html
File is a duplicate covid-2020-10-13T04-42-01.htm

In [4]:
list(enumerate(table.find_all(["td","th"])))

[(0, <th scope="col">Â </th>),
 (1, <th colspan="2" rowspan="1" scope="col">Staff</th>),
 (2, <th colspan="2" rowspan="1" scope="col">Students</th>),
 (3, <td>Â </td>),
 (4, <td>On campus *</td>),
 (5, <td>Off campus **</td>),
 (6, <td>On campus *</td>),
 (7, <td>Off campus **</td>),
 (8, <td>New cases in last counted 24 hour period ***</td>),
 (9, <td>1</td>),
 (10, <td>1</td>),
 (11, <td>21</td>),
 (12, <td>7</td>),
 (13, <td>New cases in last counted 7 day period ***</td>),
 (14, <td>6</td>),
 (15, <td>3</td>),
 (16, <td>129</td>),
 (17, <td>17</td>),
 (18, <td>Total cases since 28 Sept 2020 (start of Term 1)</td>),
 (19, <td>9</td>),
 (20, <td>4</td>),
 (21, <td>159</td>),
 (22, <td>31</td>)]

In [5]:
list(range(2,-1,-1))

[2, 1, 0]