# Quick script to look at all the files tagged as bc in `file_index.csv`


We import the data, filter out all the other provinces' data except bc, look at all the unique file tags and download all the files for the 3 relevant ones. Adapting this code to download other file tags or provice's data would be trivial.

In [None]:
import pandas as pd
import wget
import re
import os

# Setting the max column width to be displayed fully to 500 so see full URLs
pd.options.display.max_colwidth = 500

In [78]:
# First we load in the file and examine a few rows
index_df = pd.read_csv('file_index.csv')
index_df.sample(3)

Unnamed: 0,dir_parent,dir_file,file_name,file_timestamp,file_date,file_date_true,file_size,file_etag,file_etag_duplicate,file_url
3341,bc,public-exposures-flights,public-exposures-flights-tables-Current_2020-10-27_23-04.pdf,2020-10-27_23-04,2020-10-27,2020-10-27,265698,"""473ab1c1034b8b123d9519615b55ab0a""",0,https://s3.us-east-2.amazonaws.com/data.opencovid.ca/archive/bc/public-exposures-flights/public-exposures-flights-tables-Current_2020-10-27_23-04.pdf
31625,ns,serious-outcomes,serious-outcomes_2021-03-26_22-05.json,2021-03-26_22-05,2021-03-26,2021-03-26,1341,"""b0346872b1733df17b088f034ea43852""",1,https://s3.us-east-2.amazonaws.com/data.opencovid.ca/archive/ns/serious-outcomes/serious-outcomes_2021-03-26_22-05.json
4041,bc,regional-exposure-events-interior-webpage,regional-exposure-events-interior-webpage_2020-12-02_23-17.html,2020-12-02_23-17,2020-12-02,2020-12-02,52388,"""31d898046db830da30e2d521d7cc7e0e""",0,https://s3.us-east-2.amazonaws.com/data.opencovid.ca/archive/bc/regional-exposure-events-interior-webpage/regional-exposure-events-interior-webpage_2020-12-02_23-17.html


In [87]:
# We now focus our attention on the data for BC. Thus we filter all the other provinces' data out.
bc_data = index_df[index_df['dir_parent']=='bc']
bc_data.sample(3)

Unnamed: 0,dir_parent,dir_file,file_name,file_timestamp,file_date,file_date_true,file_size,file_etag,file_etag_duplicate,file_url
2010,bc,case-data,BCCDC_COVID19_Dashboard_Case_Details_2020-12-10_22-56.csv,2020-12-10_22-56,2020-12-10,2020-12-10,2164551,"""a89664569f71f6d34d9dadeb178d40b5""",0,https://s3.us-east-2.amazonaws.com/data.opencovid.ca/archive/bc/case-data/BCCDC_COVID19_Dashboard_Case_Details_2020-12-10_22-56.csv
2818,bc,regional-case-summary,BCCDC_COVID19_Regional_Summary_Data_2021-07-07_22-01.csv,2021-07-07_22-01,2021-07-07,2021-07-07,713046,"""a2ab63de5c15ddae7b4420d100df09ab""",0,https://s3.us-east-2.amazonaws.com/data.opencovid.ca/archive/bc/regional-case-summary/BCCDC_COVID19_Regional_Summary_Data_2021-07-07_22-01.csv
6698,bc,school-exposures-vancouver-coastal-webpage,school-exposures-vancouver-coastal-webpage_2020-12-06_22-43.html,2020-12-06_22-43,2020-12-06,2020-12-06,467930,"""0c6338f875f0a0ae3693731f653bdd2c""",0,https://s3.us-east-2.amazonaws.com/data.opencovid.ca/archive/bc/school-exposures-vancouver-coastal-webpage/school-exposures-vancouver-coastal-webpage_2020-12-06_22-43.html


In [88]:
# We now print out all the unique values of the 'dir_file' column, each of which represents a different type of datafile.

bc_data['dir_file'].unique()

# The ones of interest to us are 
# 
# 1 - case-testing-vaccine-summary-by-CHSA-and-LHA 
# 2 - vaccine-doses-by-rha
# 3 - vaccine-doses-by-rha-2

array(['bc-covid-data-webpage', 'case-data', 'laboratory-data',
       'regional-case-summary',
       'case-testing-vaccine-summary-by-CHSA-and-LHA',
       'voc-time-series-by-rha-2', 'public-exposures-webpage',
       'public-exposures-flights',
       'regional-exposure-events-fraser-webpage',
       'regional-exposure-events-interior-webpage',
       'regional-exposure-events-island-webpage',
       'regional-exposure-events-northern-webpage',
       'regional-exposure-events-vancouver-coastal-webpage',
       'school-exposures-fraser-webpage',
       'school-exposures-interior-webpage',
       'school-exposures-island-webpage',
       'school-exposures-northern-webpage',
       'school-exposures-vancouver-coastal-webpage',
       'bc-canada-cumulative-testing-rate', 'case-demographics-by-rha',
       '7-day-and-cumulative-cases-by-hsda',
       'testing-timeseries-by-rha-2', 'testing-timeseries-by-rha',
       'vaccine-doses-by-rha', 'case-time-series-by-hsda',
       'cumulative

In [119]:
# First lets look at case-testing-vaccine-summary-by-CHSA-and-LHA. 
# Lets see how many of data files there are of this type.
# Lets also check the filetype.
vaccine_summary_chsa = bc_data[(bc_data['dir_file']=='case-testing-vaccine-summary-by-CHSA-and-LHA') &
                               (bc_data['file_etag_duplicate']==0)]

file_type = re.findall(r'(\.)(\w*)',vaccine_summary_chsa.sample()["file_name"].values[0])[0][1]
print(f'There are {vaccine_summary_chsa.count()[0]} rows'
        f' and the filetype is {file_type}. Here is a sample row.')
vaccine_summary_chsa.sample()

There are 10 rows and the filetype is xlsx. Here is a sample row.


Unnamed: 0,dir_parent,dir_file,file_name,file_timestamp,file_date,file_date_true,file_size,file_etag,file_etag_duplicate,file_url
2878,bc,case-testing-vaccine-summary-by-CHSA-and-LHA,BCCDC_COVID19_LHA_CHSA_Data_2021-08-06_22-01.xlsx,2021-08-06_22-01,2021-08-06,2021-08-06,41219,"""ff8f25f3aedb5f15e19830cf80e42c04""",0,https://s3.us-east-2.amazonaws.com/data.opencovid.ca/archive/bc/case-testing-vaccine-summary-by-CHSA-and-LHA/BCCDC_COVID19_LHA_CHSA_Data_2021-08-06_22-01.xlsx


In [143]:
# We download all the files of this type.
if not os.path.exists('vaccine_summary_chsa/'):
    os.makedirs('vaccine_summary_chsa/')
for index, row in vaccine_summary_chsa[['file_name','file_url']].iterrows():
    wget.download(row['file_url'],out=f'vaccine_summary_chsa/{row["file_name"]}')

In [139]:
# Now let's look at the files tagged "vaccine-doses-by-rha"
vaccine_doses_by_rha = bc_data[(bc_data['dir_file']=='vaccine-doses-by-rha') &
                               (bc_data['file_etag_duplicate']==0)]

file_type = re.findall(r'(\.)(\w*)',vaccine_doses_by_rha.sample()["file_name"].values[0])[0][1]
print(f'There are {vaccine_doses_by_rha.count()[0]} rows'
        f' and the filetype is {file_type}. Here is a sample row.')
vaccine_doses_by_rha.sample()

There are 105 rows and the filetype is json. Here is a sample row.


Unnamed: 0,dir_parent,dir_file,file_name,file_timestamp,file_date,file_date_true,file_size,file_etag,file_etag_duplicate,file_url
7709,bc,vaccine-doses-by-rha,BC_COVID19Dashboard_Vaccine_Counts_2021-07-07_22-02.json,2021-07-07_22-02,2021-07-07,2021-07-07,9139,"""857d7abd7102b2296dce0f31c5891a1b""",0,https://s3.us-east-2.amazonaws.com/data.opencovid.ca/archive/bc/vaccine-doses-by-rha/BC_COVID19Dashboard_Vaccine_Counts_2021-07-07_22-02.json


In [141]:
# We download all the files of this type.
if not os.path.exists('vaccine_doses_by_rha/'):
    os.makedirs('vaccine_doses_by_rha/')
for index, row in vaccine_doses_by_rha[['file_name','file_url']].iterrows():
    wget.download(row['file_url'],out=f'vaccine_doses_by_rha/{row["file_name"]}')

In [147]:
# And lastly, we look at "vaccine-doses-by-rha-2" tagged files
vaccine_doses_by_rha_2 = bc_data[(bc_data['dir_file']=='vaccine-doses-by-rha-2') &
                               (bc_data['file_etag_duplicate']==0)]

file_type = re.findall(r'(\.)(\w*)',vaccine_doses_by_rha_2.sample()["file_name"].values[0])[0][1]
print(f'There are {vaccine_doses_by_rha_2.count()[0]} rows'
        f' and the filetype is {file_type}. Here is a sample row.')
vaccine_doses_by_rha_2.sample()

There are 1 rows and the filetype is json. Here is a sample row.


Unnamed: 0,dir_parent,dir_file,file_name,file_timestamp,file_date,file_date_true,file_size,file_etag,file_etag_duplicate,file_url
66472,bc,vaccine-doses-by-rha-2,BCCOVID19DashboardVaccineCountsVIEWSTAGING_2021-06-30_22-03.json,2021-06-30_22-03,2021-06-30,2021-06-30,9151,"""1f8e3e771087b4ae5479dc77df3ed69b""",0,https://s3.us-east-2.amazonaws.com/data.opencovid.ca/archive/bc/vaccine-doses-by-rha-2/BCCOVID19DashboardVaccineCountsVIEWSTAGING_2021-06-30_22-03.json


In [148]:
# We download the one file of this type.
if not os.path.exists('vaccine_doses_by_rha_2/'):
    os.makedirs('vaccine_doses_by_rha_2/')
for index, row in vaccine_doses_by_rha_2[['file_name','file_url']].iterrows():
    wget.download(row['file_url'],out=f'vaccine_doses_by_rha_2/{row["file_name"]}')