# Download

This notebook downloads the required data files for preprocessing.

## Libraries

In [8]:
import requests
import zipfile

import pandas as pd

from datetime import datetime, timedelta
from IPython.display import display, HTML
from pathlib import Path
from time import time

## Functions

In [9]:
# Downloads a file from a link
def download(url, file):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(file, 'wb') as f:
            for chunk in r.iter_content(1024):
                f.write(chunk)

# Unzips a single compressed .zip file
def unzip(file, folder):
    ext = Path(file).suffixes[0].lower()
    with zipfile.ZipFile(file, 'r') as z:
        if ext == '.shp':
            for target in z.infolist():
                zext = ''.join(Path(target.filename).suffixes).replace(ext, '')
                target.filename = Path(file).stem + zext
                z.extract(target, folder)
        else:
            target = z.infolist()[0]
            target.filename = Path(file).stem
            z.extract(target, folder)
        
# Formats human readable sizes in bytes
# https://stackoverflow.com/questions/1094841/get-human-readable-version-of-file-size
def format_size(num, suffix="B"):
    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
        if abs(num) < 1024.0:
            return f"{num:3.1f} {unit}{suffix}"
        num /= 1024.0
    return f"{num:.1f} Yi{suffix}"

# Extracts file info for summary
def file_info(file):
    stats = Path(file).stat()
    modified = datetime.fromtimestamp(stats.st_mtime)
    created = datetime.fromtimestamp(stats.st_birthtime)
    size = format_size(stats.st_size)
    out = {'created': created, 'modified': modified, 'size': size}
    return out

## Settings

In [10]:
folder = '../tmp/downloads'
data_sources = pd.read_csv('data.csv')

## Run

Run the download process for all defined data files in the settings.

In [11]:
# Create folder if not exists
Path(folder).mkdir(parents=True, exist_ok=True)

### Download Data

Download data from url sources to ``folder``.

In [12]:
# Start download
print(f'Starting Downloads ({datetime.now()})...')
start = time()
for row in data_sources.itertuples():
    
    # Data file vars
    url = row.url
    file = f'{folder}/{row.file}'
    
    # Download data file if it does not exist otherwise skip
    if not Path(file).is_file():
        print(f'Downloading {row.file} ({datetime.now()})...')
        download(url, file)
        print(f'Downloaded {row.file} ({datetime.now()})')
    else:
        print(f'Skipping {row.file} - file exists ({datetime.now()})...')
        
# End downloads
end = time()
elapsed = str(timedelta(seconds=end - start))
print(f'Downloads Complete ({datetime.now()})')
print(f'Elapsed Time ({elapsed})')

Starting Downloads (2022-02-11 23:20:29.572441)...
Downloading toronto.geojson (2022-02-11 23:20:29.574289)...
Downloaded toronto.geojson (2022-02-11 23:20:32.842151)
Downloading centrelines.geojson.zip (2022-02-11 23:20:32.843104)...
Downloaded centrelines.geojson.zip (2022-02-11 23:20:33.781965)
Downloading collisions.geojson (2022-02-11 23:20:33.782693)...
Downloaded collisions.geojson (2022-02-11 23:20:35.333885)
Downloading traffic.csv.zip (2022-02-11 23:20:35.334054)...
Downloaded traffic.csv.zip (2022-02-11 23:20:36.286858)
Downloading autospeed_enforcement.geojson (2022-02-11 23:20:36.288440)...
Downloaded autospeed_enforcement.geojson (2022-02-11 23:20:37.365376)
Downloading watch_your_speed.geojson (2022-02-11 23:20:37.366382)...
Downloaded watch_your_speed.geojson (2022-02-11 23:20:39.217886)
Downloading red_light_cams.geojson (2022-02-11 23:20:39.219002)...
Downloaded red_light_cams.geojson (2022-02-11 23:20:40.353661)
Downloading police.geojson (2022-02-11 23:20:40.354840)

### Unzip Data

Unzip compressed data files.

In [13]:
# Unzip data files
print(f'Unzipping Data ({datetime.now()})...')
start = time()
for row in data_sources.itertuples():
    
    # Data file vars
    url = row.url
    file = f'{folder}/{row.file}'
    
    # Download data file if it does not exist otherwise skip
    path = Path(file)
    if path.suffix.lower() == '.zip':
        if path.with_suffix('').is_file():
            print(f'Skipping {row.file} - already unzipped ({datetime.now()})...')
        else:
            print(f'Unzipping {row.file}')
            unzip(file, folder)
            print(f'Unzipped {row.file} ({datetime.now()})')
    else:
        print(f'Skipping {row.file} - not a zip file ({datetime.now()})...') 

# End downloads
end = time()
elapsed = str(timedelta(seconds=end - start))
print(f'Unzip Complete ({datetime.now()})')
print(f'Elapsed Time ({elapsed})')

Unzipping Data (2022-02-11 23:21:54.112567)...
Skipping toronto.geojson - not a zip file (2022-02-11 23:21:54.114543)...
Unzipping centrelines.geojson.zip
Unzipped centrelines.geojson.zip (2022-02-11 23:21:54.476952)
Skipping collisions.geojson - not a zip file (2022-02-11 23:21:54.477089)...
Unzipping traffic.csv.zip
Unzipped traffic.csv.zip (2022-02-11 23:21:54.682477)
Skipping autospeed_enforcement.geojson - not a zip file (2022-02-11 23:21:54.682564)...
Skipping watch_your_speed.geojson - not a zip file (2022-02-11 23:21:54.682583)...
Skipping red_light_cams.geojson - not a zip file (2022-02-11 23:21:54.682599)...
Skipping police.geojson - not a zip file (2022-02-11 23:21:54.682615)...
Skipping ambulance.geojson - not a zip file (2022-02-11 23:21:54.682630)...
Skipping fire_hydrants.geojson - not a zip file (2022-02-11 23:21:54.682644)...
Skipping fire_stations.geojson - not a zip file (2022-02-11 23:21:54.682657)...
Skipping renewables.geojson - not a zip file (2022-02-11 23:21:54

## Summary

In [14]:
# Copy data sources to use in summary
summary = data_sources.copy()

# Get file infos and add to summary
info = [file_info(f'{folder}/{row.file}') for row in summary.itertuples()]
info = pd.DataFrame(info)
summary = pd.concat([summary, info], axis=1)

# Rearrange summary columns and display
summary = summary[['file', 'size', 'created', 'modified', 'source', 'source_url', 'url']]
display(HTML(summary.to_html(render_links=True)))

Unnamed: 0,file,size,created,modified,source,source_url,url
0,toronto.geojson,1.9 MiB,2022-02-11 23:20:32.493825,2022-02-11 23:20:32.841482,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/neighbourhoods/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/a083c865-6d60-4d1d-b6c6-b0c8a85f9c15?format=geojson&projection=4326
1,centrelines.geojson.zip,13.6 MiB,2022-02-11 23:20:33.244570,2022-02-11 23:20:33.781208,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/toronto-centreline-tcl/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/ba71b78e-464b-46dd-8185-a0d249fe4857
2,collisions.geojson,248.9 MiB,2022-02-11 23:20:34.417441,2022-02-11 23:20:35.333786,Toronto Police Service Public Safety Data Portal,https://data.torontopolice.on.ca/datasets/TorontoPS::traffic-collisions-asr-t-tbl-001/about,https://opendata.arcgis.com/api/v3/datasets/2a36d61277a84c3eb9d0beebef4c023d_0/downloads/data?format=geojson&spatialRefId=4326
3,traffic.csv.zip,8.1 MiB,2022-02-11 23:20:35.671553,2022-02-11 23:20:36.286167,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/traffic-volumes-at-intersections-for-all-modes/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/0ffadfc9-b017-44df-a1b1-905591e54caa
4,autospeed_enforcement.geojson,25.4 KiB,2022-02-11 23:20:37.361050,2022-02-11 23:20:37.364880,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/automated-speed-enforcement-locations/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/e25e9460-a0e8-469c-b9fb-9a4837ac6c1c?format=geojson&projection=4326
5,watch_your_speed.geojson,320.6 KiB,2022-02-11 23:20:39.112338,2022-02-11 23:20:39.217459,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/school-safety-zone-watch-your-speed-program-locations/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/4e2221b9-da3a-4ef8-b8eb-17e95b7abaa0?format=geojson&projection=4326
6,red_light_cams.geojson,169.6 KiB,2022-02-11 23:20:40.305768,2022-02-11 23:20:40.353193,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/red-light-cameras/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/b57a31a1-5ee6-43e3-bfb9-206ebe93066d?format=geojson&projection=4326
7,police.geojson,9.0 KiB,2022-02-11 23:20:41.183082,2022-02-11 23:20:41.186033,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/police-facility-locations/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/4afc3c66-5614-466a-b714-e8d6336fc6d3?format=geojson&projection=4326
8,ambulance.geojson,37.6 KiB,2022-02-11 23:20:42.176574,2022-02-11 23:20:42.181733,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/ambulance-station-locations/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/e49245ba-395c-46bf-bcf8-22fc7024d649?format=geojson&projection=4326
9,fire_hydrants.geojson,10.7 MiB,2022-02-11 23:21:00.196884,2022-02-11 23:21:00.718290,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/fire-hydrants/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/beaaa552-6338-4c81-95be-411e6cef6b89?format=geojson&projection=4326
