# Download

This notebook downloads the required data files for preprocessing.

## Settings

In [1]:
folder = '../tmp/downloads'
data_sources_file = 'data.csv'

## Libraries

In [2]:
import requests
import zipfile

import pandas as pd

from datetime import datetime, timedelta
from IPython.display import display, HTML
from pathlib import Path
from time import time

## Functions

In [3]:
# Downloads a file from a link
def download(url, file):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(file, 'wb') as f:
            for chunk in r.iter_content(1024):
                f.write(chunk)

# Unzips a single compressed .zip file
def unzip(file, folder):
    ext = Path(file).suffixes[0].lower()
    with zipfile.ZipFile(file, 'r') as z:
        if ext == '.shp':
            for target in z.infolist():
                zext = ''.join(Path(target.filename).suffixes).replace(ext, '')
                target.filename = Path(file).stem + zext
                z.extract(target, folder)
        else:
            target = z.infolist()[0]
            target.filename = Path(file).stem
            z.extract(target, folder)
        
# Formats human readable sizes in bytes
# https://stackoverflow.com/questions/1094841/get-human-readable-version-of-file-size
def format_size(num, suffix="B"):
    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
        if abs(num) < 1024.0:
            return f"{num:3.1f} {unit}{suffix}"
        num /= 1024.0
    return f"{num:.1f} Yi{suffix}"

# Extracts file info for summary
def file_info(file):
    stats = Path(file).stat()
    modified = datetime.fromtimestamp(stats.st_mtime)
    created = datetime.fromtimestamp(stats.st_birthtime)
    size = format_size(stats.st_size)
    out = {'created': created, 'modified': modified, 'size': size}
    return out

## Run

Run the download process for all defined data files in the settings.

In [4]:
# Create folder if not exists
Path(folder).mkdir(parents=True, exist_ok=True)

# Get data sources
data_sources = pd.read_csv(data_sources_file)

### Download Data

Download data from url sources to ``folder``.

In [5]:
# Start download
print(f'Starting Downloads ({datetime.now()})...')
start = time()
for row in data_sources.itertuples():
    
    # Data file vars
    url = row.url
    file = f'{folder}/{row.file}'
    
    # Download data file if it does not exist otherwise skip
    if not Path(file).is_file():
        print(f'Downloading {row.file} ({datetime.now()})...')
        download(url, file)
        print(f'Downloaded {row.file} ({datetime.now()})')
    else:
        print(f'Skipping {row.file} - file exists ({datetime.now()})...')
        
# End downloads
end = time()
elapsed = str(timedelta(seconds=end - start))
print(f'Downloads Complete ({datetime.now()})')
print(f'Elapsed Time ({elapsed})')

Starting Downloads (2022-02-17 21:30:27.756153)...
Downloading toronto.geojson (2022-02-17 21:30:27.757038)...
Downloaded toronto.geojson (2022-02-17 21:30:29.682642)
Downloading centrelines.geojson.zip (2022-02-17 21:30:29.683418)...
Downloaded centrelines.geojson.zip (2022-02-17 21:30:32.935229)
Downloading collisions.geojson (2022-02-17 21:30:32.935923)...
Downloaded collisions.geojson (2022-02-17 21:30:34.483351)
Downloading traffic.csv.zip (2022-02-17 21:30:34.483510)...
Downloaded traffic.csv.zip (2022-02-17 21:30:35.178843)
Downloading autospeed_enforcement.geojson (2022-02-17 21:30:35.179488)...
Downloaded autospeed_enforcement.geojson (2022-02-17 21:30:35.871875)
Downloading watch_your_speed.geojson (2022-02-17 21:30:35.872004)...
Downloaded watch_your_speed.geojson (2022-02-17 21:30:37.200850)
Downloading red_light_cams.geojson (2022-02-17 21:30:37.202020)...
Downloaded red_light_cams.geojson (2022-02-17 21:30:38.170317)
Downloading police.geojson (2022-02-17 21:30:38.171532)

### Unzip Data

Unzip compressed data files.

In [6]:
# Unzip data files
print(f'Unzipping Data ({datetime.now()})...')
start = time()
for row in data_sources.itertuples():
    
    # Data file vars
    url = row.url
    file = f'{folder}/{row.file}'
    
    # Download data file if it does not exist otherwise skip
    path = Path(file)
    if path.suffix.lower() == '.zip':
        if path.with_suffix('').is_file():
            print(f'Skipping {row.file} - already unzipped ({datetime.now()})...')
        else:
            print(f'Unzipping {row.file}')
            unzip(file, folder)
            print(f'Unzipped {row.file} ({datetime.now()})')
    else:
        print(f'Skipping {row.file} - not a zip file ({datetime.now()})...') 

# End downloads
end = time()
elapsed = str(timedelta(seconds=end - start))
print(f'Unzip Complete ({datetime.now()})')
print(f'Elapsed Time ({elapsed})')

Unzipping Data (2022-02-17 21:31:53.887005)...
Skipping toronto.geojson - not a zip file (2022-02-17 21:31:53.888571)...
Unzipping centrelines.geojson.zip
Unzipped centrelines.geojson.zip (2022-02-17 21:31:54.250943)
Skipping collisions.geojson - not a zip file (2022-02-17 21:31:54.251081)...
Unzipping traffic.csv.zip
Unzipped traffic.csv.zip (2022-02-17 21:31:54.458879)
Skipping autospeed_enforcement.geojson - not a zip file (2022-02-17 21:31:54.458961)...
Skipping watch_your_speed.geojson - not a zip file (2022-02-17 21:31:54.458982)...
Skipping red_light_cams.geojson - not a zip file (2022-02-17 21:31:54.458996)...
Skipping police.geojson - not a zip file (2022-02-17 21:31:54.459012)...
Skipping ambulance.geojson - not a zip file (2022-02-17 21:31:54.459034)...
Skipping fire_hydrants.geojson - not a zip file (2022-02-17 21:31:54.459054)...
Skipping fire_stations.geojson - not a zip file (2022-02-17 21:31:54.459068)...
Skipping renewables.geojson - not a zip file (2022-02-17 21:31:54

## Summary

In [7]:
# Copy data sources to use in summary
summary = data_sources.copy()

# Get file infos and add to summary
info = [file_info(f'{folder}/{row.file}') for row in summary.itertuples()]
info = pd.DataFrame(info)
summary = pd.concat([summary, info], axis=1)

# Rearrange summary columns and display
summary = summary[['file', 'size', 'created', 'modified', 'source', 'source_url', 'url']]
display(HTML(summary.to_html(render_links=True)))

Unnamed: 0,file,size,created,modified,source,source_url,url
0,toronto.geojson,1.9 MiB,2022-02-17 21:30:29.540496,2022-02-17 21:30:29.682087,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/neighbourhoods/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/a083c865-6d60-4d1d-b6c6-b0c8a85f9c15?format=geojson&projection=4326
1,centrelines.geojson.zip,13.6 MiB,2022-02-17 21:30:30.168185,2022-02-17 21:30:32.934665,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/toronto-centreline-tcl/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/ba71b78e-464b-46dd-8185-a0d249fe4857
2,collisions.geojson,248.9 MiB,2022-02-17 21:30:33.537429,2022-02-17 21:30:34.483262,Toronto Police Service Public Safety Data Portal,https://data.torontopolice.on.ca/datasets/TorontoPS::traffic-collisions-asr-t-tbl-001/about,https://opendata.arcgis.com/api/v3/datasets/2a36d61277a84c3eb9d0beebef4c023d_0/downloads/data?format=geojson&spatialRefId=4326
3,traffic.csv.zip,8.1 MiB,2022-02-17 21:30:34.869590,2022-02-17 21:30:35.178555,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/traffic-volumes-at-intersections-for-all-modes/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/0ffadfc9-b017-44df-a1b1-905591e54caa
4,autospeed_enforcement.geojson,25.4 KiB,2022-02-17 21:30:35.870656,2022-02-17 21:30:35.871788,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/automated-speed-enforcement-locations/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/e25e9460-a0e8-469c-b9fb-9a4837ac6c1c?format=geojson&projection=4326
5,watch_your_speed.geojson,320.6 KiB,2022-02-17 21:30:37.170878,2022-02-17 21:30:37.200416,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/school-safety-zone-watch-your-speed-program-locations/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/4e2221b9-da3a-4ef8-b8eb-17e95b7abaa0?format=geojson&projection=4326
6,red_light_cams.geojson,169.6 KiB,2022-02-17 21:30:38.125359,2022-02-17 21:30:38.169757,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/red-light-cameras/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/b57a31a1-5ee6-43e3-bfb9-206ebe93066d?format=geojson&projection=4326
7,police.geojson,9.0 KiB,2022-02-17 21:30:38.811707,2022-02-17 21:30:38.816532,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/police-facility-locations/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/4afc3c66-5614-466a-b714-e8d6336fc6d3?format=geojson&projection=4326
8,ambulance.geojson,37.6 KiB,2022-02-17 21:30:39.462448,2022-02-17 21:30:39.468650,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/ambulance-station-locations/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/e49245ba-395c-46bf-bcf8-22fc7024d649?format=geojson&projection=4326
9,fire_hydrants.geojson,10.7 MiB,2022-02-17 21:30:57.694595,2022-02-17 21:30:58.803919,City of Toronto Open Data Portal,https://open.toronto.ca/dataset/fire-hydrants/,https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/beaaa552-6338-4c81-95be-411e6cef6b89?format=geojson&projection=4326
