In [177]:
import os
import urllib
import requests
import zipfile
from pathlib import PurePath

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [186]:
# set path variables
DATA_ROOT = '../data'
parent_dir = os.path.join(DATA_ROOT, 'raw')
inventory_filepath = os.path.join(DATA_ROOT, 'data-inventory.csv')

In [187]:
# read data inventory to dataframe
inventory_df = pd.read_csv(inventory_filepath)

In [190]:
# view summary of data inventory
print(inventory_df.info())
inventory_df.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 13 columns):
id                  47 non-null int64
category            47 non-null object
access              47 non-null object
source              44 non-null object
directory           47 non-null object
sub-directory       47 non-null object
filename            44 non-null object
zipfile             37 non-null float64
page-url            47 non-null object
data-url            44 non-null object
reference           32 non-null object
description         44 non-null object
access-confirmed    44 non-null object
dtypes: float64(1), int64(1), object(11)
memory usage: 4.9+ KB
None


Unnamed: 0,id,category,access,source,directory,sub-directory,filename,zipfile,page-url,data-url,reference,description,access-confirmed
0,1,boston property assessments,download,data,raw,property,fy19-assessments,0.0,https://data.boston.gov/dataset/property-asses...,https://data.boston.gov/dataset/e02c44d2-3c64-...,", https://data.boston.gov/dataset/e02c44d2-3c6...","Gives property, or parcel, ownership together ...",2019-11-07


In [136]:
# subset data inventory to include just 'downloads'
cols = ['sub-directory', 'filename', 'zipfile', 'data-url', 'source']
download_df = inventory_df.loc[inventory_df['access']=='download'][cols]

In [191]:
# define functions for performing data downloads

def make_download_dict(inventory, parent):
    """
    """
    subdirs = list(set(inventory['sub-directory']))
    inventory['file-type'] = download_df['data-url'].apply(
        lambda x: os.path.join(*PurePath(x).suffixes)
    )
    
    download_dict = {
        subdir: {
            filename: {
                'url': url,
                'filepath': os.path.join(parent, subdir, ''.join([filename, suffix])),
                'is_zip': is_zip
            }
            for filename, url, suffix, is_zip in zip(
                inventory.loc[inventory_df['sub-directory'] == subdir]['filename'],
                inventory.loc[inventory_df['sub-directory'] == subdir]['data-url'],
                inventory.loc[inventory_df['sub-directory'] == subdir]['file-type'],
                inventory.loc[inventory_df['sub-directory'] == subdir]['zipfile'],
            )
        } for subdir in subdirs
    }
    
    return download_dict


def make_subdirs(download_dict, parent, verbose=True):
    """
    """
    
    if not os.path.exists(parent):
        os.mkdir(parent)
        open('.gitkeep', 'a').close()
        if verbose:
            print('The {0} parent directory has been created.'.format(parent))
            print()
    
    # create list of current top-level files and directories
    existing = os.listdir(parent)

    # check for ./data/ dir and create if it doesn't exist
    [
        os.mkdir(os.path.join(parent, subdir))
        for subdir in download_dict.keys() if not subdir in existing 
    ]
    
    # save new list of files and directories, as well is difference
    new_existing = os.listdir(parent)
    new_added = list(set(new_existing) - set(existing))
    
    # print summary results
    if verbose:
        if len(new_added) > 0:
            print('The following sub-directories were added to {}:'.format(parent))
            for subdir in new_added:
                print(subdir)
            print()
        else:
            print(
                'No directories have been created. All target directories already '\
                'exist locally\n'
            )
    
    return new_existing, new_added


def download_datafiles(download_dict, parent, exclude_subdir='shapefile', verbose=True):
    """
    """
    subdirs = [
        subdir for subdir in list(download_dict.keys())
        if subdir not in exclude_subdir
    ]
    downloaded = dict()
    
    for subdir in subdirs:
        for filename, download in download_dict[subdir].items():
            if not os.path.exists(download['filepath']):
                if verbose:
                    print(
                        'Downloading {0} data to {1}'.format(filename, download['filepath'])
                    )                
                downloaded[filename] = [
                    urllib.request.urlretrieve(
                        download['url'],
                        download['filepath'],
                    )
                ]
    
    if verbose:
        if len(downloaded)==0:
            print(
                'No datafiles have been downloaded. All target files already exist locally.\n'
            )
        else:
            print(
                '{0} data files have been downloaded and stored locally.\n'.format(
                    len(downloaded)
                )
            )
    
    return downloaded


def download_shapefiles(download_dict, parent, target_subdir='shapefile', verbose=True):
    """
    """
    downloaded = dict()
    
    for filename, download in download_dict[target_subdir].items():
        if not os.path.exists(download['filepath']):
            if verbose:
                print(
                    'Downloading {0} shapefile to {1}'.format(filename, download['filepath'])
                )                
            
            # download shape zipfile to directory
            downloaded[filename] = [
                urllib.request.urlretrieve(
                    download['url'],
                    download['filepath'],
                )
            ]
            
            # create target sub-directory for extracting zipfile
            shapedir = os.path.join(os.path.dirname(download['filepath']), filename)
            if not os.path.exists(shapedir):
                os.mkdir(shapedir)
            
            # extract zipfile to target sub-directory
            with zipfile.ZipFile(download['filepath'], 'r') as zipobj:

                if verbose:
                    print(
                        '\t...extracting shapefile zip archive to {0}'.format(shapedir)
                    )                

                # extract all files
                zipobj.extractall(shapedir)

    if verbose:
        if len(downloaded)==0:
            print(
                'No shapefiles have been downloaded. All target files already exist locally.\n'
            )
        else:
            print(
                '{0} shapefiles have been downloaded and extracted locally.\n'.format(
                    len(downloaded)
                )
            )
            
    return downloaded

In [None]:
download_dict = make_download_dict(download_df, parent_dir)

# make required parent and sub-directories
listdirs, added = make_subdirs(download_dict, parent_dir)


In [184]:
%%time
# report cell execution time for later reference

# create download dictionary
download_dict = make_download_dict(download_df, parent_dir)

# make required sub-directories in parent directory
listdirs, added = make_subdirs(download_dict, parent_dir)

# download data files to target sub-directories
downloaded_data_confirmation = download_datafiles(download_dict, parent_dir)

# download and extract shapefiles to target sub-directories
downloaded_shape_confirmation = download_shapefiles(download_dict, parent_dir) 

No directories have been created. All target directories already exist locally

No datafiles have been downloaded. All target files already exist locally.

