# Notebook 001: Scrape Datasources

This notebook creates data sources listed in the `../data-inventory.csv` that require web-scraping activities in order to generate the data.

Similar to "Notebook 000: Download Datasources", consolidated data created by running this notebook will be saved programatically to the appropriate sub-directory in the project's `../data/raw/` data store.

### Please DO NOT yet run this notebook.

**This notebook is not yet competed. The master branch will be updated with a finished version of this notebook once it is complete**

In [177]:
import os
import urllib
import requests
import zipfile
from pathlib import PurePath

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [186]:
# set path variables
DATA_ROOT = '../data'
parent_dir = os.path.join(DATA_ROOT, 'raw')
inventory_filepath = os.path.join(DATA_ROOT, 'data-inventory.csv')

In [187]:
# read data inventory to dataframe
inventory_df = pd.read_csv(inventory_filepath)

In [190]:
# view summary of data inventory
print(inventory_df.info())
inventory_df.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 13 columns):
id                  47 non-null int64
category            47 non-null object
access              47 non-null object
source              44 non-null object
directory           47 non-null object
sub-directory       47 non-null object
filename            44 non-null object
zipfile             37 non-null float64
page-url            47 non-null object
data-url            44 non-null object
reference           32 non-null object
description         44 non-null object
access-confirmed    44 non-null object
dtypes: float64(1), int64(1), object(11)
memory usage: 4.9+ KB
None


Unnamed: 0,id,category,access,source,directory,sub-directory,filename,zipfile,page-url,data-url,reference,description,access-confirmed
0,1,boston property assessments,download,data,raw,property,fy19-assessments,0.0,https://data.boston.gov/dataset/property-asses...,https://data.boston.gov/dataset/e02c44d2-3c64-...,", https://data.boston.gov/dataset/e02c44d2-3c6...","Gives property, or parcel, ownership together ...",2019-11-07


In [136]:
# subset data inventory to include just 'downloads'
cols = ['sub-directory', 'filename', 'zipfile', 'data-url', 'source']
download_df = inventory_df.loc[inventory_df['access']=='download'][cols]

In [183]:
# define functions for performing data downloads

def make_download_dict(inventory, parent):
    """
    """
    subdirs = list(set(inventory['sub-directory']))
    inventory['file-type'] = download_df['data-url'].apply(
        lambda x: os.path.join(*PurePath(x).suffixes)
    )
    
    download_dict = {
        subdir: {
            filename: {
                'url': url,
                'filepath': os.path.join(parent, subdir, ''.join([filename, suffix])),
                'is_zip': is_zip
            }
            for filename, url, suffix, is_zip in zip(
                inventory.loc[inventory_df['sub-directory'] == subdir]['filename'],
                inventory.loc[inventory_df['sub-directory'] == subdir]['data-url'],
                inventory.loc[inventory_df['sub-directory'] == subdir]['file-type'],
                inventory.loc[inventory_df['sub-directory'] == subdir]['zipfile'],
            )
        } for subdir in subdirs
    }
    
    return download_dict


def make_subdirs(download_dict, parent, verbose=True):
    """
    """
    # create list of current top-level files and directories
    existing = os.listdir(parent)

    # check for ./data/ dir and create if it doesn't exist
    [
        os.mkdir(os.path.join(parent, subdir))
        for subdir in download_dict.keys() if not subdir in existing 
    ]
    
    # save new list of files and directories, as well is difference
    new_existing = os.listdir(parent)
    new_added = list(set(new_existing) - set(existing))
    
    # print summary results
    if verbose:
        if len(new_added) > 0:
            print('The following sub-directories were added to {}:'.format(parent))
            for subdir in new_added:
                print(subdir)
            print()
        else:
            print(
                'No directories have been created. All target directories already '\
                'exist locally\n'
            )
    
    return new_existing, new_added


def download_datafiles(download_dict, parent, exclude_subdir='shapefile', verbose=True):
    """
    """
    subdirs = [
        subdir for subdir in list(download_dict.keys())
        if subdir not in exclude_subdir
    ]
    downloaded = dict()
    
    for subdir in subdirs:
        for filename, download in download_dict[subdir].items():
            if not os.path.exists(download['filepath']):
                if verbose:
                    print(
                        'Downloading {0} data to {1}'.format(filename, download['filepath'])
                    )                
                downloaded[filename] = [
                    urllib.request.urlretrieve(
                        download['url'],
                        download['filepath'],
                    )
                ]
    
    if verbose:
        if len(downloaded)==0:
            print(
                'No datafiles have been downloaded. All target files already exist locally.\n'
            )
        else:
            print(
                '{0} data files have been downloaded and stored locally.\n'.format(
                    len(downloaded)
                )
            )
    
    return downloaded


def download_shapefiles(download_dict, parent, target_subdir='shapefile', verbose=True):
    """
    """
    downloaded = dict()
    
    for filename, download in download_dict[target_subdir].items():
        if not os.path.exists(download['filepath']):
            if verbose:
                print(
                    'Downloading {0} shapefile to {1}'.format(filename, download['filepath'])
                )                
            
            # download shape zipfile to directory
            downloaded[filename] = [
                urllib.request.urlretrieve(
                    download['url'],
                    download['filepath'],
                )
            ]
            
            # create target sub-directory for extracting zipfile
            shapedir = os.path.join(os.path.dirname(download['filepath']), filename)
            if not os.path.exists(shapedir):
                os.mkdir(shapedir)
            
            # extract zipfile to target sub-directory
            with zipfile.ZipFile(download['filepath'], 'r') as zipobj:

                if verbose:
                    print(
                        '\t...extracting shapefile zip archive to {0}'.format(shapedir)
                    )                

                # extract all files
                zipobj.extractall(shapedir)

    if verbose:
        if len(downloaded)==0:
            print(
                'No shapefiles have been downloaded. All target files already exist locally.\n'
            )
        else:
            print(
                '{0} shapefiles have been downloaded and extracted locally.\n'.format(
                    len(downloaded)
                )
            )
            
    return downloaded

In [None]:
def make_source_inventory():
    """"""
    raise NotImplementedError

def make_local_inventory():
    """"""
    raise NotImplementedError

def make_local_directory():
    """"""
    raise NotImplementedError

def check_local_data():
    """"""
    raise NotImplementedError
    
def download_source_data():
    """"""
    raise NotImplementedError

def check_additional_sources():
    """"""

In [17]:
url = 'https://data.boston.gov/dataset/property-assessment'
my_page = requests.get(url)

In [18]:
my_page.status_code

200

In [19]:
soup = BeautifulSoup(my_page.content, 'html.parser')

In [22]:
soup.prettify()[:500]

'<!DOCTYPE html>\n<!--[if IE 7]> <html lang="en" class="ie ie7"> <![endif]-->\n<!--[if IE 8]> <html lang="en" class="ie ie8"> <![endif]-->\n<!--[if IE 9]> <html lang="en" class="ie9"> <![endif]-->\n<!--[if gt IE 8]><!-->\n<html lang="en">\n <!--<![endif]-->\n <head>\n  <!--[if lte ie 8]><script type="text/javascript" src="/fanstatic/vendor/:version:2019-08-09T21:00:46/html5.min.js"></script><![endif]-->\n  <link href="/fanstatic/vendor/:version:2019-08-09T21:00:46/select2/select2.css" rel="stylesheet" typ'

In [31]:
dataset_resources = soup.find('section', attrs={'id':'dataset-resources'})

In [32]:
resources = dataset_resources.find_all('li', attrs={'class': 'resource-item'})

In [38]:
resource_names = [resource.find('a', attrs={'class': 'heading'}).get('title') for resource in resources]

In [40]:
resource_urls = [resource.find('a', attrs={'btn btn-primary'}).get('href') for resource in resources]

In [53]:
resource_exts  = [os.path.splitext(resource_url)[1] for resource_url in resource_urls]

In [59]:
resource_types = ['data' if resource_ext in ['.csv', '.txt'] else 'key' for resource_ext in resource_exts]

In [56]:
resource_filenames = [
    ''.join([resource.lower().replace(' ', '-'), resource_ext])
    for resource, resource_ext
    in zip(resource_names, resource_exts)
]

In [61]:
inventory_dict = {
    'resource': resource_names,
    'filename': resource_filenames,
    'filetype': resource_types,
    'url': resource_urls,
}

In [63]:
inventory_df = pd.DataFrame(inventory_dict)

In [27]:
[a.get('href') for a in soup.find_all('a', attrs={'btn btn-primary'})]

['https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/d6c1268c-cd83-4dc3-a914-bba1ed59da6d/download/propertyoccupancycodes.pdf',
 'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/695a8596-5458-442b-a017-7cd72471aade/download/fy19fullpropassess.csv',
 'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/bac18ae6-b8fd-4cd3-a61c-c5e1a11f716c/download/property-assessment-fy2019-data-key.pdf',
 'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/fd351943-c2c6-4630-992d-3f895360febd/download/ast2018full.csv',
 'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/b8e32ddf-671f-4a35-b99f-c060bae958e5/download/property-assessment-fy2018-data-key.pdf',
 'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/062fc6fa-b5ff-4270-86cf-202225e40858/download/property-assessment-fy2017.csv',
 'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8f