In [177]:
import os
import urllib
import requests
import zipfile
from pathlib import PurePath

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [186]:
# set path variables
DATA_ROOT = '../data'
parent_dir = os.path.join(DATA_ROOT, 'raw')
inventory_filepath = os.path.join(DATA_ROOT, 'data-inventory.csv')

In [187]:
# read data inventory to dataframe
inventory_df = pd.read_csv(inventory_filepath)

In [190]:
# view summary of data inventory
print(inventory_df.info())
inventory_df.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 13 columns):
id                  47 non-null int64
category            47 non-null object
access              47 non-null object
source              44 non-null object
directory           47 non-null object
sub-directory       47 non-null object
filename            44 non-null object
zipfile             37 non-null float64
page-url            47 non-null object
data-url            44 non-null object
reference           32 non-null object
description         44 non-null object
access-confirmed    44 non-null object
dtypes: float64(1), int64(1), object(11)
memory usage: 4.9+ KB
None


Unnamed: 0,id,category,access,source,directory,sub-directory,filename,zipfile,page-url,data-url,reference,description,access-confirmed
0,1,boston property assessments,download,data,raw,property,fy19-assessments,0.0,https://data.boston.gov/dataset/property-asses...,https://data.boston.gov/dataset/e02c44d2-3c64-...,", https://data.boston.gov/dataset/e02c44d2-3c6...","Gives property, or parcel, ownership together ...",2019-11-07


In [136]:
# subset data inventory to include just 'downloads'
cols = ['sub-directory', 'filename', 'zipfile', 'data-url', 'source']
download_df = inventory_df.loc[inventory_df['access']=='download'][cols]

In [183]:
# define functions for performing data downloads

def make_download_dict(inventory, parent):
    """
    """
    subdirs = list(set(inventory['sub-directory']))
    inventory['file-type'] = download_df['data-url'].apply(
        lambda x: os.path.join(*PurePath(x).suffixes)
    )
    
    download_dict = {
        subdir: {
            filename: {
                'url': url,
                'filepath': os.path.join(parent, subdir, ''.join([filename, suffix])),
                'is_zip': is_zip
            }
            for filename, url, suffix, is_zip in zip(
                inventory.loc[inventory_df['sub-directory'] == subdir]['filename'],
                inventory.loc[inventory_df['sub-directory'] == subdir]['data-url'],
                inventory.loc[inventory_df['sub-directory'] == subdir]['file-type'],
                inventory.loc[inventory_df['sub-directory'] == subdir]['zipfile'],
            )
        } for subdir in subdirs
    }
    
    return download_dict


def make_subdirs(download_dict, parent, verbose=True):
    """
    """
    # create list of current top-level files and directories
    existing = os.listdir(parent)

    # check for ./data/ dir and create if it doesn't exist
    [
        os.mkdir(os.path.join(parent, subdir))
        for subdir in download_dict.keys() if not subdir in existing 
    ]
    
    # save new list of files and directories, as well is difference
    new_existing = os.listdir(parent)
    new_added = list(set(new_existing) - set(existing))
    
    # print summary results
    if verbose:
        if len(new_added) > 0:
            print('The following sub-directories were added to {}:'.format(parent))
            for subdir in new_added:
                print(subdir)
            print()
        else:
            print(
                'No directories have been created. All target directories already '\
                'exist locally\n'
            )
    
    return new_existing, new_added


def download_datafiles(download_dict, parent, exclude_subdir='shapefile', verbose=True):
    """
    """
    subdirs = [
        subdir for subdir in list(download_dict.keys())
        if subdir not in exclude_subdir
    ]
    downloaded = dict()
    
    for subdir in subdirs:
        for filename, download in download_dict[subdir].items():
            if not os.path.exists(download['filepath']):
                if verbose:
                    print(
                        'Downloading {0} data to {1}'.format(filename, download['filepath'])
                    )                
                downloaded[filename] = [
                    urllib.request.urlretrieve(
                        download['url'],
                        download['filepath'],
                    )
                ]
    
    if verbose:
        if len(downloaded)==0:
            print(
                'No datafiles have been downloaded. All target files already exist locally.\n'
            )
        else:
            print(
                '{0} data files have been downloaded and stored locally.\n'.format(
                    len(downloaded)
                )
            )
    
    return downloaded


def download_shapefiles(download_dict, parent, target_subdir='shapefile', verbose=True):
    """
    """
    downloaded = dict()
    
    for filename, download in download_dict[target_subdir].items():
        if not os.path.exists(download['filepath']):
            if verbose:
                print(
                    'Downloading {0} shapefile to {1}'.format(filename, download['filepath'])
                )                
            
            # download shape zipfile to directory
            downloaded[filename] = [
                urllib.request.urlretrieve(
                    download['url'],
                    download['filepath'],
                )
            ]
            
            # create target sub-directory for extracting zipfile
            shapedir = os.path.join(os.path.dirname(download['filepath']), filename)
            if not os.path.exists(shapedir):
                os.mkdir(shapedir)
            
            # extract zipfile to target sub-directory
            with zipfile.ZipFile(download['filepath'], 'r') as zipobj:

                if verbose:
                    print(
                        '\t...extracting shapefile zip archive to {0}'.format(shapedir)
                    )                

                # extract all files
                zipobj.extractall(shapedir)

    if verbose:
        if len(downloaded)==0:
            print(
                'No shapefiles have been downloaded. All target files already exist locally.\n'
            )
        else:
            print(
                '{0} shapefiles have been downloaded and extracted locally.\n'.format(
                    len(downloaded)
                )
            )
            
    return downloaded

In [184]:
# create download dictionary
download_dict = make_download_dict(download_df, parent_dir)

# make required sub-directories in parent directory
listdirs, added = make_subdirs(download_dict, parent_dir)

# download data files to target sub-directories
downloaded_data_confirmation = download_datafiles(download_dict, parent_dir)

# download and extract shapefiles to target sub-directories
downloaded_shape_confirmation = download_shapefiles(download_dict, parent_dir) 

No directories have been created. All target directories already exist locally

No datafiles have been downloaded. All target files already exist locally.



In [154]:
exclude = ['shapefile', 'crime']
list_keys = list(download_dict.keys())
[
    key for key in list_keys
    if key not in exclude
]


['bpd-fio', 'boston', 'property']

In [None]:
def make_source_inventory():
    """"""
    raise NotImplementedError

def make_local_inventory():
    """"""
    raise NotImplementedError

def make_local_directory():
    """"""
    raise NotImplementedError

def check_local_data():
    """"""
    raise NotImplementedError
    
def download_source_data():
    """"""
    raise NotImplementedError

def check_additional_sources():
    """"""

In [17]:
url = 'https://data.boston.gov/dataset/property-assessment'
my_page = requests.get(url)

In [18]:
my_page.status_code

200

In [19]:
soup = BeautifulSoup(my_page.content, 'html.parser')

In [22]:
soup.prettify()[:500]

'<!DOCTYPE html>\n<!--[if IE 7]> <html lang="en" class="ie ie7"> <![endif]-->\n<!--[if IE 8]> <html lang="en" class="ie ie8"> <![endif]-->\n<!--[if IE 9]> <html lang="en" class="ie9"> <![endif]-->\n<!--[if gt IE 8]><!-->\n<html lang="en">\n <!--<![endif]-->\n <head>\n  <!--[if lte ie 8]><script type="text/javascript" src="/fanstatic/vendor/:version:2019-08-09T21:00:46/html5.min.js"></script><![endif]-->\n  <link href="/fanstatic/vendor/:version:2019-08-09T21:00:46/select2/select2.css" rel="stylesheet" typ'

In [31]:
dataset_resources = soup.find('section', attrs={'id':'dataset-resources'})

In [32]:
resources = dataset_resources.find_all('li', attrs={'class': 'resource-item'})

In [38]:
resource_names = [resource.find('a', attrs={'class': 'heading'}).get('title') for resource in resources]

In [40]:
resource_urls = [resource.find('a', attrs={'btn btn-primary'}).get('href') for resource in resources]

In [53]:
resource_exts  = [os.path.splitext(resource_url)[1] for resource_url in resource_urls]

In [59]:
resource_types = ['data' if resource_ext in ['.csv', '.txt'] else 'key' for resource_ext in resource_exts]

In [56]:
resource_filenames = [
    ''.join([resource.lower().replace(' ', '-'), resource_ext])
    for resource, resource_ext
    in zip(resource_names, resource_exts)
]

In [61]:
inventory_dict = {
    'resource': resource_names,
    'filename': resource_filenames,
    'filetype': resource_types,
    'url': resource_urls,
}

In [63]:
inventory_df = pd.DataFrame(inventory_dict)

In [27]:
[a.get('href') for a in soup.find_all('a', attrs={'btn btn-primary'})]

['https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/d6c1268c-cd83-4dc3-a914-bba1ed59da6d/download/propertyoccupancycodes.pdf',
 'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/695a8596-5458-442b-a017-7cd72471aade/download/fy19fullpropassess.csv',
 'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/bac18ae6-b8fd-4cd3-a61c-c5e1a11f716c/download/property-assessment-fy2019-data-key.pdf',
 'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/fd351943-c2c6-4630-992d-3f895360febd/download/ast2018full.csv',
 'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/b8e32ddf-671f-4a35-b99f-c060bae958e5/download/property-assessment-fy2018-data-key.pdf',
 'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/062fc6fa-b5ff-4270-86cf-202225e40858/download/property-assessment-fy2017.csv',
 'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8f

In [36]:
DATA_ROOT = '../data/raw/'
raw_dir = 'bos_analyze'
filename = 'SAMaddresses'
data_url = 'http://bostonopendata-boston.opendata.arcgis.com/datasets/b6bffcace320448d96bb84eabb8a075f_0.csv'
directories = os.listdir(DATA_ROOT)

orig_filename = os.path.basename(data_url)
new_filename = ''.join([filename, os.path.splitext(orig_filename)[1]])
filepath_sam = os.path.join(DATA_ROOT, raw_dir, new_filename)

# check for bos_analyze
if not raw_dir in directories:
    os.mkdir(os.path.join(DATA_ROOT, raw_dir))

    local_filename, headers = urllib.request.urlretrieve(
        data_url,
        filepath
    )

['../data/raw/',
 '../data/raw/bos_analyze',
 '../data/raw/bos_analyze/property_assess',
 '../data/raw/census']

In [43]:
DATA_ROOT = '../data/raw/'
raw_dir = 'bos_analyze/property_assess'
filename = 'fy19fullpropassess'
data_url = 'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/'\
'resource/695a8596-5458-442b-a017-7cd72471aade/download/fy19fullpropassess.csv'
directories = os.listdir(DATA_ROOT)

orig_filename = os.path.basename(data_url)
new_filename = ''.join([filename, os.path.splitext(orig_filename)[1]])
filepath_prop = os.path.join(DATA_ROOT, raw_dir, new_filename)

# check for bos_analyze
if not raw_dir in [i[0] for i in os.walk(DATA_ROOT)]:
    os.mkdir(os.path.join(DATA_ROOT, raw_dir))
    local_filename, headers = urllib.request.urlretrieve(
        data_url,
        filepath
    )

FileExistsError: [Errno 17] File exists: '../data/raw/bos_analyze/property_assess'

In [51]:
df_sam = pd.read_csv(filepath_sam, dtype=str)
df_prop = pd.read_csv(filepath_prop, dtype=str)

In [52]:
print(df_prop.info())
df_prop.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174668 entries, 0 to 174667
Data columns (total 75 columns):
PID                174668 non-null object
CM_ID              82038 non-null object
GIS_ID             174668 non-null object
ST_NUM             174668 non-null object
ST_NAME            174668 non-null object
ST_NAME_SUF        174129 non-null object
UNIT_NUM           65700 non-null object
ZIPCODE            174661 non-null object
PTYPE              174668 non-null object
LU                 174668 non-null object
OWN_OCC            174668 non-null object
OWNER              174668 non-null object
MAIL_ADDRESSEE     174668 non-null object
MAIL_ADDRESS       174668 non-null object
MAIL CS            174668 non-null object
MAIL_ZIPCODE       174668 non-null object
AV_LAND            174668 non-null object
AV_BLDG            174668 non-null object
AV_TOTAL           174668 non-null object
GROSS_TAX          174668 non-null object
LAND_SF            169227 non-null object
YR_BUILT 

Unnamed: 0,PID,CM_ID,GIS_ID,ST_NUM,ST_NAME,ST_NAME_SUF,UNIT_NUM,ZIPCODE,PTYPE,LU,...,U_BTH_STYLE2,U_BTH_STYLE3,U_KITCH_TYPE,U_KITCH_STYLE,U_HEAT_TYP,U_AC,U_FPLACE,U_INT_FIN,U_INT_CND,U_VIEW
0,502550008,502550000,502550000,87,BEACON,ST,2-F,2108,102,CD,...,M,,O,M,W,N,1,N,G,A
1,502550010,502550000,502550000,87,BEACON,ST,2-R,2108,102,CD,...,,,O,S,W,N,1,N,A,A
2,502550012,502550000,502550000,87,BEACON,ST,3-F,2108,102,CD,...,S,,O,M,W,N,1,N,A,G
3,502550014,502550000,502550000,87,BEACON,ST,3-R,2108,102,CD,...,,,O,N,W,N,1,S,A,G
4,502550016,502550000,502550000,87,BEACON,ST,4,2108,102,CD,...,M,M,O,L,W,C,2,N,E,G


In [159]:
df_prop['ZIPCODE'] = '0' + df_prop['ZIPCODE']
df_prop['is_prop'] = 1
df_sam['is_sam'] = 1
df_sam['GIS_ID'] = df_sam['PARCEL'].copy().str[1:]

In [160]:
print(df_sam.info())
df_sam.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398702 entries, 0 to 398701
Data columns (total 31 columns):
X                       398702 non-null object
Y                       398702 non-null object
SAM_ADDRESS_ID          398702 non-null object
RELATIONSHIP_TYPE       398702 non-null object
BUILDING_ID             398702 non-null object
FULL_ADDRESS            398702 non-null object
STREET_NUMBER           398702 non-null object
IS_RANGE                398702 non-null object
RANGE_FROM              26175 non-null object
RANGE_TO                26175 non-null object
UNIT                    270100 non-null object
FULL_STREET_NAME        398702 non-null object
STREET_ID               398702 non-null object
STREET_PREFIX           398702 non-null object
STREET_BODY             398702 non-null object
STREET_SUFFIX_ABBR      394519 non-null object
STREET_FULL_SUFFIX      394519 non-null object
STREET_SUFFIX_DIR       398702 non-null object
STREET_NUMBER_SORT      398702 non-null objec

Unnamed: 0,X,Y,SAM_ADDRESS_ID,RELATIONSHIP_TYPE,BUILDING_ID,FULL_ADDRESS,STREET_NUMBER,IS_RANGE,RANGE_FROM,RANGE_TO,...,X_COORD,Y_COORD,SAM_STREET_ID,WARD,PRECINCT_WARD,PARCEL,is_sam,PID,CM_ID,GIS_ID
0,-71.1250365476,42.2506265523,1,1,100778,6-10 A St,6-10,1,6.0,10.0,...,757684.428549561,2916575.26204486,1,18,1818,1809309000,1,809309000,809309000,809309000
1,-71.1248914926,42.2503284233,2,1,126344,15 A St,15,0,,,...,757724.178237994,2916466.7927758,1,18,1818,1809331000,1,809331000,809331000,809331000
2,-71.125400606,42.2504686502,4,1,100797,7 A St,7,0,,,...,757586.116749272,2916517.2874641,1,18,1818,1809337000,1,809337000,809337000,809337000
3,-71.0568005885,42.3408886702,11,1,154909,10 A St,10,0,,,...,775987.55932641,2949556.68074288,2,6,601,600090000,1,600090000,600090000,600090000
4,-71.0531485883,42.3448456709,16,1,141250,172-174 A St,172-174,1,172.0,174.0,...,776967.328447327,2951003.83399223,2,6,601,602743000,1,602743000,602743000,602743000


In [161]:
df_sam_unique = df_sam.groupby('GIS_ID').agg(
    {
        'X': 'first',
        'Y': 'first',
        'X_COORD': 'first',
        'Y_COORD': 'first',
        'is_sam': 'first',
        'is_sam': np.sum
    }
)

In [162]:
df_sam_unique.shape

(66783, 5)

In [163]:
df_sam_unique.tail()

Unnamed: 0_level_0,X,Y,X_COORD,Y_COORD,is_sam
GIS_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
905095000,-71.1183806062,42.2902886581,759421.780110985,2931036.67317362,4
905097000,-71.1186506061,42.2903286581,759348.661454916,2931050.92306164,4
905098000,-71.1189006061,42.2903386585,759281.003054842,2931054.26504564,1
905100000,-71.1184306058,42.2900386582,759408.659342974,2930945.50888555,4
905101000,-71.1184506058,42.289918658,759403.443566963,2930901.7547735,4


In [120]:
# df_merge = pd.merge(df_prop[['CM_ID', 'is_prop']], df_sam[['CM_ID', 'is_sam']], on='CM_ID', how='left')

In [164]:
df_merge = pd.merge(df_prop, df_sam_unique, on='GIS_ID', how='left')

In [165]:
df_merge.info()
df_merge.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 174668 entries, 0 to 174667
Data columns (total 81 columns):
PID                174668 non-null object
CM_ID              82038 non-null object
GIS_ID             174668 non-null object
ST_NUM             174668 non-null object
ST_NAME            174668 non-null object
ST_NAME_SUF        174129 non-null object
UNIT_NUM           65700 non-null object
ZIPCODE            174661 non-null object
PTYPE              174668 non-null object
LU                 174668 non-null object
OWN_OCC            174668 non-null object
OWNER              174668 non-null object
MAIL_ADDRESSEE     174668 non-null object
MAIL_ADDRESS       174668 non-null object
MAIL CS            174668 non-null object
MAIL_ZIPCODE       174668 non-null object
AV_LAND            174668 non-null object
AV_BLDG            174668 non-null object
AV_TOTAL           174668 non-null object
GROSS_TAX          174668 non-null object
LAND_SF            169227 non-null object
YR_BUILT 

Unnamed: 0,PID,CM_ID,GIS_ID,ST_NUM,ST_NAME,ST_NAME_SUF,UNIT_NUM,ZIPCODE,PTYPE,LU,...,U_FPLACE,U_INT_FIN,U_INT_CND,U_VIEW,is_prop,X,Y,X_COORD,Y_COORD,is_sam
0,502550008,502550000,502550000,87,BEACON,ST,2-F,2108,102,CD,...,1.0,N,G,A,1,-71.0716905945,42.355918672,771934.51881063,2955013.33005996,16.0
1,502550010,502550000,502550000,87,BEACON,ST,2-R,2108,102,CD,...,1.0,N,A,A,1,-71.0716905945,42.355918672,771934.51881063,2955013.33005996,16.0
2,502550012,502550000,502550000,87,BEACON,ST,3-F,2108,102,CD,...,1.0,N,A,G,1,-71.0716905945,42.355918672,771934.51881063,2955013.33005996,16.0
3,502550014,502550000,502550000,87,BEACON,ST,3-R,2108,102,CD,...,1.0,S,A,G,1,-71.0716905945,42.355918672,771934.51881063,2955013.33005996,16.0
4,502550016,502550000,502550000,87,BEACON,ST,4,2108,102,CD,...,2.0,N,E,G,1,-71.0716905945,42.355918672,771934.51881063,2955013.33005996,16.0
5,502551000,502551000,502551000,88,BEACON,ST,,2108,995,CM,...,,,,,1,-71.0717905946,42.355898672,771907.526618615,2955005.90596394,7.0
6,502551002,502551000,502551000,88,BEACON,ST,1,2108,102,CD,...,0.0,N,A,F,1,-71.0717905946,42.355898672,771907.526618615,2955005.90596394,7.0
7,502551004,502551000,502551000,88,BEACON,ST,2,2108,102,CD,...,1.0,N,A,F,1,-71.0717905946,42.355898672,771907.526618615,2955005.90596394,7.0
8,502551006,502551000,502551000,88,BEACON,ST,3,2108,102,CD,...,2.0,E,E,E,1,-71.0717905946,42.355898672,771907.526618615,2955005.90596394,7.0
9,502551008,502551000,502551000,88,BEACON,ST,4,2108,102,CD,...,0.0,N,A,E,1,-71.0717905946,42.355898672,771907.526618615,2955005.90596394,7.0


In [166]:
df_merge['is_prop'].sum()

174668

In [167]:
df_merge[df_merge['is_sam'].isnull()]

Unnamed: 0,PID,CM_ID,GIS_ID,ST_NUM,ST_NAME,ST_NAME_SUF,UNIT_NUM,ZIPCODE,PTYPE,LU,...,U_FPLACE,U_INT_FIN,U_INT_CND,U_VIEW,is_prop,X,Y,X_COORD,Y_COORD,is_sam
312,0100179002,,100179002,,PRINCETON,ST,,0000000002128,132,RL,...,,,,,1,,,,,
441,0100289000,,100289000,,PRINCETON,ST,,0000000002128,106,RL,...,,,,,1,,,,,
442,0100290000,,100290000,,PRINCETON,ST,,0000000002128,316,C,...,,,,,1,,,,,
483,0100335000,,100335000,,BREMEN,ST,,0000000002128,337,CL,...,,,,,1,,,,,
492,0100344000,,100344000,368,BREMEN,ST,,0000000002128,337,CL,...,,,,,1,,,,,
497,0100352000,,100352000,,NEPTUNE,RD,,0000000002128,985,E,...,,,,,1,,,,,
499,0100357000,,100357000,36,NEPTUNE,RD,,0000000002128,985,E,...,,,,,1,,,,,
502,0100359001,,100359001,405,FRANKFORT,ST,,0000000002128,902,E,...,,,,,1,,,,,
503,0100359002,,100359002,409,FRANKFORT,ST,,0000000002128,985,E,...,,,,,1,,,,,
504,0100359005,,100359005,,NEPTUNE,RD,,0000000002128,901,E,...,,,,,1,,,,,


In [147]:
len(df_merge.loc[df_merge[['is_prop', 'is_sam']].sum(axis=1)==2])

KeyError: "['is_sam'] not in index"

In [83]:
df_prop['ST_NUM'].value_counts()

              9445
1             2376
15            2337
10            2028
9             1958
2             1898
11            1804
8             1769
6             1721
25            1549
7             1532
12            1531
5             1515
19            1506
21            1478
20            1441
16            1405
14            1384
22            1375
18            1254
35            1253
3             1235
17            1224
4             1170
26            1129
24            1118
30            1113
27            1088
42            1047
40            1043
              ... 
55 W             1
425 A425         1
1110 A1108       1
51 - 53          1
199 -203         1
2 D              1
336 344          1
1 21             1
109 109A         1
41 -49           1
2521             1
1935             1
46 46R           1
1062 1064        1
695 A            1
401 405          1
24 2             1
662 A666A        1
53 HF            1
798 A            1
312 310          1
303 305B    

In [80]:
df_sam['STREET_NUMBER'].value_counts()

1            6649
10           5470
15           5224
11           5142
9            4221
6            4113
8            4085
20           3926
7            3898
5            3705
16           3642
25           3636
12           3633
14           3567
19           3559
30           3558
2            3494
4            3284
21           3171
40           3131
35           3048
24           3035
17           3002
3            2904
26           2849
18           2828
28           2806
31           2658
27           2613
22           2612
             ... 
554-560         1
874-880         1
3359-3365       1
57-6            1
3280-3288       1
580-590         1
511R            1
377-385         1
661B            1
232-236A        1
720A-720        1
1536-1538       1
25H             1
18B             1
119-119B        1
406-406         1
349-363         1
4446-4448       1
216-238         1
1866-1872       1
12 -14          1
8D              1
3134            1
1818            1
614-618   

In [86]:
df_sam['FULL_STREET_NAME']

0                 A St
1                 A St
2                 A St
3                 A St
4                 A St
5                 A St
6                 A St
7                 A St
8                 A St
9                 A St
10                A St
11                A St
12                A St
13                A St
14                A St
15                A St
16                A St
17                A St
18                A St
19                A St
20                A St
21                A St
22                A St
23                A St
24                A St
25                A St
26                A St
27                A St
28                A St
29                A St
              ...     
398672    Harrison Ave
398673    Harrison Ave
398674    Harrison Ave
398675    Harrison Ave
398676    Harrison Ave
398677    Harrison Ave
398678    Harrison Ave
398679    Harrison Ave
398680    Harrison Ave
398681    Harrison Ave
398682    Harrison Ave
398683    Harrison Ave
398684    H

In [95]:
df_sam.loc[df_sam['FULL_STREET_NAME'].str.contains('Beacon')]['STREET_NUMBER'].value_counts().sort_index().loc['87']

16

In [107]:
df_prop.loc[(df_prop['ST_NAME'].str.contains('BEACON')) & (df_prop['ST_NUM']=='87')]

Unnamed: 0,PID,CM_ID,GIS_ID,ST_NUM,ST_NAME,ST_NAME_SUF,UNIT_NUM,ZIPCODE,PTYPE,LU,...,U_BTH_STYLE3,U_KITCH_TYPE,U_KITCH_STYLE,U_HEAT_TYP,U_AC,U_FPLACE,U_INT_FIN,U_INT_CND,U_VIEW,is_prop
0,502550008,502550000,502550000,87,BEACON,ST,2-F,2108,102,CD,...,,O,M,W,N,1.0,N,G,A,1
1,502550010,502550000,502550000,87,BEACON,ST,2-R,2108,102,CD,...,,O,S,W,N,1.0,N,A,A,1
2,502550012,502550000,502550000,87,BEACON,ST,3-F,2108,102,CD,...,,O,M,W,N,1.0,N,A,G,1
3,502550014,502550000,502550000,87,BEACON,ST,3-R,2108,102,CD,...,,O,N,W,N,1.0,S,A,G,1
4,502550016,502550000,502550000,87,BEACON,ST,4,2108,102,CD,...,M,O,L,W,C,2.0,N,E,G,1
45818,502550000,502550000,502550000,87,BEACON,ST,,2108,995,CM,...,,,,,,,,,,1
45819,502550002,502550000,502550000,87,BEACON,ST,G-1,2108,102,CD,...,,O,M,W,N,0.0,N,G,A,1
45820,502550004,502550000,502550000,87,BEACON,ST,1,2108,102,CD,...,,O,M,W,C,1.0,N,G,A,1


In [141]:
df_sam.loc[(df_sam['FULL_STREET_NAME'].str.contains('Beacon')) & (df_sam['STREET_NUMBER']=='87')]#['ZIP_CODE']#[['STREET_NUMBER', 'PARCEL', 'CM_ID']]

Unnamed: 0,X,Y,SAM_ADDRESS_ID,RELATIONSHIP_TYPE,BUILDING_ID,FULL_ADDRESS,STREET_NUMBER,IS_RANGE,RANGE_FROM,RANGE_TO,...,ZIP_CODE,X_COORD,Y_COORD,SAM_STREET_ID,WARD,PRECINCT_WARD,PARCEL,is_sam,PID,CM_ID
8856,-71.0716905945,42.355918672,12344,1,139636,87 Beacon St,87,0,,,...,2108,771934.51881063,2955013.33005996,332,5,505,502550000,1,502550000,502550000
8857,-71.0716905945,42.355918672,12345,2,139636,87 Beacon St #G1,87,0,,,...,2108,771934.51881063,2955013.33005996,332,5,505,502550000,1,502550000,502550000
8858,-71.0716905945,42.355918672,12346,2,139636,87 Beacon St #1,87,0,,,...,2108,771934.51881063,2955013.33005996,332,5,505,502550000,1,502550000,502550000
8859,-71.0716905945,42.355918672,12347,2,139636,87 Beacon St #2F,87,0,,,...,2108,771934.51881063,2955013.33005996,332,5,505,502550000,1,502550000,502550000
8860,-71.0716905945,42.355918672,12348,2,139636,87 Beacon St #2R,87,0,,,...,2108,771934.51881063,2955013.33005996,332,5,505,502550000,1,502550000,502550000
8861,-71.0716905945,42.355918672,12349,2,139636,87 Beacon St #3F,87,0,,,...,2108,771934.51881063,2955013.33005996,332,5,505,502550000,1,502550000,502550000
8862,-71.0716905945,42.355918672,12350,2,139636,87 Beacon St #3R,87,0,,,...,2108,771934.51881063,2955013.33005996,332,5,505,502550000,1,502550000,502550000
8863,-71.0716905945,42.355918672,12351,2,139636,87 Beacon St #4,87,0,,,...,2108,771934.51881063,2955013.33005996,332,5,505,502550000,1,502550000,502550000
321165,-71.0716905947,42.3559186722,356456,2,139636,87 Beacon St #2,87,0,,,...,2108,771934.518766137,2955013.3301314,332,5,505,502550000,1,502550000,502550000
321166,-71.0716905947,42.3559186722,356457,2,139636,87 Beacon St #3,87,0,,,...,2108,771934.518766137,2955013.3301314,332,5,505,502550000,1,502550000,502550000


In [26]:
filepath

'../data/raw/bos_analyze/SAMaddresses.csv'

In [16]:
os.path.splitext(data_url)

('http://bostonopendata-boston.opendata.arcgis.com/datasets/b6bffcace320448d96bb84eabb8a075f_0',
 '.csv')

In [13]:

pathlib.Path(data_url).suffixes

['.csv']

In [None]:
local_filename, headers = urllib.request.urlretrieve(
    'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/695a8596-5458-442b-a017-7cd72471aade/download/fy19fullpropassess.csv',
    '../data/raw/test_prop2.csv'
)