In [1]:
##########==========##########==========##########==========##########==========

## H - Header

#### H1 – libraries

In [2]:
## standard foundational libraries
import numpy  as np
import pandas as pd

## import specific function
from os             import mkdir, listdir
from os.path        import isfile, isdir
from datetime       import datetime, timedelta
from dbfread        import DBF
from geopy.distance import geodesic
from ipyparallel    import Cluster

#### H2 – basic automation

In [3]:
## set up standard directories if needed
def make_standard_file_system():
    for i in ['A_Input', 'B_Intermediate', 'C_Output']:
        if not isdir(i): mkdir(i)

## log time elapsed
time_log = dict()
def log_time(the_id = 'End Log'):
    
    ## construct new time stamp
    now_time = str(datetime.now().hour).zfill(2)
    now_time = now_time +':'+ str(datetime.now().minute).zfill(2)
    now_time = now_time +':'+ str(datetime.now().second).zfill(2)

    ## add to time log
    if the_id == 'End Log':
        time_log['End'] = now_time
        print('Time log:')
        for i in time_log.keys():
            print(i.rjust(5) + ':', time_log[i])
    else:
        time_log[the_id] = now_time
        
## toggle cache versus build
def build_or_cache(function, address, permit):
    if permit and isfile(address):
        print('Build/Cache Decision: Cache')
        the_file = pd.read_csv(address)
    else:
        print('Build/Cache Decision: Build')
        the_file = function()
    return the_file
    
## execute functions
make_standard_file_system()
log_time('H2')

#### H3 – settings

In [4]:
## server mode (switches off data sampling; full distance data too big for a PC)
server_mode = False

## GD settings
set_gd = dict()

## RD settings
set_rd = {'1_cache': True}

## MD settings
set_md = dict()

## EMR settings
set_emr = dict()

## PVD settings
set_pvd = dict()

## RV settings
set_rv = dict()

## GD - Gather Data

Primary data source:
+ 2020 TIGER shapefiles from the US Census (the .dbf files)
+ 2020 DP series population summary tables from US Census (as needed)

#### GD1 - read in primary data (census tract shapefile .dbfs)

In [5]:
## read in dbf files for census tracts
def read_tract_dbf(directory):
    
    ## file dbf files in target directory
    dbf_addr = listdir(directory)
    dbf_addr = [i for i in dbf_addr if i[-3::] == 'dbf']
    
    ## read relevant columns from each
    desired_columns = {'GEOID': str,
                       'STATEFP': str, 'COUNTYFP':str, 'TRACTCE':str,
                        'INTPTLAT': float, 'INTPTLON': float, 'ALAND': int}
    dbf_data = []
    for i in dbf_addr:
        i_dbf = pd.DataFrame(iter(DBF(directory + '/' + i)))
        i_dbf = i_dbf[desired_columns.keys()].astype(desired_columns)
        dbf_data.append(i_dbf)
        
    ## compile data into a single file and export
    dbf_data = pd.concat(dbf_data, axis = 0).sort_values('GEOID')
    dbf_data = dbf_data.reset_index(drop = True)
    dbf_data.to_csv('B_Intermediate/dbf_data.csv.gz')
    return dbf_data
    
## execute code
dbf_data = read_tract_dbf('A_Input/tracts_dbf')
if not server_mode:
    dbf_data = dbf_data.loc[dbf_data.STATEFP.isin(['49', '08', '04', '35'])]
log_time('GD1')

#### GD2 - Read in secondary data (census DP table columns)

             GEOID STATEFP COUNTYFP TRACTCE   INTPTLAT    INTPTLON       ALAND
1437   04001942600      04      001  942600  36.752621 -109.847229  1525247269
1438   04001942700      04      001  942700  36.755256 -109.368058  2990231682
1439   04001944000      04      001  944000  35.950683 -109.146142   791413501
1440   04001944100      04      001  944100  36.388911 -109.316317  1830863806
1441   04001944201      04      001  944201  36.153428 -109.685341   502007070
...            ...     ...      ...     ...        ...         ...         ...
77348  49057210900      49      057  210900  41.174956 -111.958681     4514100
77349  49057211000      49      057  211000  41.181342 -111.978033     1424120
77350  49057211100      49      057  211100  41.164136 -111.979655     4319446
77351  49057211201      49      057  211201  41.157778 -111.887136    23275491
77352  49057211202      49      057  211202  41.150167 -111.946949     8017160

[4540 rows x 7 columns]


## RD - Refine Data

#### RD1 – calculate and cache geographic distances between tract centroids

In [7]:
## reshape tract coordinate data to input format
tract_xy = list(zip(dbf_data.INTPTLAT.values, dbf_data.INTPTLON.values))

## define function to do distance compuations in parallel
def measure_distance_in_parallel(xy = tract_xy):
    the_iter = list(range(0, len(xy)))
    
    ## define engine function that will run on each parallel process
    def measure_distance_parallel_slice(n, xy_col = xy):
        from geopy.distance import geodesic
        xy_col = xy_col.copy()
        xy_row = xy_col[n]
        xy_dist = []
        for i in xy_col[0:n]: xy_dist.append(0)
        for i in xy_col[n::]:
            xy_dist.append(int(round(geodesic(xy_row, i).miles)))
        return xy_dist

    ## run engine in parallel for each slice of the data
    with Cluster(n = 4) as clust:
        view = clust.load_balanced_view()
        asyncresult = view.map_async(measure_distance_parallel_slice, the_iter)
        asyncresult.wait_interactive()
        result = asyncresult.get()
        
    ## package results and export
    result = np.array(result)
    result = result + result.T
    np.savetxt('B_Intermediate/tract_distance.csv.gz', result, delimiter = ',',
            fmt = '%u')
    return result

tract_distance = build_or_cache(
    function = measure_distance_in_parallel,
    address = 'B_Intermediate/tract_distance.csv.gz',
    permit = set_rd['1_cache']
    )

log_time('RD1')

Build/Cache Decision: Cache


#### RD2

## MD - Model Data

#### MD1

#### MD2

## EMR - Enrich Model Results

#### EMR1

#### EMR2

## PVD - Prepare Visualization Data

#### PVD1

#### PVD2

## RV - Render Visualization

#### RV1

#### RV2

        0   27   68   39   42   45   30   55   82   81  ...  328.41  327.36  \
0      27    0   57   25   45   41   37   55   78   84  ...     337     337   
1      68   57    0   32   33   24   43   26   26   42  ...     393     392   
2      39   25   32    0   26   18   25   32   53   61  ...     362     361   
3      42   45   33   26    0   12   13   13   41   40  ...     369     368   
4      45   41   24   18   12    0   19   15   38   43  ...     372     371   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...     ...     ...   
4534  326  335  391  360  368  370  355  380  408  405  ...       4       4   
4535  326  336  392  360  368  371  355  381  409  405  ...       3       3   
4536  325  335  391  359  367  370  354  380  408  404  ...       3       3   
4537  323  333  389  357  365  368  352  378  405  402  ...       8       8   
4538  324  333  389  358  366  368  353  378  406  403  ...       5       5   

      327.37  326.25  327.38  326.26  326.27  325.3

## F - Footer

In [9]:
log_time()

Time log:
   H2: 16:12:40
  GD1: 16:12:43
  RD1: 16:12:47
  End: 16:12:47


In [10]:
##########==========##########==========##########==========##########==========