# H - Header

#### H1 - Libraries

In [1]:
## standard foundational libraries
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt

## import specific functions
from os                 import mkdir, listdir
from os.path            import isfile, isdir
from datetime           import datetime, timedelta
from cartopy            import feature
from cartopy.crs        import LambertConformal, PlateCarree
from dbfread            import DBF
from docx               import Document
from textwrap           import fill as txt_wrap

#### H2 - Basic Automation

In [2]:
## set up standard directories if needed
def make_standard_file_system():
    for i in ['A_Input', 'B_Intermediate', 'C_Output']:
        if not isdir(i): mkdir(i)

## log time elapsed
time_log = dict()
def log_time(the_id = 'End Log'):
    
    ## construct new time stamp
    now_time = str(datetime.now().hour).zfill(2)
    now_time = now_time +':'+ str(datetime.now().minute).zfill(2)
    now_time = now_time +':'+ str(datetime.now().second).zfill(2)

    ## add to time log
    if the_id == 'End Log':
        time_log['End'] = now_time
        print('Time log:')
        for i in time_log.keys():
            print(i.rjust(5) + ':', time_log[i])
    else:
        time_log[the_id] = now_time
        
## toggle cache versus build
def build_or_cache(function, address, permit):
    if permit and isfile(address):
        print('Build/Cache Decision: Cache')
        the_file = pd.read_csv(address, index_col = 0)
    else:
        print('Build/Cache Decision: Build')
        the_file = function()
    return the_file
    
## execute functions
make_standard_file_system()
log_time('H2')

#### H3 - Settings

In [3]:
## set color palette
set_color = {
    'AzureDark'    :(7/12, 1.0, 0.4),
    'AzureMedium'  :(7/12, 0.7, 0.7),
    'AzureLight'   :(7/12, 0.4, 1.0),
    'AzureBG'      :(7/12, 0.1, 1.0),
    'AzureOverlay' :(7/12, 0.1, 1.0, 0.5),
    
    'OrangeDark'   :(1/12, 1.0, 0.4),
    'OrangeMedium' :(1/12, 0.7, 0.7),
    'OrangeLight'  :(1/12, 0.4, 1.0),
    'OrangeBG'     :(1/12, 0.1, 1.0),
    'OrangeOverlay':(1/12, 0.1, 1.0, 0.5) 
    }

## set font sizes
set_font = {
    'small' : 16,
    'medium': 24,
    'large' : 32
    }

## time-saver settings
set_acceleration = {
    'dbf_cache':True,
    'sample_size':1/20,
    'distance_cache':False,
    'cluster_cache':False
    }

## map parameters
set_map = {
    'bounds'  : [-124.73 + 5, -66.95 - 5, 25.12 - 3.3, 49.38 + 3.3],
    'map_proj': LambertConformal(
        central_longitude = (-124.73 - 66.95) / 2,
        central_latitude = (25.12 + 49.38) / 2,
        standard_parallels = (25.12, 49.38)
        )
    }

log_time('H3')

# GD - Gather Data / RD - Refine Data

#### GD1 - read census tract geographic data

In [4]:
def read_geo_data(directory = 'A_Input/tracts_dbf'):
    
    ## list dbf files in target directory
    dbf_addr = listdir(directory)
    dbf_addr = [i for i in dbf_addr if i[-3::] == 'dbf']
    
    ## define relevant columns from each
    desired_columns = {'GEOID': str,
                       'STATEFP': str, 'COUNTYFP':str, 'TRACTCE':str,
                        'INTPTLAT': float, 'INTPTLON': float, 'ALAND': int}
    
    ## read in all dbf files
    dbf_data = []
    for i in dbf_addr:
        i_dbf = pd.DataFrame(iter(DBF(directory + '/' + i)))
        i_dbf = i_dbf[desired_columns.keys()].astype(desired_columns)
        dbf_data.append(i_dbf)
        
    ## compile data into a single file
    dbf_data = pd.concat(dbf_data, axis = 0).sort_values('GEOID')
    dbf_data['count'] = 1
    dbf_data = dbf_data.reset_index(drop = True)
    
    ## export data
    dbf_data.to_csv('B_Intermediate/dbf_data.csv.gz')
    return dbf_data

## execute code
geo_data = build_or_cache(function = read_geo_data,
                          address = 'B_Intermediate/dbf_data.csv.gz',
                          permit = set_acceleration['dbf_cache'])
log_time('GD1')

Build/Cache Decision: Cache


#### RD1 - draw a sample from the census tract geographic data and exclude outlier tracts

In [5]:
def refine_geo_data(dat = geo_data, too_rural = 20.720e6 * 5,
                    too_much = set_acceleration['sample_size']):
    
    ## filter out extremely rural areas (< 100 people per square mile)
    dat = dat.loc[dat.ALAND < too_rural, :]
    
    ## take systematic sample of the data
    i = np.arange(0, dat.shape[0]) % int(1 / too_much)
    dat = dat.loc[i == 0, ]

    ## return data
    return dat

## execute code
geo_data = refine_geo_data()
log_time('RD1')

#### GD2 - read population data

In [6]:
def read_pop_data(roster = 'A_Input/sources.csv'):
    
    ## read in data file roster
    roster = pd.read_csv(roster).set_index('OBJ_NAME')
    
    ## load all datasets in the roster file
    pop_data = dict()
    for i in roster.index:
        var_names = roster.loc[i, 'VAR_NAME'].split(';')
        pop_data[i] = pd.read_csv('A_Input' + '/' + roster.loc[i, 'FILE_NAME'],
            usecols = var_names, dtype = str)
        
    ## merge census datasets
    census_i = roster.index[roster.SOURCE == 'data.census.gov'].values
    census_dat_count = 0
    
    for i in census_i:
        pop_data[i] = pop_data[i].loc[1::, :].set_index('GEO_ID')
        if census_dat_count < 1:
            pop_data['census'] = pop_data[i]
            pop_data.pop(i)
            census_dat_count += 1
        else:
            pop_data['census'] = pop_data['census'].join(pop_data[i])
            pop_data.pop(i)
            census_dat_count += 1
            
    ## convert census data to numeric
    def robust_int(x):
        try: x = int(x)
        except: x = 0
        return x
    map_robust_int = lambda x: x.map(robust_int)
    pop_data['census'] = pop_data['census'].apply(map_robust_int)

    return pop_data

## execute code
pop_data = read_pop_data()
log_time('GD2')

#### RD2 - refine and compile population data

In [7]:
def refine_pop_data(pop):
    
    ## -- standardize geographic codes as needed
    state_fips = pop.pop('state_fips')# if needed in future; not currently used
    pop['census'].index = pop['census'].index.str.replace('1400000US', '')

    ## -- refine life expectancy data and merge into census data
    pop['lifespan'].columns = ['GEO_ID', 'life_expect']
    pop['lifespan'].life_expect = pop['lifespan'].life_expect.astype(float)
    pop['census'] = pop['census'].join(pop['lifespan'].set_index('GEO_ID'))
    pop.pop('lifespan')
    
    ## -- refine voting data
    
    ## filter to necessary data
    i = (pop['vote'].party == 'REPUBLICAN') & (pop['vote'].year == '2020')
    pop['vote'] = pop['vote'].loc[i].drop(['year', 'party', 'state_po'], axis = 1)
    
    ## impute total votes (data is irregular from state to state)
    temp = pop['vote'].copy()
    temp = temp.drop(['totalvotes'], axis = 1)
    temp = temp.set_index(['county_fips', 'mode']).astype(int).reset_index()
    temp = temp.groupby(['county_fips', 'mode']).sum().reset_index()
    total_vote = temp.loc[temp['mode'] == 'TOTAL'].set_index('county_fips')
    total_vote = total_vote.drop('mode', axis = 1)
    seg_vote = temp.loc[temp['mode'] != 'TOTAL'].groupby('county_fips').sum()
    total_vote = pd.concat({'Total': total_vote, 'Alt': seg_vote}, axis = 1)
    total_vote = total_vote.max(axis = 1)
    total_vote = pd.DataFrame({'repvotes':total_vote})
    pop['vote'] = pop['vote'].join(total_vote, on = 'county_fips')
    pop['vote'] = pop['vote'].drop_duplicates('county_fips')
    pop['vote'] = pop['vote'].drop(['mode', 'candidatevotes'], axis = 1)
    del seg_vote, temp, total_vote
    
    ## calculate percentage voting republican
    pop['vote'] = pop['vote'].set_index('county_fips').astype(float)
    pop['vote']['reppct'] = pop['vote']['repvotes'] / pop['vote']['totalvotes']
    
    ## calculate percentage of population that voted
    county_total = pd.DataFrame(pop['census']['DP05_0001E'])
    county_total['county'] = [i[0:5] for i in county_total.index]
    county_total = county_total.groupby('county').sum().astype(int)
    pop['vote'] = pop['vote'].join(county_total)
    pop['vote']['totalpct'] = pop['vote']['totalvotes'] / pop['vote']['DP05_0001E']
    pop['vote'].loc[pop['vote'].totalpct > 1, 'totalpct'] = 159633396 / 331449281
    pop['vote'] = pop['vote'][['reppct', 'totalpct']]
    
    ## merge voting data into census and convert to counts
    pop['census']['county'] = [i[0:5] for i in pop['census'].index]
    pop['census'] = pop['census'].reset_index().set_index('county').join(pop['vote'])
    pop = pop['census'].set_index('GEO_ID')
    pop['state'] = [i[0:2] for i in pop.index]
    
    ## -- impute missing data
    
    ## impute at the state level
    state_mean = pop.copy()[['life_expect', 'reppct', 'totalpct', 'state']]
    state_mean = state_mean.groupby('state').mean().round(2)
    state_mean = state_mean.loc[pop.state]
    for i in state_mean.columns:
        j = pop[i].isna().values
        pop.loc[j, i] = state_mean.loc[j, i].values
    
    ## impute at the national level
    for i in state_mean.columns:
        j = pop[i].isna().values
        pop.loc[j, i] = pop[i].mean()
    del state_mean
    
    ## convert vote proportions to counts
    pop['rep_vote'] = pop['reppct'] * pop['totalpct'] * pop['DP05_0001E']
    pop['rep_vote'] = pop['rep_vote'].round().astype(int)
    pop['total_vote'] = (pop['totalpct'] * pop['DP05_0001E']).round().astype(int)
    pop = pop.drop(['reppct', 'totalpct', 'state'], axis = 1).round(1)
    
    return pop

## execute code
pop_data = refine_pop_data(pop = pop_data.copy())
log_time('RD2')

#### GD3 / RD3 - read and refine text data

In [8]:
def read_explanatory_text(addr = 'A_Input/explanation.docx', n = 47):
    explain = Document(addr).paragraphs
    explain = [txt_wrap(i.text, n) for i in explain]
    explain = '\n'.join(explain)
    return explain

## execute code
explanatory_text = read_explanatory_text()
log_time('RD3')

# Model Data

#### MD1 - Reconcile geographic and population dataset tracts

#### MD2 - Precalculate tract-to-tract distance matrix

#### MD3 - Calculate helper functions for modeling

#### MD4 - Conduct two-stage agglomeration clustering

# Enrich Data

In [9]:
log_time()

Time log:
   H2: 13:20:51
   H3: 13:20:51
  GD1: 13:20:51
  RD1: 13:20:51
  GD2: 13:20:57
  RD2: 13:20:57
  RD3: 13:20:57
  End: 13:20:57
