In [1]:
print('Importing packages...')
import os
import pandas
import numpy as np
import time
#from IPython.display import display, HTML
pandas.set_option('display.max_colwidth', -1)
debug = 1

thisdir = '/home/idies/workspace/Storage/raddick/Baltimore/community_reinvestment_act/'
#data_dir = '/home/idies/workspace/Temporary/raddick/cra_scratch_final/'
baltimore_dir = thisdir + 'baltimore/'

code_lookup_dir = thisdir + 'code_guide_lookups/'
inflation_dir = '/home/idies/workspace/Storage/raddick/Baltimore/community_reinvestment_act/datasets/inflation/'

g = 0  # keep track of grand total of processing time

os.chdir(thisdir)
print('Now in directory: {0:}'.format(os.getcwd()))
print('ok')

Importing packages...
Now in directory: /home/idies/workspace/Storage/raddick/Baltimore/community_reinvestment_act
ok


# Load CRA loans data

In [2]:
s = time.time()
print('reading loan data...')
baltimore_agg_loans_df = pandas.read_csv(baltimore_dir+'baltimore_agg_loans_df.csv', encoding='utf-8', low_memory=False, index_col='rownumber')
print('Keeping loans 2011 and later (since we do not have 2010 small business jobs...')
baltimore_agg_loans_df = baltimore_agg_loans_df[baltimore_agg_loans_df['activity_year'] >= 2011]
print('Keeping only business loan originations...')
baltimore_agg_loans_df = baltimore_agg_loans_df[(baltimore_agg_loans_df['loan_type'] == 4) & (baltimore_agg_loans_df['action_taken_type'] == 1)]
e = time.time()
print('Processed {0:,.0f} rows in {1:,.0f} seconds!'.format(len(baltimore_agg_loans_df), e-s))

reading loan data...
Keeping loans 2011 and later (since we do not have 2010 small business jobs...
Keeping only business loan originations...
Processed 1,388 rows in 0 seconds!


# Calculate total loans and working loans

## Total loans per tract

In [3]:
print('getting total loans...')
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(nLoans = baltimore_agg_loans_df['nLoans1'] + baltimore_agg_loans_df['nLoans100k'] + baltimore_agg_loans_df['nLoans250k'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(amtLoans = baltimore_agg_loans_df['amtLoans1'] + baltimore_agg_loans_df['amtLoans100k'] + baltimore_agg_loans_df['amtLoans250k'])
e = time.time()
g = g + (e-s)
print('Processed {0:,.0f} rows in {1:,.0f} seconds!'.format(len(baltimore_agg_loans_df), e-s))


getting total loans...
Processed 1,388 rows in 0 seconds!


## Working loans per tract

In [4]:
print('calculating working loans...')
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(avgSmallLoan = baltimore_agg_loans_df['amtLoans1'] / baltimore_agg_loans_df['nLoans1'])
#print('Removing loans from American Express...')
#respondents_df = pandas.read_csv(code_lookup_dir+'respondentid.csv', low_memory=False, encoding='utf-8')#, index_col='respondentID')
#amex_respondentIDs = baltimore_loans_df.merge(respondents_df, how='left', on='respondentID')['respondentID'][baltimore_loans_df.merge(respondents_df, how='left', on='respondentID')['institution_name'].apply(lambda x: 'american express' in str(x).lower())].drop_duplicates().tolist()
#respondents_df[respondents_df['respondentID'].apply(lambda x: x in amex_respondentIDs)]
#baltimore_loans_df = baltimore_loans_df[baltimore_loans_df['respondentID'].apply(lambda x: x not in amex_respondentIDs)]
#print('Removed (credit card) loans from American Express, now {0:,.0f} loans...'.format(len(baltimore_loans_df)))

baltimore_agg_loans_df = baltimore_agg_loans_df.assign(nWorkingLoans = np.nan)
baltimore_agg_loans_df.loc[baltimore_agg_loans_df['avgSmallLoan'] < 10000, 
                           'nWorkingLoans'] = baltimore_agg_loans_df['nLoans'][baltimore_agg_loans_df['avgSmallLoan'] < 10000] - baltimore_agg_loans_df['nLoans1'][baltimore_agg_loans_df['avgSmallLoan'] < 10000]
baltimore_agg_loans_df.loc[baltimore_agg_loans_df['avgSmallLoan'] >= 10000, 
                           'nWorkingLoans'] = baltimore_agg_loans_df['nLoans'][baltimore_agg_loans_df['avgSmallLoan'] >= 10000]

baltimore_agg_loans_df = baltimore_agg_loans_df.assign(amtWorkingLoans = np.nan)
baltimore_agg_loans_df.loc[baltimore_agg_loans_df['avgSmallLoan'] < 10000, 
                           'amtWorkingLoans'] = baltimore_agg_loans_df['amtLoans'][baltimore_agg_loans_df['avgSmallLoan'] < 10000] - baltimore_agg_loans_df['amtLoans1'][baltimore_agg_loans_df['avgSmallLoan'] < 10000]
baltimore_agg_loans_df.loc[baltimore_agg_loans_df['avgSmallLoan'] >= 10000, 
                           'amtWorkingLoans'] = baltimore_agg_loans_df['amtLoans'][baltimore_agg_loans_df['avgSmallLoan'] >= 10000]

print('Setting multi-index to census_tract and year...')
baltimore_agg_loans_df = baltimore_agg_loans_df.set_index(['census_tract', 'activity_year'])
print('Done!')

calculating working loans...
Setting multi-index to census_tract and year...
Done!


# Connect jobs data

In [5]:
e = time.time()
print('reading job data...')

raw_jobs_df = pandas.read_csv(baltimore_dir+'wac_jobs_df.csv', index_col='rownumber')

print('grouping by census tract and year...')
# GeoID format is STATE+COUNTY+TRACT+BLOCK (2+3+6+4 = 15 characters)
raw_jobs_df = raw_jobs_df.assign(census_tract = pandas.to_numeric(raw_jobs_df['w_geocode'].apply(lambda x: str(x)[5:9] + '.' + str(x)[9:11]), errors='coerce'))#.drop_duplicates().sort_values()[0:3]

sum_columns = [x for x in raw_jobs_df.columns.tolist() if x not in ('w_geocode', 'createdate', 'year')]
jobs_df = pandas.DataFrame()

for i in range(2010,2018):
    jobs_i_df = raw_jobs_df[sum_columns][raw_jobs_df['year'] == i].groupby('census_tract', as_index=False).sum()
    if (i >= 2016):
        jobs_i_df = raw_jobs_df[sum_columns][raw_jobs_df['year'] == 2015].groupby('census_tract', as_index=False).sum()
    jobs_i_df = jobs_i_df.assign(year = i)
    jobs_df = pandas.concat((jobs_df, jobs_i_df), axis=0)

jobs_df = jobs_df.rename(columns={'year': 'activity_year'})
print('removing jobs data from 2010...')
jobs_df = jobs_df[jobs_df['activity_year'] >= 2011]
jobs_df = jobs_df.set_index(['census_tract', 'activity_year'])

print('joining jobs data onto loans data...')
baltimore_agg_loans_df = baltimore_agg_loans_df.join(jobs_df)
e = time.time()
g = g + (e-s)
print('Processed {0:,.0f} rows in {1:,.0f} seconds!'.format(len(baltimore_agg_loans_df), e-s))
print('Done!')
baltimore_agg_loans_df.sample(2)


reading job data...


  mask |= (ar1 == a)


grouping by census tract and year...
removing jobs data from 2010...
joining jobs data onto loans data...
Processed 1,388 rows in 31 seconds!
Done!


Unnamed: 0_level_0,Unnamed: 1_level_0,loan_type,action_taken_type,state,county,msa,split_county_indicator,population_classification,income_group_total,nLoans1,amtLoans1,...,CFA01,CFA02,CFA03,CFA04,CFA05,CFS01,CFS02,CFS03,CFS04,CFS05
census_tract,activity_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1205.0,2014,4,1,24,510,12580.0,N,L,8,41,799000,...,40.0,27.0,81.0,121.0,702.0,356.0,253.0,231.0,44.0,87.0
904.0,2015,4,1,24,510,12580.0,N,L,3,24,549000,...,34.0,6.0,5.0,5.0,521.0,112.0,77.0,11.0,162.0,209.0


# Which jobs columns do we want?

In [6]:
jobs_metadata_df = pandas.read_csv(baltimore_dir+'wac_jobs_metadata.csv', encoding='utf-8', index_col='varnum')
jobs_metadata_df = jobs_metadata_df.set_index('variable')

jobs_columns = ['C000', 'CA01', 'CA02', 'CA03', 'CE01', 'CE02', 'CE03', 'CNS01']
jobs_columns += ['CNS02', 'CNS03', 'CNS04', 'CNS05', 'CNS06', 'CNS07', 'CNS08']
jobs_columns += ['CNS09', 'CNS10', 'CNS11', 'CNS12', 'CNS13', 'CNS14', 'CNS15']
jobs_columns += ['CNS16', 'CNS17', 'CNS18', 'CNS19', 'CNS20', 'CR01', 'CR02']
jobs_columns += ['CR03', 'CR04', 'CR05', 'CR07', 'CT01', 'CT02', 'CD01', 'CD02']
jobs_columns += ['CD03', 'CD04', 'CS01', 'CS02', 'CFA01', 'CFA02', 'CFA03']
jobs_columns += ['CFA04', 'CFA05', 'CFS01', 'CFS02', 'CFS03', 'CFS04', 'CFS05']

#for x in baltimore_agg_loans_df[jobs_columns].columns:
#    print('variable: {0:}\t\tdescription:{1:}'.format(x, jobs_metadata_df['description'][jobs_metadata_df.index == x].tolist()[0]))
    
jobs_columns_we_want = ['C000', 'CFS01']
for x in baltimore_agg_loans_df[jobs_columns_we_want].columns:
    print('variable: {0:}\t\tdescription:{1:}'.format(x, jobs_metadata_df['description'][jobs_metadata_df.index == x].tolist()[0]))


variable: C000		description:Total number of jobs
variable: CFS01		description:Number of jobs for workers at firms with Firm Size: 0-19 Employees


# Calcuate loans per job

In [7]:
#sbjobs_column = jobs_varnames_df[jobs_varnames_df['description'].apply(lambda x: '0-19' in x)].index.values[0]
#loans_columns = []
#baltimore_agg_loans_df[sbjobs_column]
print('Calulating loans per job (total and with firm size 0-19)...')

baltimore_agg_loans_df = baltimore_agg_loans_df.assign(nLoans1_per_totaljob = baltimore_agg_loans_df['nLoans1'] / baltimore_agg_loans_df['C000'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(amtLoans1_per_totaljob = baltimore_agg_loans_df['amtLoans1'] / baltimore_agg_loans_df['C000'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(nLoans100k_per_totaljob = baltimore_agg_loans_df['nLoans100k'] / baltimore_agg_loans_df['C000'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(amtLoans100k_per_totaljob = baltimore_agg_loans_df['amtLoans100k'] / baltimore_agg_loans_df['C000'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(nLoans250k_per_totaljob = baltimore_agg_loans_df['nLoans250k'] / baltimore_agg_loans_df['C000'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(amtLoans250k_per_totaljob = baltimore_agg_loans_df['amtLoans250k'] / baltimore_agg_loans_df['C000'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(nLoansToSmallest_per_totaljob = baltimore_agg_loans_df['nLoansToSmallest'] / baltimore_agg_loans_df['C000'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(amtLoansToSmallest_per_totaljob = baltimore_agg_loans_df['amtLoansToSmallest'] / baltimore_agg_loans_df['C000'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(nLoans_per_totaljob = baltimore_agg_loans_df['nLoans'] / baltimore_agg_loans_df['C000'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(amtLoans_per_totaljob = baltimore_agg_loans_df['amtLoans'] / baltimore_agg_loans_df['C000'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(nWorkingLoans_per_totaljob = baltimore_agg_loans_df['nWorkingLoans'] / baltimore_agg_loans_df['C000'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(amtWorkingLoans_per_totaljob = baltimore_agg_loans_df['amtWorkingLoans'] / baltimore_agg_loans_df['C000'])

baltimore_agg_loans_df = baltimore_agg_loans_df.assign(nLoans1_per_sbjob = baltimore_agg_loans_df['nLoans1'] / baltimore_agg_loans_df['CFS01'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(amtLoans1_per_sbjob = baltimore_agg_loans_df['amtLoans1'] / baltimore_agg_loans_df['CFS01'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(nLoans100k_per_sbjob = baltimore_agg_loans_df['nLoans100k'] / baltimore_agg_loans_df['CFS01'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(amtLoans100k_per_sbjob = baltimore_agg_loans_df['amtLoans100k'] / baltimore_agg_loans_df['CFS01'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(nLoans250k_per_sbjob = baltimore_agg_loans_df['nLoans250k'] / baltimore_agg_loans_df['CFS01'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(amtLoans250k_per_sbjob = baltimore_agg_loans_df['amtLoans250k'] / baltimore_agg_loans_df['CFS01'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(nLoansToSmallest_per_sbjob = baltimore_agg_loans_df['nLoansToSmallest'] / baltimore_agg_loans_df['CFS01'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(amtLoansToSmallest_per_sbjob = baltimore_agg_loans_df['amtLoansToSmallest'] / baltimore_agg_loans_df['CFS01'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(nLoans_per_sbjob = baltimore_agg_loans_df['nLoans'] / baltimore_agg_loans_df['CFS01'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(amtLoans_per_sbjob = baltimore_agg_loans_df['amtLoans'] / baltimore_agg_loans_df['CFS01'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(nWorkingLoans_per_sbjob = baltimore_agg_loans_df['nWorkingLoans'] / baltimore_agg_loans_df['CFS01'])
baltimore_agg_loans_df = baltimore_agg_loans_df.assign(amtWorkingLoans_per_sbjob = baltimore_agg_loans_df['amtWorkingLoans'] / baltimore_agg_loans_df['CFS01'])

print('recoding infinite values to NaN...')

per_job_columns = ['nLoans1_per_totaljob', 'amtLoans1_per_totaljob', 'nLoans100k_per_totaljob']
per_job_columns += ['amtLoans100k_per_totaljob', 'nLoans250k_per_totaljob', 'amtLoans250k_per_totaljob']
per_job_columns += ['nLoansToSmallest_per_totaljob', 'amtLoansToSmallest_per_totaljob']
per_job_columns += ['nLoans_per_totaljob', 'amtLoans_per_totaljob', 'nWorkingLoans_per_totaljob']
per_job_columns += ['amtWorkingLoans_per_totaljob', 'nLoans1_per_sbjob', 'amtLoans1_per_sbjob']
per_job_columns += ['nLoans100k_per_sbjob', 'amtLoans100k_per_sbjob', 'nLoans250k_per_sbjob']
per_job_columns += ['amtLoans250k_per_sbjob', 'nLoansToSmallest_per_sbjob', 'amtLoansToSmallest_per_sbjob']
per_job_columns += ['nLoans_per_sbjob', 'amtLoans_per_sbjob', 'nWorkingLoans_per_sbjob']
per_job_columns += ['amtWorkingLoans_per_sbjob']

for x in baltimore_agg_loans_df[per_job_columns]:
    baltimore_agg_loans_df.loc[baltimore_agg_loans_df[x] == np.inf, x] = np.nan
print('Done!')


    

Calulating loans per job (total and with firm size 0-19)...
recoding infinite values to NaN...
Done!


# Connect ACS 5-year census data

## Read ACS data

In [8]:
acs5_df = pandas.read_csv(baltimore_dir+'acs5_2010_2017.csv', encoding='utf-8', low_memory=False, index_col='Unnamed: 0')
acs5_df.index.name = 'rownumber'
print('Done!')

Done!


In [9]:
# MAKE A LIST of all variables
descriptions = []
metadata_df = pandas.read_csv(baltimore_dir+'acs5_metadata/acs5_metadata_2017.csv', encoding='utf-8', low_memory=False, index_col='variable')
acs5_columns = [x for x in acs5_df.columns[4:-2].tolist() if ('_err' not in x)]
for x in acs5_columns:
    this_description = {}
    this_description[x] = metadata_df['description'].loc[x]
    descriptions.append(this_description)
#acs5_df['description']
#for x in descriptions:
#    for k,v in x.items():
#        if ('education' in v.lower()):
#            print(k,v)
print('ok')


ok


In [10]:
print('\ncalculating and renaming estimates columns for IVs...')

if (debug >= 1):
    print('...high school graduates or higher 25 years and older...')
h = acs5_df['B15002_011'] + acs5_df['B15002_012'] + acs5_df['B15002_013'] 
h += acs5_df['B15002_014'] + acs5_df['B15002_015'] + acs5_df['B15002_016']
h += acs5_df['B15002_017'] + acs5_df['B15002_018']
h += acs5_df['B15002_028'] + acs5_df['B15002_029'] + acs5_df['B15002_030'] 
h += acs5_df['B15002_031'] + acs5_df['B15002_032'] + acs5_df['B15002_033'] 
h += acs5_df['B15002_034'] + acs5_df['B15002_035']
acs5_df = acs5_df.assign(hs_grad_25plus = pandas.to_numeric(h, errors='coerce'))

if (debug >= 1):
    print('...householder sex & race, unempoyment, poverty, home value, home age...')
acs5_df = acs5_df.rename(columns = {     
    'B11001_006': 'female_householder',
    'B11001B_001': 'black_householder',
    'B11001H_001': 'white_householder',
    'B23025_005': 'unemployed_16plus',
    'B17001_002': 'poverty_past_12_months',
    'B25077_001': 'median_home_value',
    'B25035_001': 'median_year_built'
})
if (debug >= 1):
    print('...race, owner-occupied units, mfi...')
acs5_df = acs5_df.rename(columns = {
    'B02001_002': 'pop_white',
    'B02001_003': 'pop_black',
    'B25003_002': 'owner_occ_housing_units',
    'B19113_001': 'mfi'    
})
if (debug >= 1):
    print('....comparison variables: total population, total households, poverty status...')
acs5_df = acs5_df.rename(columns = {
    'B01001_001': 'pop_total',
    'B11001_001': 'total_householders',
    'B23025_002': 'labor_force_16plus',
    'B17001_001': 'poverty_status_known'
})

if (debug >= 1):
    print('...population 25plus...')
acs5_df = acs5_df.assign(pop_25plus = pandas.to_numeric(
                                             (acs5_df['B01001_011'] + acs5_df['B01001_012'] + acs5_df['B01001_013'] 
                                              + acs5_df['B01001_014'] + acs5_df['B01001_015'] + acs5_df['B01001_016']
                                              + acs5_df['B01001_017'] + acs5_df['B01001_018'] + acs5_df['B01001_019']
                                              + acs5_df['B01001_020'] + acs5_df['B01001_021'] + acs5_df['B01001_022']
                                              + acs5_df['B01001_023'] + acs5_df['B01001_024'] + acs5_df['B01001_025']
                                              + acs5_df['B01001_035'] + acs5_df['B01001_036'] + acs5_df['B01001_037']
                                              + acs5_df['B01001_038'] + acs5_df['B01001_039'] + acs5_df['B01001_040']
                                              + acs5_df['B01001_041'] + acs5_df['B01001_042'] + acs5_df['B01001_043']
                                              + acs5_df['B01001_044'] + acs5_df['B01001_045'] + acs5_df['B01001_046']
                                              + acs5_df['B01001_047'] + acs5_df['B01001_048'] + acs5_df['B01001_049']
                                             ), errors='coerce'
                                         )
                                        )
#acs5_df.sample(1).T
print('Done!')



calculating and renaming estimates columns for IVs...
...high school graduates or higher 25 years and older...
...householder sex & race, unempoyment, poverty, home value, home age...
...race, owner-occupied units, mfi...
....comparison variables: total population, total households, poverty status...
...population 25plus...
Done!


# Get errors for composite columns

### Create the functions we will need

In [11]:
### Guide on how to calculate errors in percentages:
# https://www.census.gov/content/dam/Census/library/publications/2018/acs/acs_general_handbook_2018_ch08.pdf
    
## Aggregating Data Across Population Subgroups: add error for each group in quadrature, divide by 1.645 for serr

def find_serr_hsgrad25plus(row):
    return pandas.to_numeric(np.sqrt(row['B15002_011_err']**2 + row['B15002_012_err']**2 + row['B15002_013_err']**2 
                                 + row['B15002_014_err']**2 + row['B15002_015_err']**2 + row['B15002_016_err']**2 
                                 + row['B15002_017_err']**2 + row['B15002_018_err']**2 + 
                                 + row['B15002_028_err']**2 + row['B15002_029_err']**2 + row['B15002_030_err']**2 
                                 + row['B15002_031_err']**2 + row['B15002_032_err']**2 + row['B15002_033_err']**2 
                                 + row['B15002_034_err']**2 + row['B15002_035_err']**2
                                ) / 1.645, errors='coerce')

def find_serr_pop25plus(row):
    return pandas.to_numeric(np.sqrt(row['B01001_011_err']**2 + row['B01001_012_err']**2 + row['B01001_013_err']**2 
                                     + row['B01001_014_err']**2 + row['B01001_015_err']**2 + row['B01001_016_err']**2 
                                     + row['B01001_017_err']**2 + row['B01001_018_err']**2 + row['B01001_019_err']**2 
                                     + row['B01001_020_err']**2 + row['B01001_021_err']**2 + row['B01001_022_err']**2 
                                     + row['B01001_023_err']**2 + row['B01001_024_err']**2 + row['B01001_025_err']**2 
                                     + row['B01001_035_err']**2 + row['B01001_036_err']**2 + row['B01001_037_err']**2 
                                     + row['B01001_038_err']**2 + row['B01001_039_err']**2 + row['B01001_040_err']**2 
                                     + row['B01001_041_err']**2 + row['B01001_042_err']**2 + row['B01001_043_err']**2 
                                     + row['B01001_044_err']**2 + row['B01001_045_err']**2 + row['B01001_046_err']**2 
                                     + row['B01001_047_err']**2 + row['B01001_048_err']**2 + row['B01001_049_err']**2 
                                    ) / 1.645, errors='coerce')
print('Defined standard-error-calculating functions!')
print('ok')

Defined standard-error-calculating functions!
ok


## Calculate errors

In [12]:
print('\ncalculating and renaming margins of error columns for IVs...')

if (debug >= 1):
    print('...margins for race, owner-occupied units, mfi...')
acs5_df = acs5_df.rename(columns = {
    'B02001_002_err': 'pop_white_err',
    'B02001_003_err': 'pop_black_err',
    'B25003_002_err': 'owner_occ_housing_units_err',
    'B19113_001_err': 'mfi_err'    
})

if (debug >= 1):
    print('...standard errors for hs graduates 25 and older (using custom serr-finding function...')
acs5_df = acs5_df.assign(hs_grad_25plus_serr = pandas.to_numeric(acs5_df.apply(lambda row: find_serr_hsgrad25plus(row), axis=1), errors='coerce'))

if (debug >= 1):
    print('...margins of error for householder sex & race, unempoyment, poverty, home value, home age...')
acs5_df = acs5_df.rename(columns = {     
    'B11001_006_err': 'female_householder_err',
    'B11001B_001_err': 'black_householder_err',
    'B11001H_001_err': 'white_householder_err',
    'B23025_005_err': 'unemployed_16plus_err',
    'B17001_002_err': 'poverty_past_12_months_err',
    'B25077_001_err': 'median_home_value_err',
    'B25035_001_err': 'median_year_built_err'
})

print('\ncalculating and renaming margins of error for comparison variables...')
if (debug >= 1):
    print('...race, owner-occupied units, mfi...')
acs5_df = acs5_df.rename(columns = {
    'B01001_001_err': 'pop_total_err',
    'B11001_001_err': 'total_householders_err',
    'B17001_001_err': 'poverty_status_known_err'
})
if (debug >= 1):
    print('...population 25plus...')
acs5_df = acs5_df.assign(pop_25plus_serr = pandas.to_numeric(acs5_df.apply(lambda row: find_serr_pop25plus(row), axis=1), errors='coerce'))


if (debug >= 1):
    print('...labor force, poverty status known...')
acs5_df = acs5_df.rename(columns = {
    'B23025_002_err': 'labor_force_16plus_err',
    'B17001_001_err': 'poverty_status_known_err'
})

print('Calculated errors for all columns!')
#reinvestment_df[['hs_grad_25plus', 'hs_grad_25plus_serr', 'pop_25plus', 'pop_25plus_serr']]



calculating and renaming margins of error columns for IVs...
...margins for race, owner-occupied units, mfi...
...standard errors for hs graduates 25 and older (using custom serr-finding function...
...margins of error for householder sex & race, unempoyment, poverty, home value, home age...

calculating and renaming margins of error for comparison variables...
...race, owner-occupied units, mfi...
...population 25plus...
...labor force, poverty status known...
Calculated errors for all columns!


## Join census data to loan+jobs data

In [13]:
print('dropping columns we do not care about...')
columns_do_not_care = ['B15002_011','B15002_012','B15002_013','B15002_014','B15002_015']
columns_do_not_care += ['B15002_016','B15002_017','B15002_018','B15002_028','B15002_029']
columns_do_not_care += ['B15002_030','B15002_031','B15002_032','B15002_033','B15002_034']
columns_do_not_care += ['B15002_035','B01001_011','B01001_012','B01001_013','B01001_014']
columns_do_not_care += ['B01001_015','B01001_016','B01001_017','B01001_018','B01001_019']
columns_do_not_care += ['B01001_020','B01001_021','B01001_022','B01001_023','B01001_024']
columns_do_not_care += ['B01001_025','B01001_035','B01001_036','B01001_037','B01001_038']
columns_do_not_care += ['B01001_039','B01001_040','B01001_041','B01001_042','B01001_043']
columns_do_not_care += ['B01001_044','B01001_045','B01001_046','B01001_047','B01001_048']
columns_do_not_care += ['B01001_049','B15002_011_err','B15002_012_err','B15002_013_err']
columns_do_not_care += ['B15002_014_err','B15002_015_err','B15002_016_err']
columns_do_not_care += ['B15002_017_err','B15002_018_err','B15002_028_err']
columns_do_not_care += ['B15002_029_err','B15002_030_err','B15002_031_err']
columns_do_not_care += ['B15002_032_err','B15002_033_err','B15002_034_err']
columns_do_not_care += ['B15002_035_err','B15002_011_err','B15002_012_err']
columns_do_not_care += ['B15002_013_err','B15002_014_err','B15002_015_err']
columns_do_not_care += ['B15002_016_err','B15002_017_err','B15002_018_err']
columns_do_not_care += ['B15002_028_err','B15002_029_err','B15002_030_err']
columns_do_not_care += ['B15002_031_err','B15002_032_err','B15002_033_err']
columns_do_not_care += ['B15002_034_err','B15002_035_err','B01001_011_err']
columns_do_not_care += ['B01001_012_err','B01001_013_err','B01001_014_err']
columns_do_not_care += ['B01001_015_err','B01001_016_err','B01001_017_err']
columns_do_not_care += ['B01001_018_err','B01001_019_err','B01001_020_err']
columns_do_not_care += ['B01001_021_err','B01001_022_err','B01001_023_err']
columns_do_not_care += ['B01001_024_err','B01001_025_err','B01001_035_err']
columns_do_not_care += ['B01001_036_err','B01001_037_err','B01001_038_err']
columns_do_not_care += ['B01001_039_err','B01001_040_err','B01001_041_err']
columns_do_not_care += ['B01001_042_err','B01001_043_err','B01001_044_err']
columns_do_not_care += ['B01001_045_err','B01001_046_err','B01001_047_err']
columns_do_not_care += ['B01001_048_err','B01001_049_err','STATE']
acs5_df = acs5_df.drop(columns_do_not_care, axis=1)

print('calculating census tract and block group numbers...')
acs5_df = acs5_df.assign(census_tract = np.nan)
acs5_df = acs5_df.assign(block_group = np.nan)

acs5_df.loc[:, 'census_tract'] = acs5_df['GEOID'].apply(lambda x: x[12:18])
acs5_df.loc[:, 'census_tract'] = pandas.to_numeric(acs5_df['census_tract'], errors='coerce')
acs5_df.loc[:, 'census_tract'] = acs5_df['census_tract'].apply(lambda x: x/100) # get right decimalization of census tracts
acs5_df = acs5_df.assign(block_group = np.nan)
acs5_df.loc[acs5_df['GEOID'].apply(lambda x:len(x)) == 19, 
            'block_group'] = acs5_df['GEOID'][acs5_df['GEOID'].apply(lambda x:len(x)) == 19].apply(lambda x: x[18])
acs5_df.loc[:, 'block_group'] = pandas.to_numeric(acs5_df['block_group'], errors='coerce')

print('dropping block groups to avoid double-counting...')
acs5_df = acs5_df[acs5_df['block_group'].isnull()]  # select only census tracts (ignore block groups) to avoid double-counting
acs5_df = acs5_df.drop('block_group', axis=1)

print('setting index of both tables to do a year-by-year join...')
acs5_df = acs5_df.set_index('census_tract')
baltimore_agg_loans_df = baltimore_agg_loans_df.reset_index().set_index('census_tract')

baltimore_tracts_years_df = pandas.DataFrame()
theyears = baltimore_agg_loans_df['activity_year'].drop_duplicates().tolist()
for x in theyears:
    baltimore_tracts_years_df = pandas.concat((baltimore_tracts_years_df, 
                                               baltimore_agg_loans_df[baltimore_agg_loans_df['activity_year'] == x].join(acs5_df[acs5_df['year'] == x])
                                              ), axis=0)
baltimore_tracts_years_df = baltimore_tracts_years_df.reset_index().set_index(['census_tract', 'activity_year'])
baltimore_tracts_years_df = baltimore_tracts_years_df.sort_index()

print('backing up...')
baltimore_tracts_years_df_bk = baltimore_tracts_years_df
print('Done!')


dropping columns we do not care about...
calculating census tract and block group numbers...
dropping block groups to avoid double-counting...
setting index of both tables to do a year-by-year join...
backing up...
Done!


# Calculate percentages for needed demographic variables

In [18]:
vars_for_percentification = ['pop_white', 'pop_black', 'black_householder', 'white_householder']
vars_for_percentification += ['owner_occ_housing_units', 'hs_grad_25plus', 'female_householder']
vars_for_percentification += ['unemployed_16plus', 'poverty_past_12_months']

vars_for_percentification += ['pop_white_err', 'pop_black_err', 'black_householder_err', 'white_householder_err']
vars_for_percentification += ['owner_occ_housing_units_err', 'hs_grad_25plus_serr', 'female_householder_err']
vars_for_percentification += ['unemployed_16plus_err', 'poverty_past_12_months_err']

vars_for_percentification += ['pop_total', 'total_householders', 'pop_25plus', 'labor_force_16plus']
vars_for_percentification += ['poverty_status_known']

vars_for_percentification += ['pop_total_err', 'total_householders_err', 'pop_25plus_serr', 'labor_force_16plus_err']
vars_for_percentification += ['poverty_status_known_err']

baltimore_tracts_years_df[vars_for_percentification].columns.tolist()
#print('ok')

['pop_white',
 'pop_black',
 'black_householder',
 'white_householder',
 'owner_occ_housing_units',
 'hs_grad_25plus',
 'female_householder',
 'unemployed_16plus',
 'poverty_past_12_months',
 'pop_white_err',
 'pop_black_err',
 'black_householder_err',
 'white_householder_err',
 'owner_occ_housing_units_err',
 'hs_grad_25plus_serr',
 'female_householder_err',
 'unemployed_16plus_err',
 'poverty_past_12_months_err',
 'pop_total',
 'total_householders',
 'pop_25plus',
 'labor_force_16plus',
 'poverty_status_known',
 'pop_total_err',
 'total_householders_err',
 'pop_25plus_serr',
 'labor_force_16plus_err',
 'poverty_status_known_err']

In [17]:
print('getting from backup...')
baltimore_tracts_years_df = baltimore_tracts_years_df_bk
#[x for x in vars_for_percentification if "_err" not in x]

baltimore_tracts_years_df = baltimore_tracts_years_df.assign(pct_white = pandas.to_numeric((baltimore_tracts_years_df['pop_white'] / baltimore_tracts_years_df['pop_total']), errors='coerce'))
baltimore_tracts_years_df = baltimore_tracts_years_df.assign(pct_black = pandas.to_numeric((baltimore_tracts_years_df['pop_black'] / baltimore_tracts_years_df['pop_total']), errors='coerce'))
baltimore_tracts_years_df = baltimore_tracts_years_df.assign(pct_white_householders = pandas.to_numeric((baltimore_tracts_years_df['pop_white'] / baltimore_tracts_years_df['pop_total']), errors='coerce'))
baltimore_tracts_years_df = baltimore_tracts_years_df.assign(pct_black_householders = pandas.to_numeric((baltimore_tracts_years_df['pop_black'] / baltimore_tracts_years_df['pop_total']), errors='coerce'))

baltimore_tracts_years_df['pct_white']

getting from backup...


census_tract  activity_year
101.00        2011             0.924555
              2012             0.941853
              2013             0.944095
              2014             0.915238
              2015             0.906877
              2016             0.898929
              2017             0.884099
102.00        2011             0.831728
              2012             0.730096
              2013             0.754756
              2014             0.837133
              2015             0.804982
              2016             0.822929
              2017             0.882353
103.00        2011             0.881521
              2012             0.924419
              2013             0.920729
              2014             0.904184
              2015             0.907987
              2016             0.908834
              2017             0.893417
104.00        2011             0.931542
              2012             0.910345
              2013             0.906136
            

# Correct for inflation

In [None]:
money_columns = ['amtLoans1', 'amtLoans100k', 'amtLoans250k', 'amtLoansToSmallest']
money_columns += ['amtLoans', 'amtWorkingLoans']
money_columns += ['mfi', 'median_home_value']
money_columns += ['amtLoans1_per_totaljob', 'amtLoans100k_per_totaljob']
money_columns += ['amtLoans250k_per_totaljob', 'amtLoansToSmallest_per_totaljob']
money_columns += ['amtLoans_per_totaljob', 'amtWorkingLoans_per_totaljob']
money_columns += ['amtLoans1_per_sbjob', 'amtLoans100k_per_sbjob']
money_columns += ['amtLoans250k_per_sbjob', 'amtLoansToSmallest_per_sbjob']
money_columns += ['amtLoans_per_sbjob', 'amtWorkingLoans_per_sbjob']

print('getting inflation data...')
cpi_1913_2017_df = pandas.read_csv(inflation_dir+'cpi-1913-2017.csv', index_col='Year')
cpi_annual_s = cpi_1913_2017_df['Jan']
cpi_annual_s.name = 'rawfactor'
value_in_2017 = cpi_annual_s.loc[2017]

annual_inflator_s = 1 / (cpi_annual_s / value_in_2017)
print('inflating pre-2017 monetary values...')
inflate_these_df = baltimore_tracts_years_df[money_columns]
newcolnames = [x+'_adj' for x in inflate_these_df.columns.tolist()]
inflate_these_df.columns = newcolnames

inflated_df = pandas.DataFrame()
for i in inflate_these_df.index.get_level_values(1).drop_duplicates().tolist():
    inflated_df_i = inflate_these_df.xs(i, level=1).apply(lambda x: x * annual_inflator_s.loc[i])
    inflated_df_i['activity_year'] = i
    inflated_df = pandas.concat((inflated_df, inflated_df_i), axis=0)
inflated_df = inflated_df.reset_index().set_index(['census_tract', 'activity_year'])

print('joining inflated money columns to the rest of the columns...')
baltimore_tracts_years_df = baltimore_tracts_years_df.join(inflated_df, how='left')


print('Done!')


## Add codes (e.g. CSA, income group...)

# Add community statistical area names
print('Matching community statistical areas...')
tract_to_csa_df = pandas.read_csv(baltimore_dir+'census_tract_to_neighborhood.csv', index_col='NAME10')
baltimore_tracts_years_df = baltimore_tracts_years_df.reset_index().merge(tract_to_csa_df.reset_index(), how='left', left_on='census_tract', right_on='NAME10').set_index(['census_tract', 'activity_year'])
baltimore_tracts_years_df = baltimore_tracts_years_df.drop(['NAME10', 'TRACTCE10', 'GEOID10'], axis=1)

print('Decoding income group names...')
# Get income group names
baltimore_tracts_years_df = baltimore_tracts_years_df.rename(columns = {'income_group_total': 'income_group_code'})
baltimore_tracts_years_df = baltimore_tracts_years_df.assign(income_group = np.nan)

baltimore_tracts_years_df.loc[baltimore_tracts_years_df['income_group_code'] == 1, 'income_group'] = '< 10% of Median Family Income (MFI)'
baltimore_tracts_years_df.loc[baltimore_tracts_years_df['income_group_code'] == 2, 'income_group'] = '10% to 20% of MFI'
baltimore_tracts_years_df.loc[baltimore_tracts_years_df['income_group_code'] == 3, 'income_group'] = '20% to 30% of MFI'
baltimore_tracts_years_df.loc[baltimore_tracts_years_df['income_group_code'] == 4, 'income_group'] = '30% to 40% of MFI'
baltimore_tracts_years_df.loc[baltimore_tracts_years_df['income_group_code'] == 5, 'income_group'] = '40% to 50% of MFI'
baltimore_tracts_years_df.loc[baltimore_tracts_years_df['income_group_code'] == 6, 'income_group'] = '50% to 60% of MFI'
baltimore_tracts_years_df.loc[baltimore_tracts_years_df['income_group_code'] == 7, 'income_group'] = '60% to 70% of MFI'
baltimore_tracts_years_df.loc[baltimore_tracts_years_df['income_group_code'] == 8, 'income_group'] = '70% to 80% of MFI'
baltimore_tracts_years_df.loc[baltimore_tracts_years_df['income_group_code'] == 9, 'income_group'] = '80% to 90% of MFI'
baltimore_tracts_years_df.loc[baltimore_tracts_years_df['income_group_code'] == 10, 'income_group'] = '90% to 100% of MFI'
baltimore_tracts_years_df.loc[baltimore_tracts_years_df['income_group_code'] == 11, 'income_group'] = '100% to 110% of MFI'
baltimore_tracts_years_df.loc[baltimore_tracts_years_df['income_group_code'] == 12, 'income_group'] = '110% to 120% of MFI'
baltimore_tracts_years_df.loc[baltimore_tracts_years_df['income_group_code'] == 13, 'income_group'] = '> 120% of MFI'

print('Decoding income levels (low/moderate/middle/upper/unknown)...')
# Get levels (low, moderate, middle, upper)
baltimore_tracts_years_df = baltimore_tracts_years_df.assign(cra_level = np.nan)
baltimore_tracts_years_df.loc[(baltimore_tracts_years_df['income_group_code'] >= 1) & (baltimore_tracts_years_df['income_group_code'] <= 5), 'cra_level'] = 'low'
baltimore_tracts_years_df.loc[(baltimore_tracts_years_df['income_group_code'] >= 6) & (baltimore_tracts_years_df['income_group_code'] <= 8), 'cra_level'] = 'moderate'
baltimore_tracts_years_df.loc[(baltimore_tracts_years_df['income_group_code'] >= 9) & (baltimore_tracts_years_df['income_group_code'] <= 12), 'cra_level'] = 'middle'
baltimore_tracts_years_df.loc[(baltimore_tracts_years_df['income_group_code'] == 13), 'cra_level'] = 'upper'
baltimore_tracts_years_df.loc[(baltimore_tracts_years_df['income_group_code'] == 14), 'cra_level'] = 'unknown'

print('Writing out...')
baltimore_tracts_years_df.to_csv(baltimore_dir+'baltimore_alldata.csv', encoding='utf-8')
print('Done!')
