In [1]:
import os
import pandas
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import ticker
from IPython.display import display, HTML
import geopandas as gpd
import time
import io
from pprint import pprint
os.chdir('/home/idies/workspace/Temporary/raddick/cra_scratch/')
print(os.getcwd())
outdir = '/home/idies/workspace/Storage/raddick/Baltimore/community_reinvestment_act/'
print(outdir)
shapefiledir = '/home/idies/workspace/Storage/raddick/Baltimore/shapefiles/'
print(shapefiledir)
print('DONE')

/home/idies/workspace/Temporary/raddick/cra_scratch
/home/idies/workspace/Storage/raddick/Baltimore/community_reinvestment_act/
/home/idies/workspace/Storage/raddick/Baltimore/shapefiles/
DONE


## Load shapefiles

In [2]:
s = time.time()
print('reading shapefile...')

tract_shapes_gdf = gpd.read_file(shapefiledir+'census_tracts_2010/geo_export_c50bbe56-543e-4878-9c9f-c56be327600a.shp', encoding='utf-8')
tract_shapes_gdf = tract_shapes_gdf.assign(tractname = pandas.to_numeric(tract_shapes_gdf['name'].apply(lambda x: x.split(' ')[-1]), errors='coerce'))
tract_shapes_gdf = tract_shapes_gdf.set_index('tractname')
e = time.time()
print('Read {0:,.0f} Maryland census tracts in {1:.3f} seconds.'.format(len(tract_shapes_gdf), e-s))

print('Calculating percent white...')
tract_shapes_gdf = tract_shapes_gdf.assign(percent_white = tract_shapes_gdf['white'] / tract_shapes_gdf['population'])

print('\nreading city boundary...')
boundary_gdf = gpd.read_file(shapefiledir+'baltimore_city_polygon/baltimore_city_polygon.shp')

print('\nreading water features...')
water_gdf = gpd.read_file(shapefiledir+'water/water.shp', encoding='utf-8')
water_gdf = water_gdf.set_index('OBJECTID')

s = time.time()
print('\nreading street centerlines...')
streets_gdf = gpd.read_file(shapefiledir+'streets/streetcl.shp', encoding='utf-8')
e = time.time()
print('Read {0:,.0f} streets in {1:.3f} seconds.'.format(len(streets_gdf), e-s))

s = time.time()
streets_gdf = streets_gdf.drop('OBJECTID_1', axis=1)
streets_gdf = streets_gdf.set_index('OBJECTID')
print('Cutting off streets at city boundary...')
s = time.time()
streets_gdf = gpd.sjoin(streets_gdf, boundary_gdf, op='within')
e = time.time()
print('Remaining: {0:,.0f} streets after {1:,.0f} seconds of processing.'.format(len(streets_gdf), e-s))
#streets_gdf = gpd.overlay(streets_gdf, boundary_gdf, how='intersection')

#print('\nSetting CRSes...')
#for x in (boundary_gdf, water_gdf, streets_gdf):
#    x.crs = tract_shapes_gdf.crs

print('DONE')


reading shapefile...
Read 200 Maryland census tracts in 0.140 seconds.
Calculating percent white...

reading city boundary...

reading water features...

reading street centerlines...
Read 48,160 streets in 11.558 seconds.
Cutting off streets at city boundary...


  warn('CRS of frames being joined does not match!')


Remaining: 47,642 streets after 16 seconds of processing.
DONE


# Lending by geography of borrowers

## Load all data

In [None]:
s = time.time()
baltimore_tracts_df = pandas.read_csv('tracts_processed.csv', encoding='utf-8', index_col='rownumber')
print('{0:,.0f} tract datas nationwide.'.format(len(baltimore_tracts_df)))
print('backing up...')
baltimore_tracts_df_bk = baltimore_tracts_df
e = time.time()
print('DONE in {0:,.0f} seconds.'.format(e-s))

## Select Baltimore City, convert income tract data to low/moderate/medium/high

In [None]:
print('retrieving from backup...')
baltimore_tracts_df = baltimore_tracts_df_bk

baltimore_tracts_df = baltimore_tracts_df[(baltimore_tracts_df['state'] == 24) & (baltimore_tracts_df['county'] == 510)]
print('{0:,.0f} tract datas in Baltimore.'.format(len(baltimore_tracts_df)))

baltimore_tracts_df = baltimore_tracts_df[baltimore_tracts_df['loan_indicator'] == 'Y']
print('{0:,.0f} of them had loans.'.format(len(baltimore_tracts_df)))

print('Converting full level system to low/moderate/medium/upper...')
baltimore_tracts_df = baltimore_tracts_df.assign(cra_level = '')
baltimore_tracts_df.loc[(baltimore_tracts_df['income_group_total'].apply(lambda x: (x >= 1) & (x <=5))), 'cra_level'] = 'low'
baltimore_tracts_df.loc[(baltimore_tracts_df['income_group_total'].apply(lambda x: (x >= 6) & (x <=8))), 'cra_level'] = 'moderate'
baltimore_tracts_df.loc[(baltimore_tracts_df['income_group_total'].apply(lambda x: (x >= 9) & (x <=12))), 'cra_level'] = 'middle'
baltimore_tracts_df.loc[(baltimore_tracts_df['income_group_total'].apply(lambda x: (x == 13))), 'cra_level'] = 'upper'
baltimore_tracts_df.loc[(baltimore_tracts_df['income_group_total'].apply(lambda x: (x >= 14) & (x <=15))), 'cra_level'] = 'unknown'

print('backing up...')
baltimore_tracts_df_bk = baltimore_tracts_df
print('DONE')


# Aggregate all loans within census tracts

In [None]:
print('retrieving from backup...')
baltimore_tracts_df = baltimore_tracts_df_bk

print('grouping...')

tractdata = baltimore_tracts_df[baltimore_tracts_df['loan_indicator'] == 'Y'].groupby(['census_tract', 'activity_year', 'institution_name']).size()

tractdata = tractdata.sort_index()
# Run this to get every index value to display in every row:
#pandas.set_option('display.multi_sparse', False)

# GET ALL ROWS THAT MATCH A GIVEN CRITERION FROM A GIVEN LEVEL
#tractdata.xs(101.00, level=0) #Access all loans given in census tract 101
#tractdata.xs('low', level=1) # Access all loans given to low income group 
#tractdata.xs(2016, level=2) # Access all loans given in 2016
#tractdata.xs('WELLS FARGO BANK, N.A. (CA)', level=3) # Access all loans given by Wells Fargo

# FIND NUMBER OF LOANS FOR EACH VALUE AT EACH LEVEL

#tractdata.groupby([pandas.Grouper(level='census_tract')]).sum() # Number of loans given in each census tract
#tractdata.groupby([pandas.Grouper(level='cra_level')]).sum() # Number of loans given at each CRA income level
#tractdata.groupby([pandas.Grouper(level='activity_year')]).sum() # Number of loans given per year
#tractdata.groupby([pandas.Grouper(level='institution_name')]).sum() # Number of loans given per institution


# ITERATE THROUGH VALUES OF AN INDEX LEVEL AND SEE THE GROUPS FORMED BY THOSE VALUES
#grouped_by_census_tract = tractdata.groupby('census_tract')
#for name, group in grouped_by_census_tract:
#    print(name)
#    print(group)
#    print('\n')

# ACCESS A GROUP ASSOCIATED WITH A SPECIFIC VALUE ONE ONE LEVEL...
#grouped_by_census_tract = tractdata.groupby('census_tract')
#grouped_by_census_tract.get_group(101.00)
# OR ON MULTIPLE LEVELS...
#tractdata.groupby(['census_tract', 'institution_name']).get_group((101, '1ST MARINER BANK (MD)')) # note get_group takes a tuple

print('backing up...')
tractdata_bk = tractdata

print('OK')
#tractdata

# Connect census tract aggregates with shapefiles

In [None]:
print('getting tract loan data from backup...')
tractdata = tractdata_bk

#tractdata
print('Grouping by tract only...')
grouped_by_census_tract = tractdata.groupby('census_tract').sum()

print('\nAdding loan data to shape data...')
tract_shapes_gdf = tract_shapes_gdf.assign(nLoans=grouped_by_census_tract)

#print('\nCalculating loans per 10,000 pepole...')
#tract_shapes_gdf = tract_shapes_gdf.assign(nLoansPer10k = (10000 * tract_shapes_gdf['nLoans']) / tract_shapes_gdf['population'])

print('matching up CSAs...')
tract_shapes_gdf = tract_shapes_gdf.to_crs(water_gdf.crs)

print('\nbacking up...')
tract_shapes_gdf_bk = tract_shapes_gdf

print('ok')

# For each tract, look up median family income (MFI) from ACS 2011-2016

In [None]:
print('retrieving from backup...')
tract_shapes_gdf = tract_shapes_gdf_bk

print('reading MFI data...')
df = pandas.read_csv(outdir+'mfi_and_business_data/ACS_16_5YR_B19113_with_ann.csv', header=1)
df = df.assign(census_tract = pandas.to_numeric(df['Geography'].apply(lambda x: x.split(',')[0][12:])))
#outdir = '/home/idies/workspace/Storage/raddick/Baltimore/community_reinvestment_act/'
df = df.rename(columns={'Estimate; Median family income in the past 12 months (in 2016 inflation-adjusted dollars)': 'mfi'})
#df
#tract_shapes_gdf
df = df.set_index('census_tract')
df.loc[df['mfi'] == '-', 'mfi'] = np.nan

print('assigning MFI to census tracts...')
tract_shapes_gdf = tract_shapes_gdf.assign(mfi = pandas.to_numeric(tract_shapes_gdf.join(df['mfi'], how='left')['mfi']))

print('backing up...')
tract_shapes_gdf_bk = tract_shapes_gdf

print('Done!')

# Make maps

## Locator map

In [None]:
scale = 1#0.25
fig, ax = plt.subplots(figsize=(48*scale,48*scale))
ax.set_aspect('equal')
ax.tick_params(axis='both', which='both', bottom=False, left=False, labelleft=False, labelbottom=False)

tract_shapes_gdf.plot(ax=ax, alpha=0.15, color='red', edgecolor='black', linewidth=2*scale)

water_gdf[water_gdf['NAME'] == 'Harbor'].plot(ax=ax)

boundary_gdf.plot(ax=ax, color='none', edgecolor='black', linewidth=10*scale)

streets_gdf.plot(ax=ax, color='black', linewidth=0.5*scale)

for ix, thisrow in tract_shapes_gdf[tract_shapes_gdf['population'].notnull()].iterrows():
    annotator = str(ix)
    ax.annotate(annotator, 
                xy=(thisrow.geometry.centroid.x, thisrow.geometry.centroid.y), 
                xytext=(thisrow.geometry.centroid.x, thisrow.geometry.centroid.y), 
                ha='center', va='center', backgroundcolor='white', fontsize=20*scale)
plt.show()

## Simple map of MFI

In [None]:
fig, ax = plt.subplots()
tract_shapes_gdf[tract_shapes_gdf['mfi'].notnull()].plot(column='mfi', ax=ax)
water_gdf[water_gdf['NAME'] == 'Harbor'].plot(ax=ax, color='gray')
plt.show()

## Simple map of number of loans (NOT normalized by businesses)

In [None]:
fig, ax = plt.subplots()
tract_shapes_gdf[tract_shapes_gdf['nLoans'].notnull()].plot(column='nLoans', ax=ax)
water_gdf[water_gdf['NAME'] == 'Harbor'].plot(ax=ax, color='gray')
plt.show()
#tract_shapes_gdf_bk = tract_shapes_gdf

## Full-size pretty map showing both

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(60,34))

ax1 = tract_shapes_gdf.plot(ax=ax1, column='nLoans', cmap='viridis', edgecolor='black')
ax1.set_title('Number of CRA loans originated', fontsize=36)
ax1.set_aspect('equal')
water_gdf[water_gdf['NAME'] == 'Harbor'].plot(ax=ax1, color='w')

for ix, thisrow in tract_shapes_gdf[tract_shapes_gdf['population'].notnull()].iterrows():
    annotator = str(ix)
    if (thisrow['population'] > 0):
        annotator += '\n' + '{0:,.0f}'.format(thisrow['nLoans'])
#        annotator += '\n' + '(n = {0:,.0f})'.format(thisrow['population'])
    ax1.annotate(annotator, 
                xy=(thisrow.geometry.centroid.x, thisrow.geometry.centroid.y), 
                xytext=(thisrow.geometry.centroid.x, thisrow.geometry.centroid.y), 
                ha='center', va='center', backgroundcolor='white', fontsize=11)

ax2 = tract_shapes_gdf[tract_shapes_gdf['mfi'].notnull()].plot(ax=ax2, column='mfi', cmap='viridis', edgecolor='black')
ax2.set_title('Median family income', fontsize=36)
ax2.set_aspect('equal')
water_gdf[water_gdf['NAME'] == 'Harbor'].plot(ax=ax2, color='w')

for ix, thisrow in tract_shapes_gdf[tract_shapes_gdf['mfi'].notnull()].iterrows():
    annotator = str(ix)
#    if (thisrow['population'] > 0):
    annotator += '\n' + '${0:,.0f}'.format(thisrow['mfi'])
        #annotator += '\n' + '(n = {0:,.0f})'.format(thisrow['population'])
    ax2.annotate(annotator, 
                xy=(thisrow.geometry.centroid.x, thisrow.geometry.centroid.y), 
                xytext=(thisrow.geometry.centroid.x, thisrow.geometry.centroid.y), 
                ha='center', va='center', backgroundcolor='white', fontsize=11)


ax1.tick_params(axis='both', which='both', bottom=False, left=False, labelleft=False, labelbottom=False)
ax2.tick_params(axis='both', which='both', bottom=False, left=False, labelleft=False, labelbottom=False)

# add colorbar
cax1 = fig.add_axes([0.12, 0.08, 0.36, 0.03])

sm1 = plt.cm.ScalarMappable(cmap='viridis', norm=plt.Normalize(vmin=0, vmax=tract_shapes_gdf['nLoans'].max()))

# fake up the array of the scalar mappable. Urgh...
sm1._A = []

cbar1 = fig.colorbar(sm1, cax=cax1, orientation='horizontal')

cax1.tick_params(labelsize=38)
cbar1.set_label('Number of CRA loans', fontsize=46)


# add colorbar

sm2 = plt.cm.ScalarMappable(cmap='viridis', norm=plt.Normalize(vmin=0, vmax=220000))

# fake up the array of the scalar mappable. Urgh...
sm2._A = []

cax2 = fig.add_axes([0.545, 0.08, 0.36, 0.03])
cbar2 = fig.colorbar(sm2, cax=cax2, format='%.0f', ticks=np.arange(0, 300000, 50000), orientation='horizontal')
cax2.tick_params(labelsize=38)
cbar2.set_label('Median Family Income', fontsize=46)

plt.show()

#print('Saving...')
#plt.savefig(outdir+'baltimore_loans_and_mfi_by_census_tract.svg', format='svg', dpi=300)
#print('DONE!')


# Examine two census tracts: 1207 (Charles Village) and 1403 (Druid Heights)

In [None]:
import matplotlib.patches as mpatches
#tract_shapes_gdf = tract_shapes_gdf_bk# 1207 1403

scale = 1
fig, ax = plt.subplots(figsize=(48*scale,48*scale))
ax.set_aspect('equal')
ax.tick_params(axis='both', which='both', bottom=False, left=False, labelleft=False, labelbottom=False)

csa1 = tract_shapes_gdf[tract_shapes_gdf.index == 1207].plot(ax=ax, alpha=0.5, color='red', edgecolor='black', linewidth=2*scale)
csa2 = tract_shapes_gdf[tract_shapes_gdf.index == 1403].plot(ax=ax, alpha=0.5, color='green', edgecolor='black', linewidth=2*scale)

water_gdf[water_gdf['NAME'] == 'Harbor'].plot(ax=ax)
boundary_gdf.plot(ax=ax, color='none', edgecolor='black', linewidth=10*scale)
streets_gdf.plot(ax=ax, color='black', linewidth=0.5*scale)

#https://stackoverflow.com/questions/44098362/using-mpatches-patch-for-a-custom-legend
colors = ["r", "g"]
texts = ["Census tract 1207", "Census tract 1403"]
patches = [plt.plot([],[], marker="s", ms=100*scale, ls="", mec=None, color=colors[i], 
            label="{:s}".format(texts[i]) )[0]  for i in range(len(texts)) ]
plt.legend(handles=patches, bbox_to_anchor=(0.25, 0.15), 
           loc='center', ncol=1, numpoints=1, fontsize=80*scale, labelspacing=1*scale)
plt.title('Locator map for Baltimore census tracts', fontsize=84*scale)

plt.savefig(outdir+'figures/locator_cv_dhu.svg', format='svg')
print('Figure saved')
#plt.show()

## Number of loans by assessment area for those census tracts

### Census tract 1207

In [None]:
loans_1207_by_assessment_area_s = pandas.Series(
    data=baltimore_tracts_df[baltimore_tracts_df['census_tract'] == 1207].fillna('none or unknown').groupby('assessment_area_number').size(), 
)
#loans_1207_by_assessment_area_s.name = 'nLoans'
loans_1207_by_assessment_area_df = pandas.DataFrame(data=loans_1207_by_assessment_area_s.values, columns=['nLoans'], index=loans_1207_by_assessment_area_s.index)
loans_1207_by_assessment_area_df

### Census tract 1403

In [None]:
loans_1403_by_assessment_area_s = pandas.Series(
    data=baltimore_tracts_df[baltimore_tracts_df['census_tract'] == 1403].fillna('none or unknown').groupby('assessment_area_number').size(), 
)
loans_1403_by_assessment_area_df = pandas.DataFrame(data=loans_1403_by_assessment_area_s.values, columns=['nLoans'], index=loans_1403_by_assessment_area_s.index)
loans_1403_by_assessment_area_df

## Number of loans by income group for those census tracts

### Census tract 1207

In [None]:
loans_1207_by_income_group_s = pandas.Series(
    data=baltimore_tracts_df[baltimore_tracts_df['census_tract'] == 1207].fillna('none or unknown').groupby('income_group').size(), 
)
loans_1207_by_income_group_df = pandas.DataFrame(data=loans_1207_by_income_group_s.values, columns=['nLoans'], index=loans_1207_by_income_group_s.index)
loans_1207_by_income_group_df

### Census tract 1403

In [None]:
loans_1403_by_income_group_s = pandas.Series(
    data=baltimore_tracts_df[baltimore_tracts_df['census_tract'] == 1403].fillna('none or unknown').groupby('income_group').size(), 
)
loans_1403_by_income_group_df = pandas.DataFrame(data=loans_1403_by_income_group_s.values, columns=['nLoans'], index=loans_1403_by_income_group_s.index)
loans_1403_by_income_group_df

## List of all loans in those census tracts

### Census tract 1207

In [None]:
output = baltimore_tracts_df[[
    'census_tract', 'assessment_area_number', 'income_group', 'cra_level', 'institution_name'
]][baltimore_tracts_df['census_tract'] == 1207].fillna('').sort_values('income_group')

htmlString = '<table>'
htmlString += '<tr><th>Census tract</th><th>Assessment area</th><th>Income group</th><th>CRA level</th><th>Institution name</th></tr>'
for ix, thisrow in output.iterrows():
    htmlString += '<tr>'
    for thiscol in thisrow:
        htmlString += '<td>'+str(thiscol)+'</td>'
    htmlString += '<tr>'
htmlString += '</table>'

display(HTML(htmlString))

### Census tract 1403

In [None]:
output = baltimore_tracts_df[[
    'census_tract', 'assessment_area_number', 'income_group', 'cra_level', 'institution_name'
]][baltimore_tracts_df['census_tract'] == 1403].fillna('').sort_values('income_group')

htmlString = '<table>'
htmlString += '<tr><th>Census tract</th><th>Assessment area</th><th>Income group</th><th>CRA level</th><th>Institution name</th></tr>'
for ix, thisrow in output.iterrows():
    htmlString += '<tr>'
    for thiscol in thisrow:
        htmlString += '<td>'+str(thiscol)+'</td>'
    htmlString += '<tr>'
htmlString += '</table>'

display(HTML(htmlString))