In [3]:
print('Importing packages...')
import os
import urllib
import pandas
import zipfile
import time
from IPython.display import display, HTML

data_dir = '/home/idies/workspace/Temporary/raddick/cra_scratch'

os.chdir(data_dir)
os.getcwd()

Importing packages...


'/home/idies/workspace/Temporary/raddick/cra_scratch'

# Get 2016 data

## Get the files from ffiec.gov and unzip

In [4]:
# Get oroginal datafiles from ffiec.gov
print('Downloading master datafile...')
thatpath = 'https://www.ffiec.gov/cra/xls/'
theyears = list(range(96,100))
theyears += list(range(0,17))

filenames = []

for i in theyears:
    filenames.append('{:02d}exp_discl.zip'.format(i))
#filenames

for thisfile in filenames:
    print('Starting to download {:}...'.format(thisfile))
    with urllib.request.urlopen(thatpath+thisfile) as response:    
        it = response.read()
        with open(thisfile, 'wb') as f:
            f.write(it)
print('Unzipping individual datafiles...')

allfiles = os.listdir()
allfiles_df = pandas.DataFrame(allfiles)

allfiles_df.columns = ['filename']

allfiles_df['file_extension'] = allfiles_df['filename'][
    allfiles_df['filename'].apply(lambda x: 
                                      (len(str(x).split('.')) > 1)
                                 )
].apply(lambda x: str(x).split('.')[1])

zipfiles_df = allfiles_df[allfiles_df['file_extension'] == 'zip'].sort_values('filename')
zipfiles_df['yearstring'] = zipfiles_df['filename'].apply(lambda x: x[0:2])
zipfiles_df['thisyear'] = pandas.to_numeric(zipfiles_df['yearstring'])

zipfiles_df = zipfiles_df.set_index('thisyear')
zipfiles_df
sortorder = [96, 97, 98, 99, 0, 1, 2, 3]
sortorder += [4, 5, 6, 7, 8, 9, 10, 11]
sortorder += [12, 13, 14, 15, 16]
newindex = pandas.Index(sortorder)
zipfiles_df = zipfiles_df.reindex(newindex)

zipfiles_df['needs_rename'] = zipfiles_df.index.map(lambda x: (x >= 8) & (x <= 15))
zipfiles_df = zipfiles_df.drop(['file_extension', 'yearstring'], axis=1)
zipfiles_df
for idx, thisrow in zipfiles_df.iterrows():
    print('Extracting {:}...'.format(thisrow['filename']))
    thezipfile = zipfile.ZipFile(thisrow['filename'])
    thezipfile.extractall()
    thezipfile.close()
    if (thisrow['needs_rename']):
        print('Renaming...')
        os.rename('exp_discl.dat', '{:02d}exp_discl.dat'.format(idx))

# Delete the zipfiles, we don't need them anymore
for idx, thisrow in zipfiles_df.iterrows():
    os.remove(thisrow['filename'])
print('Done!')

Downloading master datafile...
Starting to download 96exp_discl.zip...
Starting to download 97exp_discl.zip...
Starting to download 98exp_discl.zip...
Starting to download 99exp_discl.zip...
Starting to download 00exp_discl.zip...
Starting to download 01exp_discl.zip...
Starting to download 02exp_discl.zip...
Starting to download 03exp_discl.zip...
Starting to download 04exp_discl.zip...
Starting to download 05exp_discl.zip...
Starting to download 06exp_discl.zip...
Starting to download 07exp_discl.zip...
Starting to download 08exp_discl.zip...
Starting to download 09exp_discl.zip...
Starting to download 10exp_discl.zip...
Starting to download 11exp_discl.zip...
Starting to download 12exp_discl.zip...
Starting to download 13exp_discl.zip...
Starting to download 14exp_discl.zip...
Starting to download 15exp_discl.zip...
Starting to download 16exp_discl.zip...
Unzipping individual datafiles...
Extracting 96exp_discl.zip...
Extracting 97exp_discl.zip...
Extracting 98exp_discl.zip...
Extra

## Parse the single rawdata string into separate columns of strings

Using the guides on their website (https://www.ffiec.gov/cra/pdf/16FlatDiscSpecs.pdf)

In [3]:
rdf = pandas.DataFrame()

s = time.time()

for i in range(1,3):
    for j in range(1,3):
        #
        thisfile = 'cra2016_Discl_D{0:.0f}{1:.0f}.dat'.format(i,j)
        print('Reading file {:}...'.format(thisfile))
        rdf = rdf.append(pandas.read_csv(thisfile, header=None))

for k in range(3,7):
    thisfile = 'cra2016_Discl_D{0:.0f}.dat'.format(k)
    print('Reading file {:}...'.format(thisfile))
    rdf = rdf.append(pandas.read_csv(thisfile, header=None))

e = time.time()

print('\n')
print('Read 2016 data: {0:,.0f} rows in {1:,.0f} seconds.'.format(len(rdf), e-s))

Reading file cra2016_Discl_D11.dat...
Reading file cra2016_Discl_D12.dat...
Reading file cra2016_Discl_D21.dat...
Reading file cra2016_Discl_D22.dat...
Reading file cra2016_Discl_D3.dat...
Reading file cra2016_Discl_D4.dat...
Reading file cra2016_Discl_D5.dat...
Reading file cra2016_Discl_D6.dat...


Read 2016 data: 3,593,829 rows in 13 seconds.


## Read files for 2004 to 2015

In [4]:
s = time.time()

filelist = []
for i in range(4,16):
    thisfile = '{:02d}exp_discl'.format(i)
    if i in [4,5,6]:
        thisfile = thisfile + '_new.dat'
    else:
        thisfile = thisfile + '.dat'
    filelist.append(thisfile)
print('Found filenames!')

for thisfile in filelist:
    print('Reading file {:}...'.format(thisfile))
    rdf = rdf.append(pandas.read_csv(thisfile, header=None))

e = time.time()
print('\n')
print('Read 2004-2015 data: {0:,.0f} rows in {1:,.0f} seconds.'.format(len(rdf), e-s))

rdf_bk = rdf
print('\n')
print('made backup')

Found filenames!
Reading file 04exp_discl_new.dat...
Reading file 05exp_discl_new.dat...
Reading file 06exp_discl_new.dat...
Reading file 07exp_discl.dat...
Reading file 08exp_discl.dat...
Reading file 09exp_discl.dat...
Reading file 10exp_discl.dat...
Reading file 11exp_discl.dat...
Reading file 12exp_discl.dat...
Reading file 13exp_discl.dat...
Reading file 14exp_discl.dat...
Reading file 15exp_discl.dat...


Read 2004-2015 data: 39,019,524 rows in 148 seconds.


made backup


In [5]:
rdf = rdf_bk
rdf.columns = ['thestring']
rdf.index.name = 'rownumber'

cols = ['tableID', 'respondentID', 'agency_code', 'activity_year']

c = 0
s = time.time()
print('constructing empty dataframe...')
df = pandas.DataFrame(data=None, columns=cols, index=rdf.index)
e = time.time()
c = c + (e-s)
print('Took {0:,.0f} seconds\n'.format(e-s))

print('parsing long-ass string data into columns...')
s = time.time()

print('tableID...')
df['tableID'] = rdf['thestring'].apply(lambda x: x[0:5])
print('respondentID...')
df['respondentID'] = rdf['thestring'].apply(lambda x: x[5:15])
print('agency_code...')
df['agency_code'] = rdf['thestring'].apply(lambda x: x[15])
print('activity_year...')
df['activity_year'] = rdf['thestring'].apply(lambda x: x[16:20])
e = time.time()
c = c + (e-s)
print('Converted {0:,.0f} rows in {1:,.0f} seconds.'.format(len(df), e-s))

print('\nTrimming strings in columns...')
s = time.time()
for thiscol in df.columns:
    print('{:}...'.format(thiscol))
    df[thiscol] = df[thiscol].apply(lambda x: x.strip())
e = time.time()
print('Trimmed {0:,.0f} rows in {1:,.0f} seconds.'.format(len(df), e-s))

s = time.time()
print('\nCoverting to numeric...')
print('respondentID...')
df['respondentID'] = pandas.to_numeric(df['respondentID'], errors='coerce')
print('agency_code...')
df['agency_code'] = pandas.to_numeric(df['agency_code'], errors='coerce')
print('activity_year...')
df['activity_year'] = pandas.to_numeric(df['activity_year'], errors='coerce')
e = time.time()
print('Converted columns in {:,.0f} seconds.'.format(e-s))
c = c + (e-s)

print('backing up...')
df_bk = df

print('Done! Total time: {:,.0f} seconds.'.format(c))


constructing empty dataframe...
Took 15 seconds

parsing long-ass string data into columns...
tableID...
respondentID...
agency_code...
activity_year...
Converted 39,019,524 rows in 157 seconds.

Trimming strings in columns...
tableID...
respondentID...
agency_code...
activity_year...
Trimmed 39,019,524 rows in 99 seconds.

Coverting to numeric...
respondentID...
agency_code...
activity_year...
Converted columns in 165 seconds.
backing up...
Done! Total time: 337 seconds.


In [None]:
df = df_bk

theloans = ['D1-1', 'D1-2', 'D2-1', 'D2-2']
theactivities = ['D3-0', 'D4-0']
thecras = ['D5-0']
thetracts = ['D6-0']

c = 0
s = time.time()
print('Assigning new columns to False...')
df = df.assign(isloan=False)
df = df.assign(isactivity=False)
df = df.assign(iscra=False)
df = df.assign(istract=False)
e = time.time()
c = c + (e-s)
print('Assigned, took {0:,.0f} seconds.'.format(e-s))

s = time.time()
df.loc[df['tableID'].isin(theloans), 'isloan'] = True
df.loc[df['tableID'].isin(theactivities), 'isactivity'] = True
df.loc[df['tableID'].isin(thecras), 'iscra'] = True
df.loc[df['tableID'].isin(thetracts), 'istract'] = True
e = time.time()

print('Loans/activities/cras/tracts data: processed {0:,.0f} rows to True in {1:,.0f} seconds.'.format(len(df), e-s))
c = c + (e-s)

s = time.time()
df_bk = df
e = time.time()
c = c + (e-s)
print('backed up in {0:,.0f} seconds\n'.format(e-s))
print('Done! Took {0:,.0f} seconds total.'.format(c))


In [None]:
print('Creating empty dataframe...')
c = 0
s = time.time()
loans_df = pandas.DataFrame(data=None, columns=df.columns, index=df.index)
e = time.time()
c = c + (e-s)
print('Dataframe created in {0:,.0f} seconds.\n'.format(e-s))

print('Getting data on loans (tables D1-1/D1-2/D2-1/D2-2) into loans_df...')
s = time.time()
#loans_df = loans_df.append(df[df['isloan'] == True])
loans_df = df[df['isloan'] == True]
e = time.time()
c = c + (e-s)

print('Loans data: copied {0:,.0f} rows in {1:,.0f} seconds.\n'.format(len(loans_df), e-s))

print('backing up')
loans_df_bk = loans_df

print('Done in {0:,.0f} seconds!'.format(c))


In [None]:
s = time.time()
print('Retrieving backup...')
loans_df = loans_df_bk
print('Creating new columns...')

print('loan_type...')
loans_df = loans_df.assign(loan_type = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[20]))
print('action_taken_type...')
loans_df = loans_df.assign(action_taken_type = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[21]))
print('state...')
loans_df = loans_df.assign(state = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[22:24]))
print('county...')
loans_df = loans_df.assign(county = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[24:27]))
print('msa...')
loans_df = loans_df.assign(msa = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[27:32]))
print('assessment_area_number...')
loans_df = loans_df.assign(assessment_area_number = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[33:36]))
print('partial_county_indicator...')
loans_df = loans_df.assign(partial_county_indicator = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[36]))
print('split_county_indicator...')
loans_df = loans_df.assign(split_county_indicator = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[37]))
print('population_classification...')
loans_df = loans_df.assign(population_classification = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[38]))
print('income_group_total...')
loans_df = loans_df.assign(income_group_total = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[39:42]))
print('report_level...')
loans_df = loans_df.assign(report_level = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[42:45]))
print('nLoans1...')
loans_df = loans_df.assign(nLoans1 = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[45:55]))
print('amtLoans1...')
loans_df = loans_df.assign(amtLoans1 = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[55:65]))
print('nLoans100k...')
loans_df = loans_df.assign(nLoans100k = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[65:75]))
print('amtLoans100k...')
loans_df = loans_df.assign(amtLoans100k = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[75:85]))
print('nLoans250k...')
loans_df = loans_df.assign(nLoans250k = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[85:95]))
print('amtLoans250k...')
loans_df = loans_df.assign(amtLoans250k = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[95:105]))
print('nLoansToSmallest...')
loans_df = loans_df.assign(nLoansToSmallest = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[105:115]))
print('amtLoansToSmallest...')
loans_df = loans_df.assign(amtLoansToSmallest = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[115:125]))
print('nLoansAff...')
loans_df = loans_df.assign(nLoansAff = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[125:135]))
print('amtLoansAff..\n')
loans_df = loans_df.assign(amtLoansAff = rdf['thestring'][df['isloan'] == True].apply(lambda x: x[135:145]))

print('Keep only lowest level of aggregation to avoid double-counting...')
loans_df = loans_df[loans_df['report_level'] == '   ']

print('Drop columns we do not need...')
loans_df = loans_df.drop(['isloan', 'isactivity', 'iscra', 'istract'], axis=1)
loans_df = loans_df.drop(['tableID','report_level'], axis=1)

print('backing up...')
loans_df_bk = loans_df
e = time.time()

print('Parsed all columns for {0:,.0f} rows in {1:,.0f} seconds'.format(len(loans_df), e-s))

print('Writing outfile...')
s = time.time()
loans_df.to_csv('loans_df.csv', encoding='utf-8')
e = time.time()
print('{0:,.0f} rows written in {1:,.0f} seconds.'.format(len(loans_df), e-s))
print('ok')
#os.getcwd()

#loans_df.sample(3).T
print('Done!')

In [None]:
df[df['isactivity'] == True].head(1)

In [None]:
print('Creating empty dataframe...')
c = 0
s = time.time()
activities_df = pandas.DataFrame(data=None, columns=df.columns, index=df.index)
e = time.time()
c = c + (e-s)
print('Dataframe created in {0:,.0f} seconds.\n'.format(e-s))

print('Getting data on activities (tables D3/D4) into activities_df...')
s = time.time()
activities_df = df[df['isactivity'] == True]
e = time.time()
c = c + (e-s)

print('Activities data: copied {0:,.0f} rows in {1:,.0f} seconds.\n'.format(len(activities_df), e-s))

print('backing up')
activities_df_bk = activities_df

print('Done in {0:,.0f} seconds!'.format(c))
#activities_df.columns

In [None]:
s = time.time()
print('Retrieving backup...')
activities_df = activities_df_bk

print('Creating new columns...')

print('state...')
activities_df = activities_df.assign(state = rdf['thestring'][df['isactivity'] == True].apply(lambda x: x[21:23]))
print('county...')
activities_df = activities_df.assign(county = rdf['thestring'][df['isactivity'] == True].apply(lambda x: x[23:26]))
print('msa...')
activities_df = activities_df.assign(msa = rdf['thestring'][df['isactivity'] == True].apply(lambda x: x[26:31]))
print('assessment_area_number...')
activities_df = activities_df.assign(assessment_area_number = rdf['thestring'][df['isactivity'] == True].apply(lambda x: x[31:35]))
print('partial_county_indicator...')
activities_df = activities_df.assign(partial_county_indicator = rdf['thestring'][df['isactivity'] == True].apply(lambda x: x[35]))
print('split_county_indicator...')
activities_df = activities_df.assign(split_county_indicator = rdf['thestring'][df['isactivity'] == True].apply(lambda x: x[36]))
print('report_level...')
activities_df = activities_df.assign(report_level = rdf['thestring'][df['isactivity'] == True].apply(lambda x: x[37:39]))
print('nLoansAll...')
activities_df = activities_df.assign(nLoansAll = rdf['thestring'][df['isactivity'] == True].apply(lambda x: x[39:49]))
print('amtLoansAll...')
activities_df = activities_df.assign(amtLoansAll = rdf['thestring'][df['isactivity'] == True].apply(lambda x: x[49:59]))
print('nLoans1Mbiz...')
activities_df = activities_df.assign(nLoans1Mbiz = rdf['thestring'][df['isactivity'] == True].apply(lambda x: x[59:69]))
print('amtLoans1Mbiz...')
activities_df = activities_df.assign(amtLoans1Mbiz = rdf['thestring'][df['isactivity'] == True].apply(lambda x: x[69:79]))
print('nLoanssmallbiz...')
activities_df = activities_df.assign(nLoanssmallbiz = rdf['thestring'][df['isactivity'] == True].apply(lambda x: x[79:89]))
print('amtLoanssmallbiz...')
activities_df = activities_df.assign(amtLoanssmallbiz = rdf['thestring'][df['isactivity'] == True].apply(lambda x: x[89:99]))

print('dropping columns no longer needed')
activities_df = activities_df.drop(['isloan', 'isactivity', 'iscra', 'istract'], axis=1)
activities_df = activities_df.drop(['tableID', 'report_level'], axis=1)

e = time.time()

print('Parsed all columns for {0:,.0f} rows in {1:,.0f} seconds'.format(len(activities_df), e-s))

print('Saving file...')
activities_df.to_csv('activities_df.csv', encoding='utf-8')
#activities_df.sample(3).T

print('Done!')

In [None]:
print('Creating empty dataframe...')
c = 0
s = time.time()
cra_df = pandas.DataFrame(data=None, columns=df.columns, index=df.index)
e = time.time()
c = c + (e-s)
print('Dataframe created in {0:,.0f} seconds.\n'.format(e-s))

print('Getting data on activities (tables D3/D4) into activities_df...')
s = time.time()
cra_df = df[df['iscra'] == True]
e = time.time()
c = c + (e-s)

print('CRA data: copied {0:,.0f} rows in {1:,.0f} seconds.\n'.format(len(cra_df), e-s))

print('backing up')
cra_df_bk = cra_df

print('Done in {0:,.0f} seconds!'.format(c))
#cra_df.columns

In [None]:
s = time.time()
print('Retrieving backup...')
cra_df = cra_df_bk
print('Creating new columns...')

print('nLoansAll...')
cra_df = cra_df.assign(nLoansAll = rdf['thestring'][df['iscra'] == True].apply(lambda x: x[21:31]))
print('amtLoansAll...')
cra_df = cra_df.assign(amtLoansAll = rdf['thestring'][df['iscra'] == True].apply(lambda x: x[31:41]))
print('nLoansAff...')
cra_df = cra_df.assign(nLoansAll = rdf['thestring'][df['iscra'] == True].apply(lambda x: x[41:51]))
print('amtLoansAff...')
cra_df = cra_df.assign(amtLoansAll = rdf['thestring'][df['iscra'] == True].apply(lambda x: x[51:61]))
print('action_taken_type...')
cra_df = cra_df.assign(amtLoansAll = rdf['thestring'][df['iscra'] == True].apply(lambda x: x[61]))

print('backing up...')
cra_df_bk = cra_df
e = time.time()

print('Parsed all columns for {0:,.0f} rows in {1:,.0f} seconds'.format(len(cra_df), e-s))

print('dropping columns no longer needed')
cra_df = cra_df.drop(['tableID', 'isloan', 'isactivity', 'iscra', 'istract'], axis=1)

print('Saving file...')
cra_df.to_csv('cra_df.csv', encoding='utf-8')
#cra_df.sample(3).T
print('Done!')

### Table 6: Assessment area(s) by tract

In [None]:
print('Creating empty dataframe...')
c = 0
s = time.time()
tracts_df = pandas.DataFrame(data=None, columns=df.columns, index=df.index)
e = time.time()
c = c + (e-s)
print('Dataframe created in {0:,.0f} seconds.\n'.format(e-s))

print('Getting data on tracts (table D6) into tracts_df...')
s = time.time()
tracts_df = df[df['istract'] == True]
e = time.time()
c = c + (e-s)

print('Tracts data: copied {0:,.0f} rows in {1:,.0f} seconds.\n'.format(len(tracts_df), e-s))

print('backing up')
tracts_df_bk = tracts_df

print('Done in {0:,.0f} seconds!'.format(c))


In [None]:
s = time.time()
print('Retrieving backup...')
tracts_df = tracts_df_bk
print('Creating new columns...')

print('state...')
tracts_df = tracts_df.assign(state = rdf['thestring'][df['istract'] == True].apply(lambda x: x[20:22]))
print('county...')
tracts_df = tracts_df.assign(county = rdf['thestring'][df['istract'] == True].apply(lambda x: x[22:25]))
print('msa...')
tracts_df = tracts_df.assign(msa = rdf['thestring'][df['istract'] == True].apply(lambda x: x[25:30]))
print('census tract...')
tracts_df = tracts_df.assign(census_tract = rdf['thestring'][df['istract'] == True].apply(lambda x: x[30:37]))
print('assessment_area_number...')
tracts_df = tracts_df.assign(assessment_area_number = rdf['thestring'][df['istract'] == True].apply(lambda x: x[37:41]))
print('partial_county_indicator...')
tracts_df = tracts_df.assign(partial_county_indicator = rdf['thestring'][df['istract'] == True].apply(lambda x: x[41]))
print('split_county_indicator...')
tracts_df = tracts_df.assign(split_county_indicator = rdf['thestring'][df['istract'] == True].apply(lambda x: x[42]))
print('population_classification...')
tracts_df = tracts_df.assign(population_classification = rdf['thestring'][df['istract'] == True].apply(lambda x: x[43]))
print('income_group_total...')
tracts_df = tracts_df.assign(income_group_total = rdf['thestring'][df['istract'] == True].apply(lambda x: x[44:47]))
print('loan_indicator...')
tracts_df = tracts_df.assign(loan_indicator = rdf['thestring'][df['istract'] == True].apply(lambda x: x[47]))

print('dropping columns no longer needed')
tracts_df = tracts_df.drop(['tableID', 'isloan', 'isactivity', 'iscra', 'istract'], axis=1)

e = time.time()

print('Parsed all columns for {0:,.0f} rows in {1:,.0f} seconds'.format(len(tracts_df), e-s))

print('Saving file...')
tracts_df.to_csv('tracts_df.csv', encoding='utf-8')
#tracts_df.sample(3).T
print('Done!')

# OLDER: Add historical data

In [None]:
filelist = []
for i in range(4,16):
    thisfile = '{:02d}exp_discl'.format(i)
    if i in [4,5,6]:
        thisfile = thisfile + '_new.dat'
    else:
        thisfile = thisfile + '.dat'
    filelist.append(thisfile)
        
print('Got list of data files from 2004 to 2015!')
print('\n')
print(filelist)

In [None]:

for thisfile in filelist:
    print('Reading {0:}exp_discl.dat...'.format(thisfile))
    thisyear_df = pandas.read_csv(thisfile, low_memory=False, header=None)
    print('Appending...')
    df = df.append(thisyear_df)
    #print('\n')
history_df.columns = ['thestring']
history_df.index.name = 'rownumber'
#history_df.head(1)
print('Done')

In [None]:
'{:,.0f}'.format(len(history_df))

In [None]:
# os.listdir()
# Original file is one huge giant string. Read that string.
print('Reading table D1-1...')
bizorig = pandas.read_csv('cra2016_Discl_D11.dat', header=None)
bizorig.name = 'bizorig'
bizorig.columns = ['thestring']

# Now parse the strings into individual values, following the data guide in the PDF
print('Parsing strings...')
bizorig['tableID'] = bizorig['thestring'].apply(lambda x: x[0:5])
bizorig['respondentID'] = bizorig['thestring'].apply(lambda x: x[5:15])
bizorig['agency_code'] = bizorig['thestring'].apply(lambda x: x[15])
bizorig['activity_year'] = bizorig['thestring'].apply(lambda x: x[16:20])
bizorig['loan_type'] = bizorig['thestring'].apply(lambda x: x[20])
bizorig['action_taken_type'] = bizorig['thestring'].apply(lambda x: x[21])
bizorig['state'] = bizorig['thestring'].apply(lambda x: x[22:24])
bizorig['county'] = bizorig['thestring'].apply(lambda x: x[24:27])
bizorig['msa'] = bizorig['thestring'].apply(lambda x: x[27:32])
bizorig['assessment_area_number'] = bizorig['thestring'].apply(lambda x: x[33:36])
bizorig['partial_county_indicator'] = bizorig['thestring'].apply(lambda x: x[36])
bizorig['split_county_indicator'] = bizorig['thestring'].apply(lambda x: x[37])
bizorig['population_classification'] = bizorig['thestring'].apply(lambda x: x[38])
bizorig['income_group_total'] = bizorig['thestring'].apply(lambda x: x[39:42])
bizorig['report_level'] = bizorig['thestring'].apply(lambda x: x[42:45])

bizorig['nLoans1'] = bizorig['thestring'].apply(lambda x: x[45:55])
bizorig['amtLoans1'] = bizorig['thestring'].apply(lambda x: x[55:65])
bizorig['nLoans100k'] = bizorig['thestring'].apply(lambda x: x[65:75])
bizorig['amtLoans100k'] = bizorig['thestring'].apply(lambda x: x[75:85])
bizorig['nLoans250k'] = bizorig['thestring'].apply(lambda x: x[85:95])
bizorig['amtLoans250k'] = bizorig['thestring'].apply(lambda x: x[95:105])
bizorig['nLoansTotal'] = bizorig['thestring'].apply(lambda x: x[105:115])
bizorig['amtLoansTotal'] = bizorig['thestring'].apply(lambda x: x[115:125])
bizorig['nLoansAff'] = bizorig['thestring'].apply(lambda x: x[125:135])
bizorig['amtLoansAff'] = bizorig['thestring'].apply(lambda x: x[135:145])

bizorig = bizorig.drop('thestring', axis=1)

print(bizorig.groupby('loan_type').size())

bizorig.sample(3).T

In [None]:
df = df_bk

print('Converting data to numeric...')

df['respondentID'] = pandas.to_numeric(df['respondentID'], errors='coerce')
df['agency_code'] = pandas.to_numeric(df['agency_code'], errors='coerce')
df['activity_year'] = pandas.to_numeric(df['activity_year'], errors='coerce')
df['loan_type'] = pandas.to_numeric(df['loan_type'], errors='coerce')
df['action_taken_type'] = pandas.to_numeric(df['action_taken_type'], errors='coerce')
df['state'] = pandas.to_numeric(df['state'], errors='coerce')
df['county'] = pandas.to_numeric(df['county'], errors='coerce')
df['msa'] = pandas.to_numeric(df['msa'], errors='coerce')
df['income_group_total'] = pandas.to_numeric(df['income_group_total'], errors='coerce')

df['nLoans1'] = pandas.to_numeric(df['msa'], errors='coerce')
df['amtLoans1'] = pandas.to_numeric(df['msa'], errors='coerce')
df['nLoans100k'] = pandas.to_numeric(df['msa'], errors='coerce')
df['amtLoans100k'] = pandas.to_numeric(df['msa'], errors='coerce')
df['nLoans250k'] = pandas.to_numeric(df['msa'], errors='coerce')
df['amtLoans250k'] = pandas.to_numeric(df['msa'], errors='coerce')
df['nLoansTotal'] = pandas.to_numeric(df['msa'], errors='coerce')
df['amtLoansTotal'] = pandas.to_numeric(df['msa'], errors='coerce')
df['nLoansAff'] = pandas.to_numeric(df['msa'], errors='coerce')
df['amtLoansAff'] = pandas.to_numeric(df['msa'], errors='coerce')

print('Assigning codes...')

df = df.rename(columns={'loan_type': 'loan_type_code'})
df = df.assign(loan_type = '')

#df.loc[df['loan_type_code'] == '4', 'loan_type'] = 'Small business'
#df.loc[df['loan_type_code'] == '5', 'loan_type'] = 'Small farm'
#df.loc[df['loan_type_code'] == 6, 'loan_type'] = 'Community Development'
#df.loc[df['loan_type_code'] == 7, 'loan_type'] = 'Consortium/Third-Party'

df.head(1).T

In [None]:
df.groupby('loan_type').size()

## Convert raw data columns to numbers and codes

In [None]:
print('Looking up data codes...')
df['agency'] = ''
df.loc[tracts['agency_code'] == 1, 'agency'] = 'OCC'
df.loc[tracts['agency_code'] == 2, 'agency'] = 'FRS'
df.loc[tracts['agency_code'] == 3, 'agency'] = 'FDIC'
df.loc[tracts['agency_code'] == 4, 'agency'] = 'OTS'



In [None]:

tracts = tracts.drop('thestring', axis=1)


#bizorig['assessment_area_number'] = pandas.to_numeric(bizorig['assessment_area_number'], downcast='integer', errors='coerce')

print('Looking up insitution names from respondentIDs...')
respondents = pandas.read_csv('respondentid.csv', index_col='respondentID')
tracts = tracts.join(respondents, how='left', on='respondentID')#[['respondentID', 'institution_name']].drop_duplicates()


tracts.index.name = 'row_number'

tracts_bk = tracts


print('done')

In [None]:
bizorig = bizorig_bk

print('Converting geography codes...')
bizorig['respondentID'] = pandas.to_numeric(bizorig['respondentID'], downcast='integer', errors='coerce')
bizorig['agency_code'] = pandas.to_numeric(bizorig['agency_code'], downcast='integer', errors='coerce')
bizorig['activity_year'] = pandas.to_numeric(bizorig['activity_year'], downcast='integer', errors='coerce')
bizorig['loan_type'] = pandas.to_numeric(bizorig['loan_type'], downcast='integer', errors='coerce')
bizorig['action_taken_type'] = pandas.to_numeric(bizorig['action_taken_type'], downcast='integer', errors='coerce')
bizorig['state'] = pandas.to_numeric(bizorig['state'], downcast='integer', errors='coerce')
bizorig['county'] = pandas.to_numeric(bizorig['county'], downcast='integer', errors='coerce')
bizorig['msa'] = pandas.to_numeric(bizorig['msa'], downcast='integer', errors='coerce')
#bizorig['assessment_area_number'] = pandas.to_numeric(bizorig['assessment_area_number'], downcast='integer', errors='coerce')

print('Converting numbers...')
bizorig['nBizLoans1'] = pandas.to_numeric(bizorig['nBizLoans1'])
bizorig['amtBizLoans1'] = pandas.to_numeric(bizorig['amtBizLoans1']) * 1000
bizorig['nBizLoans100k'] = pandas.to_numeric(bizorig['nBizLoans100k'])
bizorig['amtBizLoans100k'] = pandas.to_numeric(bizorig['amtBizLoans100k']) * 1000
bizorig['nBizLoans250k'] = pandas.to_numeric(bizorig['nBizLoans250k'])
bizorig['amtBizLoans250k'] = pandas.to_numeric(bizorig['amtBizLoans250k']) * 1000
bizorig['nBizLoans1M'] = pandas.to_numeric(bizorig['nBizLoans1M'])
bizorig['amtBizLoans1M'] = pandas.to_numeric(bizorig['amtBizLoans1M']) * 1000
bizorig['nBizLoansAff'] = pandas.to_numeric(bizorig['nBizLoansAff'])
bizorig['amtBizLoansAff'] = pandas.to_numeric(bizorig['amtBizLoansAff']) * 1000

bizorig_bk = bizorig
print('ok')


In [None]:
bizorig = bizorig_bk

print('Looking up data codes...')
# agency code
bizorig['agency'] = ''
bizorig.loc[bizorig['agency_code'] == 1, 'agency'] = 'OCC'
bizorig.loc[bizorig['agency_code'] == 2, 'agency'] = 'FRS'
bizorig.loc[bizorig['agency_code'] == 3, 'agency'] = 'FDIC'
bizorig.loc[bizorig['agency_code'] == 4, 'agency'] = 'OTS'

# income group code
bizorig['income_group'] = 'xxx'
bizorig.loc[bizorig['income_group_total'] == '001', 'income_group'] = '< 10% MFI'
bizorig.loc[bizorig['income_group_total'] == '002', 'income_group'] = '10-20% MFI'
bizorig.loc[bizorig['income_group_total'] == '003', 'income_group'] = '20-30% MFI'
bizorig.loc[bizorig['income_group_total'] == '004', 'income_group'] = '30-40% MFI'
bizorig.loc[bizorig['income_group_total'] == '005', 'income_group'] = '40-50% MFI'
bizorig.loc[bizorig['income_group_total'] == '006', 'income_group'] = '50-60% MFI'
bizorig.loc[bizorig['income_group_total'] == '007', 'income_group'] = '60-70% MFI'
bizorig.loc[bizorig['income_group_total'] == '008', 'income_group'] = '70-80% MFI'
bizorig.loc[bizorig['income_group_total'] == '009', 'income_group'] = '80-90% MFI'
bizorig.loc[bizorig['income_group_total'] == '010', 'income_group'] = '90-100% MFI'
bizorig.loc[bizorig['income_group_total'] == '011', 'income_group'] = '100-110% MFI'
bizorig.loc[bizorig['income_group_total'] == '012', 'income_group'] = '110-120% MFI'
bizorig.loc[bizorig['income_group_total'] == '013', 'income_group'] = '> 120% MFI'
bizorig.loc[bizorig['income_group_total'] == '013', 'income_group'] = '> 120% MFI'
bizorig.loc[bizorig['income_group_total'] == '014', 'income_group'] = 'MFI not known'
bizorig.loc[bizorig['income_group_total'] == '015', 'income_group'] = 'Tract not known'
bizorig.loc[bizorig['income_group_total'] == '101', 'income_group'] = 'Low income'
bizorig.loc[bizorig['income_group_total'] == '102', 'income_group'] = 'Moderate income'
bizorig.loc[bizorig['income_group_total'] == '103', 'income_group'] = 'Middle income'
bizorig.loc[bizorig['income_group_total'] == '104', 'income_group'] = 'Upper income'
bizorig.loc[bizorig['income_group_total'] == '105', 'income_group'] = 'Income not known'
bizorig.loc[bizorig['income_group_total'] == '106', 'income_group'] = 'Tract not known (via cra level)'

# Find institution names by respondent IDs
respondents = pandas.read_csv('respondentid.csv', index_col='respondentID')
bizorig = bizorig.join(respondents, how='left', on='respondentID')

print('Summing numbers...')
bizorig['nBizLoans'] = bizorig['nBizLoans1'
                              ] + bizorig['nBizLoans100k'
                                         ] + bizorig['nBizLoans250k'
                                                    ] #+ bizorig['nBizLoans1M'
                                                        #]# + bizorig['nBizLoansAff']
bizorig['amtBizLoans'] = bizorig['amtBizLoans1'
                                ] + bizorig['amtBizLoans100k'
                                           ] + bizorig['amtBizLoans250k'
                                                      ] #+ bizorig['amtBizLoans1M'
                                                            #]# + bizorig['amtBizLoansAff']
bizorig.index.name = 'row_number'

# Remove county totals (blank income group) so we don't double-count
bizorig = bizorig[(bizorig['income_group'] != 'xxx')]

#bizorig.groupby('income_group').size()

#baltimore_originators  = baltimore_originators.join(respondents, on='respondentID')
bizorig_bk = bizorig

print('done')

In [None]:
baltimore_originators = bizorig[(bizorig['state'] == 24) & (bizorig['county'] == 510)]

htmlstring = '<table>'
htmlstring += '<tr><th>Institution</th><th>Amount</th>'
for idx, amt in baltimore_originators[baltimore_originators['income_group_total'] != '   '].groupby('institution_name')['amtBizLoans'].sum().sort_values(ascending=False).iteritems():
    htmlstring += '<tr><td>{0:}</td><td>${1:,.0f}</td></tr>'.format(idx, amt)

#display(HTML(htmlstring))

#print(len(baltimore_originators))

baltimore_originators = baltimore_originators.assign(cra_level = '')

#baltimore_originators.add(pandas.Series(data=baltimore_originators['income_group'].values[0], name='cra_level'))#['income_group']
#baltimore_originators['cra_level']
#baltimore_originators.columns

baltimore_originators.loc[(baltimore_originators['income_group_total'].apply(lambda x: x in ['001', '002', '003', '004', '005'])), 'cra_level'] = 'low'
baltimore_originators.loc[(baltimore_originators['income_group_total'].apply(lambda x: x in ['006', '007', '008'])), 'cra_level'] = 'moderate'
baltimore_originators.loc[(baltimore_originators['income_group_total'].apply(lambda x: x in ['009', '010', '011', '012'])), 'cra_level'] = 'middle'
baltimore_originators.loc[(baltimore_originators['income_group_total'].apply(lambda x: x in ['013'])), 'cra_level'] = 'upper'
baltimore_originators.loc[(baltimore_originators['income_group_total'].apply(lambda x: x in ['014', '015'])), 'cra_level'] = 'unknown'

#neworder = 

#baltimore_originators = baltimore_originators.reindex(neworder)

baltimore_originators.groupby('cra_level').size().reindex(['low', 'moderate', 'middle', 'upper', 'unknown'])

#baltimore_originators[baltimore_originators['cra_level'] == '']

# Total lending in Baltimore City by institution

In [None]:
htmlstring = '<table>'
htmlstring += '<tr><th>Institution</th><th>Amount</th>'
for idx, amt in baltimore_originators[baltimore_originators['income_group_total'] != '   '].groupby('institution_name')['amtBizLoans'].sum().sort_values(ascending=False).iteritems():
    htmlstring += '<tr><td>{0:}</td><td>${1:,.0f}</td></tr>'.format(idx, amt)

display(HTML(htmlstring))


In [None]:
sortorderlist = baltimore_originators.groupby('institution_name')['amtBizLoans'].sum().sort_values(ascending=False).index.tolist()

sortorder = pandas.Index(sortorderlist)

lending = baltimore_originators.groupby(['institution_name', 'cra_level'])[['nBizLoans', 'amtBizLoans']].sum().unstack('cra_level').reindex(sortorder)#.sort_values(by=['amtBizLoans'], ascending=False)

lending.to_csv('lendingtable.csv', encoding='utf-8')
lending


# Get tract information

# Add neighborhoods (from census tracts)

In [None]:
#baltimore_tracts[baltimore_tracts['loan_indicator'] == 'Y'].groupby(['census_tract', 'institution_name']).size().sort_index()

tracts = tracts_bk

baltimore_tracts = tracts[(tracts['state'] == 24) & (tracts['county'] == 510)]

#baltimore_tracts['census_tract'].head(10)
tracts_to_neighborhoods = pandas.read_csv('neighborhoods/census_tract_to_neighborhood.csv')
tracts_to_neighborhoods = tracts_to_neighborhoods.set_index('NAME10')
#tracts_to_neighborhoods

baltimore_tracts = baltimore_tracts.join(tracts_to_neighborhoods, how='left', on='census_tract')
print('ok')

In [None]:
loans_by_neighborhood = pandas.Series(data=baltimore_tracts[baltimore_tracts['loan_indicator'] == 'Y'].groupby(['CSA2010', 'institution_name']).size(), name='nLoans')

loans_by_neighborhood.to_csv('loans_by_neighborhood.csv', index=True, header=True)
print('ok')
#loans_by_neighborhood

# Get historical data

In [None]:
filelist = []
for i in range(4,16):
    thisfile = '{:02d}exp_discl'.format(i)
    if i in [4,5,6]:
        thisfile = thisfile + '_new.dat'
    else:
        thisfile = thisfile + '.dat'
    filelist.append(thisfile)
        
for thisfile in filelist:
    print('Reading {0:}exp_discl.dat...'.format(thisfile))
    thisyear_df = pandas.read_csv(thisfile, low_memory=False, header=None)
thisyear_df.head(1)

#os.getcwd()

#print('\n')
#print(filelist)

In [None]:
print('read 2015 dat file...')
df.columns = ['thestring']

# Now parse the strings into individual values, following the data guide in the PDF
print('Parsing strings...')
df['tableID'] = df['thestring'].apply(lambda x: x[0:5].strip())

df['type'] = ''
df['respondentID'] = ''
df['agency_code'] = ''
df['activity_year'] = ''
df['loan_type'] = ''
df['action_taken_type'] = ''
df['state'] = ''
df['county'] = ''
df['msa'] = ''
df['assessment_area_number'] = ''
df['partial_county_indicator'] = ''
df['split_county_indicator'] = ''
df['population_classification'] = ''
df['income_group_total'] = ''
df['report_level'] = ''

df['nBizLoans1'] = ''
df['amtBizLoans1'] = ''
df['nBizLoans100k'] = ''
df['amtBizLoans100k'] = ''
df['nBizLoans250k'] = ''
df['amtBizLoans250k'] = ''
df['nBizLoans1M'] = ''
df['amtBizLoans1M'] = ''
df['nBizLoansAff'] = ''
df['amtBizLoansAff'] = ''

print('Total number of rows: {:,.0f}'.format(len(df)))

# Parse table D1-1 (business loan originators)
print('Parsing table D1-1 (business loan originators...)')
df.loc[df['tableID'] == 'D1-1', 'type'] = 'originator'
df.loc[df['tableID'] == 'D1-1', 'respondentID'] = df['thestring'].apply(lambda x: x[5:15])
df.loc[df['tableID'] == 'D1-1', 'agency_code'] = df['thestring'].apply(lambda x: x[15])
df.loc[df['tableID'] == 'D1-1', 'activity_year'] = df['thestring'].apply(lambda x: x[16:20])
df.loc[df['tableID'] == 'D1-1', 'agency_code'] = df['thestring'].apply(lambda x: x[20])
df.loc[df['tableID'] == 'D1-1', 'action_taken_type'] = df['thestring'].apply(lambda x: x[21])
df.loc[df['tableID'] == 'D1-1', 'state'] = df['thestring'].apply(lambda x: x[22:24])
df.loc[df['tableID'] == 'D1-1', 'county'] = df['thestring'].apply(lambda x: x[24:27])
df.loc[df['tableID'] == 'D1-1', 'msa'] = df['thestring'].apply(lambda x: x[27:32])
df.loc[df['tableID'] == 'D1-1', 'assessment_area_number'] = df['thestring'].apply(lambda x: x[33:36])
df.loc[df['tableID'] == 'D1-1', 'partial_county_indicator'] = df['thestring'].apply(lambda x: x[36])
df.loc[df['tableID'] == 'D1-1', 'split_county_indicator'] = df['thestring'].apply(lambda x: x[37])
df.loc[df['tableID'] == 'D1-1', 'population_classification'] = df['thestring'].apply(lambda x: x[38])
df.loc[df['tableID'] == 'D1-1', 'income_group_total'] = df['thestring'].apply(lambda x: x[39:42])
df.loc[df['tableID'] == 'D1-1', 'report_level'] = df['thestring'].apply(lambda x: x[42:45])
df.loc[df['tableID'] == 'D1-1', 'nBizLoans1'] = df['thestring'].apply(lambda x: x[45:55])
df.loc[df['tableID'] == 'D1-1', 'amtBizLoans1'] = df['thestring'].apply(lambda x: x[55:65])
df.loc[df['tableID'] == 'D1-1', 'nBizLoans100k'] = df['thestring'].apply(lambda x: x[65:75])
df.loc[df['tableID'] == 'D1-1', 'amtBizLoans100k'] = df['thestring'].apply(lambda x: x[75:85])
df.loc[df['tableID'] == 'D1-1', 'nBizLoans250k'] = df['thestring'].apply(lambda x: x[85:95])
df.loc[df['tableID'] == 'D1-1', 'amtBizLoans250k'] = df['thestring'].apply(lambda x: x[95:105])
df.loc[df['tableID'] == 'D1-1', 'nBizLoans1M'] = df['thestring'].apply(lambda x: x[105:115])
df.loc[df['tableID'] == 'D1-1', 'amtBizLoans1M'] = df['thestring'].apply(lambda x: x[115:125])
df.loc[df['tableID'] == 'D1-1', 'nBizLoansAff'] = df['thestring'].apply(lambda x: x[125:135])
df.loc[df['tableID'] == 'D1-1', 'amtBizLoansAff'] = df['thestring'].apply(lambda x: x[135:145])
#df[df['tableID'] == 'D1-1'].sample(1).T

# Parse table D1-2 (business loan purchasers)
print('Parsing table D1-2 (business loan purchasers...)')
df.loc[df['tableID'] == 'D1-2', 'type'] = 'purchaser'
df.loc[df['tableID'] == 'D1-2', 'respondentID'] = df['thestring'].apply(lambda x: x[5:15])
df.loc[df['tableID'] == 'D1-2', 'agency_code'] = df['thestring'].apply(lambda x: x[15])
df.loc[df['tableID'] == 'D1-2', 'activity_year'] = df['thestring'].apply(lambda x: x[16:20])
df.loc[df['tableID'] == 'D1-2', 'agency_code'] = df['thestring'].apply(lambda x: x[20])
df.loc[df['tableID'] == 'D1-2', 'action_taken_type'] = df['thestring'].apply(lambda x: x[21])
df.loc[df['tableID'] == 'D1-2', 'state'] = df['thestring'].apply(lambda x: x[22:24])
df.loc[df['tableID'] == 'D1-2', 'county'] = df['thestring'].apply(lambda x: x[24:27])
df.loc[df['tableID'] == 'D1-2', 'msa'] = df['thestring'].apply(lambda x: x[27:32])
df.loc[df['tableID'] == 'D1-2', 'assessment_area_number'] = df['thestring'].apply(lambda x: x[33:36])
df.loc[df['tableID'] == 'D1-2', 'partial_county_indicator'] = df['thestring'].apply(lambda x: x[36])
df.loc[df['tableID'] == 'D1-2', 'split_county_indicator'] = df['thestring'].apply(lambda x: x[37])
df.loc[df['tableID'] == 'D1-2', 'population_classification'] = df['thestring'].apply(lambda x: x[38])
df.loc[df['tableID'] == 'D1-2', 'income_group_total'] = df['thestring'].apply(lambda x: x[39:42])
df.loc[df['tableID'] == 'D1-2', 'report_level'] = df['thestring'].apply(lambda x: x[42:45])
df.loc[df['tableID'] == 'D1-2', 'nBizLoans1'] = df['thestring'].apply(lambda x: x[45:55])
df.loc[df['tableID'] == 'D1-2', 'amtBizLoans1'] = df['thestring'].apply(lambda x: x[55:65])
df.loc[df['tableID'] == 'D1-2', 'nBizLoans100k'] = df['thestring'].apply(lambda x: x[65:75])
df.loc[df['tableID'] == 'D1-2', 'amtBizLoans100k'] = df['thestring'].apply(lambda x: x[75:85])
df.loc[df['tableID'] == 'D1-2', 'nBizLoans250k'] = df['thestring'].apply(lambda x: x[85:95])
df.loc[df['tableID'] == 'D1-2', 'amtBizLoans250k'] = df['thestring'].apply(lambda x: x[95:105])
df.loc[df['tableID'] == 'D1-2', 'nBizLoans1M'] = df['thestring'].apply(lambda x: x[105:115])
df.loc[df['tableID'] == 'D1-2', 'amtBizLoans1M'] = df['thestring'].apply(lambda x: x[115:125])
df.loc[df['tableID'] == 'D1-2', 'nBizLoansAff'] = df['thestring'].apply(lambda x: x[125:135])
df.loc[df['tableID'] == 'D1-2', 'amtBizLoansAff'] = df['thestring'].apply(lambda x: x[135:145])
#df[df['tableID'] == 'D1-2'].sample(1).T

# Parse table D6 (tracts)
print('Parsing table D6 (tracts...)')
df.loc[df['tableID'] == 'D6-0', 'type'] = 'tract'
df.loc[df['tableID'] == 'D6-0', 'respondentID'] = df['thestring'].apply(lambda x: x[5:15])
df.loc[df['tableID'] == 'D6-0', 'agency_code'] = df['thestring'].apply(lambda x: x[15])
df.loc[df['tableID'] == 'D6-0', 'activity_year'] = df['thestring'].apply(lambda x: x[16:20])
df.loc[df['tableID'] == 'D6-0', 'state'] = df['thestring'].apply(lambda x: x[20:22])
df.loc[df['tableID'] == 'D6-0', 'county'] = df['thestring'].apply(lambda x: x[22:25])
df.loc[df['tableID'] == 'D6-0', 'msa'] = df['thestring'].apply(lambda x: x[25:30])
df.loc[df['tableID'] == 'D6-0', 'census_tract'] = df['thestring'].apply(lambda x: x[30:37])
df.loc[df['tableID'] == 'D6-0', 'assessment_area_number'] = df['thestring'].apply(lambda x: x[37:41])
df.loc[df['tableID'] == 'D6-0', 'partial_county_indicator'] = df['thestring'].apply(lambda x: x[41])
df.loc[df['tableID'] == 'D6-0', 'split_county_indicator'] = df['thestring'].apply(lambda x: x[42])
df.loc[df['tableID'] == 'D6-0', 'population_classification'] = df['thestring'].apply(lambda x: x[43])
df.loc[df['tableID'] == 'D6-0', 'income_group_total'] = df['thestring'].apply(lambda x: x[44:47])
df.loc[df['tableID'] == 'D6-0', 'loan_indicator'] = df['thestring'].apply(lambda x: x[47])
df.loc[df['tableID'] == 'D6-0', 'filter'] = df['thestring'].apply(lambda x: x[48:145])
#df[df['tableID'] == 'D6-0'].sample(1).T

print('Converting numbers...')
df['nBizLoans1'] = pandas.to_numeric(df['nBizLoans1'])
df['amtBizLoans1'] = pandas.to_numeric(df['amtBizLoans1']) * 1000
df['nBizLoans100k'] = pandas.to_numeric(df['nBizLoans100k'])
df['amtBizLoans100k'] = pandas.to_numeric(df['amtBizLoans100k']) * 1000
df['nBizLoans250k'] = pandas.to_numeric(df['nBizLoans250k'])
df['amtBizLoans250k'] = pandas.to_numeric(df['amtBizLoans250k']) * 1000
df['nBizLoans1M'] = pandas.to_numeric(df['nBizLoans1M'])
df['amtBizLoans1M'] = pandas.to_numeric(df['amtBizLoans1M']) * 1000
df['nBizLoansAff'] = pandas.to_numeric(df['nBizLoansAff'])
df['amtBizLoansAff'] = pandas.to_numeric(df['amtBizLoansAff']) * 1000
df['respondentID'] = pandas.to_numeric(df['respondentID'], downcast='integer', errors='coerce')
df['agency_code'] = pandas.to_numeric(df['agency_code'], downcast='integer', errors='coerce')
df['activity_year'] = pandas.to_numeric(df['activity_year'], downcast='integer', errors='coerce')

print('Looking up data codes...')
df['agency'] = ''
df.loc[df['agency_code'] == 1, 'agency'] = 'OCC'
df.loc[df['agency_code'] == 2, 'agency'] = 'FRS'
df.loc[df['agency_code'] == 3, 'agency'] = 'FDIC'
df.loc[df['agency_code'] == 4, 'agency'] = 'OTS'

print('Summing numbers...')
df['nBizLoans'] = df['nBizLoans1'] + df['nBizLoans100k'] + df['nBizLoans250k'] + df['nBizLoans1M'] + df['nBizLoansAff']
df['amtBizLoans'] = df['amtBizLoans1'] + df['amtBizLoans100k'] + df['amtBizLoans250k'] + df['amtBizLoans1M'] + df['amtBizLoansAff']

print('Converting geography codes...')
df['state'] = pandas.to_numeric(df['state'], downcast='integer', errors='coerce')
df['county'] = pandas.to_numeric(df['county'], downcast='integer', errors='coerce')
df['msa'] = pandas.to_numeric(df['msa'], downcast='integer', errors='coerce')
df['census_tract'] = pandas.to_numeric(df['census_tract'], downcast='integer', errors='coerce')

print('Looking up insitution names from respondentIDs...')
respondents = pandas.read_csv('respondentid.csv', index_col='respondentID')
df = df.join(respondents, how='left', on='respondentID')#[['respondentID', 'institution_name']].drop_duplicates()

print('Dropping raw string variable, no longer needed...')
df = df.drop('thestring', axis=1)

print('Keeping only tables D1-1, D1-2, and D6-0.')
df = df[df['type'] != '']

df.index.name = 'rownumber'

print('done')
df.sample(1).T


In [None]:


print(df.groupby('type').size())
#os.chdir('/home/idies/workspace/Storage/raddick/persistent/cra/')
os.chdir('/home/idies/workspace/Temporary/raddick/cra_scratch/')

os.getcwd()

df.to_csv('2015all.csv', encoding='utf-8')
print('ok')

In [None]:
df = pandas.read_csv('2015all.csv', encoding='utf-8', low_memory=False)
df = df.set_index('rownumber')
df.head(2)

In [None]:
baltimore_2015_df = df[(df['state'] == 24) & (df['county'] == 510)]

print('Keeping {:,.0f} rows from Baltimore!'.format(len(baltimore_2015_df)))

showcols = ['type', 'institution_name', 'activity_year', 'agency', 'assessment_area_number']
showcols += ['census_tract', 'income_group_total', 'report_level']
showcols += ['filter', 'nBizLoans', 'amtBizLoans']
baltimore_2015_df[showcols].sample(5)


## Add neighborhood info (from census tracts)

In [None]:
baltimore_2015_tracts = baltimore_2015_df[baltimore_2015_df['type'] == 'tract']

#baltimore_tracts['census_tract'].head(10)
tracts_to_neighborhoods = pandas.read_csv('neighborhoods/census_tract_to_neighborhood.csv')
tracts_to_neighborhoods = tracts_to_neighborhoods.set_index('NAME10')
#tracts_to_neighborhoods

baltimore_2015_tracts = baltimore_2015_tracts.join(tracts_to_neighborhoods, how='left', on='census_tract')
print('ok')
baltimore_2015_tracts.groupby('CSA2010').size()


In [None]:
baltimore_originators_2015 = baltimore_2015_df[baltimore_2015_df['type']=='originator']

baltimore_originators_2015 = baltimore_originators_2015.assign(cra_level = '')

baltimore_originators_2015.loc[(baltimore_originators_2015['income_group_total'].apply(lambda x: x in ['001', '002', '003', '004', '005'])), 'cra_level'] = 'low'
baltimore_originators_2015.loc[(baltimore_originators_2015['income_group_total'].apply(lambda x: x in ['006', '007', '008'])), 'cra_level'] = 'moderate'
baltimore_originators_2015.loc[(baltimore_originators_2015['income_group_total'].apply(lambda x: x in ['009', '010', '011', '012'])), 'cra_level'] = 'middle'
baltimore_originators_2015.loc[(baltimore_originators_2015['income_group_total'].apply(lambda x: x in ['013'])), 'cra_level'] = 'upper'
baltimore_originators_2015.loc[(baltimore_originators_2015['income_group_total'].apply(lambda x: x in ['014', '015'])), 'cra_level'] = 'unknown'

baltimore_originators_2015.groupby('cra_level').size().reindex(['low', 'moderate', 'middle', 'upper', 'unknown'])


## Total lending in Baltimore City by institution in 2015

In [None]:
htmlstring = '<table>'
htmlstring += '<tr><th>Institution</th><th>Amount</th>'
for idx, amt in baltimore_originators_2015[baltimore_originators_2015['income_group_total'] != '   '].groupby('institution_name')['amtBizLoans'].sum().sort_values(ascending=False).iteritems():
    htmlstring += '<tr><td>{0:}</td><td>${1:,.0f}</td></tr>'.format(idx, amt)

display(HTML(htmlstring))


In [None]:
df[(df['type'] == 'originator') &
   (df['state'] == 24) &
   (df['county'] == 510)
  ].to_csv('baltimore_cra2015_Discl_D11.csv', encoding='utf-8')

df[(df['type'] == 'tract') &
   (df['state'] == 24) &
   (df['county'] == 510)
  ].to_csv('baltimore_cra2015_Discl_D6.csv', encoding='utf-8')
print('saved!')