In [None]:
import os
import pandas
import time
import numpy as np
os.chdir('/home/idies/workspace/Temporary/raddick/cra_scratch/')
os.getcwd()

In [None]:
s = time.time()
tracts_df = pandas.read_csv('tracts_df.csv', encoding='utf-8', index_col='rownumber')
e = time.time()
print('Loaded {0:,.0f} rows in {1:,.0f} seconds.'.format(len(tracts_df), e-s))
tracts_df_bk = tracts_df
tracts_df.sample(3).T

In [None]:
print('reading from backup...')
c = 0
s = time.time()
tracts_df = tracts_df_bk
print('getting institution name from respondentID...')
respondents_df = pandas.read_csv('/home/idies/workspace/Storage/raddick/raddick_cra/respondentid.csv', encoding='utf-8', index_col='respondentID')
print('Loaded respondentIDs (n = {0:,.0f}).'.format(len(respondents_df['institution_name'].drop_duplicates())))

print('\nAdding institution (from respondentID)...')
tracts_df = tracts_df.join(respondents_df, how='left', on='respondentID')
# There are only 87 unique institutions represented?
print('Found institutions for {0:,.0f} ({1:,.0f} unique).'.format(len(tracts_df['institution_name'].notnull()), len(tracts_df['institution_name'].drop_duplicates().notnull())))
tracts_df['institution_name'] = tracts_df['institution_name'].fillna('Unknown')

print('\nAssigning codes...')
s = time.time()
print('agency_code -> agency...')
tracts_df.assign(agency='')
tracts_df.loc[tracts_df['agency_code'] == 1, 'agency'] = 'OCC'
tracts_df.loc[tracts_df['agency_code'] == 2, 'agency'] = 'FRS'
tracts_df.loc[tracts_df['agency_code'] == 3, 'agency'] = 'FDIC'
tracts_df.loc[tracts_df['agency_code'] == 4, 'agency'] = 'OTS'
e = time.time()
c = c + (e-s)
print('Assigned agency names for {0:,.0f} rows in {1:,.0f} seconds.'.format(len(tracts_df[tracts_df['agency'] != '']), e-s))

print('income_group_total -> income_group')
s = time.time()
tracts_df['income_group_total'] = pandas.to_numeric(tracts_df['income_group_total'], errors='coerce')

tracts_df.assign(income_group='')
tracts_df.loc[tracts_df['income_group_total'] == 1, 'income_group'] = '< 10% of Median Family Income (MFI)'
tracts_df.loc[tracts_df['income_group_total'] == 2, 'income_group'] = '10% to 20% of MFI'
tracts_df.loc[tracts_df['income_group_total'] == 3, 'income_group'] = '20% to 30% of MFI'
tracts_df.loc[tracts_df['income_group_total'] == 4, 'income_group'] = '30% to 40% of MFI'
tracts_df.loc[tracts_df['income_group_total'] == 5, 'income_group'] = '40% to 50% of MFI'
tracts_df.loc[tracts_df['income_group_total'] == 6, 'income_group'] = '50% to 60% of MFI'
tracts_df.loc[tracts_df['income_group_total'] == 7, 'income_group'] = '60% to 70% of MFI'
tracts_df.loc[tracts_df['income_group_total'] == 8, 'income_group'] = '70% to 80% of MFI'
tracts_df.loc[tracts_df['income_group_total'] == 9, 'income_group'] = '80% to 90% of MFI'
tracts_df.loc[tracts_df['income_group_total'] == 10, 'income_group'] = '90% to 100% of MFI'
tracts_df.loc[tracts_df['income_group_total'] == 11, 'income_group'] = '100% to 110% of MFI'
tracts_df.loc[tracts_df['income_group_total'] == 12, 'income_group'] = '110% to 120% of MFI'
tracts_df.loc[tracts_df['income_group_total'] == 13, 'income_group'] = '> 120% of MFI'
tracts_df.loc[tracts_df['income_group_total'] == 14, 'income_group'] = 'MFI not known (income percentage = 0)'
tracts_df.loc[tracts_df['income_group_total'] == 15, 'income_group'] = 'Tract not Known (reported as NA)'
tracts_df.loc[tracts_df['income_group_total'] == 101, 'income_group'] = 'Low Income (< 50% of MFI) - excluding 0)'
tracts_df.loc[tracts_df['income_group_total'] == 102, 'income_group'] = 'Moderate Income (50% to 80% of MFI)'
tracts_df.loc[tracts_df['income_group_total'] == 103, 'income_group'] = 'Middle Income (80% to 120% of MFI)'
tracts_df.loc[tracts_df['income_group_total'] == 104, 'income_group'] = 'Upper Income (> 120% of MFI)'
tracts_df.loc[tracts_df['income_group_total'] == 105, 'income_group'] = 'Income Not Known (0)'
tracts_df.loc[tracts_df['income_group_total'] == 106, 'income_group'] = 'Tract not Known (NA)'
e = time.time()
c = c + (e-s)
print('Assigned income group names for {0:,.0f} rows in {1:,.0f} seconds.'.format(len(tracts_df[tracts_df['income_group'] != '']), e-s))


print('state -> state_name...')
s = time.time()
statecodes_df = pandas.read_csv(
    '/home/idies/workspace/Storage/raddick/persistent/cra/metadata/statecodes.csv', 
    encoding='utf-8', index_col='STATE')
statecodes_df = statecodes_df.rename(columns={'STUSAB': 'state_name'})
statecodes_df.index.name = 'state'
tracts_df = tracts_df.join(statecodes_df, how='left', on='state')
e = time.time()
c = c + (e-s)
print('Assigned state names for {0:,.0f} rows in {1:,.0f} seconds.'.format(len(tracts_df[tracts_df['state_name'].notnull()]), e-s))

print('county -> county_name...')
s = time.time()
countycodes_df = pandas.read_csv(
    '/home/idies/workspace/Storage/raddick/persistent/cra/metadata/countycodes.csv', 
    encoding='utf-8')
countycodes_df = countycodes_df.drop('state_name', axis=1)
countycodes_df.assign(fips_class_description='')
countycodes_df.loc[countycodes_df['fips_class_code'] == 'H1', 'fips_class_description'] = 'active county'
countycodes_df.loc[countycodes_df['fips_class_code'] == 'H4', 'fips_class_description'] = 'inactive county'
countycodes_df.loc[countycodes_df['fips_class_code'] == 'H5', 'fips_class_description'] = 'Alaska census area'
countycodes_df.loc[countycodes_df['fips_class_code'] == 'H6', 'fips_class_description'] = 'part of another entity'
countycodes_df.loc[countycodes_df['fips_class_code'] == 'C7', 'fips_class_description'] = 'independent city'
countycodes_df = countycodes_df.set_index(['state', 'county'])

tracts_df = tracts_df.join(countycodes_df['county_name'], how='left', on=['state', 'county'])
e = time.time()
c = c + (e-s)
print('Assigned county names for {0:,.0f} rows in {1:,.0f} seconds.'.format(len(tracts_df[tracts_df['county_name'].notnull()]), e-s))

s = time.time()
print('\nreplacing NA with -1 in...')
print('msa...')
tracts_df.loc[tracts_df['msa'] == 'NA  ', 'msa'] = '-1'
print('assessment_area_number...')
tracts_df.loc[tracts_df['assessment_area_number'] == 'NA  ', 'assessment_area_number'] = '-1'

print('\nconverting to numeric...')
print('msa...')
tracts_df['msa'] = pandas.to_numeric(tracts_df['msa'], errors='coerce')
print('assessment_area_number...')
tracts_df['assessment_area_number'] = pandas.to_numeric(tracts_df['assessment_area_number'], errors='coerce')

print('\nReplacing -1 with NaN in...')
print('msa...')
tracts_df.loc[tracts_df['msa'] == -1, 'msa'] = np.nan
print('assessment_area_number...')
tracts_df.loc[tracts_df['assessment_area_number'] == -1, 'assessment_area_number'] = np.nan
e = time.time()
c = c + (e-s)

s = time.time()
print('\nBacking up...')
tracts_df_bk = tracts_df
e = time.time()
c = c + (e-s)

print('Done in {0:,.0f} seconds total.'.format(c))
tracts_df.sample(3).T


In [None]:
print('MATCH TO METRO AREAS...')
c = 0
s = time.time()
print('Reading from backup...')
tracts_df = tracts_df_bk
print('Overall {0:,.0f} rows.'.format(len(tracts_df)))
print('MSA values in {0:,.0f} rows.'.format(len(tracts_df[tracts_df['msa'].notnull()])))
e = time.time()
c = c + (e-s)

s = time.time()
msa_df = pandas.read_csv(
    '/home/idies/workspace/Storage/raddick/persistent/cra/metadata/msacodes.csv', 
    encoding='utf-8', low_memory=False)
msa_mathcer_df = msa_df[['msa_code', 'msa_title']][msa_df['msa_code'].notnull()].drop_duplicates()
msa_mathcer_df = msa_mathcer_df.set_index('msa_code')
e = time.time()
c = c + (e-s)

print('\nRead {0:,.0f} msacodes ({1:,.0f} distinct) in {2:,.0f} seconds.'.format(len(msa_df[msa_df['msa_code'].notnull()]), len(msa_mathcer_df), e-s))

print('Will match to a list of {0:,.0f} distinct msa_codes...'.format(len(msa_mathcer_df)))

s = time.time()
tracts_df = tracts_df[tracts_df['msa'].notnull()].join(msa_mathcer_df, how='left', on='msa')
e = time.time()
c = c + (e-s)

print('\nMatched {0:,.0f} rows to MSA codes in {1:,.0f} seconds.'.format(len(tracts_df[tracts_df['msa_title'].notnull()]), e-s))

print('{0:,.0f} rows remain to be matched...'.format(len(tracts_df[(tracts_df['msa_title'].isnull()) & (tracts_df['msa'].notnull())])))

#print('\nThere are {0:,.0f} CBSA codes found in msa_df that do not have MSA equivalents ({1:,.0f} distinct).'.format(len(msa_df[(msa_df['cbsa_code'].notnull()) & (msa_df['msa_code'].isnull())]), len(msa_df['cbsa_code'][(msa_df['cbsa_code'].notnull()) & (msa_df['msa_code'].isnull())].drop_duplicates())))
print('\nThere are {0:,.0f} CBSA codes found in msa_df ({1:,.0f} distinct).'.format(len(msa_df[(msa_df['cbsa_code'].notnull())]), len(msa_df['cbsa_code'][(msa_df['cbsa_code'].notnull())].drop_duplicates())))

cbsa_matcher_df = msa_df[['cbsa_code', 'cbsa_title']][(msa_df['cbsa_code'].notnull())].drop_duplicates() #& (msa_df['msa_code'].isnull())].drop_duplicates()
cbsa_matcher_df = cbsa_matcher_df.set_index('cbsa_code')

print('Will match to a list of {0:,.0f} distinct cbsa_codes...'.format(len(cbsa_matcher_df)))

s = time.time()
tracts_df = tracts_df[tracts_df['msa'].notnull()].join(cbsa_matcher_df, how='left', on='msa')
e = time.time()
c = c + (e-s)

print('\nMatched {0:,.0f} rows to CBSA codes in {1:,.0f} seconds.'.format(len(tracts_df[tracts_df['cbsa_title'].notnull()]), e-s))

print('\nThere are {0:,.0f} rows with either MSA or CBSA titles, and {1:,.0f} rows with both.'.format(len(tracts_df[(tracts_df['msa_title'].notnull()) | (tracts_df['cbsa_title'].notnull())]), len(tracts_df[(tracts_df['msa_title'].notnull()) & (tracts_df['cbsa_title'].notnull())])))

print('\nRenaming columns...')
tracts_df = tracts_df.rename(columns={'msa_title': 'msa_name'})
tracts_df = tracts_df.rename(columns={'cbsa_title': 'cbsa_name'})

print('\nBacking up...')
tracts_df_bk = tracts_df
e = time.time()
c = c + (e-s)
print('\nDONE in {0:,.0f} seconds.'.format(c))

In [None]:
# MSA got converted back to object for some reason, so make it numeric again...
s = time.time()
print('\nconverting to numeric...')
print('msa...')
tracts_df['msa'] = pandas.to_numeric(tracts_df['msa'], errors='coerce')

print('\nwriting outfile...')
tracts_df.to_csv('tracts_processed.csv', encoding='utf-8')

e = time.time()
c = c + (e-s)

print('\nDONE in {0:,.0f} seconds.'.format(e-s))
#tracts_df.groupby('msa').size().sort_values(ascending=False)

In [None]:
os.getcwd()

In [None]:
s = time.time()
df = pandas.read_csv('tracts_processed.csv', encoding='utf-8', index_col='rownumber')
e = time.time()
print('Loaded in {0:,.0f} seconds.'.format(e-s))
df.head(5)