This script will match all columns with some zip codes in a column named "Zip Code" with the crosswalk lookup generated by zcta_ej.ipynb. Files must be under data/to_match or in a subfolder under that folder.

In [1]:
import pandas, pyprojroot, warnings, numpy, os

def match(subfolder=''):
    #set paths
    loc = pyprojroot.here('./data/to_match/' + subfolder)
    zip_ej = pyprojroot.here('./data/outputs/ZCTA_EJ.csv')
    lookup = pandas.read_csv(zip_ej).set_index('zip')
    
    #loop through all csv's in folder
    for file in loc.iterdir():
        if file.is_file() and file.suffix == '.csv':
            #pull zip codes and create array of proportions of EJ communities in those zip codes
            zipcode_col = "Zip" #if your files have different column titles for zip codes, replace this variable.
            df = pandas.read_csv(file).dropna(axis='columns', how='all')
            zips = df[zipcode_col]
            ej_prob = [0] * len(zips)
            cat_prob = [0] * len(zips)
            crit_prob = [0] * len(zips)
            for i in range(0, len(zips)):
                try:
                    z = int(str(zips[i]).split('-', 1)[0]) #cleans any zip+4 entries
                    if(len(str(z)) == 9):
                        z = int(str(z)[0:5])
                except:
                    ej_prob[i] = numpy.nan
                    cat_prob[i] = numpy.nan
                    crit_prob[i] = numpy.nan
                    continue
                try:
                    ej_prob[i] = lookup.loc[z, 'percent_ej']
                    cat_prob[i] = lookup.loc[z, 'avg_cats']
                    crit_prob[i] = lookup.loc[z, 'avg_criteria']
                except:
                    ej_prob[i] = numpy.nan
                    cat_prob[i] = numpy.nan
                    crit_prob[i] = numpy.nan
                    print("Could not find zip code " + str(z))

            #append the array and create the output file
            df.insert(len(df.columns), 'Percent EJ', ej_prob)
            df.insert(len(df.columns), 'Average Categories Exceeded', cat_prob)
            df.insert(len(df.columns), 'Average Criteria Exceeded', crit_prob)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                outpath = str(pyprojroot.here('./data/matched/' + subfolder)) + '/'
                if subfolder == '':
                    outpath = str(pyprojroot.here('./data/matched/')) + '/'
            if not os.path.exists(outpath):
                os.makedirs(outpath)
            df.to_csv(str(outpath) + file.name)
            print(file.name + " matched!")
            print()

Calling the match function below with no arguments will match all .csv's in the root <code>/data/to_match</code> folder, a subfolder within that folder can be passed as a string argument to match all .csv's in that folder. For example, <code>match('tests')</code> will match all .csv's under <code>/data/to_match/tests</code>. All matched files will be placed under <code>/data/matched/</code>.

In [3]:
match()

Could not find zip code 96162
Could not find zip code 7509
Could not find zip code 96823
Could not find zip code 3302
Could not find zip code 87502
Could not find zip code 71501
Could not find zip code 87154
Could not find zip code 34106
Could not find zip code 33000
Could not find zip code 95518
Could not find zip code 97703
Could not find zip code 19395
Could not find zip code 85327
Could not find zip code 87502
Could not find zip code 85069
Could not find zip code 99708
Could not find zip code 0
Could not find zip code 75444
Could not find zip code 77399
Could not find zip code 90078
Could not find zip code 85378
Could not find zip code 90264
Could not find zip code 85733
Could not find zip code 4078
Could not find zip code 86340
Could not find zip code 108
Could not find zip code 38184
Could not find zip code 98291
Could not find zip code 7303
Could not find zip code 30357
Could not find zip code 84129
Could not find zip code 84130
Could not find zip code 93448
Could not find zip c