In [1]:
# import needed libraries
import pandas as pd
import json
import requests
import math
import numpy as np
import urllib
import os

In [2]:
import utilcalcs as calc
import cen_geo_agg as geo

In [3]:
# using local environmental variable 
CensusAPI = os.environ.get('Census_API')

In [4]:
#My search parameters
year = '2018'
counties = ['005','047','061','081','085'] # NYC extent
cols = f'B01001_001E,B01001_001M,group(B03002)' #add other tables here for other demo variables

source = 'acs/acs5'

In [5]:
def get_data(geo):
    frames = []
    for c in counties:
        if geo == 'cbg':
            url = f'https://api.census.gov/data/{year}/{source}?get={cols}&for=block%20group:*&in=state:36%20county:{c}&key={CensusAPI}'
            resp = requests.request('GET', url).content
            df = pd.DataFrame(json.loads(resp)[1:])
            df.columns = json.loads(resp)[0]
            frames.append(df)
        else:
            url = f'https://api.census.gov/data/{year}/{source}?get={cols}&for=county:{c}&in=state:36&key={CensusAPI}'
            resp = requests.request('GET', url).content
            df = pd.DataFrame(json.loads(resp)[1:])
            df.columns = json.loads(resp)[0]
            frames.append(df)
    df = pd.concat(frames,sort=True)
    return df

def clean_data(df,var):
    dff = df[var].copy()
    var_num = var[1:]
    for col in var_num:
        dff[col] = dff[col].astype(float)
    dff = dff.replace([999999999, 555555555, 333333333, 222222222,\
                    666666666, 888888888, -999999999, -555555555,\
                    -333333333, -222222222, -666666666, -888888888], 0)
    return dff

In [6]:
## Total Population
Pop = ['B01001_001E','B01001_001M']

## Race - Mutually Exclusive (i.e. "Alone") 
White = ['B03002_003E','B03002_003M']
Black = ['B03002_004E','B03002_004M']
Hispanic = ['B03002_012E','B03002_012M']
Asian = ['B03002_006E','B03002_006M']
Other_E = ['B03002_005E','B03002_007E','B03002_008E','B03002_009E'] #separate estimate & moe for calcs
Other_M = ['B03002_005M','B03002_007M','B03002_008M','B03002_009M']

Race = White + Black + Hispanic + Asian + Other_E + Other_M

#other socioeconomic variables here
##
##
##


var_data = ['GEO_ID'] + Pop + Race # + other variables

### Make CBG level table for aggregations

In [7]:
df_cbg = get_data('cbg')
df_cbg = clean_data(df_cbg,var_data)

In [10]:
#Population
df_cbg['Pop_E'] = df_cbg['B01001_001E']
df_cbg['Pop_M'] = df_cbg['B01001_001M']
df_cbg['Pop_C'] = df_cbg.apply(lambda x: (calc.get_cv(x['Pop_E'],x['Pop_M'])),axis=1)

#Race
df_cbg['White_E'] = df_cbg['B03002_003E']
df_cbg['White_M'] = df_cbg['B03002_003M']
df_cbg['White_C'] = df_cbg.apply(lambda x: (calc.get_cv(x['White_E'],x['White_M'])),axis=1)
df_cbg['Black_E'] = df_cbg['B03002_004E']
df_cbg['Black_M'] = df_cbg['B03002_004M']
df_cbg['Black_C'] = df_cbg.apply(lambda x: (calc.get_cv(x['Black_E'],x['Black_M'])),axis=1)
df_cbg['Hispanic_E'] = df_cbg['B03002_012E']
df_cbg['Hispanic_M'] = df_cbg['B03002_012M']
df_cbg['Hispanic_C'] = df_cbg.apply(lambda x: (calc.get_cv(x['Hispanic_E'],x['Hispanic_M'])),axis=1)
df_cbg['Asian_E'] = df_cbg['B03002_006E']
df_cbg['Asian_M'] = df_cbg['B03002_006M']
df_cbg['Asian_C'] = df_cbg.apply(lambda x: (calc.get_cv(x['Asian_E'],x['Asian_M'])),axis=1)
df_cbg['Other_E'] = df_cbg.loc[:,Other_E].sum(axis=1)
df_cbg['Other_M'] = df_cbg.apply(lambda x: (calc.get_moe(x[Other_M])),axis=1)
df_cbg['Other_C'] = df_cbg.apply(lambda x: (calc.get_cv(x['Other_E'],x['Other_M'])),axis=1)

#Other variables

In [11]:
df_cbg['orig_cbg'] = df_cbg['GEO_ID'].str[9:]
df_cbg = df_cbg.drop(var_data,axis=1)
df_cbg.head()

Unnamed: 0,Pop_E,Pop_M,Pop_C,White_E,White_M,White_C,Black_E,Black_M,Black_C,Hispanic_E,Hispanic_M,Hispanic_C,Asian_E,Asian_M,Asian_C,Other_E,Other_M,Other_C,orig_cbg
0,1886.0,358.0,11.539193,11.0,23.0,127.106936,227.0,134.0,35.885007,1648.0,401.0,14.791808,0.0,12.0,0.0,0.0,24.0,0.0,360050245023
1,1318.0,204.0,9.409117,12.0,13.0,65.85613,630.0,180.0,17.36865,662.0,198.0,18.181985,6.0,9.0,91.18541,8.0,24.0,182.370821,360050247002
2,1222.0,298.0,14.824469,14.0,22.0,95.527573,45.0,51.0,68.895643,1155.0,299.0,15.737049,0.0,12.0,0.0,8.0,24.515301,186.286484,360050253004
3,1941.0,610.0,19.10462,464.0,291.0,38.124934,398.0,254.0,38.795803,891.0,504.0,34.386417,0.0,12.0,0.0,188.0,127.573508,41.251215,360050263005
4,1191.0,623.0,31.798774,0.0,12.0,0.0,211.0,295.0,84.991141,892.0,529.0,36.051631,88.0,147.0,101.547389,0.0,24.0,0.0,360050265004


### Make table for geo aggregations

In [12]:
dff = df_cbg.copy()

In [13]:
geo_xwalk = pd.read_excel('../data/nyc_geo_xwalk.xlsx')
geo_xwalk['orig_cbg'] = geo_xwalk['orig_cbg'].apply(str) # convert to string

In [16]:
dff = geo_xwalk.merge(dff,on='orig_cbg').drop(columns=['Pop_10E','orig_st','orig_co','orig_stco'])

### Final CBG Table

In [None]:
set_index('orig_puma',inplace=True)

### NTA Table

In [17]:
df_nta = dff.copy().drop(columns=['orig_cbg','orig_ctract','orig_puma','orig_subbor'])
df_nta = geo.calculate_sumgeo(df_nta,'orig_nta')
df_nta.set_index('orig_nta',inplace=True)


In [18]:
df_nta.head()

Unnamed: 0_level_0,Black_E,Black_M,Black_C,Hispanic_E,Hispanic_M,Hispanic_C,Other_E,Other_M,Other_C,White_E,White_M,White_C,Pop_E,Pop_M,Pop_C,Asian_E,Asian_M,Asian_C
orig_nta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
BX98,3984.0,270.266535,4.12389,2329.0,248.290153,6.480733,104.0,60.712437,35.487747,538.0,160.449369,18.129667,7080.0,290.24817,2.492128,125.0,52.392748,25.479756
BX09,18751.0,1923.851086,6.237077,33187.0,2416.239227,4.425945,809.0,357.158228,26.837758,1486.0,512.266532,20.956139,54925.0,2911.874654,3.222825,692.0,249.545587,21.921885
BX39,15147.0,1539.595726,6.178943,37263.0,2305.397146,3.760989,511.0,272.560085,32.424662,1005.0,339.255066,20.520804,54163.0,2623.718163,2.944751,237.0,148.801882,38.16754
BX99,324.0,170.334964,31.958979,296.0,142.765542,29.320123,120.0,171.735261,86.998613,88.0,117.736995,81.332547,831.0,324.450304,23.734564,3.0,36.345564,736.485586
BX55,7982.0,977.520844,7.444721,23717.0,1800.647939,4.615334,1078.0,434.427209,24.498097,1131.0,379.189926,20.381131,37130.0,2067.697512,3.385292,3222.0,808.643927,15.256886


### PUMA Table

In [19]:
df_puma = dff.copy().drop(columns=['orig_cbg','orig_ctract','orig_nta','orig_subbor'])
df_puma = geo.calculate_sumgeo(df_puma,'orig_puma')
df_puma.set_index('orig_puma',inplace=True)

In [20]:
df_puma.head()

Unnamed: 0_level_0,Black_E,Black_M,Black_C,Hispanic_E,Hispanic_M,Hispanic_C,Other_E,Other_M,Other_C,White_E,White_M,White_C,Pop_E,Pop_M,Pop_C,Asian_E,Asian_M,Asian_C
orig_puma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3603710,46294.0,2712.900846,3.562405,105715.0,3847.142056,2.212258,1830.0,499.949997,16.607703,3195.0,567.685652,10.801179,158185.0,4467.479491,1.716846,1151.0,427.614312,22.584527
3603709,53643.0,3102.245799,3.515582,109131.0,4259.856218,2.372908,4899.0,873.902168,10.843999,5488.0,823.162803,9.118129,186915.0,5110.395582,1.662051,13754.0,1566.477896,6.923558
3603707,36875.0,2130.268528,3.511854,95667.0,3626.612331,2.304481,1865.0,549.542537,17.912515,1556.0,344.37044,13.453967,137655.0,3986.755699,1.760604,1692.0,467.823685,16.807996
3603708,41186.0,2659.223195,3.924996,93319.0,4013.726448,2.614639,2886.0,794.634507,16.738063,2776.0,536.087679,11.739523,142071.0,4570.768207,1.95577,1904.0,473.431093,15.115549
3603705,52686.0,2806.140944,3.237788,108391.0,4264.639492,2.391791,2125.0,561.514025,16.063337,6900.0,754.157145,6.644264,171832.0,4907.228851,1.736067,1730.0,465.324618,16.350989


## County Table

In [27]:
df_county = get_data('county')
df_county = clean_data(df_county,var_data)

In [28]:
#Population
df_county['Pop_E'] = df_county['B01001_001E']
df_county['Pop_M'] = df_county['B01001_001M']
df_county['Pop_C'] = df_county.apply(lambda x: (calc.get_cv(x['Pop_E'],x['Pop_M'])),axis=1)

#Race
df_county['White_E'] = df_county['B03002_003E']
df_county['White_M'] = df_county['B03002_003M']
df_county['White_C'] = df_county.apply(lambda x: (calc.get_cv(x['White_E'],x['White_M'])),axis=1)
df_county['Black_E'] = df_county['B03002_004E']
df_county['Black_M'] = df_county['B03002_004M']
df_county['Black_C'] = df_county.apply(lambda x: (calc.get_cv(x['Black_E'],x['Black_M'])),axis=1)
df_county['Hispanic_E'] = df_county['B03002_012E']
df_county['Hispanic_M'] = df_county['B03002_012M']
df_county['Hispanic_C'] = df_county.apply(lambda x: (calc.get_cv(x['Hispanic_E'],x['Hispanic_M'])),axis=1)
df_county['Asian_E'] = df_county['B03002_006E']
df_county['Asian_M'] = df_county['B03002_006M']
df_county['Asian_C'] = df_county.apply(lambda x: (calc.get_cv(x['Asian_E'],x['Asian_M'])),axis=1)
df_county['Other_E'] = df_county.loc[:,Other_E].sum(axis=1)
df_county['Other_M'] = df_county.apply(lambda x: (calc.get_moe(x[Other_M])),axis=1)
df_county['Other_C'] = df_county.apply(lambda x: (calc.get_cv(x['Other_E'],x['Other_M'])),axis=1)

#Other variables

In [29]:
df_county['orig_stco'] = df_county['GEO_ID'].str[9:]
df_county = df_county.drop(var_data,axis=1)
df_county.set_index('orig_stco',inplace=True)

Unnamed: 0,Pop_E,Pop_M,Pop_C,White_E,White_M,White_C,Black_E,Black_M,Black_C,Hispanic_E,Hispanic_M,Hispanic_C,Asian_E,Asian_M,Asian_C,Other_E,Other_M,Other_C,orig_stco
0,1437872.0,0.0,0.0,133874.0,501.0,0.227497,421275.0,1347.0,0.194373,803636.0,0.0,0.0,50906.0,1034.0,1.234769,28181.0,1916.658029,4.134494,36005
0,2600747.0,0.0,0.0,940759.0,611.0,0.039482,787705.0,1505.0,0.116147,499279.0,0.0,0.0,304870.0,1905.0,0.379852,68134.0,2590.299404,2.311108,36047
0,1632480.0,0.0,0.0,765564.0,806.0,0.064001,203849.0,1594.0,0.47535,423683.0,0.0,0.0,194346.0,1366.0,0.427277,45038.0,2244.111183,3.029001,36061
0,2298513.0,0.0,0.0,581373.0,1059.0,0.110733,396320.0,2152.0,0.330088,643563.0,0.0,0.0,576353.0,2388.0,0.251872,100904.0,3628.144154,2.185799,36081
0,474101.0,0.0,0.0,292360.0,206.0,0.042833,43906.0,850.0,1.176872,86976.0,0.0,0.0,40946.0,649.0,0.963535,9913.0,1202.47869,7.374055,36085


In [None]:
df_county.head()