In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
API_key = '428db062383b0709dcdb4ad6b91a30671288235b'

In [6]:
us_call = 'https://api.census.gov/data/2020/dec/pl?get=P1_001N&for=us:*&key={}'.format(API_key)
call_return = requests.get(us_call).json()

In [20]:
us_call

'https://api.census.gov/data/2020/dec/pl?get=P1_001N&for=us:*&key=428db062383b0709dcdb4ad6b91a30671288235b'

## URL Call Generation

Use this section to generate the types of calls you want to make. 

If you're selecting, a state, do you want the breakdown to be by county and then tract. The function should take a number of arguments to define granularity and then other arguments to determine location and geography. 

Final arguments will be type of data to pull but for now focus on population: P1_001N

In [122]:
def census_2020(API_key, fields = ['P1_001N'], state_abr = "All", county_name = "*", county = True, tract = True):
    ''' Takes in API Key, Census fields, and geography and granularity specifications. Returns a dataframe'''
    
    state_codes = pd.read_csv('https://www2.census.gov/geo/docs/reference/state.txt', sep='|', dtype={'STATE': str})
    if state_abr == 'All':
        state = '*'
    if state_abr != "All":
        state = state_codes.loc[state_codes['STUSAB'] == state_abr]['STATE'].values[0]
    
    if county & (state_abr != "All"):
        county_url = 'https://www2.census.gov/geo/docs/reference/codes/files/st{}_{}_cou.txt'.format(state, state_abr.lower())
        print(county_url)
        county_codes = pd.read_csv(county_url, names=['State', 'State Code', 'County Code', 'County Name', 'Fips Class Code'], dtype={'County Code': str})
        
    if county_name != "*":
        county_name = county_codes.loc[county_codes['County Name'] == county_name]['County Code'].values[0]

    if tract:
        final_call = 'https://api.census.gov/data/2020/dec/pl?get={}&for=tract:*&in=state:{}&in=county:{}&key={}'.format(','.join(fields), state, county_name, API_key)
        call_return = requests.get(final_call).json()
        call_df = pd.DataFrame(call_return)
        call_df.columns = call_df.iloc[0]
        call_df = call_df.drop(call_df.index[0])

        call_df = call_df.merge(county_codes, left_on = 'county', right_on = 'County Code', how = 'left')

    elif county:
        final_call = 'https://api.census.gov/data/2020/dec/pl?get={}&for=county:{}&in=state:{}&key={}'.format(','.join(fields), county_name, state, API_key)
        call_return = requests.get(final_call).json()
        call_df = pd.DataFrame(call_return)
        call_df.columns = call_df.iloc[0]
        call_df = call_df.drop(call_df.index[0])
        call_df = call_df.astype({'county': str})

        call_df = call_df.merge(county_codes, left_on = 'county', right_on = 'County Code', how = 'left')
    else:
        final_call = 'https://api.census.gov/data/2020/dec/pl?get={}&for=state:{}&key={}'.format(fields[0], state, API_key)
        call_return = requests.get(final_call).json()
        call_df = pd.DataFrame(call_return)
        call_df.columns = call_df.iloc[0]
        call_df = call_df.drop(call_df.index[0])

        #merge with state codes table
        call_df = call_df.merge(state_codes, left_on = 'state', right_on = 'STATE', how = 'left')




    return(final_call, call_df)

In [131]:
fields = ['P1_001N', 'P1_003N', 'H1_001N', 'H1_002N', 'H1_003N']
url, results = census_2020(API_key, fields = fields, state_abr = 'NY', county_name='New York County', tract = True, county=True)

https://www2.census.gov/geo/docs/reference/codes/files/st36_ny_cou.txt


## Troubleshooting

In [3]:
url = 'https://api.census.gov/data/2020/dec/pl/variables.html'
html_text = requests.get(url).text


In [4]:
soup = BeautifulSoup(html_text, 'html.parser')
tables = []
for table in soup.find_all('table'):
    tables.append(table)

data_table = tables[0]

In [5]:
columns = []
header = data_table.find_all('thead')[0].find_all('th')

In [6]:
for i in header:
  curr_column = i.get_text()
  curr_column = curr_column.replace(' ', '_')
  columns.append(curr_column)

In [7]:
final_list = []
for i in data_table.find_all('tr'):
  curr_row = i.find_all('td')
  if len(curr_row) == len(columns):
    curr_list = []
    #parse date
    for val in range(0,len(columns)):
      to_add = curr_row[val].get_text().strip().replace(',','')
      curr_list.append(to_add)
    final_list.append(curr_list)

In [8]:
final_df = pd.DataFrame(final_list, columns=columns)

In [9]:
final_df = final_df.loc[final_df['Concept'] != ""]

In [60]:



def concept_dict_creation():
    concepts_fields = {}
    concepts = list(final_df['Concept'].unique())
    concepts.remove('Census API Geography Specification')
    concepts = concepts[1:]
    for c in concepts:
        curr_df = final_df.loc[final_df['Concept'] == c]
        labels = list(curr_df['Label'])
        field = list(curr_df['Name'])
        concepts_fields[c] = {}
        for l in range(len(labels)):
            split_val = labels[l].split("!")
            #  = split_val[]
            concepts_fields[c][labels[l]] = field[l]
    return(concepts_fields)
concepts_fields = concept_dict_creation() 

In [55]:
final_df.loc[final_df['Concept'] == 'OCCUPANCY STATUS']

Unnamed: 0,Name,Label,Concept,Required,Attributes,Limit,Predicate_Type,Group
17,H1_001N,!!Total:,OCCUPANCY STATUS,not required,H1_001NA,0,int,H1
18,H1_002N,!!Total:!!Occupied,OCCUPANCY STATUS,not required,H1_002NA,0,int,H1
19,H1_003N,!!Total:!!Vacant,OCCUPANCY STATUS,not required,H1_003NA,0,int,H1


In [61]:
list(concepts_fields['OCCUPANCY STATUS'].keys())

['!!Total:', '!!Total:!!Occupied', '!!Total:!!Vacant']

In [65]:
concepts_fields['OCCUPANCY STATUS'].get('!!Total:', '!!Total:!!Occupied')

'H1_001N'

In [57]:
concepts_fields['OCCUPANCY STATUS'].values()

dict_values(['!!Total:', '!!Total:!!Occupied', '!!Total:!!Vacant'])

In [45]:
concepts_fields.keys()

dict_keys(['OCCUPANCY STATUS', 'RACE', 'HISPANIC OR LATINO AND NOT HISPANIC OR LATINO BY RACE', 'RACE FOR THE POPULATION 18 YEARS AND OVER', 'HISPANIC OR LATINO AND NOT HISPANIC OR LATINO BY RACE FOR THE POPULATION 18 YEARS AND OVER', 'GROUP QUARTERS POPULATION BY MAJOR GROUP QUARTERS TYPE'])

In [52]:
concepts_fields['OCCUPANCY STATUS']

{'H1_001N': '!!Total:',
 'H1_002N': '!!Total:!!Occupied',
 'H1_003N': '!!Total:!!Vacant'}