In [2]:
import pandas as pd
import requests

This notebook gathers data from the census API 

ACS API basic overview
 
The "get_data" function below is building a url endpoint (a web address)
based on:
Year you want data from
 
which type of ACS data - 1 year, 3 year or 5 year data. 
You can see the difference here - https://www.census.gov/programs-surveys/acs/guidance/estimates.html
 
State - this function uses state abbreviations to get state level data (the function "run the states" runs through all 50 states and combines them to get USA wide data). There are other geographies you can use, see the API handbook. https://www.census.gov/content/dam/Census/library/publications/2020/acs/acs_api_handbook_2020.pdf
 
List of variables you want. See this list for 5-year variable names (you want code in the "Name" column https://api.census.gov/data/2019/acs/acs5/variables.html
Note: 1 year and 3 year data will have slightly different variable names, again see the API handbook



In [3]:
#got state number list from this github https://gist.github.com/dantonnoriega/bf1acd2290e15b91e6710b6fd3be0a53
census_state_list_url = "https://gist.githubusercontent.com/dantonnoriega/bf1acd2290e15b91e6710b6fd3be0a53/raw/11d15233327c8080c9646c7e1f23052659db251d/us-state-ansi-fips.csv"

In [4]:
#need converter to keep leading zero
states = pd.read_csv(census_state_list_url,converters={' st': lambda x: str(x)})
states_list = states[" st"]

In [1]:
def get_data(year,acs,state,codes):
    """
    Load data from API. modify the url to change the items being requested. Also if adding new items to call, add them to the "codes" dictionary above
    so they they get proper column labels. 
    inputs:
        year (str) year of the call 
        acs(str) 1 or 5 year acs 
        state (str) which state - should run through state list - of form 01-50.
        codes (str) codes for data you want to load 
        
    """
    #use the URL below for census tract level 
    #url = "https://api.census.gov/data/"+year+"/acs/acs"+acs+"?get=NAME,"+codes +"&for=tract:*&in=state:"+state+"&in=county:*"
    
    #the URL below is for county level 
    url = "https://api.census.gov/data/"+year+"/acs/acs"+acs+"?get=NAME," + codes +"&for=county:*&in=state:*" 
    #+state

    #print(url)
    response = requests.get(url)
    data_1 = pd.DataFrame(response.json())

    return data_1 


In [5]:
def make_header(df,codes):
	"""
	do some basic cleanup of the header row 
	"""
	new_header = df.iloc[0] #grab the first row for the header
	df = df[1:] #take the data less the header row
	df.columns = new_header
	df.columns = [codes[x] for x in df.columns]
	return df 

In [6]:
#read in list of desired codes for variables 
df_key = pd.read_excel("ACS data dl.xlsx")

#subset the data based on calls to different datasets 
acs_5 = df_key[df_key.survey == 'ACS 5 year 2018']
acs_1_2018 = df_key[df_key.survey == 'ACS 1 year 2018']
acs_1_2014 = df_key[df_key.survey == 'ACS 1 year 2014']

In [8]:
acs_5.head()

Unnamed: 0,survey,geographic unit,code,label,concept,description
0,ACS 5 year 2018,census tract,B17020_001E,Estimate!!Total,POVERTY STATUS IN THE PAST 12 MONTHS BY AGE,poverty rate
1,ACS 5 year 2018,census tract,B17020_002E,Estimate!!Total!!Income in the past 12 months ...,POVERTY STATUS IN THE PAST 12 MONTHS BY AGE,poverty rate
2,ACS 5 year 2018,census tract,B17020_010E,Estimate!!Total!!Income in the past 12 months ...,POVERTY STATUS IN THE PAST 12 MONTHS BY AGE,poverty rate
3,ACS 5 year 2018,census tract,B03002_001E,Estimate!!Total,HISPANIC OR LATINO ORIGIN BY RACE,total hispanic or latino origin by race
4,ACS 5 year 2018,census tract,B03002_004E,Estimate!!Total!!Not Hispanic or Latino!!Black...,HISPANIC OR LATINO ORIGIN BY RACE,total NH Black


In [9]:
#to make column names need to make dictionary of codes to names acs_5
codes = dict(zip(acs_5.code, acs_5.label))
tract_dict = {"NAME":"Name","state":"State","county":"County","tract":"Tract"}
codes.update(tract_dict) 

In [101]:
#to make column names need to make dictionary of codes to names acs_1_2018
codes_1_2018 = dict(zip(acs_1_2018.code, acs_1_2018.label))
tract_dict = {"NAME":"Name","state":"State","county":"County"}
codes_1_2018.update(tract_dict) 

In [100]:
#to make column names need to make dictionary of codes to names acs_1_2014
codes_1_2014 = dict(zip(acs_1_2014.code, acs_1_2014.label))
tract_dict = {"NAME":"Name","state":"State","county":"County"}
codes_1_2014.update(tract_dict) 

In [17]:
#make string of codes we want 
def make_string(df):
    string = ""
    for i in list(df["code"].unique()):
        i = i + ","
        string += i 
    string = string[:-1]
    return string

In [18]:
#make different variable name keys for each substring 
string = make_string(acs_5)
string_acs_1_2014 = make_string(acs_1_2014)
string_acs_1_2018 = make_string(acs_1_2018)

In [106]:
def run_the_states(states_list,codes,year,acs,string):
    """
    states_list (list) list of strings of state codes (01, etc)
    codes (str) single string of all variable codes 
    year (str) year of data
    acs (str) 1 or 5 year acs data 
    codes (dict) dictionary mapping of ACS variable name codes to real names 
        Run through list of states to get acs data from each census tract. 
    
    """

    final_df = pd.DataFrame()
    df = []
    for i in states_list:
        try:
            new_state = get_data(year ,acs ,i , string)
        except Exception as e:
            print(i," doesn't work", e)
        try:
            new_state = make_header(new_state, codes)
        except Exception as e:
            print(i," col name doesn't work", e)
        
        try:
            df.append(new_state)
        
        except Exception as e:
            print("cant add new info",file_name, e)
        
    final_df = final_df.append(df, True)   
    return final_df

In [111]:
def run_county_level(states_list,codes,year,acs,string):
    try:
        final_df = get_data(year ,acs ,i , string)
    except Exception as e:
        print(i," doesn't work", e)
    try:
        final_df = make_header(final_df, codes)
    except Exception as e:
        print(i," col name doesn't work", e)

    return final_df

In [None]:
#gather all acs5 data.
full_df = run_the_states(states_list,codes)

In [113]:
#gather all AC1 2018 data 
acs_1_2018_full_data = run_county_level(states_list,codes_1_2018,"2018","1",string_acs_1_2018)

In [119]:
#gather all AC1 2014 data 
acs_1_2014_full_data = run_county_level(states_list,codes_1_2014,"2014","1",string_acs_1_2014)

In [116]:
#each data set had 838 unique counties, including PR
len(acs_1_2018_full_data["Name"].unique())

838

In [117]:
len(acs_1_2018_full_data["State"].unique())

52

In [74]:
#write ACS5 data to csv 
full_df.to_csv("ACS5Year_Tract_Level_Poverty.csv")

In [118]:
#write ACS_1_18 data to csv 
acs_1_2018_full_data.to_csv("ACS1_2018_County_Level_Poverty.csv")

In [121]:
#write ACS_1_14 data to csv 
acs_1_2014_full_data.to_csv("ACS1_2014_County_Level_Poverty.csv")