# Download census tract data from API

  - Available data: https://api.census.gov/data.html
  - Variables (aka, fields) you can query for: https://api.census.gov/data/2017/acs/acs5/profile/variables.html
  - Browse variables for a place here: https://www.census.gov/acs/www/data/data-tables-and-tools/data-profiles/2014/
  - Sample query: https://api.census.gov/data/2017/acs/acs5/profile?get=DP05_0001E&for=tract:400100&in=state:06+county:001
  - County FIPS codes: https://www.nrcs.usda.gov/wps/portal/nrcs/detail/?cid=nrcs143_013697
  
#### Variables name format

More info: https://www.census.gov/data/developers/data-sets/acs-5year/data-notes.html

variable name format: [TableID]_[RowNumber][VariableType]

Example: Variable DP02_0002PE, "Family households (families)", represents the percent estimate for table DP02 row number 2.

DP (Data Profile): Table type containing broad social, economic, housing, and demographic information in a total of four profiles.

  - DP02: Social Characteristics — includes Education, Marital Status, Relationships, Fertility, Grandparents... 
  - DP03: Economic Characteristics — includes Income, Employment, Occupation, Commuting to Work... 
  - DP04: Housing Characteristics — includes Occupancy and Structure, Housing Value and Costs, Utilities... 
  - DP05: Demographic Characteristics — includes Sex and Age, Race, Hispanic Origin, Housing Units... 

Variable suffixes:

  - E = estimate
  - M = margin of error
  - PE = percent estimate (of total)
  - PM = margin of error for corresponding PE
  - A = annotation

In [1]:
import geopandas as gpd
import getcensus as gc
import os
import pandas as pd
from shapely import geometry
# from keys import census_api_key

  shapely_geos_version, geos_capi_version_string


In [8]:
# which census dataset
dataset = 'acs/acs5'
census_api_key = '7679cb7920268e39c115f3c753fa2885aa9fb0cf'
# which vintage year
year = 2018

# which census variables to retrieve for each tract
variables = {'DP05_0001E':'total_pop',    #total pop
             'DP05_0018E':'median_age',    #median age
             'DP05_0071PE':'pct_hispanic',   #pct pop hispanic or latino
             'DP05_0077PE':'pct_white',   #pct pop non-hispanic white alone
             'DP05_0078PE':'pct_black',   #pct pop non-hispanic black
             'DP05_0080E':'pct_asian',   #pct pop non-hispanic asian
             'DP05_0002PE':'pct_male',   #pct pop male
             'DP04_0007PE':'pct_single_family_home',   #pct single family detached homes
             'DP04_0089E':'med_home_value',    #median value of owner occupied units (dollars)
             'DP04_0037E':'med_rooms_per_home',    #median number of rooms in house
             'DP04_0026PE':'pct_built_before_1940',   #pct structure built 1939 or earlier
             'DP04_0047PE':'pct_renting',   #pct renter-occupied housing units
             'DP04_0005E':'rental_vacancy_rate',    #rental vacancy rate
             'DP04_0049E':'avg_renter_household_size',    #average household size of renter-occupied housing units
             'DP04_0134E':'med_gross_rent',    #median gross rent (dollars)
             'DP03_0062E':'med_household_income',    #median household income
             'DP03_0025E':'mean_commute_time',    #mean travel time to work
             'DP03_0019PE':'pct_commute_drive_alone',   #pct commute drove alone
             'DP03_0128PE':'pct_below_poverty',   #pct people with income below povery level
             'DP02_0057PE':'pct_college_grad_student',   #pct who are students currently enrolled in college or grad school
             'DP02_0079PE':'pct_same_residence_year_ago',   #pct residence 1 year ago was same house
             'DP02_0067PE':'pct_bachelors_degree',   #pct bachelor's degree or higher
             'DP02_0111PE':'pct_english_only',   #pct with english only language spoken at home
             'DP02_0092PE':'pct_foreign_born'}   #pct of population foreign born

# data directories
tracts_path = '../tl_2019_48_tract'
output_path = 'census_tracts_data.geojson'

In [9]:
# download and display census descriptions of each variable
variable_descriptions = gc.get_census_variable_descriptions(dataset=dataset, 
                                                            year=year, 
                                                            variables=variables)
for v, d in variable_descriptions.items():
    print('{}\t{}'.format(variables[v], d['label']))

total_pop	Estimate!!SEX AND AGE!!Total population
median_age	Estimate!!SEX AND AGE!!Total population!!Median age (years)
pct_hispanic	Percent Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)
pct_white	Percent Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!White alone
pct_black	Percent Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Black or African American alone
pct_asian	Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Asian alone
pct_male	Percent Estimate!!SEX AND AGE!!Total population!!Male
pct_single_family_home	Percent Estimate!!UNITS IN STRUCTURE!!Total housing units!!1-unit, detached
med_home_value	Estimate!!VALUE!!Owner-occupied units!!Median (dollars)
med_rooms_per_home	Estimate!!ROOMS!!Total housing units!!Median rooms
pct_built_before_1940	Percent Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1939 or earlier
pct_renting	Per

## Get vars from ACS DP

In [10]:
# load the tracts in our study area
gdf = gpd.read_file(tracts_path).sort_values(by='GEOID')
len(gdf)

5265

In [11]:
%%time
df = gc.get_census_tracts_data(tract_fips=gdf['GEOID'], api_key=census_api_key, dataset=dataset,
                               year=year, variables=variables, clean=True)

Downloading 24 census vars in 48001 for 11 tracts.
Downloading 24 census vars in 48003 for 4 tracts.
Downloading 24 census vars in 48005 for 17 tracts.
Downloading 24 census vars in 48007 for 6 tracts.
Downloading 24 census vars in 48009 for 3 tracts.
Downloading 24 census vars in 48011 for 1 tracts.
Downloading 24 census vars in 48013 for 8 tracts.
Downloading 24 census vars in 48015 for 6 tracts.
Downloading 24 census vars in 48017 for 1 tracts.
Downloading 24 census vars in 48019 for 5 tracts.
Downloading 24 census vars in 48021 for 10 tracts.
Downloading 24 census vars in 48023 for 1 tracts.
Downloading 24 census vars in 48025 for 7 tracts.
Downloading 24 census vars in 48027 for 65 tracts.
Downloading 24 census vars in 48029 for 366 tracts.
Downloading 24 census vars in 48031 for 2 tracts.
Downloading 24 census vars in 48033 for 1 tracts.
Downloading 24 census vars in 48035 for 7 tracts.
Downloading 24 census vars in 48037 for 18 tracts.
Downloading 24 census vars in 48039 for 51 

Downloading 24 census vars in 48327 for 1 tracts.
Downloading 24 census vars in 48329 for 27 tracts.
Downloading 24 census vars in 48331 for 7 tracts.
Downloading 24 census vars in 48333 for 2 tracts.
Downloading 24 census vars in 48335 for 2 tracts.
Downloading 24 census vars in 48337 for 6 tracts.
Downloading 24 census vars in 48339 for 59 tracts.
Downloading 24 census vars in 48341 for 4 tracts.
Downloading 24 census vars in 48343 for 3 tracts.
Downloading 24 census vars in 48345 for 1 tracts.
Downloading 24 census vars in 48347 for 13 tracts.
Downloading 24 census vars in 48349 for 10 tracts.
Downloading 24 census vars in 48351 for 4 tracts.
Downloading 24 census vars in 48353 for 5 tracts.
Downloading 24 census vars in 48355 for 82 tracts.
Downloading 24 census vars in 48357 for 3 tracts.
Downloading 24 census vars in 48359 for 1 tracts.
Downloading 24 census vars in 48361 for 21 tracts.
Downloading 24 census vars in 48363 for 9 tracts.
Downloading 24 census vars in 48365 for 6 tr

In [12]:
# merge the tracts with the acs variables, rename columns, then make sure everything we merged is the same length
merged = pd.merge(left=gdf.set_index('GEOID'), right=df, how='inner', left_index=True, right_index=True)
merged = merged.rename(columns=variables)
assert len(gdf) == len(df) == len(merged)

In [13]:
merged.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,...,mean_commute_time,pct_commute_drive_alone,pct_below_poverty,pct_college_grad_student,pct_same_residence_year_ago,pct_bachelors_degree,pct_english_only,pct_foreign_born,state,county
48001950100,48,1,950100,9501.0,Census Tract 9501,G5020,S,483306619,7864313,31.971468,...,27.6,86.1,16.2,11.6,92.1,19.6,93.6,4.3,48,1
48001950401,48,1,950401,9504.01,Census Tract 9504.01,G5020,S,16549991,296734,31.7346372,...,13.0,100.0,1.0,55.3,58.4,3.9,77.9,6.9,48,1
48001950402,48,1,950402,9504.02,Census Tract 9504.02,G5020,S,72472039,2627857,31.8000515,...,5.0,100.0,20.3,61.8,76.9,2.8,78.5,5.5,48,1
48001950500,48,1,950500,9505.0,Census Tract 9505,G5020,S,23132052,99223,31.787885,...,17.8,91.1,21.1,5.4,87.8,11.8,66.3,16.5,48,1
48001950600,48,1,950600,9506.0,Census Tract 9506,G5020,S,20653883,329641,31.7502049,...,16.6,88.2,10.0,2.8,85.5,7.5,89.4,3.5,48,1


## Save to disk

In [14]:
upcast_dispatch = {geometry.Point: geometry.MultiPoint, 
                   geometry.LineString: geometry.MultiLineString, 
                   geometry.Polygon: geometry.MultiPolygon}

def maybe_cast_to_multigeometry(geom):
    caster = upcast_dispatch.get(type(geom), lambda x: x[0])
    return caster([geom])

merged['geometry'] = merged['geometry'].apply(maybe_cast_to_multigeometry)

In [15]:
%%time
merged.reset_index().to_file(output_path, driver='GeoJSON')
print(output_path)

census_tracts_data.geojson
CPU times: user 19.9 s, sys: 835 ms, total: 20.7 s
Wall time: 24.6 s
