https://docs.safegraph.com/docs/open-census-data#section-census-demographic-data

In [13]:
import os
from requests import get
import pandas as pd
import geopandas as gpd
from timeit import default_timer as timer
import numpy as np

In [14]:
path_to_data='/scratch/spf248/covid/data'

# US

In [None]:
country='US'

In [None]:
def download(url, file_name):
    # open in binary mode
    with open(file_name, "wb") as file:
        # get request
        response = get(url)
        # write to file
        file.write(response.content)
        
url='https://safegraph.apms5.com/anywhere/m?s=safegraph&m=tr_f6c66439-7622-4661-b5ac-6eede6fccafd&u=e1jq4wvfdtfkjha46n342e265mvk4d265mu42e1n5mwkcdtn5mrk8da18ct36hhr8n2mc&r2=d1u78w3k78qjywtk5nuq6bbqcntq8b9j5tgpurbudxq62xvk5thpyv9fedgpctb7e9gq0u1de1jq4v9fedgpctb7e9gq0u2zdxr6avjzcdjpwwvnedfp8rbmc4qq6rb6cnkq4rbgd1fpyw35dtfp6tbeeduq6qv4c5u62bkmc5t2wtvu&n=1'
file_name=os.path.join(path_to_data,'admin',country,'safegraph_open_census_data.tar')

if not os.path.exists(file_name):
    download(url, file_name)

In [11]:
print('Prepare Shapefile')
start = timer()

df1 = pd.read_csv(os.path.join(file_name.replace('.tar',''),'data','cbg_b01.csv'))
df2 = pd.read_csv(os.path.join(file_name.replace('.tar',''),'data','cbg_b19.csv'))

# Merge dataset with population, median age and median income
df = df1.merge(df2, on='census_block_group')
df = df[['census_block_group','B01001e1','B01002e1','B19013e1']].copy()
df = df.rename(columns={"B01001e1": "population", "B01002e1": "median_age", "B19013e1": "median_income"}).copy()

# Merge dataset with geolocation - this needs a bit of preprocessing
df3 = pd.read_csv(os.path.join(file_name.replace('.tar',''),'metadata','cbg_fips_codes.csv'))
df3['state_fips'] = df3['state_fips'].astype(str).str.zfill(2) #add leading 0
df3['county_fips'] = df3['county_fips'].astype(str).str.zfill(3) #add leading 0
df3['county_fips_full'] = df3['state_fips'] + df3['county_fips'] # create unique FIPS county code (for merging) - geolocation dataframe
df['census_block_group'] = df['census_block_group'].astype(str).str.zfill(12)
df['county_fips_full'] = df['census_block_group'].str[:5]  # create unique FIPS county code (for merging) - pop/income/age dataframe
df = df.merge(df3, on='county_fips_full')

df4 = gpd.read_file(os.path.join(file_name.replace('.tar',''),'geometry','cbg.geojson'))
df4.rename(columns={"CensusBlockGroup": "census_block_group"},inplace=True)
df = df.merge(df4[['census_block_group','geometry']], on='census_block_group')

df.population=df.population.astype(np.float)
df.median_age=df.median_age.astype(np.float)
df.median_income=df.median_income.astype(np.float)

print('# Admin Units:', df.shape[0])

#Export to table
df.to_csv(os.path.join(path_to_data,'admin',country,'admin.csv'), index = False, header=True)

print("Done in", round(timer()-start), "sec")

Prepare Shapefile
Done in 294 sec


# MX

In [6]:
country='MX'

In [22]:
print('Prepare Shapefile')
start = timer()

file_name = os.path.join(path_to_data,'admin',country,'asset_index_ageb_MEX.csv')

cols=[
'geometry',
'ageb',
'locality',
'mun',
'municipality_name',
'state_code',
'state_name',
'total_population',
'pop0_14',
'pop15_64',
'pob65_mas',
'wealth_index_10k',
]

df = pd.read_csv(file_name,usecols=cols)
print('# Admin Units:', df.shape[0])
df.rename(columns={'mun':'municipality','state_code':'state'},inplace=True)
df=df.loc[df['ageb'].dropna().index].copy()
print('# Admin Units:', df.shape[0])

df.to_csv(os.path.join(path_to_data,'admin',country,'admin.csv'), index = False, header=True)

print("Done in", round(timer()-start), "sec")

Prepare Shapefile
# Admin Units: 56195
# Admin Units: 56177
Done in 7 sec


In [11]:
df.head()

Unnamed: 0,geometry,ageb,municipality,municipality_name,state,state_name,locality,total_population,pop0_14,pop15_64,pob65_mas,wealth_index_10k
0,"POLYGON ((-102.2429897033714 21.9054604847799,...",100100013204,1.0,Aguascalientes,1.0,Aguascalientes,1.0,1831.0,686.0,1119.0,23.0,0.497066
1,"POLYGON ((-102.2386086773723 21.9076445136008,...",100100013721,1.0,Aguascalientes,1.0,Aguascalientes,1.0,2820.0,1164.0,1635.0,21.0,0.4354
2,"POLYGON ((-102.284093386003 21.88241849062115,...",100100010657,1.0,Aguascalientes,1.0,Aguascalientes,1.0,2126.0,404.0,1339.0,328.0,0.57191
3,POLYGON ((-102.2848812827596 21.89669252671616...,100100011528,1.0,Aguascalientes,1.0,Aguascalientes,1.0,187.0,63.0,112.0,9.0,0.656236
4,POLYGON ((-102.3209134854875 21.87213891008895...,100100011674,1.0,Aguascalientes,1.0,Aguascalientes,1.0,1238.0,388.0,774.0,70.0,0.65143


# ID

In [12]:
country='ID'

In [55]:
print('Prepare Shapefile')
start = timer()

shp=gpd.read_file(os.path.join(path_to_data,'admin',country,'Indonesia admin4','idn_admbnda_adm4_bps_20200401.shp'))
print('# Admin Units:', shp.shape[0])
data=pd.read_csv(os.path.join(path_to_data,'admin',country,'indonesia_index.csv'))
pops=pd.read_csv(os.path.join(path_to_data,'admin',country,'pop.csv')).rename(columns={'count':'total_population'})

df=shp[['ADM4_PCODE','ADM3_EN','ADM3_PCODE','ADM2_EN','ADM2_PCODE','ADM1_EN','ADM1_PCODE','geometry']].merge(
data[['ID_village','age','index_01_all_vars']],left_on='ADM4_PCODE',right_on='ID_village')
df=df.merge(pops,on='ID_village').drop('ID_village',1)
df.rename(columns={'age':'median_age','index_01_all_vars':'wealth_index'},inplace=True)
print('# Admin Units:', df.shape[0])

df.to_csv(os.path.join(path_to_data,'admin',country,'admin.csv'), index=False, header=True)

print("Done in", round(timer()-start), "sec")

Prepare Shapefile
# Admin Units: 72300
Done in 56 sec
