https://docs.safegraph.com/docs/open-census-data#section-census-demographic-data

In [1]:
import os
from requests import get
import pandas as pd
import geopandas as gpd
from timeit import default_timer as timer
import numpy as np

In [2]:
path_to_data='/scratch/spf248/covid/data'

# US

In [10]:
country='US'

In [11]:
def download(url, file_name):
    # open in binary mode
    with open(file_name, "wb") as file:
        # get request
        response = get(url)
        # write to file
        file.write(response.content)
        
url='https://safegraph.apms5.com/anywhere/m?s=safegraph&m=tr_f6c66439-7622-4661-b5ac-6eede6fccafd&u=e1jq4wvfdtfkjha46n342e265mvk4d265mu42e1n5mwkcdtn5mrk8da18ct36hhr8n2mc&r2=d1u78w3k78qjywtk5nuq6bbqcntq8b9j5tgpurbudxq62xvk5thpyv9fedgpctb7e9gq0u1de1jq4v9fedgpctb7e9gq0u2zdxr6avjzcdjpwwvnedfp8rbmc4qq6rb6cnkq4rbgd1fpyw35dtfp6tbeeduq6qv4c5u62bkmc5t2wtvu&n=1'
file_name=os.path.join(path_to_data,'admin',country,'safegraph_open_census_data.tar')

if not os.path.exists(file_name):
    download(url, file_name)

In [30]:
print('Prepare Shapefile')
start = timer()

df1 = pd.read_csv(os.path.join(file_name.replace('.tar',''),'data','cbg_b01.csv'))
df2 = pd.read_csv(os.path.join(file_name.replace('.tar',''),'data','cbg_b19.csv'))

# Merge dataset with population, median age and median income
df = df1.merge(df2, on='census_block_group')
df = df[['census_block_group','B01001e1','B01002e1','B19013e1']].copy()
df = df.rename(columns={"B01001e1": "population", "B01002e1": "median_age", "B19013e1": "median_income"}).copy()

# Merge dataset with geolocation - this needs a bit of preprocessing
df3 = pd.read_csv(os.path.join(file_name.replace('.tar',''),'metadata','cbg_fips_codes.csv'))
df3['state_fips'] = df3['state_fips'].astype(str).str.zfill(2) #add leading 0
df3['county_fips'] = df3['county_fips'].astype(str).str.zfill(3) #add leading 0
df3['county_fips_full'] = df3['state_fips'] + df3['county_fips'] # create unique FIPS county code (for merging) - geolocation dataframe
df['census_block_group'] = df['census_block_group'].astype(str).str.zfill(12)
df['county_fips_full'] = df['census_block_group'].str[:5]  # create unique FIPS county code (for merging) - pop/income/age dataframe
df = df.merge(df3, on='county_fips_full')

# Merge with Shapefiles
df4 = gpd.read_file(os.path.join(file_name.replace('.tar',''),'geometry','cbg.geojson'))
df4.rename(columns={"CensusBlockGroup": "census_block_group"},inplace=True)
df = df.merge(df4[['census_block_group','geometry']], on='census_block_group', how='right')

df.population=df.population.astype(np.float)
df.median_age=df.median_age.astype(np.float)
df.median_income=df.median_income.astype(np.float)

print('# Admin Units:', df.shape[0])

df=df[[
'census_block_group',
'state_fips',
'state',
'county_fips',
'county',
'population',
'median_age',
'median_income',
'geometry']].copy()

print("Done in", round(timer()-start), "sec")

Prepare Shapefile
# Admin Units: 220333
Done in 0 sec


In [34]:
print('Save')
start = timer()
df.to_csv(os.path.join(path_to_data,'admin',country,'admin.csv'), index = False, header=True)
gpd.GeoDataFrame(df,geometry=df.geometry).to_file(os.path.join(path_to_data,'admin',country,'admin.geojson'),driver='GeoJSON')
print("Done in", round(timer()-start), "sec")

Save
Done in 1225 sec


# MX

In [11]:
country='MX'

In [None]:
file_name = os.path.join(path_to_data,'admin',country,'MUNIC','mex_admbnda_adm2_govmex','mex_admbnda_adm2_govmex.shp')
shp=gpd.read_file(file_name)
shp=shp[['ADM1_ES','ADM1_PCODE','ADM2_ES','ADM2_PCODE','Shape_Area','geometry']].copy()

file_name=os.path.join(path_to_data,'admin',country,'MUNIC','mex_popa_adm2_govmex.xlsx')
data=pd.read_excel(file_name,usecols=['adm2code','total_population','pop0_14','pop15_64','pop65_mas',])

df=shp.merge(data,left_on='ADM2_PCODE',right_on='adm2code').drop('adm2code',1)
print('# Admin Units:', df.shape[0])

df.to_csv(os.path.join(path_to_data,'admin',country,'admin.csv'), index = False, header=True)


In [None]:
df.head()

In [None]:
print('Prepare Shapefile')
start = timer()

file_name = os.path.join(path_to_data,'admin',country,'asset_index_ageb_MEX.csv')

cols=[
'geometry',
'ageb',
'locality',
'mun',
'municipality_name',
'state_code',
'state_name',
'total_population',
'pop0_14',
'pop15_64',
'pob65_mas',
'wealth_index_10k',
]

df = pd.read_csv(file_name,usecols=cols)
print('# Admin Units:', df.shape[0])
df.rename(columns={'mun':'municipality','state_code':'state'},inplace=True)
df=df.loc[df['ageb'].dropna().index].copy()
print('# Admin Units:', df.shape[0])

df.to_csv(os.path.join(path_to_data,'admin',country,'admin_ageb.csv'), index = False, header=True)

print("Done in", round(timer()-start), "sec")

In [None]:
df.head()

# ID

In [12]:
country='ID'

In [13]:
print('Prepare Shapefile')
start = timer()

shp=gpd.read_file(os.path.join(path_to_data,'admin',country,'idn_adm_bps_20200401_area','idn_admbnda_adm4_bps_area.shp'))
shp=shp.to_crs("epsg:4326")
print('# Admin Units:', shp.shape[0])

data=pd.read_csv(os.path.join(path_to_data,'admin',country,'indonesia_index.csv'))
pops=pd.read_csv(os.path.join(path_to_data,'admin',country,'pop.csv')).rename(columns={'count':'total_population'})

df=shp[['ADM4_EN','ADM4_PCODE','ADM3_EN','ADM3_PCODE','ADM2_EN','ADM2_PCODE','ADM1_EN','ADM1_PCODE','a_km2','geometry']].merge(
data[['ID_village','age','index_01_all_vars']],left_on='ADM4_PCODE',right_on='ID_village',how='left')
df=df.merge(pops,on='ID_village',how='left').drop('ID_village',1)
df.rename(columns={'age':'median_age','index_01_all_vars':'wealth_index'},inplace=True)
print('# Admin Units:', df.shape[0])



print("Done in", round(timer()-start), "sec")

Prepare Shapefile
# Admin Units: 81912
# Admin Units: 81912
Done in 58 sec


In [14]:
df.head()

Unnamed: 0,ADM4_EN,ADM4_PCODE,ADM3_EN,ADM3_PCODE,ADM2_EN,ADM2_PCODE,ADM1_EN,ADM1_PCODE,a_km2,geometry,median_age,wealth_index,total_population
0,1 Ilir,ID1671060006,Ilir Timur II,ID1671060,Kota Palembang,ID1671,Sumatera Selatan,ID16,2.086247,"POLYGON ((104.82248 -2.98102, 104.82165 -2.985...",26.0,0.589971,5792.0
1,1 Ulu,ID1671020008,Seberang Ulu I,ID1671020,Kota Palembang,ID1671,Sumatera Selatan,ID16,0.209298,"POLYGON ((104.75516 -3.01316, 104.75480 -3.013...",25.0,0.606387,12058.0
2,10 Ilir,ID1671062001,Ilir Timur III,ID1671062,Kota Palembang,ID1671,Sumatera Selatan,ID16,0.156444,"POLYGON ((104.77081 -2.98185, 104.77082 -2.981...",,,
3,11 Ilir,ID1671062002,Ilir Timur III,ID1671062,Kota Palembang,ID1671,Sumatera Selatan,ID16,0.07704,"POLYGON ((104.77081 -2.98185, 104.77070 -2.981...",,,
4,11 Ulu,ID1671030014,Seberang Ulu II,ID1671030,Kota Palembang,ID1671,Sumatera Selatan,ID16,0.227913,"POLYGON ((104.77317 -2.99340, 104.77316 -2.993...",24.0,0.529232,7543.0


In [15]:
print('Save')
start = timer()

df.to_file(os.path.join(path_to_data,'admin',country,'admin.geojson'), driver='GeoJSON')
df.to_csv(os.path.join(path_to_data,'admin',country,'admin.csv'), index=False, header=True)

print("Done in", round(timer()-start), "sec")

Save
Done in 198 sec


# IN

In [None]:
country='IN'

In [None]:
shp=gpd.read_file(os.path.join(path_to_data,'admin',country,'admin3','IND_adm3.shp'))
print('# Admin Units:', shp.shape[0])
df=shp[['NAME_1','ID_1','NAME_2','ID_2','geometry']].copy()
df.to_csv(os.path.join(path_to_data,'admin',country,'admin.csv'), index=False, header=True)