In [1]:
import sys
sys.path.append("../src")

import pandas as pd

from census import read_shapefile

In [2]:
PATH_SHAPEFILE = "../../data/census_blocks_atlanta_shapefile.zip"

In [3]:
df_shapefile = read_shapefile(PATH_SHAPEFILE)
df_shapefile = df_shapefile.astype({"INTPTLAT20": "float", "INTPTLON20": "float"})
df_shapefile = df_shapefile.set_index("GEOID20")

In [4]:
df_race = pd.read_csv("../../data/census_block_race/DECENNIALPL2020.P1_data_with_overlays_2021-12-09T172014.csv", low_memory=False, header=1)
df_race.columns = [colname.strip() for colname in list(df_race.columns)]
colnames_final = ["pop_white", "pop_black", "pop_indian_alaskan", 
                  "pop_asian", "pop_hawaiian_pacific", "pop_other",
                  "!!Total:"
                 ]
df_race = df_race.rename(
    columns=
    {
        "!!Total:": "pop_total",
        "!!Total:!!Population of one race:!!White alone": "pop_white",
        "!!Total:!!Population of one race:!!Black or African American alone": "pop_black",
        "!!Total:!!Population of one race:!!American Indian and Alaska Native alone": "pop_indian_alaskan",
        "!!Total:!!Population of one race:!!Asian alone": "pop_asian",
        "!!Total:!!Population of one race:!!Native Hawaiian and Other Pacific Islander alone": "pop_hawaiian_pacific",
        "!!Total:!!Population of one race:!!Some Other Race alone": "pop_other"
    })
df_race["id_sub"] = df_race.id.str.slice(9, 30)
df_race = df_race.set_index("id_sub")
df_race = df_race[df_race.columns.drop(list(df_race.filter(regex='!!Total')))]

In [5]:
df_occupancy = pd.read_csv("../../data/census_block_occupancy/DECENNIALPL2020.H1_data_with_overlays_2022-03-06T175643.csv", low_memory=False, header=1)
df_occupancy.columns = [colname.strip() for colname in list(df_occupancy.columns)]
df_occupancy = df_occupancy.rename(
    columns=
    {
        "!!Total:!!Occupied": "occ_occupied",
        "!!Total:!!Vacant": "occ_vacant"
    }
)
df_occupancy["id_sub"] = df_occupancy.id.str.slice(9, 30)
df_occupancy = df_occupancy.set_index("id_sub")
df_occupancy = df_occupancy[df_occupancy.columns.drop(list(df_occupancy.filter(regex='!!Total')))]

In [6]:
df_institutionalization = pd.read_csv("../../data/census_block_institutionalization/DECENNIALPL2020.P5_data_with_overlays_2022-03-07T214522.csv", low_memory=False, header=1)
df_institutionalization.columns = [colname.strip() for colname in list(df_institutionalization.columns)]
df_institutionalization = df_institutionalization.rename(
    columns=
    {
        "!!Total:!!Institutionalized population:!!Correctional facilities for adults": "inst_correctional_adults",
        "!!Total:!!Institutionalized population:!!Juvenile facilities": "inst_juvenile",
        "!!Total:!!Institutionalized population:!!Nursing facilities/Skilled-nursing facilities": "inst_nursing",
        "!!Total:!!Institutionalized population:!!Other institutional facilities": "inst_other"
    }
)
df_institutionalization["id_sub"] = df_institutionalization.id.str.slice(9, 30)
df_institutionalization = df_institutionalization.set_index("id_sub")
df_institutionalization = df_institutionalization[df_institutionalization.columns.drop(list(df_institutionalization.filter(regex='!!Total')))]

In [7]:
df_joined = df_shapefile.join(df_race).join(df_occupancy, rsuffix="_occ").join(df_institutionalization, rsuffix="_inst").reset_index()

In [8]:
columns_retained = ["GEOID20", "INTPTLAT20", "INTPTLON20", "occ_occupied", "occ_vacant", "inst_correctional_adults", "inst_juvenile",
                   "inst_nursing", "inst_other", "pop_white", "pop_black", "pop_indian_alaskan",
                    "pop_asian", "pop_hawaiian_pacific", "pop_other", "pop_total"]
df_joined = df_joined[columns_retained]
df_joined = df_joined.rename(
    columns=
    {
        "GEOID20": "geoid",
        "INTPTLAT20": "lat",
        "INTPTLON20": "lon"
    })

In [10]:
df_joined.to_csv("../../data/lat_lon_to_census_data.csv", index=False)