In [9]:
import os
from importlib.resources import files
import pandas as pd
import numpy as np
import geojson
from shapely.geometry import shape
import requests
from warnings import warn

surveys = {'1' : '',
           '2' : '/profile',
           '3' : '/subject'}

######## HELPER FUNCTIONS

def _pull_raw_census_data(survey_key, acs_year, census_year, var_code_list, level, census_api_key):
    
    """
    Fetches American Community Survey (ACS) data from the U.S. Census Bureau API and processes it into a pandas DataFrame. Used
    by `_generate_bbl_estimates()`, `_generate_bbl_variances()`, and `generate_new_estimates()`. Can be aggregated to geographies
    in the census hierachy, including census tract, neighborhood tabulation area, borough, and New York City.

    Parameters:
    -----------
    acs_year : int 
        The year of the ACS dataset to fetch (e.g., 2019 for 2019 ACS 5-year data).
    census_year : int 
        The decennial census year to associate with the unique identifier for census tracts. Enter '2010' for ACS surveys from
        before 2020 but after 2010. Enter '2020' for surveys 2020 and above.
    var_code_list : list of str
        A list of variable codes to retrieve from the ACS dataset (e.g., ['DP03_0045E', 'DP03_0032E']).
    level : str
        The level of geographic aggregation desired (options include 'tract', 'nta', 'modzcta', 'borough', and 'city')
    census_api_key : str
        A valid API key for accessing the U.S. Census Bureau's API.

    Returns:
    --------
    pandas.DataFrame
        A DataFrame containing the requested variable estimates for the selected geography (pulled directly from the ACS itself).
    
    Notes:
    ------
    - NTA and MODZCTA estimates are aggregated from tracts and ZCTAs, respectively. Tract, borough, and city estimates are taken directly from the survey.
    - The unique identifier for each census tract is created by concatenating the tract number and county FIPS number.
    
    """

    # define parameters
    base_url = "https://api.census.gov/data"
    dataset = surveys[survey_key]  # ACS 5-year dataset
    variables = ",".join(var_code_list)  # Concatenate variables into a comma-separated string
    state = "36" # New York state
    
    # # setting path for pulling data from internal folder
    data_path = files("councilcount").joinpath("data") 

    # setting values based on geography level chosen
    if level == 'city': 
        for_code = f"place:51000&in=state:{state}" # NYC code
    elif level == 'borough': 
        for_code = f"county:005,047,061,081,085&in=state:{state}" # county codes for NYC boroughs
        # to match counties to boroughs
        conversion_dict = {'5': 'The Bronx', '47': 'Brooklyn', '61': 'Manhattan', '81': 'Queens', '85': 'Staten Island'} 
    elif level == 'modzcta':
        # defining NYC ZCTAs to pull data for
        nyc_zctas = pd.read_csv(f'{data_path}/nyc-zcta-list.csv')
        nyc_zctas_str = str(nyc_zctas['ZCTA5CE20'].to_list()).replace(' ', '').replace('[', '').replace(']', '')
        for_code =f"zip code tabulation area:{nyc_zctas_str}" # all NYC ZCTAs
        # converts zcta to modzcta
        file_path = f'{data_path}/modzcta-boundaries.geojson'
        with open(file_path) as f: df = geojson.load(f)
        features = df["features"]
        zcta_to_modzcta_df = pd.json_normalize([feature["properties"] for feature in features])
        modzcta_dict = dict(zip(zcta_to_modzcta_df['ZCTA'],zcta_to_modzcta_df['MODZCTA']))
        # exploding the dict
        conversion_dict = {key: v for k, v in modzcta_dict.items() for key in k.split(', ')}
    elif level in ['tract', 'nta']: 
        for_code = f'tract:*&in=county:005,047,061,081,085&in=state:{state}' # all NYC census tracts
        if level == 'nta':
            # to help build NTAs out of census tracts
            nta_conversion = pd.read_csv(f'{data_path}/2020_Census_Tracts_to_2020_NTAs_and_CDTAs_Equivalency_20240905.csv')
            conversion_dict = pd.Series(nta_conversion['NTACode'].values,index=nta_conversion['GEOID'].astype(str)).to_dict() 
            # need to pull in df with both names and codes to create column with NTA full names 
            file_path = f'{data_path}/nta-boundaries.geojson'
            with open(file_path) as f: df = geojson.load(f)
            features = df["features"]
            nta_name_df = pd.json_normalize([feature["properties"] for feature in features])
        
    url = f'{base_url}/{acs_year}/acs/acs5{dataset}/variables?get={variables}&for={for_code}&key={census_api_key}'
    print(url)
    response = requests.get(url)

    # check the response
    if response.status_code == 200:
        try:
            data = response.json()  # attempt to parse JSON response
            demo_df = pd.DataFrame(data[1:], columns=data[0]) # first row is the header
            demo_df.replace('-555555555', np.nan, inplace=True) # sometimes this number comes in when data is missing
            demo_df[var_code_list] = demo_df[var_code_list].astype(float) # setting dtype
                
            if level == 'tract':
                # create unique identifier for each tract (some counties have duplicate census tract names)
                demo_df[f'{census_year}_tract_id'] = demo_df['tract'].astype(int).astype(str) + '-' + demo_df['county'].astype(int).astype(str)
                demo_df = demo_df.drop(columns=['state', 'county', 'tract'])
            elif level == 'nta':
                # pair census tract GEOIDs to corresponding NTA
                demo_df['geoid'] = demo_df['state'] + demo_df['county'] + demo_df['tract']
                demo_df[level] = demo_df['geoid'].map(conversion_dict)
                demo_df = demo_df.drop(columns=['state', 'county', 'tract', 'geoid'])
                # census formula -> to aggregate multiple MOEs, sqrt the sum of all MOEs squared
                MOE_columns = [col for col in demo_df.columns if col[-1] == 'M'] # isolating the MOE columns
                demo_df[MOE_columns] = demo_df[MOE_columns]**2 # squaring MOE columns
                # aggregating estimates and MOE from tract-level to nta-level
                demo_df = demo_df.groupby(level).sum().reset_index()
                demo_df[MOE_columns] = np.sqrt(demo_df[MOE_columns]).round().astype(int) # sqrt the sum of all MOEs squared      
                # second conversion (adding full names using codes)
                nta_names = dict(zip(nta_name_df['nta2020'],nta_name_df['ntaname']))
                demo_df['ntaname'] = demo_df[level].map(nta_names)                      
                demo_df.insert(0, level, demo_df.pop(level)) # move region column to the beginning  
                demo_df.insert(1, 'ntaname', demo_df.pop('ntaname'))
            elif level == 'modzcta':
                demo_df[level] = demo_df['zip code tabulation area'].map(conversion_dict)
                demo_df = demo_df.groupby(level).sum().reset_index()
                demo_df = demo_df.drop(columns=['zip code tabulation area'])
                demo_df.insert(0, level, demo_df.pop(level)) # move region column to the beginning 
            elif level == 'borough':
                # pair county FIPS code to borough name
                demo_df['county'] = demo_df['county'].astype(int).astype(str)
                demo_df[level] = demo_df['county'].map(conversion_dict)
                demo_df = demo_df.drop(columns=['state', 'county'])
                demo_df.insert(0, level, demo_df.pop(level)) # move region column to the beginning  
            elif level == 'city':
                # renaming columns
                demo_df['place'] = 'New York City'
                demo_df = demo_df.drop(columns=['state']).rename(columns={'place':'city'})
                demo_df.insert(0, level, demo_df.pop(level)) # move region column to the beginning 
                
        except Exception as e:
            print("Error parsing JSON response:", e)
            print("Response text:", response.text)
    else:
        print(f"Error: {response.status_code}")
        print("Response text:", response.text)
        
    return demo_df

survey_key = '1'
acs_year = 2021
census_year = 2020
var_code_list = ['B06008_003E, B08201_006E']
level = 'borough'
census_api_key = '2f42b2b59ea3d5882dd01587b2e25769f000ed56'

g = _pull_raw_census_data(survey_key, acs_year, census_year, var_code_list, level, census_api_key)
g 

https://api.census.gov/data/2021/acs/acs5?get=B06008_003E, B08201_006E&for=county:005,047,061,081,085&in=state:36&key=2f42b2b59ea3d5882dd01587b2e25769f000ed56
Error: 400
Response text: error: unknown variable ''


UnboundLocalError: local variable 'demo_df' referenced before assignment