In [None]:
import pickle
import json
import pandas as pd
import os

RAW_DATA_DIR = 'demo_data_by_ward'

#Some of our target variables need to be grouped for readibility and consolodation purposes
#the key is the variable that will show up as a column in the resulting dataframe and
#the lists are variables that sum together to make their key
variable_groups = {'Total Population' : ['B01001_001E'],
                  'White' : ['B01001A_001E'],
                  'Black or African American' : ['B01001B_001E'], 
                  'American Indian or Alaska Native' : ['B01001C_001E'],
                  'Asian' : ['B01001D_001E'], 
                  'Native Hawaiian or Pacific Islander' : ['B01001E_001E'],
                  'Other' : ['B01001F_001E'], 
                  'Multiracial' : ['B01001G_001E'],
                  'White Not Hispanic or Latino' : ['B01001H_001E'],
                  'Hispanic or Latino' : ['B01001I_001E'],
                  'Under $25,000' : ['B19101_002E' ,'B19101_003E', 'B19101_004E', 'B19101_005E'],
                  '$25,000 to $49,999' : ['B19101_006E', 'B19101_007E', 'B19101_008E','B19101_009E','B19101_010E'],
                  '$50,000 to $74,999' : [ 'B19101_011E','B19101_012E'],
                  '$75,000 to $125,000' : ['B19101_013E','B19101_014E'],
                  '$125,000 +' : ['B19101_015E', 'B19101_016E','B19101_017E'],
                  'Male 0 to 17' : ['B01001_003E','B01001_004E','B01001_005E','B01001_006E'],
                  'Male 18 to 24' : ['B01001_007E','B01001_008E','B01001_009E', 'B01001_010E'],
                  'Male 25 to 34' : ['B01001_011E','B01001_012E'],
                  'Male 35 to 49' : ['B01001_013E','B01001_014E','B01001_015E'],
                  'Male 50 to 64' : ['B01001_016E','B01001_017E','B01001_018E','B01001_019E'],
                  'Male 65+' : ['B01001_020E','B01001_021E','B01001_022E','B01001_023E','B01001_024E','B01001_025E'],
                  'Female 0 to 17' : ['B01001_027E','B01001_028E','B01001_029E','B01001_030E'],
                  'Female 18 to 24' : ['B01001_031E','B01001_032E','B01001_033E','B01001_034E'],
                  'Female 25 to 34' : ['B01001_035E','B01001_036E'],
                  'Female 35 to 49' : ['B01001_037E','B01001_038E','B01001_039E'],
                  'Female 50 to 64' : ['B01001_040E','B01001_041E','B01001_042E','B01001_043E'],
                  'Female 65 + ' : ['B01001_044E','B01001_045E','B01001_046E','B01001_047E','B01001_048E','B01001_049E']}

def unpack_variable_groups(tract, variable_groups = variable_groups):
    '''This function takes a given census tract json (created in the get_raw_census_data workbook)
    and flattens out the raw data by taking variables stored in a dict with the key as the output column

    tract: JSON/dict 
    variable_groups: {'Total Population' : ['B01001_001E'],
                  'White' : ['B01001A_001E'],
                  'Black or African American' : ['B01001B_001E'], 
                  'American Indian or Alaska Native' : ['B01001C_001E'],
                  'Asian' : ['B01001D_001E'], 
                  'Native Hawaiian or Pacific Islander' : ['B01001E_001E']..
        
        
    '''
    #create a temp dict to collect flattened data
    temp_dict = {}
    #iterate through all of they keys in the variable groups
    for variable in variable_groups.keys():
        #store the values in a temp list to sum later
        temp_list = []
        for sub_variable in variable_groups[variable]:
            temp_list.append(tract["out_variables"][sub_variable])

        #multiply the total of the summed list of variables by the % of overalp the tract
        #has with the arbitrary geometry
        temp_dict[variable] = sum(temp_list) * tract['%_of_tract']
    
    return temp_dict

def process_tracts_in_geo_area(data, geo_area_name, geo_area_taxonomy = 'Ward'):
    '''this takes the raw data, flattens with the unpack_variables_groups function, and creates a dataframe of a given geometry
    data: dict
    geo_area_taxonomy: is a str to assign a name to  given arbitrary geomtery 
    geo_area_name: the spceific name in a given taxonomy 
    '''

    #store all of the variables in a list
    unpacked_tract_list = []
    for key in data.keys():
        tract = data[key]
        unpacked_tract_list.append(unpack_variable_groups(tract))
        
    #sum all of the tracts in a given geomtery, name the geomtery, and transpose
    temp_totals = pd.DataFrame(unpacked_tract_list).sum()
    temp_totals[geo_area_taxonomy] = geo_area_name
    temp_totals = pd.DataFrame(temp_totals)
    temp_totals = temp_totals.T

    return temp_totals

In [None]:
#empty list to store the temp data frames to concat at the end
ward_summaries = []
for file in os.listdir(RAW_DATA_DIR):
    #sometimes, when working with jupyter notebooks, a .ipynb checkpoint is created in a working directory
    #this will ignore that
    if '.ipynb' in file:
        pass
    else:
        #open the raw JSON data for a given geomtery
        with open(f'{RAW_DATA_DIR}/{file}') as f:
            data = json.load(f)

        #flatten out and create frames for concating 
        ward_summaries.append(process_tracts_in_geo_area(data,file))
        print(file)

#concat the resulting geometry data frames
ward_frame = pd.concat(ward_summaries, axis = 0)

#iterate through the columns and round to the nearest whole number if desired
for column in ward_frame.columns:
    if column not in ['Ward','Community Area']:
        print(column)
        ward_frame[column] = ward_frame[column].apply(lambda x: round(x,0))

ward_frame['Survey_Year'] = 2023

ward_frame.to_csv('<write_a_file_name_for_your_data>', index = False)