In [5]:
import numpy as np
import pandas as pd

Generate Age, Gender, Race for all agents in an NTA

In [6]:
def compute_stats(nta_df, attr, estimation_vars=['E', 'M', 'C', 'P', 'Z'], filter=True):
    if filter:
        filter_vars = ['E', 'P']
        estimation_vars = filter_vars
    stats = [nta_df['{}{}'.format(attr, var)].values for var in estimation_vars]
    
    return stats

def merge_nta_stats(nta_df, attr_vals):
    '''estimate and percentage'''
    args = [compute_stats(nta_df, attr, filter=True) for attr in attr_vals]
    ret_e, ret_p = 0, 0
    for val in args:
        e, p = val
        ret_e += e
        ret_p += p
    
    return [ret_e, ret_p]

def get_cumulative_stats(nta_df, attr_vals):
    total_estimate, total_percentage = merge_nta_stats(nta_df, attr_vals)
    return total_estimate, total_percentage

def get_gender_split(total_estimate, total_percentage, male_ratio):
    total_estimate = total_estimate.item()

    male_estimate = int(male_ratio*total_estimate)
    female_estimate = total_estimate - male_estimate

    male_percentage = male_ratio*total_percentage.item()
    female_percentage = total_percentage - male_percentage

    return_dict = {'estimate': {'male': male_estimate, 'female': female_estimate}, 'percentage': {'male': male_percentage, 'female': female_percentage}}

    return return_dict

    # return {'male': [male_estimate, male_percentage], 'female': [female_estimate, female_percentage]}

In [7]:
def get_nta_age_gender(df, nta_id, age_mapping, male_ratio=0.508):
    '''estimate, percentage'''
    nta_df = df[df['GeoID'] == nta_id]

    nta_age_gender = {}
    for key in age_mapping:
        attr_vals = age_mapping[key]

        total_estimate, total_percentage = merge_nta_stats(nta_df, attr_vals)
        if key == 'U19':
            male_ratio = compute_stats(nta_df, 'PopU18M')[-1] / 100.0 # percentage of male < 19
        if key == '65A':
            male_ratio = compute_stats(nta_df, 'Pop65plM')[-1] / 100.0 # percentage of male > 65
        
        age_gender_stats = get_gender_split(total_estimate, total_percentage, male_ratio=male_ratio)

        nta_age_gender[key] = age_gender_stats

    return nta_age_gender

In [8]:
nta_demographics_file = '../nta/demo_2021acs5yr_nta.xlsx'

df = pd.read_excel(nta_demographics_file)
# NTA_ID = 'BK0101'
NTA_ID = df['GeoID'].unique()

In [9]:
age_mapping = {'U19': ['PopU5', 'Pop5t9', 'Pop10t14', 'Pop15t19'], '20t29': ['Pop20t24', 'Pop25t29'],  
                '30t39': ['Pop30t34', 'Pop35t39'], '40t49': ['Pop40t44', 'Pop45t49'], 
                '50t64': ['Pop50t54', 'Pop55t59', 'Pop60t64'], '65A': ['Pop65t69', 'Pop70t74', 'Pop75t79','Pop80t84', 'Pop85pl']}

# age data
NTA_ID = 'BK0101'
nta_demographics_file = '../nta/demo_2021acs5yr_nta.xlsx'
df = pd.read_excel(nta_demographics_file)
nta_age_gender = get_nta_age_gender(df, NTA_ID, age_mapping)

  male_estimate = int(male_ratio*total_estimate)


In [10]:
nta_age_gender

{'U19': {'estimate': {'male': 2445, 'female': 2496},
  'percentage': {'male': array([6.534]), 'female': array([6.666])}},
 '20t29': {'estimate': {'male': 3327, 'female': 3396},
  'percentage': {'male': array([8.91]), 'female': array([9.09])}},
 '30t39': {'estimate': {'male': 5728, 'female': 5845},
  'percentage': {'male': array([15.2955]), 'female': array([15.6045])}},
 '40t49': {'estimate': {'male': 2833, 'female': 2891},
  'percentage': {'male': array([7.524]), 'female': array([7.676])}},
 '50t64': {'estimate': {'male': 2300, 'female': 2347},
  'percentage': {'male': array([6.138]), 'female': array([6.262])}},
 '65A': {'estimate': {'male': 1736, 'female': 2174},
  'percentage': {'male': array([4.6176]), 'female': array([5.7824])}}}

In [46]:
import pandas as pd
NTA_ID = df['GeoID'].unique()
# Initialize an empty DataFrame
dfa = pd.DataFrame()

# Iterate over the dictionary
nta_age_gender_df = []

for n in NTA_ID:
    try:
        nta_age_gender = get_nta_age_gender(df, n, age_mapping)
        nta_df = pd.DataFrame()
        for key, value in nta_age_gender.items():
            # Create a temporary DataFrame for each key-value pair
            temp_df = pd.DataFrame(value['estimate'], index=[0])
            # temp_df['area'] = n
            temp_df = temp_df.T
            temp_df.columns = [key]
            
            # Append the temporary DataFrame to the main DataFrame
            nta_df = pd.concat([nta_df, temp_df], axis=1)
        nta_df['area'] = n
        dfa = pd.concat([dfa, nta_df])
    except:
        pass
# Reset the index and rename the index column
dfa = dfa.reset_index()
dfa = dfa.rename(columns={'index': 'gender'})
dfa.set_index('area', inplace=True)
dfa.reset_index(inplace=True)
dfa = dfa.fillna(0)
dfa.to_pickle('gender.pkl')
dfa

  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = in

Unnamed: 0,area,gender,U19,20t29,30t39,40t49,50t64,65A
0,BK0101,male,2445,3327,5728,2833,2300,1736
1,BK0101,female,2496,3396,5845,2891,2347,2174
2,BK0102,male,5857,6867,8808,3493,3115,2297
3,BK0102,female,6321,7410,9505,3771,3363,2509
4,BK0103,male,13202,2549,2218,1741,2165,1835
...,...,...,...,...,...,...,...,...
399,SI9561,female,52,30,32,18,4,7
400,BX2891,male,9,7,0,0,4,1
401,BX2891,female,12,8,0,0,5,6
402,MN1191,male,19,36,84,71,80,91


In [13]:
# read an npy file
import numpy as np
import pandas as pd
file = np.load("/Users/shashankkumar/Documents/AgentTorch_Official/AgentTorch/AgentTorch/helpers/census_data/nyc/generate_data/all_nta_agents.npy", allow_pickle=True)
file_dict = file.item()
# df = pd.DataFrame.from_dict(dict(file_dict), orient='index')
len(file_dict.keys())

215

In [14]:
area_list = list(file_dict.keys())

In [None]:
area_list

In [25]:
file_dict['BK0101']['age_gender_prob'][1].item()

0.07200000000000001

In [14]:
dataFrames = {'age_gender_prob': pd.DataFrame(columns=['nta_id', 'U19', '20t29', '30t39', '40t49', '50t64', '65A']),
              'race_prob': pd.DataFrame(columns=['nta_id', 'hispanic', 'white', 'black', 'native', 'other', 'asian'])}
dicts = {'age_gender_prob': {},
         'race_prob': {}}
for d in file_dict.keys():
    try:
        in_dict_keys = file_dict[d].keys()
        
        for k in in_dict_keys:
            if isinstance(file_dict[d][k], list):
                if k == 'age_gender_prob':
                    dicts[k][d] = file_dict[d][k]
                    
                elif k == 'race_prob':
                    dicts[k][d] = file_dict[d][k]
    except:
        pass

    

In [45]:
dicts['race_prob'].keys()
df_r = pd.DataFrame.from_dict(dicts['race_prob'], columns=['hispanic', 'white', 'black', 'native', 'other', 'asian'],orient='index')
df_r.index.name = 'area'
df_r.reset_index(inplace=True)
df_r = df_r.melt(id_vars='area', value_vars=['hispanic', 'white', 'black', 'native', 'other', 'asian'], var_name='ethnicity', value_name='value', ignore_index=True)
df_r = df_r.sort_values(by=['area'], ignore_index=True)
columns_ag = ['U19M','U19F', '20t29M', '20t29F','30t39M','30t39F', '40t49M','40t49F', '50t64M','50t64F', '65AM','65AF'  ]
for i in range(12):
        df_r[f'{columns_ag[i]}'] = df_r['value']
df_r['U19'] = df_r['U19M'] + df_r['U19F']
df_r['20t29'] = df_r['20t29M'] + df_r['20t29F']
df_r['30t39'] = df_r['30t39M'] + df_r['30t39F']
df_r['40t49'] = df_r['40t49M'] + df_r['40t49F']
df_r['50t64'] = df_r['50t64M'] + df_r['50t64F']
df_r['65A'] = df_r['65AM'] + df_r['65AF']
df_r = df_r.drop(columns=['U19M','U19F', '20t29M', '20t29F','30t39M','30t39F', '40t49M','40t49F', '50t64M','50t64F', '65AM','65AF'])
df_r = df_r.drop(columns=['value'])
df_r.to_pickle('ethnicity.pkl')
df_r

Unnamed: 0,area,ethnicity,U19,20t29,30t39,40t49,50t64,65A
0,BK0101,hispanic,0.294,0.294,0.294,0.294,0.294,0.294
1,BK0101,other,0.124,0.124,0.124,0.124,0.124,0.124
2,BK0101,black,0.050,0.050,0.050,0.050,0.050,0.050
3,BK0101,native,0.000,0.000,0.000,0.000,0.000,0.000
4,BK0101,asian,0.078,0.078,0.078,0.078,0.078,0.078
...,...,...,...,...,...,...,...,...
1279,SI9592,native,0.000,0.000,0.000,0.000,0.000,0.000
1280,SI9592,white,0.108,0.108,0.108,0.108,0.108,0.108
1281,SI9592,hispanic,0.570,0.570,0.570,0.570,0.570,0.570
1282,SI9592,other,0.000,0.000,0.000,0.000,0.000,0.000


In [22]:
dfh = pd.read_excel('/Users/shashankkumar/Documents/AgentTorch_Official/AgentTorch/AgentTorch/helpers/census_data/nyc/nta/soc_2021acs5yr_nta.xlsx')

In [38]:
df_g = dfh[['GeoID','Borough']]
df_g['region'] = 'NYC'
df_g['super_area_code'] = df_g['Borough']
df_g['super_area_code'] = df_g['super_area_code'].map({'Manhattan':1, 'Bronx':2, 'Brooklyn':3, 'Queens':4, 'Staten Island':5})
df_g.rename(columns={'GeoID':'area', 'Borough':'super_area_name'}, inplace=True)
df_g

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_g['region'] = 'NYC'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_g['super_area_code'] = df_g['Borough']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_g['super_area_code'] = df_g['super_area_code'].map({'Manhattan':1, 'Bronx':2, 'Brooklyn':3, 'Queens':4, 'Staten Island':5})
A value is tr

Unnamed: 0,area,super_area_name,region,super_area_code
0,BK0101,Brooklyn,NYC,3
1,BK0102,Brooklyn,NYC,3
2,BK0103,Brooklyn,NYC,3
3,BK0104,Brooklyn,NYC,3
4,BK0201,Brooklyn,NYC,3
...,...,...,...,...
257,SI0291,Staten Island,NYC,5
258,SI0391,Staten Island,NYC,5
259,SI9591,Staten Island,NYC,5
260,SI9592,Staten Island,NYC,5


In [34]:
df_housing = dfh[['GeoID','HHPopE','Rshp_ChE','HH1E']]
df_housing.rename(columns={'GeoID':'area', 'HHPopE':'people_num', 'Rshp_ChE':'children_num', 'HH1E':'household_num'}, inplace=True)
df_housing

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_housing.rename(columns={'GeoID':'area', 'HHPopE':'people_num', 'Rshp_ChE':'children_num', 'HH1E':'household_num'}, inplace=True)


Unnamed: 0,area,people_num,children_num,household_num
0,BK0101,37213,6226,17487
1,BK0102,63079,14809,26955
2,BK0103,46758,26853,10613
3,BK0104,52394,11856,22665
4,BK0201,22768,4499,11857
...,...,...,...,...
257,SI0291,0,0,0
258,SI0391,23,5,9
259,SI9591,0,0,0
260,SI9592,260,90,66


In [None]:
{'nta_id': 'BK0101',
 'num_agents': 37518,
 'race_prob': [0.147, 0.727, 0.025, 0.0, 0.06199999999999999, 0.039],
 'age_gender_prob': [0.06,
  array([0.072]),
  0.08,
  array([0.1]),
  0.15,
  array([0.159]),
  0.07,
  array([0.082]),
  0.06,
  array([0.064]),
  0.04,
  array([0.064])],
 'education_prob': [0.18614058906652628,
  0.48399878085949405,
  0.18192901277326756,
  0.14793161730071208],
 'insurance_employ_prob': [0.749547983310153,
  0.06442976356050069,
  0.05027816411682893,
  0.016307371349095965,
  0.11001390820584145,
  0.009422809457579973]}

In [None]:
input_data = {
    {'nta_id': 'BK0101',    
    'probabilities': {
            'ethnicity': {'hispanic': 0.147, 'white': 0.727, 'black': 0.025, 'native': 0.0, 'other': 0.06199999999999999, 'asian': 0.039},
            'age':{
                {'U19':{'male': 0.072, 'female': 0.08},
                '20t29':{'male': 0.1, 'female': 0.15},
                '30t39':{'male': 0.07, 'female': 0.07},
                '40t49': {'male': 0.082, 'female': 0.082},
                '50t64': {'male': 0.06, 'female': 0.064},
                '65A': {'male': 0.04, 'female': 0.064}
                }
            }
            
        }}
}

In [17]:
input_mapping = {
    'race': ['hispanic', 'white', 'black', 'native', 'other', 'asian'],
    'age': ['U19', '20t29', '30t39', '40t49', '50t64', '65A'],
    'gender': ['male', 'female']
}

In [18]:
len(input_mapping['age'])

6

In [19]:
def get_index(value, age_ranges):
    try:
        return age_ranges.index(value)
    except ValueError:
        return "Value not found in the list"

age_ranges = ['U19', '20t29', '30t39', '40t49', '50t64', '65A']

print(get_index('30t39', age_ranges))  # Output: 2
print(get_index('70t79', age_ranges)) 

2
Value not found in the list


In [20]:
age_index = get_index('U19', input_mapping['age'])

In [21]:
age_index

0

In [26]:
import random

value = random.randint(0, 30000)

print(value)

22958
