In [1]:
import numpy as np
import pandas as pd

Generate Age, Gender, Race for all agents in an NTA

In [4]:
def compute_stats(nta_df, attr, estimation_vars=['E', 'M', 'C', 'P', 'Z'], filter=True):
    if filter:
        filter_vars = ['E', 'P']
        estimation_vars = filter_vars
    stats = [nta_df['{}{}'.format(attr, var)].values for var in estimation_vars]
    
    return stats

def merge_nta_stats(nta_df, attr_vals):
    '''estimate and percentage'''
    args = [compute_stats(nta_df, attr, filter=True) for attr in attr_vals]
    ret_e, ret_p = 0, 0
    for val in args:
        e, p = val
        ret_e += e
        ret_p += p
    
    return [ret_e, ret_p]

def get_cumulative_stats(nta_df, attr_vals):
    total_estimate, total_percentage = merge_nta_stats(nta_df, attr_vals)
    return total_estimate, total_percentage

def get_gender_split(total_estimate, total_percentage, male_ratio):
    total_estimate = total_estimate.item()

    male_estimate = int(male_ratio*total_estimate)
    female_estimate = total_estimate - male_estimate

    male_percentage = male_ratio*total_percentage.item()
    female_percentage = total_percentage - male_percentage

    return_dict = {'estimate': {'male': male_estimate, 'female': female_estimate}, 'percentage': {'male': male_percentage, 'female': female_percentage}}

    return return_dict

    # return {'male': [male_estimate, male_percentage], 'female': [female_estimate, female_percentage]}

In [5]:
def get_nta_race_legacy(df, nta_id, race_mapping):
    nta_df = df[df['GeoID'] == nta_id]

    nta_race = {}
    for key in race_mapping:
        nta_race[key] = {'percentage': 0.0, 'estimate': 0.0}

        for attr_value in race_mapping[key]:
            estimate, percentage = compute_stats(nta_df, attr_value, filter=True)

            nta_race[key]['percentage'] +=  max(0, percentage[0])
            nta_race[key]['estimate'] +=  max(0, estimate[0])

    return nta_race

In [6]:
def get_nta_race(df, nta_id, race_mapping):
    nta_df = df[df['GeoID'] == nta_id]

    nta_race = {}
    for key in race_mapping:
        nta_race[key] = {}

        estimate, percentage = merge_nta_stats(nta_df, race_mapping[key])

        nta_race[key]['estimate'] = max(0, estimate[0])
        nta_race[key]['percentage'] = max(0, percentage[0])

    return nta_race

In [7]:
def get_nta_age_gender(df, nta_id, age_mapping, male_ratio=0.508):
    '''estimate, percentage'''
    nta_df = df[df['GeoID'] == nta_id]

    nta_age_gender = {}
    for key in age_mapping:
        attr_vals = age_mapping[key]

        total_estimate, total_percentage = merge_nta_stats(nta_df, attr_vals)
        if key == 'U19':
            male_ratio = compute_stats(nta_df, 'PopU18M')[-1] / 100.0 # percentage of male < 19
        if key == '65A':
            male_ratio = compute_stats(nta_df, 'Pop65plM')[-1] / 100.0 # percentage of male > 65
        
        age_gender_stats = get_gender_split(total_estimate, total_percentage, male_ratio=male_ratio)

        nta_age_gender[key] = age_gender_stats

    return nta_age_gender

In [6]:
nta_demographics_file = '../nta/demo_2021acs5yr_nta.xlsx'

df = pd.read_excel(nta_demographics_file)
# NTA_ID = 'BK0101'
NTA_ID = df['GeoID'].unique()

In [None]:
race_mapping = {'hispanic': ['Hsp1'], 'white': ['WtNH'], 'black': ['AsnNH'], 'black': ['BlNH'], 
                'native': ['NHPINH', 'AIANNH'], 'other': ['OthNH', 'Rc2plNH'], 'asian': ['AsnNH']}

# race data
nta_race_df = []
for n in NTA_ID:
    nta_race = get_nta_race(df, n, race_mapping)
    ser = pd.Series(nta_race)
    nta_race_im_df = pd.DataFrame({"index":ser.index, "column2":ser.values})
    nta_race_im_df['area'] = n
    nta_race_df.append(nta_race_im_df)
    
nta_race_df = pd.concat(nta_race_df)
# nta_race_df.index = range(1, len(nta_race_df) + 1)  # Start from 1 for clarity

# Now the DataFrame uses serial numbers as the index
nta_race_df.head(19)

In [None]:
nta_race_df.groupby('area','ethnicity'
                    )
nta_race_df.set_index('area', inplace=True)
nta_race_df

In [None]:
nta_race = pd.DataFrame(nta_race)


In [None]:
nta_race

In [8]:
age_mapping = {'U19': ['PopU5', 'Pop5t9', 'Pop10t14', 'Pop15t19'], '20t29': ['Pop20t24', 'Pop25t29'],  
                '30t39': ['Pop30t34', 'Pop35t39'], '40t49': ['Pop40t44', 'Pop45t49'], 
                '50t64': ['Pop50t54', 'Pop55t59', 'Pop60t64'], '65A': ['Pop65t69', 'Pop70t74', 'Pop75t79','Pop80t84', 'Pop85pl']}

# age data
NTA_ID = 'BK0101'
nta_demographics_file = '../nta/demo_2021acs5yr_nta.xlsx'
df = pd.read_excel(nta_demographics_file)
nta_age_gender = get_nta_age_gender(df, NTA_ID, age_mapping)

  male_estimate = int(male_ratio*total_estimate)


In [None]:
nta_age_gender

In [None]:
import pandas as pd
NTA_ID = df['GeoID'].unique()
# Initialize an empty DataFrame
dfa = pd.DataFrame()

# Iterate over the dictionary
nta_age_gender_df = []

for n in NTA_ID:
    try:
        nta_age_gender = get_nta_age_gender(df, n, age_mapping)
        nta_df = pd.DataFrame()
        for key, value in nta_age_gender.items():
            # Create a temporary DataFrame for each key-value pair
            temp_df = pd.DataFrame(value['percentage'], index=[0])
            # temp_df['area'] = n
            temp_df = temp_df.T
            temp_df.columns = [key]
            
            # Append the temporary DataFrame to the main DataFrame
            nta_df = pd.concat([nta_df, temp_df], axis=1)
        nta_df['area'] = n
        dfa = pd.concat([dfa, nta_df])
    except:
        pass
# Reset the index and rename the index column
dfa = dfa.reset_index()
dfa = dfa.rename(columns={'index': 'gender'})
dfa.set_index('area', inplace=True)
dfa.reset_index(inplace=True)
dfa = dfa.fillna(0)



  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = int(male_ratio*total_estimate)
  male_estimate = in

Unnamed: 0,area,gender,U19,20t29,30t39,40t49,50t64,65A
0,BK0101,male,6.5340,8.9100,15.2955,7.5240,6.1380,4.6176
1,BK0101,female,6.6660,9.0900,15.6045,7.6760,6.2620,5.7824
2,BK0102,male,9.2833,10.8225,13.9009,5.5315,4.9062,3.5850
3,BK0102,female,10.0167,11.6775,14.9991,5.9685,5.2938,3.9150
4,BK0103,male,28.0371,5.3742,4.7151,3.7011,4.6137,3.8927
...,...,...,...,...,...,...,...,...
399,SI9561,female,9.7524,5.6760,5.9598,3.3024,0.6708,0.0000
400,BX2891,male,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
401,BX2891,female,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
402,MN1191,male,0.0000,1.9706,4.5591,3.9078,4.3754,0.0000


In [None]:
p = 0
for key in nta_race:
    p += nta_race[key]['estimate']
print(p)

In [None]:
val = 0
for key in nta_age_gender:
    val = val + nta_age_gender[key]['male'][0] + nta_age_gender[key]['female'][0]

print(val)

In [13]:
# read an npy file
import numpy as np
import pandas as pd
file = np.load("/Users/shashankkumar/Documents/AgentTorch_Official/AgentTorch/AgentTorch/helpers/census_data/nyc/generate_data/all_nta_agents.npy", allow_pickle=True)
file_dict = file.item()
# df = pd.DataFrame.from_dict(dict(file_dict), orient='index')
file_dict.keys()

dict_keys(['BK0101', 'BK0102', 'BK0103', 'BK0104', 'BK0201', 'BK0202', 'BK0203', 'BK0204', 'BK0301', 'BK0302', 'BK0401', 'BK0402', 'BK0501', 'BK0502', 'BK0503', 'BK0504', 'BK0505', 'BK0601', 'BK0602', 'BK0701', 'BK0702', 'BK0703', 'BK0801', 'BK0802', 'BK0901', 'BK0902', 'BK1001', 'BK1002', 'BK1101', 'BK1102', 'BK1103', 'BK1201', 'BK1202', 'BK1203', 'BK1204', 'BK1301', 'BK1302', 'BK1303', 'BK1401', 'BK1402', 'BK1403', 'BK1501', 'BK1502', 'BK1503', 'BK1601', 'BK1602', 'BK1701', 'BK1702', 'BK1703', 'BK1704', 'BK1801', 'BK1802', 'BK1803', 'BX0101', 'BX0102', 'BX0201', 'BX0202', 'BX0301', 'BX0302', 'BX0303', 'BX0401', 'BX0402', 'BX0403', 'BX0501', 'BX0502', 'BX0503', 'BX0601', 'BX0602', 'BX0603', 'BX0701', 'BX0702', 'BX0703', 'BX0801', 'BX0802', 'BX0803', 'BX0901', 'BX0902', 'BX0903', 'BX0904', 'BX1001', 'BX1002', 'BX1003', 'BX1004', 'BX1101', 'BX1102', 'BX1103', 'BX1104', 'BX1201', 'BX1202', 'BX1203', 'MN0101', 'MN0102', 'MN0201', 'MN0202', 'MN0203', 'MN0301', 'MN0302', 'MN0303', 'MN0401',

In [14]:
dataFrames = {'age_gender_prob': pd.DataFrame(columns=['nta_id', 'U19', '20t29', '30t39', '40t49', '50t64', '65A']),
              'race_prob': pd.DataFrame(columns=['nta_id', 'hispanic', 'white', 'black', 'native', 'other', 'asian'])}
dicts = {'age_gender_prob': {},
         'race_prob': {}}
for d in file_dict.keys():
    try:
        in_dict_keys = file_dict[d].keys()
        
        for k in in_dict_keys:
            if isinstance(file_dict[d][k], list):
                if k == 'age_gender_prob':
                    dicts[k][d] = file_dict[d][k]
                    
                elif k == 'race_prob':
                    dicts[k][d] = file_dict[d][k]
    except:
        pass

    

In [18]:
dicts['race_prob'].keys()
df_r = pd.DataFrame.from_dict(dicts['race_prob'], columns=['hispanic', 'white', 'black', 'native', 'other', 'asian'],orient='index')
df_r.index.name = 'area'
df_r.reset_index(inplace=True)
df_r = df_r.melt(id_vars='area', value_vars=['hispanic', 'white', 'black', 'native', 'other', 'asian'], var_name='ethnicity', value_name='value', ignore_index=True)
df_r = df_r.sort_values(by=['area'], ignore_index=True)
columns_ag = ['U19M','U19F', '20t29M', '20t29F','30t39M','30t39F', '40t49M','40t49F', '50t64M','50t64F', '65AM','65AF'  ]
for i in range(12):
        df_r[f'{columns_ag[i]}'] = df_r['value']
df_r

Unnamed: 0,area,ethnicity,value,U19M,U19F,20t29M,20t29F,30t39M,30t39F,40t49M,40t49F,50t64M,50t64F,65AM,65AF
0,BK0101,hispanic,0.147,0.147,0.147,0.147,0.147,0.147,0.147,0.147,0.147,0.147,0.147,0.147,0.147
1,BK0101,other,0.062,0.062,0.062,0.062,0.062,0.062,0.062,0.062,0.062,0.062,0.062,0.062,0.062
2,BK0101,black,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025
3,BK0101,native,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
4,BK0101,asian,0.039,0.039,0.039,0.039,0.039,0.039,0.039,0.039,0.039,0.039,0.039,0.039,0.039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1279,SI9592,native,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
1280,SI9592,white,0.054,0.054,0.054,0.054,0.054,0.054,0.054,0.054,0.054,0.054,0.054,0.054,0.054
1281,SI9592,hispanic,0.285,0.285,0.285,0.285,0.285,0.285,0.285,0.285,0.285,0.285,0.285,0.285,0.285
1282,SI9592,other,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [2]:
dfh = pd.read_excel('/Users/shashankkumar/Documents/AgentTorch_Official/AgentTorch/AgentTorch/helpers/census_data/nyc/nta/soc_2021acs5yr_nta.xlsx')

In [5]:
df_g = dfh[['GeoID','Borough']]
df_g['region'] = 'NYC'
df_g['super_area_code'] = df_g['Borough']
df_g['super_area_code'] = df_g['super_area_code'].map({'Manhattan':1, 'Bronx':2, 'Brooklyn':3, 'Queens':4, 'Staten Island':5})
df_g.rename(columns={'GeoID':'area', 'Borough':'super_area_name'}, inplace=True)
df_g.to_pickle('geo.pkl')
df_g

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_g['region'] = 'NYC'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_g['super_area_code'] = df_g['Borough']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_g['super_area_code'] = df_g['super_area_code'].map({'Manhattan':1, 'Bronx':2, 'Brooklyn':3, 'Queens':4, 'Staten Island':5})
A value is tr

Unnamed: 0,area,super_area_name,region,super_area_code
0,BK0101,Brooklyn,NYC,3
1,BK0102,Brooklyn,NYC,3
2,BK0103,Brooklyn,NYC,3
3,BK0104,Brooklyn,NYC,3
4,BK0201,Brooklyn,NYC,3
...,...,...,...,...
257,SI0291,Staten Island,NYC,5
258,SI0391,Staten Island,NYC,5
259,SI9591,Staten Island,NYC,5
260,SI9592,Staten Island,NYC,5


In [4]:
df_housing = dfh[['GeoID','HHPopE','Rshp_ChE','HH1E']]
df_housing.rename(columns={'GeoID':'area', 'HHPopE':'people_num', 'Rshp_ChE':'children_num', 'HH1E':'household_num'}, inplace=True)
df_housing.to_pickle('housing.pkl')
df_housing

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_housing.rename(columns={'GeoID':'area', 'HHPopE':'people_num', 'Rshp_ChE':'children_num', 'HH1E':'household_num'}, inplace=True)


Unnamed: 0,area,people_num,children_num,household_num
0,BK0101,37213,6226,17487
1,BK0102,63079,14809,26955
2,BK0103,46758,26853,10613
3,BK0104,52394,11856,22665
4,BK0201,22768,4499,11857
...,...,...,...,...
257,SI0291,0,0,0
258,SI0391,23,5,9
259,SI9591,0,0,0
260,SI9592,260,90,66


In [6]:
age_ranges = ['U19', '20t29', '30t39', '40t49', '50t64', '65A']

ages = []

for age_range in age_ranges:
    if age_range == 'U19':
        ages.extend(list(range(0, 19)))
    elif age_range == '65A':
        ages.extend(list(range(65, 101)))  # Assuming 100 as the upper limit
    else:
        start, end = map(int, age_range.split('t'))
        ages.extend(list(range(start, end + 1)))

print(ages)


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
