In [13]:
# This script generates PUMS household and person files in the format required for VisionEval.

In [6]:
import pandas as pd
import csv

In [7]:
# Set paths

# Standard VisionEval PUMS files
pums_hh_raw = r'https://raw.githubusercontent.com/VisionEval/VisionEval/master/sources/modules/VESimHouseholds/inst/extdata/pums_households.csv'
pums_person_raw = r'https://raw.githubusercontent.com/VisionEval/VisionEval/master/sources/modules/VESimHouseholds/inst/extdata/pums_persons.csv'

# PSRC data (outputs of populationsim)
psrc_pums_hh_raw = r'R:\e2projects_two\SyntheticPopulation_2018\keep\2018\populationsim_files\data\seed_households.csv'
psrc_pums_hh_raw_gq = r'R:\e2projects_two\SyntheticPopulation_2018\keep\group_quarters\input_files\seed_households_gq.csv'

psrc_pums_person_raw = r'R:\e2projects_two\SyntheticPopulation_2018\keep\2018\populationsim_files\data\seed_persons.csv'
psrc_pums_person_raw_gq = r'R:\e2projects_two\SyntheticPopulation_2018\keep\group_quarters\input_files\seed_persons_gq.csv'

# Output
output_pums_hh = r'C:\Workspace\VisionEval\input_creation\pums_data\formatted\pums_households.csv'
output_pums_person = r'C:\Workspace\VisionEval\input_creation\pums_data\formatted\pums_persons.csv'

# Household

In [8]:
# Load template files from a standard VisionEval repo
df_template_hh = pd.read_csv(pums_hh_raw)
df_template_person = pd.read_csv(pums_person_raw)

# Merge person records to household df
df_template = df_template_person.merge(df_template_hh, on='SERIALNO', how='left')

In [9]:
# Load PSRC files (outputs of populationsim)
df_psrc_pums_hh = pd.read_csv(psrc_pums_hh_raw)
df_psrc_pums_hh_gq = pd.read_csv(psrc_pums_hh_raw_gq)
df_psrc_pums_hh_gq['TYPE'] = 3 # will be changed to 2
df_psrc_pums_hh_gq['BLD'] = 2
df_psrc_pums_hh_gq['HINCP'] = 0

# SERIALNO in PSRC is not unique, but hhnum is; use this instead as a unique id field
df_psrc_pums_hh['SERIALNO'] = df_psrc_pums_hh['hhnum'].astype('int').astype('str')    
df_psrc_pums_hh_gq['hh_id'] = df_psrc_pums_hh_gq['hh_id']+9000000    # For group quarters assign a unique id outside range of standard records
df_psrc_pums_hh_gq['SERIALNO'] = df_psrc_pums_hh_gq['hh_id'].astype('int').astype('str')

# Remap column names to match VisionEval format
# Defined as follows: https://github.com/VisionEval/VisionEval/blob/master/sources/modules/VESimHouseholds/inst/extdata/pums_households.txt
col_map = {'SERIALNO': 'SERIALNO',
    'PUMA': 'PUMA5',
    'WGTP':'HWEIGHT',
    'TYPE': 'UNITTYPE',
    'NP': 'PERSONS',
    'BLD': 'BLDGSZ',
    'HINCP': 'HINC'}

# Rename for both standard and group quarters records; combine both sources as single file
df_psrc_pums_hh = df_psrc_pums_hh[list(col_map.keys())]
df_psrc_pums_hh = df_psrc_pums_hh.rename(columns=col_map)

df_psrc_pums_hh_gq = df_psrc_pums_hh_gq[list(col_map.keys())]
df_psrc_pums_hh_gq = df_psrc_pums_hh_gq.rename(columns=col_map)
df_psrc_pums_hh = df_psrc_pums_hh.append(df_psrc_pums_hh_gq)

# Unit Type is offset by 1 in the VisionEval format
df_psrc_pums_hh['UNITTYPE'] = df_psrc_pums_hh['UNITTYPE'] - 1
string_cols = ['SERIALNO','PUMA5','BLDGSZ']

# Convert income to 1999 $
conversion = 0.580    # CPI comparison between 1999 and 2018
df_psrc_pums_hh['HINC'] = df_psrc_pums_hh['HINC']*conversion

df_psrc_pums_hh['HINC'] = df_psrc_pums_hh['HINC'].astype('int')
df_psrc_pums_hh['BLDGSZ'] = df_psrc_pums_hh['BLDGSZ'].astype('int')
df_psrc_pums_hh[string_cols] = df_psrc_pums_hh[string_cols].astype('str')

# Add leading 0 to BLDGSZ
df_psrc_pums_hh['BLDGSZ'] = df_psrc_pums_hh['BLDGSZ'].apply(lambda x: x.zfill(2))
# IF BLDGSZ == 0, make it empty string
df_psrc_pums_hh.loc[df_psrc_pums_hh['BLDGSZ'] == '00', 'BLDGSZ'] = "  "

# # Keep only last 7 digits of SERIALNO
df_psrc_pums_hh['SERIALNO'] = df_psrc_pums_hh['SERIALNO'].apply(lambda x: x.zfill(7))

df_psrc_pums_hh.to_csv(output_pums_hh, index=False, 
                                    quotechar='"', quoting=csv.QUOTE_NONNUMERIC)


# Person

In [10]:
df_psrc_pums_person = pd.read_csv(psrc_pums_person_raw)
df_psrc_pums_person_gq = pd.read_csv(psrc_pums_person_raw_gq)
# Defined here: https://github.com/VisionEval/VisionEval/blob/master/sources/modules/VESimHouseholds/inst/extdata/pums_persons.txt

df_psrc_pums_person['SERIALNO'] = df_psrc_pums_person['hhnum'].astype('int').astype('str') 
df_psrc_pums_person_gq['hh_id'] = df_psrc_pums_person_gq['hh_id']+9000000
df_psrc_pums_person_gq['SERIALNO'] = df_psrc_pums_person_gq['hh_id'].astype('int').astype('str')

col_map = {'SERIALNO': 'SERIALNO',
    'AGEP': 'AGE',
    'WRK':'WRKLYR',
    'PINCP': 'INCTOT'}
df_psrc_pums_person = df_psrc_pums_person[list(col_map.keys())]
df_psrc_pums_person_gq = df_psrc_pums_person_gq[list(col_map.keys())]

df_psrc_pums_person = df_psrc_pums_person.rename(columns=col_map)
df_psrc_pums_person_gq = df_psrc_pums_person_gq.rename(columns=col_map)
df_psrc_pums_person = df_psrc_pums_person.append(df_psrc_pums_person_gq)

# Convert income to 1999 $
conversion = 0.580
df_psrc_pums_person['INCTOT'] = df_psrc_pums_person['INCTOT']*conversion

int_cols = ['AGE','WRKLYR','INCTOT']
df_psrc_pums_person[int_cols] = df_psrc_pums_person[int_cols].fillna(0).astype('int')

# If WRKLYR =0, set personal income to 0
df_psrc_pums_person.loc[df_psrc_pums_person['WRKLYR'] == 0, 'INCTOT'] = 'NA'

df_psrc_pums_person['SERIALNO'] = df_psrc_pums_person['SERIALNO'].apply(lambda x: x.zfill(7))

df_psrc_pums_person.to_csv(output_pums_person, index=False)

In [11]:
# Check that person and household fields match up
len(df_psrc_pums_person['SERIALNO'].unique()) == len(df_psrc_pums_hh['SERIALNO'].unique())

True

In [12]:
# There should be 9 different combinations of WRKLYR AND UNITTYPE here 
# (WORKLYR=[0,1,2] and UNITTYPE=[0,1,2])
# Records should exist for each alternative combination of these fields
df= df_psrc_pums_person.merge(df_psrc_pums_hh, on='SERIALNO', how='left')
df.groupby(['WRKLYR','UNITTYPE']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,SERIALNO,AGE,INCTOT,PUMA5,HWEIGHT,PERSONS,BLDGSZ,HINC
WRKLYR,UNITTYPE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,45003,45003,45003,45003,45003,45003,45003,45003
0,1,2327,2327,2327,2327,2327,2327,2327,2327
0,2,708,708,708,708,708,708,708,708
1,0,86390,86390,86390,86390,86390,86390,86390,86390
1,1,170,170,170,170,170,170,170,170
1,2,2743,2743,2743,2743,2743,2743,2743,2743
2,0,50893,50893,50893,50893,50893,50893,50893,50893
2,1,30,30,30,30,30,30,30,30
2,2,5389,5389,5389,5389,5389,5389,5389,5389
