In [4]:
import pandas as pd
from collections import defaultdict
import os
import pickle
from tqdm import tqdm

In [6]:
def add_to_data_graph(df):
    industry_state_dict = defaultdict(list)
    nodes = []
    for index, row in df.iterrows():
        # if row['AGE'] >= 25 and row['AGE'] < 64 and row['is_full_time_full_year'] and row['winsor99']:    # filter by age above 18
        id = str(row['YEAR']) + '-' + str(row['SERIAL']) + '-' + str(row['PERNUM'])
        state_industry = str(row['STATECENSUS']) + '-' + str(row['IND'])
            
        entry = {'stateXindustry': state_industry, 'id': id, 'labor_income': row['labor_income'], 'capital_income': row['capital_income'], 'age': row['AGE'], 'full_time_full_year': row['is_full_time_full_year'], 'bottom_99': row['winsor99']}
        industry_state_dict[state_industry].append(entry)

    return industry_state_dict


def process_data(df, dir, filename, save=False):
    industry_state_dict = add_to_data_graph(df)
    
    if save:
        dict_filename = os.path.join(dir, f"{filename}.pkl")
        with open(dict_filename, 'wb') as f:
            pickle.dump(industry_state_dict, f)

    return industry_state_dict

In [32]:
df = pd.read_csv('./data/cps_data/2000.csv')

In [33]:
process_data(df, './data/cps_data/', '2000_dict', save=True)

Processing IPUMS data (might take a while)

Saving results...


defaultdict(list,
            {'11-651': [{'stateXindustry': '11-651',
               'id': '2001-3-1',
               'labor_income': 37108.625,
               'capital_income': 0.0},
              {'stateXindustry': '11-651',
               'id': '2001-352-1',
               'labor_income': 35624.28,
               'capital_income': 219.90296296296293},
              {'stateXindustry': '11-651',
               'id': '2001-352-2',
               'labor_income': 32655.59,
               'capital_income': 219.90296296296293},
              {'stateXindustry': '11-651',
               'id': '2001-1424-1',
               'labor_income': 44530.35,
               'capital_income': 4.398059259259258},
              {'stateXindustry': '11-651',
               'id': '2001-1525-2',
               'labor_income': 51952.075,
               'capital_income': 2097.9850091660005}],
             '11-732': [{'stateXindustry': '11-732',
               'id': '2001-4-1',
               'labor_income': 106

In [7]:
# for each year dictionary
# run through and process data into a dictionary
# pickle each dictionary and save them 
MIN_YEAR = 1975
MAX_YEAR = 2022
for year in tqdm(range(MIN_YEAR, MAX_YEAR + 1), desc="Processing years"):
    df = pd.read_csv(f'./data/cps_data/{year}.csv')
    process_data(df, './data/cps_data/', f'{year}_dict', save=True)
    


Processing years: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [03:07<00:00,  3.91s/it]
