In [125]:
import pandas as pd
import numpy as np
import os

In [126]:
ailabor_root = '/Users/sidsatya/dev/ailabor'
crosswalk_dir = '/data/bls/crosswalks'

In [127]:
# Load the crosswalk data
crosswalk_2000_to_2010 = pd.read_csv(os.path.join(ailabor_root + crosswalk_dir, 'crosswalk_2000_to_2010.csv'))
crosswalk_2010_to_2018 = pd.read_csv(os.path.join(ailabor_root + crosswalk_dir, 'crosswalk_2010_to_2018.csv'))
crosswalk_hybrid_2000_to_2010 = pd.read_csv(os.path.join(ailabor_root + crosswalk_dir, 'crosswalk_hybrid_2000_to_2010.csv'))
crosswalk_hybrid_2010_to_2018 = pd.read_csv(os.path.join(ailabor_root + crosswalk_dir, 'crosswalk_hybrid_2010_to_2018.csv'))

# Load in the 2012 May and 2021 May OES data
oes_2012 = pd.read_csv(os.path.join(ailabor_root, 'data/bls/naics_4digit/nat4d_2012_may.csv'))
oes_2021 = pd.read_csv(os.path.join(ailabor_root, 'data/bls/naics_4digit/nat4d_2021_may.csv'))

  oes_2012 = pd.read_csv(os.path.join(ailabor_root, 'data/bls/naics_4digit/nat4d_2012_may.csv'))
  oes_2021 = pd.read_csv(os.path.join(ailabor_root, 'data/bls/naics_4digit/nat4d_2021_may.csv'))


In [128]:
# Filter OES data and add year
for i, df in enumerate([oes_2012, oes_2021]):
    df.columns = [col.lower() for col in df.columns]
    # Harmonize column names for occupational groups across different file versions.
    if 'group' in df.columns:
        df.rename(columns={'group': 'occ_group'}, inplace=True)
    if 'o_group' in df.columns:
        df.rename(columns={'o_group': 'occ_group'}, inplace=True)
    # filter out rows with 'occ_code' == '00-0000' and 'occ_group' in ['major', 'minor', 'broad']
    mask = ~(df['occ_code'].str.endswith('0')) & ~(df['occ_group'].isin(['major', 'minor', 'broad']))
    df.drop(df.index[~mask], inplace=True)
    # Add year column
    if 'bls_release_year' not in df.columns:
        df['bls_release_year'] = 2012 if i == 0 else 2021

print(f"Loaded {oes_2012.shape[0]} rows from 2012 OES data"
      f" and {oes_2021.shape[0]} rows from 2021 OES data.")


Loaded 39727 rows from 2012 OES data and 36168 rows from 2021 OES data.


Create Parent-Child mappings for 2012 data

In [129]:
soc_2000_codes_with_children = crosswalk_2000_to_2010.groupby(['2000 SOC Code']).agg({'2010 SOC Code': ['unique', 'nunique']}).reset_index()
soc_2000_codes_with_children.columns = ['2000 SOC Code', 'Unique 2010 SOC Codes', 'Num Unique 2010 SOC Codes']
soc_2000_codes_with_children = soc_2000_codes_with_children[soc_2000_codes_with_children['Num Unique 2010 SOC Codes'] > 1]

oes_2012_merged = pd.merge(oes_2012, crosswalk_2000_to_2010, how='left', left_on='occ_code', right_on='2010 SOC Code').dropna(subset=['2000 SOC Code'])

# replace any '**' in tot_emp with 0
oes_2012_merged['tot_emp'] = oes_2012_merged['tot_emp'].replace({'\*\*': 0}, regex=True)
oes_2012_merged['tot_emp'] = oes_2012_merged['tot_emp'].str.replace(',','').astype(float)

within_naics_year_parent_sum = oes_2012_merged.groupby(['bls_release_year', 'naics', '2000 SOC Code'])['tot_emp'].sum().reset_index()

# Merge back to the original OES data
oes_2012_merged = pd.merge(oes_2012_merged, within_naics_year_parent_sum, how='left', on=['bls_release_year', 'naics', '2000 SOC Code'], suffixes=('', '_parent_sum'))
oes_2012_merged['naics_year_parent_emp_share'] = np.where(
    oes_2012_merged['tot_emp_parent_sum'] == 0,
    np.nan,
    oes_2012_merged['tot_emp'] / oes_2012_merged['tot_emp_parent_sum']
)

oes_2012_mapping_cw_2000_to_2010 = oes_2012_merged[['bls_release_year', 'naics', 'naics_title', 'occ_code', 'occ_title', '2000 SOC Code', '2000 SOC Title', 'tot_emp', 'tot_emp_parent_sum', 'naics_year_parent_emp_share']].copy()

In [130]:
oes_2012_mapping_cw_2000_to_2010.to_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/oes_2012_parent_child_mapping_cw_2000_to_2010.csv'), index=False)
soc_2000_codes_with_children.to_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/soc_2000_codes_with_children.csv'), index=False)

In [131]:
oes_2012_mapping_cw_2000_to_2010[oes_2012_mapping_cw_2000_to_2010['naics_year_parent_emp_share'] < 1]

Unnamed: 0,bls_release_year,naics,naics_title,occ_code,occ_title,2000 SOC Code,2000 SOC Title,tot_emp,tot_emp_parent_sum,naics_year_parent_emp_share
146,2012,115200,Support Activities for Animal Production,13-1161,Market Research Analysts and Marketing Special...,13-1199,"Business Operations Specialists, All Other*",30.0,170.0,0.176471
148,2012,115200,Support Activities for Animal Production,13-1161,Market Research Analysts and Marketing Special...,27-3031,Public Relations Specialists*,30.0,80.0,0.375000
149,2012,115200,Support Activities for Animal Production,13-1199,"Business Operations Specialists, All Other",13-1199,"Business Operations Specialists, All Other*",140.0,170.0,0.823529
159,2012,115200,Support Activities for Animal Production,27-3031,Public Relations Specialists,27-3031,Public Relations Specialists*,50.0,80.0,0.625000
223,2012,211100,Oil and Gas Extraction,13-1161,Market Research Analysts and Marketing Special...,13-1199,"Business Operations Specialists, All Other*",420.0,5260.0,0.079848
...,...,...,...,...,...,...,...,...,...,...
41195,2012,999300,Local Government (OES Designation),49-9021,"Heating, Air Conditioning, and Refrigeration M...",49-9021,"Heating, Air Conditioning, and Refrigeration M...",4590.0,4620.0,0.993506
41205,2012,999300,Local Government (OES Designation),49-9081,Wind Turbine Service Technicians,49-9099,"Installation, Maintenance, and Repair Workers,...",90.0,3710.0,0.024259
41212,2012,999300,Local Government (OES Designation),49-9099,"Installation, Maintenance, and Repair Workers,...",49-9099,"Installation, Maintenance, and Repair Workers,...",3590.0,3710.0,0.967655
41225,2012,999300,Local Government (OES Designation),51-5112,Printing Press Operators,51-5021,Job Printers*,750.0,830.0,0.903614


Create Parent-Child mappings for 2000 to 2010 hybrid mapping

In [132]:
hybrid_2000_with_children = crosswalk_hybrid_2000_to_2010.groupby('2000 SOC code').agg({'2010 SOC code': ['unique', 'nunique']}).reset_index()
hybrid_2000_with_children.columns = ['2000 SOC Code', 'Unique 2010 SOC Codes', 'Num Unique 2010 SOC Codes']
hybrid_2000_with_children.sort_values(by='Num Unique 2010 SOC Codes', ascending = False)

oes_2012_merged = pd.merge(oes_2012, crosswalk_hybrid_2000_to_2010, how='left', left_on='occ_code', right_on='2000 SOC code').dropna(subset=['2000 SOC code'])

# replace any '**' in tot_emp with 0
oes_2012_merged['tot_emp'] = oes_2012_merged['tot_emp'].replace({'\*\*': 0}, regex=True)
oes_2012_merged['tot_emp'] = oes_2012_merged['tot_emp'].str.replace(',','').astype(float)

within_naics_year_parent_sum = oes_2012_merged.groupby(['bls_release_year', 'naics', '2000 SOC code'])['tot_emp'].sum().reset_index()

# Merge back to the original OES data
oes_2012_merged = pd.merge(oes_2012_merged, within_naics_year_parent_sum, how='left', on=['bls_release_year', 'naics', '2000 SOC code'], suffixes=('', '_parent_sum'))
oes_2012_merged['naics_year_parent_emp_share'] = np.where(
    oes_2012_merged['tot_emp_parent_sum'] == 0,
    np.nan,
    oes_2012_merged['tot_emp'] / oes_2012_merged['tot_emp_parent_sum']
)

oes_2012_mapping_cw_2000_hybrid = oes_2012_merged[['bls_release_year', 'naics', 'naics_title', 'occ_code', 'occ_title', '2000 SOC code', 'SOC 2000 Title', 'tot_emp', 'tot_emp_parent_sum', 'naics_year_parent_emp_share']].copy()
oes_2012_mapping_cw_2000_hybrid.rename(columns={'2000 SOC code': '2000 SOC Code', 'SOC 2000 Title': '2000 SOC Title'}, inplace=True)

In [133]:
oes_2012_mapping_cw_2000_hybrid.to_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/oes_2012_parent_child_mapping_cw_2000_hybrid.csv'), index=False)
hybrid_2000_with_children.to_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/soc_hybrid_2000_codes_with_children.csv'), index=False)

In [134]:
oes_2012_mapping_cw_2000_hybrid[oes_2012_mapping_cw_2000_hybrid['naics_year_parent_emp_share'] < 1]

Unnamed: 0,bls_release_year,naics,naics_title,occ_code,occ_title,2000 SOC Code,2000 SOC Title,tot_emp,tot_emp_parent_sum,naics_year_parent_emp_share
110,2012,115100,Support Activities for Crop Production,49-9099,"Installation, Maintenance, and Repair Workers,...",49-9099,"Installation, Maintenance, and Repair Workers,...",50.0,100.0,0.5
111,2012,115100,Support Activities for Crop Production,49-9099,"Installation, Maintenance, and Repair Workers,...",49-9099,"Installation, Maintenance, and Repair Workers,...",50.0,100.0,0.5
122,2012,115100,Support Activities for Crop Production,51-9199,"Production Workers, All Other",51-9199,"Production Workers, All Other*",430.0,860.0,0.5
123,2012,115100,Support Activities for Crop Production,51-9199,"Production Workers, All Other",51-9199,"Production Workers, All Other*",430.0,860.0,0.5
275,2012,211100,Oil and Gas Extraction,43-9199,"Office and Administrative Support Workers, All...",43-9199,"Office and Administrative Support Workers, All...",160.0,320.0,0.5
...,...,...,...,...,...,...,...,...,...,...
36443,2012,999300,Local Government (OES Designation),47-4099,"Construction and Related Workers, All Other",47-4099,"Construction and Related Workers, All Other*",5000.0,10000.0,0.5
36483,2012,999300,Local Government (OES Designation),49-9099,"Installation, Maintenance, and Repair Workers,...",49-9099,"Installation, Maintenance, and Repair Workers,...",3590.0,7180.0,0.5
36484,2012,999300,Local Government (OES Designation),49-9099,"Installation, Maintenance, and Repair Workers,...",49-9099,"Installation, Maintenance, and Repair Workers,...",3590.0,7180.0,0.5
36519,2012,999300,Local Government (OES Designation),51-9199,"Production Workers, All Other",51-9199,"Production Workers, All Other*",680.0,1360.0,0.5


Create Parent-Child mappings for 2021 data

In [135]:
soc_2010_codes_with_children = crosswalk_2010_to_2018.groupby(['2010 SOC Code']).agg({'2018 SOC Code': ['unique', 'nunique']}).reset_index()
soc_2010_codes_with_children.columns = ['2010 SOC Code', 'Unique 2018 SOC Codes', 'Num Unique 2018 SOC Codes']
soc_2010_codes_with_children = soc_2010_codes_with_children[soc_2010_codes_with_children['Num Unique 2018 SOC Codes'] > 1]

oes_2021_merged = pd.merge(oes_2021, crosswalk_2010_to_2018, how='left', left_on='occ_code', right_on='2018 SOC Code').dropna(subset=['2010 SOC Code'])

# replace any '**' in tot_emp with 0
oes_2021_merged['tot_emp'] = oes_2021_merged['tot_emp'].replace({'\*\*': 0}, regex=True)
oes_2021_merged['tot_emp'] = oes_2021_merged['tot_emp'].str.replace(',','').astype(float)

within_naics_year_parent_sum = oes_2021_merged.groupby(['bls_release_year', 'naics', '2010 SOC Code'])['tot_emp'].sum().reset_index()

# Merge back to the original OES data
oes_2021_merged = pd.merge(oes_2021_merged, within_naics_year_parent_sum, how='left', on=['bls_release_year', 'naics', '2010 SOC Code'], suffixes=('', '_parent_sum'))
oes_2021_merged['naics_year_parent_emp_share'] = np.where(
    oes_2021_merged['tot_emp_parent_sum'] == 0,
    np.nan,
    oes_2021_merged['tot_emp'] / oes_2021_merged['tot_emp_parent_sum']
)

oes_2021_mapping_cw_2010_to_2018 = oes_2021_merged[['bls_release_year', 'naics', 'naics_title', 'occ_code', 'occ_title', '2010 SOC Code', '2010 SOC Title', 'tot_emp', 'tot_emp_parent_sum', 'naics_year_parent_emp_share']].copy()

In [136]:
oes_2021_mapping_cw_2010_to_2018.to_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/oes_2012_parent_child_mapping_cw_2010_to_2018.csv'), index=False)
soc_2010_codes_with_children.to_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/soc_2010_codes_with_children.csv'), index=False)

In [137]:
oes_2021_mapping_cw_2010_to_2018[oes_2021_mapping_cw_2010_to_2018['naics_year_parent_emp_share'] < 1]

Unnamed: 0,bls_release_year,naics,naics_title,occ_code,occ_title,2010 SOC Code,2010 SOC Title,tot_emp,tot_emp_parent_sum,naics_year_parent_emp_share
38,2021,115100,Support Activities for Crop Production,11-3012,Administrative Services Managers,11-3011,Administrative Services Managers (#),110.0,200.0,0.550000
39,2021,115100,Support Activities for Crop Production,11-3013,Facilities Managers,11-3011,Administrative Services Managers (#),90.0,200.0,0.450000
160,2021,115200,Support Activities for Animal Production,19-4012,Agricultural Technicians,19-4011,Agricultural and Food Science Technicians (#),940.0,1050.0,0.895238
161,2021,115200,Support Activities for Animal Production,19-4013,Food Science Technicians,19-4011,Agricultural and Food Science Technicians (#),110.0,1050.0,0.104762
170,2021,115200,Support Activities for Animal Production,39-1014,First-Line Supervisors of Entertainment and Re...,39-1021,First-Line Supervisors of Personal Service Wor...,40.0,210.0,0.190476
...,...,...,...,...,...,...,...,...,...,...
37828,2021,5320A1,"Rental and Leasing Services (5322, 5323, and 5...",15-1299,"Computer Occupations, All Other",15-1199,"Computer Occupations, All Other (#)",120.0,1090.0,0.110092
37848,2021,5320A1,"Rental and Leasing Services (5322, 5323, and 5...",27-4015,Lighting Technicians,27-4099,"Media and Communication Equipment Workers, All...",540.0,630.0,0.857143
37851,2021,5320A1,"Rental and Leasing Services (5322, 5323, and 5...",27-4099,"Media and Communication Equipment Workers, All...",27-4099,"Media and Communication Equipment Workers, All...",90.0,630.0,0.142857
37964,2021,5320A1,"Rental and Leasing Services (5322, 5323, and 5...",51-9161,Computer Numerically Controlled Tool Operators,51-9199,"Production Workers, All Other (#)",40.0,220.0,0.181818


Create Parent-Child mappings for 2010 to 2018 hybrid mapping

In [138]:
hybrid_2010_with_children = crosswalk_hybrid_2010_to_2018.groupby('2010 SOC Code').agg({'2018 SOC Code': ['unique', 'nunique']}).reset_index()
hybrid_2010_with_children.columns = ['2010 SOC Code', 'Unique 2018 SOC Codes', 'Num Unique 2018 SOC Codes']
hybrid_2010_with_children.sort_values(by='Num Unique 2018 SOC Codes', ascending = False)

oes_2021_merged = pd.merge(oes_2012, crosswalk_hybrid_2010_to_2018, how='left', left_on='occ_code', right_on='2018 SOC Code').dropna(subset=['2010 SOC Code'])

# replace any '**' in tot_emp with 0
oes_2021_merged['tot_emp'] = oes_2021_merged['tot_emp'].replace({'\*\*': 0}, regex=True)
oes_2021_merged['tot_emp'] = oes_2021_merged['tot_emp'].str.replace(',','').astype(float)

within_naics_year_parent_sum = oes_2021_merged.groupby(['bls_release_year', 'naics', '2010 SOC Code'])['tot_emp'].sum().reset_index()

# Merge back to the original OES data
oes_2021_merged = pd.merge(oes_2021_merged, within_naics_year_parent_sum, how='left', on=['bls_release_year', 'naics', '2010 SOC Code'], suffixes=('', '_parent_sum'))
oes_2021_merged['naics_year_parent_emp_share'] = np.where(
    oes_2021_merged['tot_emp_parent_sum'] == 0,
    np.nan,
    oes_2021_merged['tot_emp'] / oes_2021_merged['tot_emp_parent_sum']
)

oes_2021_mapping_cw_2010_hybrid = oes_2021_merged[['bls_release_year', 'naics', 'naics_title', 'occ_code', 'occ_title', '2010 SOC Code', '2010 SOC Title', 'tot_emp', 'tot_emp_parent_sum', 'naics_year_parent_emp_share']].copy()

In [139]:
oes_2021_mapping_cw_2010_hybrid.to_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/oes_2012_parent_child_mapping_cw_2010_hybrid.csv'), index=False)
hybrid_2010_with_children.to_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/soc_hybrid_2010_codes_with_children.csv'), index=False)

In [140]:
oes_2021_mapping_cw_2010_hybrid[oes_2021_mapping_cw_2010_hybrid['naics_year_parent_emp_share'] < 1]

Unnamed: 0,bls_release_year,naics,naics_title,occ_code,occ_title,2010 SOC Code,2010 SOC Title,tot_emp,tot_emp_parent_sum,naics_year_parent_emp_share
274,2012,211100,Oil and Gas Extraction,47-5049,"Mining Machine Operators, All Other",47-5049,"Mining Machine Operators, All Other",40.0,290.0,0.137931
277,2012,211100,Oil and Gas Extraction,47-5099,"Extraction Workers, All Other",47-5049,"Mining Machine Operators, All Other",250.0,290.0,0.862069
354,2012,212100,Coal Mining,47-5049,"Mining Machine Operators, All Other",47-5049,"Mining Machine Operators, All Other",610.0,2290.0,0.266376
356,2012,212100,Coal Mining,47-5099,"Extraction Workers, All Other",47-5049,"Mining Machine Operators, All Other",1680.0,2290.0,0.733624
446,2012,212200,Metal Ore Mining,47-5049,"Mining Machine Operators, All Other",47-5049,"Mining Machine Operators, All Other",780.0,1230.0,0.634146
448,2012,212200,Metal Ore Mining,47-5099,"Extraction Workers, All Other",47-5049,"Mining Machine Operators, All Other",450.0,1230.0,0.365854
540,2012,212300,Nonmetallic Mineral Mining and Quarrying,47-5049,"Mining Machine Operators, All Other",47-5049,"Mining Machine Operators, All Other",690.0,1200.0,0.575
543,2012,212300,Nonmetallic Mineral Mining and Quarrying,47-5099,"Extraction Workers, All Other",47-5049,"Mining Machine Operators, All Other",510.0,1200.0,0.425
698,2012,213100,Support Activities for Mining,47-5049,"Mining Machine Operators, All Other",47-5049,"Mining Machine Operators, All Other",470.0,5470.0,0.085923
701,2012,213100,Support Activities for Mining,47-5099,"Extraction Workers, All Other",47-5049,"Mining Machine Operators, All Other",5000.0,5470.0,0.914077


Testing the Mapping

In [4]:
# Need to be able to successfully convert all codes in 2003 oes to 2010
oes_2003 = pd.read_csv(os.path.join(ailabor_root, 'data/bls/naics_4digit/nat4d_2003_may.csv'))
oes_2003.columns = [col.lower() for col in oes_2003.columns]
oes_2003.rename(columns={'group': 'occ_group'}, inplace=True)
oes_2003 = oes_2003[~(oes_2003['occ_code'].str.endswith('0')) &
                    ~(oes_2003['occ_group'].isin(['major', 'minor', 'broad']))].copy()
oes_2003['bls_release_year'] = 2003

# Load in the 2000 to 2010 crosswalk
crosswalk_2000_to_2010 = pd.read_csv(os.path.join(ailabor_root, 'data/bls/crosswalks/crosswalk_2000_to_2010.csv'))

# Load in the 2000 to 2010 mapping matrix
crosswalk_2000_to_2010_mapping_matrix = pd.read_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/oes_2012_parent_child_mapping_cw_2000_to_2010.csv'))

# Load in the 2000 SOC codes with children
soc_2000_codes_with_children = pd.read_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/soc_2000_codes_with_children.csv'))

In [None]:
import re
def harmonize_2010_to_2011_hybrid_codes(row, mapping_matrix, soc_codes_with_children):
    pass

def harmonize_2000_to_2010_codes(row, mapping_matrix, soc_codes_with_children, crosswalk):
    naics = row['naics']
    naics_title = row['naics_title']
    occ_code = row['occ_code']
    occ_title = row['occ_title']
    bls_release_year = row['bls_release_year']
    tot_emp = row['tot_emp']
    # convert tot_emp to float if is a string and it is not nan or contains a '*'
    if isinstance(tot_emp, str):
        if tot_emp == 'nan' or '*' in tot_emp:
            tot_emp = 0
        else:
            tot_emp = float(tot_emp.replace(',', ''))

    if not occ_code in soc_codes_with_children.keys():
        # If the SOC code is not in the mapping matrix, return the original row
        soc_2010 = crosswalk.get(occ_code)
        if soc_2010 is None:
            print("No mapping found for SOC code:", occ_code, " employment count: ", row['tot_emp'])
            return pd.DataFrame()

        row['2010 SOC Code'] = soc_2010
        row['naics_year_parent_emp_share'] = 1
        return pd.DataFrame([row])
    else:
        returned_rows = pd.DataFrame()
        children = re.findall(r"'([^']+)'",soc_codes_with_children[occ_code])
        for child in children:
            # look up the emp share in the mapping matrix
            emp_share = 0
            mapping = mapping_matrix.loc[(mapping_matrix['occ_code'] == child) & (mapping_matrix['naics'] == naics) & mapping_matrix['200 SOC Code'] == occ_code, 'naics_year_parent_emp_share']
            if mapping.empty: 
                emp_share = 0
            else: 
                if len(mapping) > 1:
                    raise ValueError(f"Multiple mappings found for SOC code {child} in NAICS {naics}. The parent is {occ_code}.")
                mapping = mapping.values[0]

            print(emp_share)
            new_row = row.copy()
            new_row.index = row.index
            new_row['2010 SOC Code'] = child
            new_row['tot_emp'] = tot_emp * emp_share 
            new_row['naics_year_parent_emp_share'] = emp_share
            
            # vertically stack the new row with returned rows
            if returned_rows.empty:
                returned_rows = pd.DataFrame([new_row])
            else:
                returned_rows = pd.concat([returned_rows, pd.DataFrame([new_row])], ignore_index=True)
        
        print(f"{len(returned_rows)} rows returned for SOC code {occ_code} in NAICS {naics}. It has {len(children)} children.")
        
        return returned_rows
        
    
    

def harmonize_2000_to_2010_soc_codes_outer(df, crosswalk_dir): 
    print("6.1. Harmonizing SOC codes from 2000 to 2010...")
    expanded_df = df[df['bls_release_year'] >= 2012].copy()

    # load in relevant crosswalks
    crosswalk_2000_to_2010 = pd.read_csv(os.path.join(ailabor_root, 'data/bls/crosswalks/crosswalk_2000_to_2010.csv'))
    cw_00_to_10 = crosswalk_2000_to_2010.set_index("2000 SOC Code")["2010 SOC Code"].to_dict()
    crosswalk_hybrid_2000_to_2010 = pd.read_csv(os.path.join(ailabor_root, 'data/bls/crosswalks/crosswalk_hybrid_2000_to_2010.csv'))
    cw_hybrid_00_to_10 = crosswalk_hybrid_2000_to_2010.set_index("2000 SOC code")["2010 SOC code"].to_dict()


    # load in relevant mapping matrices
    crosswalk_2000_to_2010_mapping_matrix = pd.read_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/oes_2012_parent_child_mapping_cw_2000_to_2010.csv'))
    crosswalk_hybrid_2000_to_2010_mapping_matrix = pd.read_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/oes_2012_parent_child_mapping_cw_2000_hybrid.csv'))
    
    soc_2000_codes_with_children = pd.read_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/soc_2000_codes_with_children.csv'))
    soc_2000_codes_with_children = soc_2000_codes_with_children.set_index('2000 SOC Code')['Unique 2010 SOC Codes'].to_dict()
    
    hybrid_2000_with_children = pd.read_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/soc_hybrid_2000_codes_with_children.csv'))
    hybrid_2000_with_children = hybrid_2000_with_children.set_index('2000 SOC Code')['Unique 2010 SOC Codes'].to_dict()

    rows_to_harmonize = df[df['bls_release_year'] < 2012].copy()
    for _, row in rows_to_harmonize.iterrows():
        intermediate_df = pd.DataFrame()
        row_year = int(row['bls_release_year'])

        if 2003 <= row_year <= 2009:
            intermediate_df = harmonize_2000_to_2010_codes(row, crosswalk_2000_to_2010_mapping_matrix, soc_2000_codes_with_children, cw_00_to_10)
        elif 2010 <= row_year <= 2011:
            intermediate_df = harmonize_2010_to_2011_hybrid_codes(row, crosswalk_hybrid_2000_to_2010_mapping_matrix, hybrid_2000_with_children, cw_hybrid_00_to_10)
        else: 
            continue
        
        # print(intermediate_df)
        # Add the (potentially multiple) rows to the expanded dataframe
        if intermediate_df.empty:
            continue
        expanded_df = pd.concat([expanded_df, intermediate_df], ignore_index=True)

    return expanded_df 

In [142]:
from typing import Dict, Tuple

# ───────────────────────── helper builders ─────────────────────────────────────

def _build_split_weights(matrix: pd.DataFrame,
                         parent_col: str,
                         child_col: str,
                         weight_col: str = 'naics_year_parent_emp_share') -> pd.DataFrame:
    """Return a tidy DF with columns  ['naics','parent','child','share']  ."""
    out = (matrix[[
                'naics', parent_col, child_col, weight_col
            ]]
            .rename(columns={parent_col: 'parent', child_col: 'child', weight_col: 'share'})
            .dropna())
    return out


# ───────────────────────── harmonisers (vectorised) ────────────────────────────


def harmonize_2000_to_2010_codes(rows: pd.DataFrame,
                                 mapping_matrix: pd.DataFrame,
                                 parents_with_children: Dict[str, list],
                                 one2one_map: Dict[str, str]) -> pd.DataFrame:
    """Explode 2000‑SOC rows into 2010 codes.

    Parameters
    ----------
    rows  : subset of OES DF where occ_code is 2000‑SOC (2003‑11)
    mapping_matrix : DF with columns  ['naics','parent','child','share']
    parents_with_children : dict  parent -> list(children)
    one2one_map : dict  parent -> child  (for parents that did not split)
    """

    # 1. mark parents that split
    rows = rows.copy()
    rows['parent_has_split'] = rows['occ_code'].isin(parents_with_children)

    # ---------- 2. rows WITHOUT split → simple map ---------------------------
    simple = rows[~rows['parent_has_split']].copy()
    simple['2010 SOC Code'] = simple['occ_code'].map(one2one_map)

    # rows that failed to map → drop + warn
    missing = simple['2010 SOC Code'].isna()
    if missing.any():
        unmapped = simple.loc[missing, ['occ_code','tot_emp']]
        print("[WARN] Dropping {} unmapped 2000‑SOC codes:\n{}".format(
            len(unmapped), unmapped.head()))
        simple = simple[~missing]
    simple['naics_year_parent_emp_share'] = 1.0

    # ---------- 3. rows WITH split → merge to children -----------------------
    split_parents = rows[rows['parent_has_split']]
    if split_parents.empty:
        exploded = pd.DataFrame(columns=simple.columns)
    else:
        # left‑merge to get child + share; will duplicate rows where a parent
        # has multiple children.
        exploded = (split_parents
                     .merge(mapping_matrix,
                            left_on=['naics','occ_code'],
                            right_on=['naics','parent'],
                            how='left'))
        # fallback: equal shares if not found (very rare)
        mask_no_share = exploded['share'].isna()
        if mask_no_share.any():
            exploded.loc[mask_no_share, 'share'] = (
                1.0 / exploded.groupby(['naics','occ_code'])['child'].transform('count')
            )
        exploded['2010 SOC Code'] = exploded['child']
        exploded['naics_year_parent_emp_share'] = exploded['share']
        exploded['tot_emp'] = exploded['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float)
        exploded['tot_emp'] = exploded['tot_emp'] * exploded['share'] 
        exploded = exploded.drop(columns=['parent','child','share'])

    # ---------- 4. concat & return -------------------------------------------
    out = pd.concat([simple, exploded], ignore_index=True)
    return out


def harmonize_2010_to_2011_hybrid_codes(rows: pd.DataFrame,
                                         mapping_matrix: pd.DataFrame,
                                         parents_with_children: Dict[str, list],
                                         one2one_map: Dict[str, str]) -> pd.DataFrame:
    """Same idea as above, but for the 2010 hybrid → 2010 detailed split."""
    # re‑use the 2000 helper: everything is structurally identical
    return harmonize_2000_to_2010_codes(rows, mapping_matrix,
                                        parents_with_children, one2one_map)


# ───────────────────────── driver for 2000→2010 pass ─────────────────────────


def harmonize_2000_to_2010_soc_codes_outer(df: pd.DataFrame, crosswalk_dir: str,
                                           ailabor_root: str) -> pd.DataFrame:
    print("6.1 Harmonising SOC 2000 → 2010 …")

    df = df.copy()

    # ---------- load cross‑walks --------------------------------------------
    cw00 = pd.read_csv(os.path.join(crosswalk_dir, 'crosswalk_2000_to_2010.csv'), dtype=str)
    one2one_00_10 = dict(zip(cw00['2000 SOC Code'].str.strip(),
                             cw00['2010 SOC Code'].str.strip()))

    cw00h = pd.read_csv(os.path.join(crosswalk_dir, 'crosswalk_hybrid_2000_to_2010.csv'), dtype=str)
    one2one_00h_10 = dict(zip(cw00h['2000 SOC code'].str.strip(),
                              cw00h['2010 SOC code'].str.strip()))

    # mapping matrices tidy
    mm_split = _build_split_weights(
        pd.read_csv(os.path.join(ailabor_root,
                                 'bls_transformations/intermediate_data/oes_2012_parent_child_mapping_cw_2000_to_2010.csv')),
        '2000 SOC Code','occ_code')

    mm_split_h = _build_split_weights(
        pd.read_csv(os.path.join(ailabor_root,
                                 'bls_transformations/intermediate_data/oes_2012_parent_child_mapping_cw_2000_hybrid.csv')),
        '2000 SOC Code','occ_code')

    # Filter to get only codes with multiple children (len > 1)
    parents_children = (mm_split.groupby('parent')['child'].unique().loc[lambda x: x.apply(len) > 1]
                               .apply(list).to_dict())
    parents_children_h = (mm_split_h.groupby('parent')['child'].unique().loc[lambda x: x.apply(len) > 1]
                                   .apply(list).to_dict())

    # ---------- slice the incoming DF ---------------------------------------
    df_2000 = df[df['bls_release_year'] < 2012]
    df_post = df[df['bls_release_year'] >= 2012]  # untouched

    # ---------- pure 2000 (<=2009) -----------------------------------------
    df_pre09 = df_2000[df_2000['bls_release_year'] <= 2009]
    mapped_pre09 = harmonize_2000_to_2010_codes(df_pre09, mm_split,
                                                parents_children, one2one_00_10)

    # ---------- hybrid 2010–11 ---------------------------------------------
    df_1011 = df_2000[(df_2000['bls_release_year'] >= 2010) & (df_2000['bls_release_year'] <= 2011)]
    mapped_1011 = harmonize_2010_to_2011_hybrid_codes(df_1011, mm_split_h,
                                                      parents_children_h, one2one_00h_10)

    out = pd.concat([df_post, mapped_pre09, mapped_1011], ignore_index=True)
    return out

In [143]:
df = harmonize_2000_to_2010_soc_codes_outer(pd.concat([oes_2012,oes_2003],ignore_index=True), os.path.join(ailabor_root, 'data/bls/crosswalks'), ailabor_root)

6.1 Harmonising SOC 2000 → 2010 …


  out = pd.concat([df_post, mapped_pre09, mapped_1011], ignore_index=True)


In [144]:
df

Unnamed: 0,naics,naics_title,occ_code,occ_title,occ_group,tot_emp,emp_prse,pct_total,pct_rpt,h_mean,...,a_pct25,a_median,a_pct75,a_pct90,annual,hourly,bls_release_year,parent_has_split,2010 SOC Code,naics_year_parent_emp_share
0,113300,Logging,11-1021,General and Operations Managers,detailed,420,13.3,0.86,5,42.73,...,58180,76670,101570,149100,,,2012,,,
1,113300,Logging,11-9199,"Managers, All Other",detailed,**,**,**,1,76.83,...,67510,134150,#,#,,,2012,,,
2,113300,Logging,13-1023,"Purchasing Agents, Except Wholesale, Retail, a...",detailed,160,42.5,0.33,1,27.39,...,43870,59370,69500,75280,,,2012,,,
3,113300,Logging,13-2011,Accountants and Auditors,detailed,90,34.9,0.18,1,28.14,...,36700,54690,80920,91650,,,2012,,,
4,113300,Logging,19-1032,Foresters,detailed,430,25.7,0.87,3,29.43,...,48650,64180,73670,83470,,,2012,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79427,999300,Local Government (OES designation),47-2181,Roofers,,26.785714,7.8,0.00,0.36,20.13,...,30180,44450,52800,58130,,,2003,True,47-2231,0.107143
79428,999300,Local Government (OES designation),49-9021,"Heating, air conditioning, and refrigeration m...",,35.0,5.1,0.10,5.28,20.24,...,35170,42050,50320,56670,,,2003,True,47-2231,0.006494
79429,999300,Local Government (OES designation),49-9021,"Heating, air conditioning, and refrigeration m...",,5355.0,5.1,0.10,5.28,20.24,...,35170,42050,50320,56670,,,2003,True,49-9021,0.993506
79430,999300,Local Government (OES designation),51-5021,Job printers,,198.795181,5.4,0.00,0.79,17.30,...,28680,34810,44340,52510,,,2003,True,51-5112,0.903614


In [118]:
df[(df['2010 SOC Code'].isna())]['occ_code'].unique()

array(['33-9032', '47-2181', '15-1051', '49-9021', '47-2111', '29-1111',
       '13-1041', '51-5021', '31-1012', '27-3031', '15-1081', '11-9061',
       '21-1091', '29-2034', '25-2041'], dtype=object)

In [124]:
print(len(df), len(oes_2003))

39705 38838


In [109]:
oes_2003[(oes_2003['naics']==425100) & oes_2003['occ_code'].isin(['29-1111', '29-1112'])]

Unnamed: 0,naics,naics_title,occ_code,occ_title,occ_group,tot_emp,emp_prse,pct_total,pct_rpt,h_mean,...,h_median,h_pct75,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90,annual,bls_release_year
18886,425100,Wholesale Electronic Markets and Agents and Br...,29-1111,Registered nurses,,80,33.7,0.01,0.14,25.75,...,25.21,27.57,33.36,41940,47580,52440,57350,69380,,2003


In [123]:
for code in df[(df['2010 SOC Code'].isna())]['occ_code'].unique():
    for naics in df[(df['2010 SOC Code'].isna()) & (df['occ_code']==code)].groupby(['naics_title', 'naics', 'occ_code', 'occ_title']).agg('count').reset_index()['naics'].unique(): 
        numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
        denom = oes_2003[(oes_2003['naics']==naics)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
        if numer/denom >= 0.01: 
            print("issue with code: ", code, " in naics: ", naics, " with emp share: ", numer/denom)

  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = 

issue with code:  33-9032  in naics:  488400  with emp share:  0.010366051182377713


  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()


issue with code:  15-1051  in naics:  516100  with emp share:  0.017723513194170933
issue with code:  15-1051  in naics:  518100  with emp share:  0.03290929203539823
issue with code:  15-1051  in naics:  517300  with emp share:  0.016140974381756257


  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = 

issue with code:  47-2111  in naics:  517900  with emp share:  0.011976047904191617
issue with code:  47-2111  in naics:  312200  with emp share:  0.010926573426573426


  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = 

issue with code:  31-1012  in naics:  721300  with emp share:  0.020499108734402853
issue with code:  15-1081  in naics:  516100  with emp share:  0.05553367467506892
issue with code:  15-1081  in naics:  518100  with emp share:  0.04074483775811209
issue with code:  15-1081  in naics:  523200  with emp share:  0.020134228187919462
issue with code:  15-1081  in naics:  517300  with emp share:  0.01310528653931586


  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()
  numer = oes_2003[(oes_2003['naics']==naics) & (oes_2003['occ_code'] == code)]['tot_emp'].str.replace(',','').str.replace('*','').replace('',0).astype(float).sum()


In [99]:
crosswalk_2000_to_2010_mapping_matrix = pd.read_csv(os.path.join(ailabor_root, 'bls_transformations/intermediate_data/oes_2012_parent_child_mapping_cw_2000_to_2010.csv'))
crosswalk_2000_to_2010_mapping_matrix[(crosswalk_2000_to_2010_mapping_matrix['occ_code'] == '33-9032')&
                                      (crosswalk_2000_to_2010_mapping_matrix['naics']==211100)]

Unnamed: 0,bls_release_year,naics,naics_title,occ_code,occ_title,2000 SOC Code,2000 SOC Title,tot_emp,tot_emp_parent_sum,naics_year_parent_emp_share
281,2012,211100,Oil and Gas Extraction,33-9032,Security Guards,33-9032,Security Guards*,,0.0,


In [113]:
80/608090

0.0001315594731043102