In [1]:
import pandas as pd
import numpy as np
import os

In [7]:
oes_datapath = '/Users/sidsatya/dev/ailabor/data/bls/naics_4digit/'
# read every csv in the oes_datapath directory
oes_files = [f for f in os.listdir(oes_datapath) if f.endswith('.csv')]
all_df = []
for file in oes_files:
    file_path = os.path.join(oes_datapath, file)
    df = pd.read_csv(file_path, dtype=str)
    # convert all columns to lowercase
    df.columns = [col.lower() for col in df.columns]
    if 'group' in df.columns: 
        df.rename(columns={'group': 'occ_group'}, inplace=True)
    df['bls_release_year'] = file.split('_')[1]  # extract year from filename
    all_df.append(df)

oes_data = pd.concat(all_df, ignore_index=True)

oes_data

Unnamed: 0,naics,naics_title,occ_code,occ_title,occ_group,tot_emp,emp_prse,pct_total,pct_rpt,h_mean,...,area,area_title,area_type,prim_state,i_group,own_code,o_group,jobs_1000,loc_quotient,ownership
0,113300,Logging,00-0000,Industry Total,,65050,1.5,100.00,100.00,15.00,...,,,,,,,,,,
1,113300,Logging,11-0000,Management occupations,major,1300,12.5,2.00,20.24,36.62,...,,,,,,,,,,
2,113300,Logging,11-1011,Chief executives,,150,30.3,0.23,3.06,35.10,...,,,,,,,,,,
3,113300,Logging,11-1021,General and operations managers,,950,12.6,1.46,16.67,37.61,...,,,,,,,,,,
4,113300,Logging,11-9011,"Farm, ranch, and other agricultural managers",,**,**,**,1.79,29.89,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534056,999300,Local Government (OES Designation),53-7071,Gas Compressor and Gas Pumping Station Operators,,110,4.4,0.00,~,18.11,...,,,,,,,,,,
1534057,999300,Local Government (OES Designation),53-7072,"Pump Operators, Except Wellhead Pumpers",,240,10.6,0.00,~,19.08,...,,,,,,,,,,
1534058,999300,Local Government (OES Designation),53-7081,Refuse and Recyclable Material Collectors,,47690,1.3,0.85,7,16.66,...,,,,,,,,,,
1534059,999300,Local Government (OES Designation),53-7121,"Tank Car, Truck, and Ship Loaders",,40,28.8,0.00,~,20.23,...,,,,,,,,,,


In [8]:
for col in oes_data.columns:
    print(col)

naics
naics_title
occ_code
occ_title
occ_group
tot_emp
emp_prse
pct_total
pct_rpt
h_mean
a_mean
mean_prse
h_pct10
h_pct25
h_median
h_pct75
h_pct90
a_pct10
a_pct25
a_median
a_pct75
a_pct90
annual
hourly
bls_release_year
area
area_title
area_type
prim_state
i_group
own_code
o_group
jobs_1000
loc_quotient
ownership


In [None]:
# Drop rows with 
# 1) occ_code == '00-0000'
# 2) group == 'major'
# 3) occ_group in ('major', 'minor, 'broad')

oes_data_filtered = oes_data[(oes_data['occ_code'] != '00-0000') &
                             ~(oes_data['occ_group'].isin(['major', 'minor', 'broad']))]

oes_data_filtered

Unnamed: 0,naics,naics_title,occ_code,occ_title,occ_group,tot_emp,emp_prse,pct_total,pct_rpt,h_mean,...,area,area_title,area_type,prim_state,i_group,own_code,o_group,jobs_1000,loc_quotient,ownership
2,113300,Logging,11-1011,Chief executives,,150,30.3,0.23,3.06,35.10,...,,,,,,,,,,
3,113300,Logging,11-1021,General and operations managers,,950,12.6,1.46,16.67,37.61,...,,,,,,,,,,
4,113300,Logging,11-9011,"Farm, ranch, and other agricultural managers",,**,**,**,1.79,29.89,...,,,,,,,,,,
6,113300,Logging,13-1021,"Purchasing agents and buyers, farm products",,30,37.8,0.05,0.51,19.69,...,,,,,,,,,,
7,113300,Logging,13-1023,"Purchasing agents, except wholesale, retail, a...",,220,44.4,0.34,2.47,16.46,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534056,999300,Local Government (OES Designation),53-7071,Gas Compressor and Gas Pumping Station Operators,,110,4.4,0.00,~,18.11,...,,,,,,,,,,
1534057,999300,Local Government (OES Designation),53-7072,"Pump Operators, Except Wellhead Pumpers",,240,10.6,0.00,~,19.08,...,,,,,,,,,,
1534058,999300,Local Government (OES Designation),53-7081,Refuse and Recyclable Material Collectors,,47690,1.3,0.85,7,16.66,...,,,,,,,,,,
1534059,999300,Local Government (OES Designation),53-7121,"Tank Car, Truck, and Ship Loaders",,40,28.8,0.00,~,20.23,...,,,,,,,,,,


In [16]:
# Characterize the degree of suppression for each naics-occ pair 
# For each naics-occ pair, count the number of rows where tot_emp is '**' or NaN (suppressed)
suppression_counts = oes_data_filtered.groupby(['naics', 'occ_code'])['tot_emp'].agg(
    num_suppressed=lambda x: ((x == '**') | (x.isna())).sum(),
    num_rows='count'
).reset_index()

# Optionally, sort by most suppressed
suppression_counts['pct_years_suppressed'] = suppression_counts['num_suppressed'] / suppression_counts['num_rows']
suppression_counts.sort_values(by='pct_years_suppressed', ascending=False)

# Display the results
print("The number of naics-occ pairs with pct_years_suppressed > 0.25 is: ", suppression_counts[suppression_counts['pct_years_suppressed'] > 0.25].shape[0])

# Drop any observation where pct_years_suppressed > 0.25 for that occ-naics pair
oes_data_drop_highly_suppressed = oes_data_filtered.merge(
    suppression_counts[suppression_counts['pct_years_suppressed'] <= 0.25][['naics', 'occ_code']],
    on=['naics', 'occ_code'],
    how='inner'
)

# Now drop any observation with suppressed data
oes_data_drop_suppressed = oes_data_drop_highly_suppressed[(oes_data_drop_highly_suppressed['tot_emp'] != '**') & 
                                                           (oes_data_drop_highly_suppressed['tot_emp'].notna())].copy()      

print("The original shape of the data is: ", oes_data_filtered.shape)
print("The new shape of the data is: ", oes_data_drop_suppressed.shape)

oes_data_drop_suppressed

The number of naics-occ pairs with pct_years_suppressed > 0.25 is:  20535
The original shape of the data is:  (1148007, 35)
The new shape of the data is:  (1000375, 35)


Unnamed: 0,naics,naics_title,occ_code,occ_title,occ_group,tot_emp,emp_prse,pct_total,pct_rpt,h_mean,...,area,area_title,area_type,prim_state,i_group,own_code,o_group,jobs_1000,loc_quotient,ownership
0,113300,Logging,11-1011,Chief executives,,150,30.3,0.23,3.06,35.10,...,,,,,,,,,,
1,113300,Logging,11-1021,General and operations managers,,950,12.6,1.46,16.67,37.61,...,,,,,,,,,,
2,113300,Logging,13-1023,"Purchasing agents, except wholesale, retail, a...",,220,44.4,0.34,2.47,16.46,...,,,,,,,,,,
3,113300,Logging,13-2011,Accountants and auditors,,90,28.9,0.14,3.32,21.47,...,,,,,,,,,,
4,113300,Logging,19-1032,Foresters,,340,15.3,0.52,6.21,23.24,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1028856,999300,Local Government (OES Designation),53-7071,Gas Compressor and Gas Pumping Station Operators,,110,4.4,0.00,~,18.11,...,,,,,,,,,,
1028857,999300,Local Government (OES Designation),53-7072,"Pump Operators, Except Wellhead Pumpers",,240,10.6,0.00,~,19.08,...,,,,,,,,,,
1028858,999300,Local Government (OES Designation),53-7081,Refuse and Recyclable Material Collectors,,47690,1.3,0.85,7,16.66,...,,,,,,,,,,
1028859,999300,Local Government (OES Designation),53-7121,"Tank Car, Truck, and Ship Loaders",,40,28.8,0.00,~,20.23,...,,,,,,,,,,


In [48]:
# Convert tot_emp and other fields to numeric
def convert_to_float(x):
    if type(x) == str:
        # regex to replace ' ' or any number of *, #, %, ',', with empty string
        x = x.replace(' ', '').replace('*', '').replace('#', '').replace('%', '').replace(',', '')
        if x == '':
            return np.nan
    try:
        return float(x)
    except ValueError:
        print("ValueError encountered for value: ", x)
        return np.nan



oes_data_drop_suppressed['bls_release_year'] = pd.to_numeric(oes_data_drop_suppressed['bls_release_year'])
oes_data_drop_suppressed['tot_emp'] = oes_data_drop_suppressed['tot_emp'].apply(lambda x: convert_to_float(x))
oes_data_drop_suppressed['emp_prse'] = oes_data_drop_suppressed['emp_prse'].apply(lambda x: convert_to_float(x))
oes_data_drop_suppressed['h_mean'] = oes_data_drop_suppressed['h_mean'].apply(lambda x: convert_to_float(x))
oes_data_drop_suppressed['a_mean'] = oes_data_drop_suppressed['a_mean'].apply(lambda x: convert_to_float(x))
oes_data_drop_suppressed['mean_prse'] = oes_data_drop_suppressed['mean_prse'].apply(lambda x: convert_to_float(x))
oes_data_drop_suppressed['h_pct10'] = oes_data_drop_suppressed['h_pct10'].apply(lambda x: convert_to_float(x))
oes_data_drop_suppressed['h_pct25'] = oes_data_drop_suppressed['h_pct25'].apply(lambda x: convert_to_float(x))
oes_data_drop_suppressed['h_median'] = oes_data_drop_suppressed['h_median'].apply(lambda x: convert_to_float(x))
oes_data_drop_suppressed['h_pct75'] = oes_data_drop_suppressed['h_pct75'].apply(lambda x: convert_to_float(x))
oes_data_drop_suppressed['h_pct90'] = oes_data_drop_suppressed['h_pct90'].apply(lambda x: convert_to_float(x))
oes_data_drop_suppressed['a_pct10'] = oes_data_drop_suppressed['a_pct10'].apply(lambda x: convert_to_float(x))
oes_data_drop_suppressed['a_pct25'] = oes_data_drop_suppressed['a_pct25'].apply(lambda x: convert_to_float(x))
oes_data_drop_suppressed['a_median'] = oes_data_drop_suppressed['a_median'].apply(lambda x: convert_to_float(x))
oes_data_drop_suppressed['a_pct75'] = oes_data_drop_suppressed['a_pct75'].apply(lambda x: convert_to_float(x))
oes_data_drop_suppressed['a_pct90'] = oes_data_drop_suppressed['a_pct90'].apply(lambda x: convert_to_float(x))

# Get columns of interest 
columns_of_interest = ['naics', 'naics_title', 'occ_code', 'occ_title', 'bls_release_year', 'tot_emp',
                       'emp_prse', 'pct_total', 'h_mean', 'a_mean', 'mean_prse', 'h_pct10', 'h_pct25', 'h_median', 'h_pct75',
                       'h_pct90', 'a_pct10', 'a_pct25', 'a_median', 'a_pct75', 'a_pct90']


oes_data_filtered_final = oes_data_drop_suppressed[columns_of_interest].copy()

oes_data_filtered_final

Unnamed: 0,naics,naics_title,occ_code,occ_title,bls_release_year,tot_emp,emp_prse,pct_total,h_mean,a_mean,...,h_pct10,h_pct25,h_median,h_pct75,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90
0,113300,Logging,11-1011,Chief executives,2005,150.0,30.3,0.23,35.10,73010.0,...,15.56,20.17,30.86,41.63,60.39,32360.0,41960.0,64190.0,86580.0,125610.0
1,113300,Logging,11-1021,General and operations managers,2005,950.0,12.6,1.46,37.61,78230.0,...,19.01,24.07,31.45,47.92,60.48,39550.0,50070.0,65420.0,99680.0,125800.0
2,113300,Logging,13-1023,"Purchasing agents, except wholesale, retail, a...",2005,220.0,44.4,0.34,16.46,34250.0,...,9.09,11.72,13.41,20.12,26.75,18920.0,24380.0,27890.0,41840.0,55640.0
3,113300,Logging,13-2011,Accountants and auditors,2005,90.0,28.9,0.14,21.47,44670.0,...,12.77,14.71,20.39,26.83,33.06,26550.0,30590.0,42410.0,55820.0,68770.0
4,113300,Logging,19-1032,Foresters,2005,340.0,15.3,0.52,23.24,48340.0,...,12.26,16.00,22.53,30.94,34.97,25490.0,33270.0,46860.0,64350.0,72740.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1028856,999300,Local Government (OES Designation),53-7071,Gas Compressor and Gas Pumping Station Operators,2010,110.0,4.4,0.00,18.11,37670.0,...,12.74,14.91,17.73,21.45,25.00,26500.0,31010.0,36870.0,44610.0,51990.0
1028857,999300,Local Government (OES Designation),53-7072,"Pump Operators, Except Wellhead Pumpers",2010,240.0,10.6,0.00,19.08,39680.0,...,12.52,14.91,18.84,22.28,26.99,26040.0,31020.0,39190.0,46330.0,56140.0
1028858,999300,Local Government (OES Designation),53-7081,Refuse and Recyclable Material Collectors,2010,47690.0,1.3,0.85,16.66,34650.0,...,8.93,11.36,15.16,20.20,28.78,18560.0,23630.0,31520.0,42020.0,59860.0
1028859,999300,Local Government (OES Designation),53-7121,"Tank Car, Truck, and Ship Loaders",2010,40.0,28.8,0.00,20.23,42080.0,...,8.89,14.44,18.06,27.30,33.44,18490.0,30030.0,37570.0,56780.0,69550.0


In [49]:
# Compute totals across industry-year and year
oes_year_totals = oes_data_filtered_final.groupby(['bls_release_year']).agg({'tot_emp': 'sum'}).reset_index()
oes_year_totals.rename(columns={'tot_emp': 'year_tot_emp'}, inplace=True)
oes_naics_year_totals = oes_data_filtered_final.groupby(['naics', 'bls_release_year']).agg({'tot_emp': 'sum'}).reset_index()
oes_naics_year_totals.rename(columns={'tot_emp': 'naics_year_tot_emp'}, inplace=True)

# Merge totals back into the main DataFrame
oes_data_merge = oes_data_filtered_final.merge(oes_year_totals, on='bls_release_year', how='left')
oes_data_merge = oes_data_merge.merge(oes_naics_year_totals, on=['naics', 'bls_release_year'], how='left')    

# Compute percent of totals 
oes_data_merge['pct_year_tot_emp'] = oes_data_merge['tot_emp'] / oes_data_merge['year_tot_emp']
oes_data_merge['pct_naics_year_tot_emp'] = oes_data_merge['tot_emp'] /oes_data_merge['naics_year_tot_emp']

oes_data_merge 

Unnamed: 0,naics,naics_title,occ_code,occ_title,bls_release_year,tot_emp,emp_prse,pct_total,h_mean,a_mean,...,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90,year_tot_emp,naics_year_tot_emp,pct_year_tot_emp,pct_naics_year_tot_emp
0,113300,Logging,11-1011,Chief executives,2005,150.0,30.3,0.23,35.10,73010.0,...,60.39,32360.0,41960.0,64190.0,86580.0,125610.0,127081780.0,61560.0,1.180342e-06,0.002437
1,113300,Logging,11-1021,General and operations managers,2005,950.0,12.6,1.46,37.61,78230.0,...,60.48,39550.0,50070.0,65420.0,99680.0,125800.0,127081780.0,61560.0,7.475501e-06,0.015432
2,113300,Logging,13-1023,"Purchasing agents, except wholesale, retail, a...",2005,220.0,44.4,0.34,16.46,34250.0,...,26.75,18920.0,24380.0,27890.0,41840.0,55640.0,127081780.0,61560.0,1.731169e-06,0.003574
3,113300,Logging,13-2011,Accountants and auditors,2005,90.0,28.9,0.14,21.47,44670.0,...,33.06,26550.0,30590.0,42410.0,55820.0,68770.0,127081780.0,61560.0,7.082054e-07,0.001462
4,113300,Logging,19-1032,Foresters,2005,340.0,15.3,0.52,23.24,48340.0,...,34.97,25490.0,33270.0,46860.0,64350.0,72740.0,127081780.0,61560.0,2.675443e-06,0.005523
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000370,999300,Local Government (OES Designation),53-7071,Gas Compressor and Gas Pumping Station Operators,2010,110.0,4.4,0.00,18.11,37670.0,...,25.00,26500.0,31010.0,36870.0,44610.0,51990.0,125231070.0,5580730.0,8.783763e-07,0.000020
1000371,999300,Local Government (OES Designation),53-7072,"Pump Operators, Except Wellhead Pumpers",2010,240.0,10.6,0.00,19.08,39680.0,...,26.99,26040.0,31020.0,39190.0,46330.0,56140.0,125231070.0,5580730.0,1.916457e-06,0.000043
1000372,999300,Local Government (OES Designation),53-7081,Refuse and Recyclable Material Collectors,2010,47690.0,1.3,0.85,16.66,34650.0,...,28.78,18560.0,23630.0,31520.0,42020.0,59860.0,125231070.0,5580730.0,3.808160e-04,0.008545
1000373,999300,Local Government (OES Designation),53-7121,"Tank Car, Truck, and Ship Loaders",2010,40.0,28.8,0.00,20.23,42080.0,...,33.44,18490.0,30030.0,37570.0,56780.0,69550.0,125231070.0,5580730.0,3.194096e-07,0.000007


I want to add the following columns: 
1) 2010 SOC Code
2) 2018 SOC Code

In [50]:
# Load in crosswalk data
crosswalk_dir = '/Users/sidsatya/dev/ailabor/data/bls/crosswalks/'
crosswalk_2000_to_2010 = pd.read_csv(crosswalk_dir + 'crosswalk_2000_to_2010.csv')
crosswalk_2010_to_2018 = pd.read_csv(crosswalk_dir + 'crosswalk_2010_to_2018.csv')
crosswalk_hybrid_2010_to_2018 = pd.read_csv(crosswalk_dir + 'crosswalk_hybrid_2010_to_2018.csv')

In [51]:
# Convert crosswalks to dictionaries for faster lookups
crosswalk_2000_to_2010_dict = crosswalk_2000_to_2010.set_index('2000 SOC Code')['2010 SOC Code'].to_dict()
crosswalk_2010_to_2018_dict = crosswalk_2010_to_2018.set_index('2010 SOC Code')['2018 SOC Code'].to_dict()
crosswalk_2018_to_2010_dict = crosswalk_2010_to_2018.set_index('2018 SOC Code')['2010 SOC Code'].to_dict()
crosswalk_2019_hybrid_to_2010_dict = crosswalk_hybrid_2010_to_2018.set_index('OES 2019 Estimates Code')['2010 SOC Code'].to_dict()
crosswalk_2019_hybrid_to_2018_dict = crosswalk_hybrid_2010_to_2018.set_index('OES 2019 Estimates Code')['2018 SOC Code'].to_dict()

In [52]:
'''
May 2003 – May 2009: 2000 SOC (all detailed occupations coded under the 2000 Standard Occupational Classification)
May 2010 & May 2011: Should both be under the 2010 SOC for the May releases. 
May 2012 – May 2018: 2010 SOC (fully implemented across all detailed occupations)
May 2019 & May 2020: Hybrid 2010 → 2018 SOC (a mix of 2010‐coded and 2018‐coded occupations, with BLS-provided shares for splits/merges)
May 2021 onward: 2018 SOC (all detailed occupations reported under the 2018 taxonomy)
'''
# Create conversion functions
def convert_to_2010_soc(soc_code, year): 
    """Convert SOC code to 2010 SOC code using the 2000 to 2010 crosswalk."""
    if (year >= 2003 and year <= 2009): 
        return crosswalk_2000_to_2010_dict.get(soc_code, soc_code)
    elif (year >= 2010 and year <= 2018):
        return soc_code
    elif (year >= 2018 and year <= 2019): 
        return crosswalk_2019_hybrid_to_2010_dict.get(soc_code, soc_code)
    elif (year >= 2020 and year <= 2025): 
        return crosswalk_2018_to_2010_dict.get(soc_code, soc_code) 
    else: 
        raise ValueError(f"Year {year} is out of range for SOC code conversion.")
    
def convert_to_2018_soc(soc_code, year): 
    """Convert SOC code to 2010 SOC code using the 2000 to 2010 crosswalk."""
    if (year >= 2003 and year <= 2009): 
        code_2010 = crosswalk_2000_to_2010_dict.get(soc_code, soc_code)
        return crosswalk_2010_to_2018_dict.get(code_2010, code_2010)
    elif (year >= 2010 and year <= 2018):
        return crosswalk_2010_to_2018_dict.get(soc_code, soc_code)
    elif (year >= 2018 and year <= 2019): 
        return crosswalk_2019_hybrid_to_2018_dict.get(soc_code, soc_code)
    elif (year >= 2020 and year <= 2025): 
        return soc_code 
    else: 
        raise ValueError(f"Year {year} is out of range for SOC code conversion.")

In [53]:
# Apply conversion to the OES data
oes_data_merge['soc_2010'] = oes_data_merge.apply(lambda row: convert_to_2010_soc(row['occ_code'], int(row['bls_release_year'])), axis=1)
oes_data_merge['soc_2018'] = oes_data_merge.apply(lambda row: convert_to_2018_soc(row['occ_code'], int(row['bls_release_year'])), axis=1)

oes_data_merge

Unnamed: 0,naics,naics_title,occ_code,occ_title,bls_release_year,tot_emp,emp_prse,pct_total,h_mean,a_mean,...,a_pct25,a_median,a_pct75,a_pct90,year_tot_emp,naics_year_tot_emp,pct_year_tot_emp,pct_naics_year_tot_emp,soc_2010,soc_2018
0,113300,Logging,11-1011,Chief executives,2005,150.0,30.3,0.23,35.10,73010.0,...,41960.0,64190.0,86580.0,125610.0,127081780.0,61560.0,1.180342e-06,0.002437,11-1011,11-1011
1,113300,Logging,11-1021,General and operations managers,2005,950.0,12.6,1.46,37.61,78230.0,...,50070.0,65420.0,99680.0,125800.0,127081780.0,61560.0,7.475501e-06,0.015432,11-1021,11-1021
2,113300,Logging,13-1023,"Purchasing agents, except wholesale, retail, a...",2005,220.0,44.4,0.34,16.46,34250.0,...,24380.0,27890.0,41840.0,55640.0,127081780.0,61560.0,1.731169e-06,0.003574,13-1023,13-1023
3,113300,Logging,13-2011,Accountants and auditors,2005,90.0,28.9,0.14,21.47,44670.0,...,30590.0,42410.0,55820.0,68770.0,127081780.0,61560.0,7.082054e-07,0.001462,13-2011,13-2011
4,113300,Logging,19-1032,Foresters,2005,340.0,15.3,0.52,23.24,48340.0,...,33270.0,46860.0,64350.0,72740.0,127081780.0,61560.0,2.675443e-06,0.005523,19-1032,19-1032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000370,999300,Local Government (OES Designation),53-7071,Gas Compressor and Gas Pumping Station Operators,2010,110.0,4.4,0.00,18.11,37670.0,...,31010.0,36870.0,44610.0,51990.0,125231070.0,5580730.0,8.783763e-07,0.000020,53-7071,53-7071
1000371,999300,Local Government (OES Designation),53-7072,"Pump Operators, Except Wellhead Pumpers",2010,240.0,10.6,0.00,19.08,39680.0,...,31020.0,39190.0,46330.0,56140.0,125231070.0,5580730.0,1.916457e-06,0.000043,53-7072,53-7072
1000372,999300,Local Government (OES Designation),53-7081,Refuse and Recyclable Material Collectors,2010,47690.0,1.3,0.85,16.66,34650.0,...,23630.0,31520.0,42020.0,59860.0,125231070.0,5580730.0,3.808160e-04,0.008545,53-7081,53-7081
1000373,999300,Local Government (OES Designation),53-7121,"Tank Car, Truck, and Ship Loaders",2010,40.0,28.8,0.00,20.23,42080.0,...,30030.0,37570.0,56780.0,69550.0,125231070.0,5580730.0,3.194096e-07,0.000007,53-7121,53-7121


In [55]:
# Compute at the level of the 2018 soc code 
oes_data_2018 = oes_data_merge.groupby(['naics', 'soc_2018', 'bls_release_year']).agg({
    'tot_emp': 'sum',
    'pct_year_tot_emp': 'sum',
    'pct_naics_year_tot_emp': 'sum',
    'h_mean': 'mean',
    'a_mean': 'mean',
    'h_pct10': 'mean',
    'h_pct25': 'mean',
    'h_median': 'mean',
    'h_pct75': 'mean',
    'h_pct90': 'mean',
    'a_pct10': 'mean',
    'a_pct25': 'mean',
    'a_median': 'mean',
    'a_pct75': 'mean',
    'a_pct90': 'mean',
    'naics_year_tot_emp': 'sum',
    'year_tot_emp': 'sum',
}).reset_index()

oes_data_2018

Unnamed: 0,naics,soc_2018,bls_release_year,tot_emp,pct_year_tot_emp,pct_naics_year_tot_emp,h_mean,a_mean,h_pct10,h_pct25,h_median,h_pct75,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90,naics_year_tot_emp,year_tot_emp
0,113300,11-0000,2019,750.0,0.000001,0.004000,46.73,97190.0,17.19,27.84,41.84,57.74,78.21,35750.0,57900.0,87030.0,120100.0,162670.0,187510.0,570291650.0
1,113300,11-0000,2020,890.0,0.000002,0.004789,70.06,145730.0,18.05,30.60,50.10,91.98,,37530.0,63650.0,104200.0,191330.0,,185860.0,540545990.0
2,113300,11-0000,2021,1060.0,0.000002,0.005888,56.96,118470.0,28.22,31.18,47.63,62.74,,58700.0,64860.0,99070.0,130500.0,,180040.0,555990610.0
3,113300,11-0000,2022,1030.0,0.000002,0.005782,49.59,103140.0,23.98,30.35,46.09,63.29,76.00,49880.0,63120.0,95860.0,131650.0,158080.0,178150.0,583232030.0
4,113300,11-0000,2023,1020.0,0.000002,0.005948,48.57,101020.0,24.97,33.44,45.03,60.59,74.44,51930.0,69540.0,93660.0,126020.0,154820.0,171490.0,598749430.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989176,999300,53-7199,2020,1630.0,0.000003,0.000076,14.84,30870.0,9.75,11.04,13.28,15.95,23.47,20290.0,22970.0,27620.0,33170.0,48820.0,21324630.0,540545990.0
989177,999300,53-7199,2021,1760.0,0.000003,0.000082,15.72,32690.0,11.07,13.64,13.85,17.39,22.78,23030.0,28380.0,28820.0,36160.0,47380.0,21486650.0,555990610.0
989178,999300,53-7199,2022,1190.0,0.000002,0.000055,17.74,36900.0,12.30,13.01,15.02,19.99,28.30,25580.0,27050.0,31240.0,41570.0,58870.0,21489690.0,583232030.0
989179,999300,53-7199,2023,1200.0,0.000002,0.000055,19.28,40110.0,12.42,14.10,17.07,22.22,30.10,25820.0,29320.0,35500.0,46210.0,62600.0,22012730.0,598749430.0


In [58]:
# save the final DataFrame to a CSV file
savedir = '/Users/sidsatya/dev/ailabor/bls_transformations/output_data/'
oes_data_2018.to_csv(savedir + 'oes_data_filtered_soc_2018.csv', index=False)
oes_data_merge.to_csv(savedir + 'oes_data_filtered_full.csv', index=False)