In [9]:
import pandas as pd 
import numpy as np
import os

In [10]:
ailabor_rootpath = '/Users/sidsatya/dev/ailabor/'
transformations_path = os.path.join(ailabor_rootpath, 'onet_transformations/')
all_task_data = pd.read_csv(os.path.join(transformations_path,'intermediate_data/all_onet_data_mapped_soc_codes.csv'))
task_statements_with_canon_id = pd.read_csv(os.path.join(transformations_path,'intermediate_data/task_statements_with_canon_id.csv'))
task_ratings = pd.read_csv(os.path.join(transformations_path,'intermediate_data/task_ratings.csv'))

In [11]:
# Drop date columns and rows with NaN Task IDs from task_ratings
task_ratings = task_ratings.drop(columns=['Date'])
task_ratings = task_ratings.dropna(subset=['Task ID'])
task_ratings

Unnamed: 0,O*NET-SOC Code,Task ID,year,Mean Importance,Importance Normalized All,Mean Frequency
0,11-1011.00,8823.0,2008,4.51,0.037502,3.6369
1,11-1011.00,8823.0,2009,4.51,0.037502,3.6369
2,11-1011.00,8823.0,2010,4.51,0.037502,3.6369
3,11-1011.00,8823.0,2011,4.51,0.037502,3.6369
4,11-1011.00,8823.0,2012,4.51,0.037502,3.6369
...,...,...,...,...,...,...
328698,53-7121.00,12806.0,2007,4.04,0.051715,
328699,53-7121.00,12807.0,2007,4.00,0.051203,4.4290
328700,53-7121.00,12808.0,2007,3.99,0.051075,5.0694
328701,53-7121.00,12809.0,2007,3.92,0.050179,


In [12]:
task_data_with_canons = pd.merge(all_task_data, task_statements_with_canon_id, on='Task', how='left')
print(task_data_with_canons.shape)

(420543, 13)


In [13]:
task_data_with_ratings = pd.merge(task_data_with_canons, task_ratings, left_on=['Task ID', 'ONET_release_year'], right_on=['Task ID', 'year'], how='left')
print(task_data_with_ratings.shape)

(420543, 18)


In [None]:
# Compute normalized importance scores (importance / summed importance) within occ-task-year groups. Compute one more time only for Core tasks.
normalize_imp_df = task_data_with_ratings.dropna(subset=['Task ID', 'Mean Importance']).copy()

Unnamed: 0,O*NET-SOC Code_x,O*NET 2010 SOC Code,O*NET 2018 SOC Code,ONET_release_year,Task ID,Task,Task Type,Incumbents Responding,Date,Domain Source,Count,task_clean,canon_id,O*NET-SOC Code_y,year,Mean Importance,Importance Normalized All,Mean Frequency
0,11-1011.01,11-1011.00,11-1011,2003,,Directs organization charged with administerin...,,,3/2002,Legacy Analyst,3,directs organization charged with administerin...,C00001,,,,,
1,11-1011.01,11-1011.00,11-1011,2003,,"Administers, interprets, and explains policies...",,,3/2002,Legacy Analyst,3,"administers, interprets, and explains policies...",C00002,,,,,
2,11-1011.01,11-1011.00,11-1011,2003,,"Develops, plans, organizes, and administers po...",,,3/2002,Legacy Analyst,3,"develops, plans, organizes, and administers po...",C00003,,,,,
3,11-1011.01,11-1011.00,11-1011,2003,,Directs and coordinates activities of workers ...,,,3/2002,Legacy Analyst,3,directs and coordinates activities of workers ...,C00004,,,,,
4,11-1011.01,11-1011.00,11-1011,2003,,Negotiates contracts and agreements with feder...,,,3/2002,Legacy Analyst,3,negotiates contracts and agreements with feder...,C00005,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420538,53-7121.00,53-7121.00,53-7121,2025,12807.0,Unload cars containing liquids by connecting h...,Supplemental,85.0,08/2019,Incumbent,22,unload cars containing liquids by connecting h...,C15090,53-7121.00,2025.0,4.08,0.050080,3.8512
420539,53-7121.00,53-7121.00,53-7121,2025,12804.0,"Clean interiors of tank cars or tank trucks, u...",Supplemental,85.0,08/2019,Incumbent,22,"clean interiors of tank cars or tank trucks, u...",C15092,53-7121.00,2025.0,4.02,0.049343,4.1856
420540,53-7121.00,53-7121.00,53-7121,2025,12803.0,Lower gauge rods into tanks or read meters to ...,Supplemental,85.0,08/2019,Incumbent,13,lower gauge rods into tanks or read meters to ...,C15085,53-7121.00,2025.0,3.88,0.047625,4.9404
420541,53-7121.00,53-7121.00,53-7121,2025,12805.0,Operate conveyors and equipment to transfer gr...,Supplemental,85.0,08/2019,Incumbent,22,operate conveyors and equipment to transfer gr...,C15087,53-7121.00,2025.0,3.87,0.047502,4.7736


In [14]:
# Compute within-occupation importance scores for only Core tasks
task_data_core_IM = task_data_with_ratings[(task_data_with_ratings['Task Type'] == 'Core') & (task_data_with_ratings['Mean Importance'].notna())].copy()

# Group by SOC Code and sum mean importance scores
task_data_core_IM_grp = task_data_core_IM.groupby(['O*NET-SOC Code_x', 'ONET_release_year']).agg({'Mean Importance': 'sum'}).reset_index().rename(columns={'Mean Importance': 'Sum Mean Importance'})

# Merge the summed importance scores back to the original DataFrame
task_data_core_IM = pd.merge(task_data_core_IM, task_data_core_IM_grp, on=['O*NET-SOC Code_x', 'ONET_release_year'], how='left')

# Calculate the normalized importance score
task_data_core_IM['Importance Normalized Core'] = task_data_core_IM['Mean Importance'] / task_data_core_IM['Sum Mean Importance']

# Merge back into the main Dataframe
columns_to_keep = ['Task ID', 'ONET_release_year', 'O*NET-SOC Code_x', 'Importance Normalized Core']
task_data_all = pd.merge(task_data_with_ratings, task_data_core_IM[columns_to_keep], on=['Task ID', 'ONET_release_year', 'O*NET-SOC Code_x'], how='left')

# drop extra columns and rename
task_data_all = task_data_all.rename(columns={'O*NET-SOC Code_x': 'O*NET-SOC Code'}).drop(columns=['O*NET-SOC Code_y'])

task_data_all

Unnamed: 0,O*NET-SOC Code,O*NET 2010 SOC Code,O*NET 2018 SOC Code,ONET_release_year,Task ID,Task,Task Type,Incumbents Responding,Date,Domain Source,Count,task_clean,canon_id,year,Mean Importance,Importance Normalized All,Mean Frequency,Importance Normalized Core
0,11-1011.01,11-1011.00,11-1011,2003,,Directs organization charged with administerin...,,,3/2002,Legacy Analyst,3,directs organization charged with administerin...,C00001,,,,,
1,11-1011.01,11-1011.00,11-1011,2003,,"Administers, interprets, and explains policies...",,,3/2002,Legacy Analyst,3,"administers, interprets, and explains policies...",C00002,,,,,
2,11-1011.01,11-1011.00,11-1011,2003,,"Develops, plans, organizes, and administers po...",,,3/2002,Legacy Analyst,3,"develops, plans, organizes, and administers po...",C00003,,,,,
3,11-1011.01,11-1011.00,11-1011,2003,,Directs and coordinates activities of workers ...,,,3/2002,Legacy Analyst,3,directs and coordinates activities of workers ...,C00004,,,,,
4,11-1011.01,11-1011.00,11-1011,2003,,Negotiates contracts and agreements with feder...,,,3/2002,Legacy Analyst,3,negotiates contracts and agreements with feder...,C00005,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420538,53-7121.00,53-7121.00,53-7121,2025,12807.0,Unload cars containing liquids by connecting h...,Supplemental,85.0,08/2019,Incumbent,22,unload cars containing liquids by connecting h...,C15090,2025.0,4.08,0.050080,3.8512,
420539,53-7121.00,53-7121.00,53-7121,2025,12804.0,"Clean interiors of tank cars or tank trucks, u...",Supplemental,85.0,08/2019,Incumbent,22,"clean interiors of tank cars or tank trucks, u...",C15092,2025.0,4.02,0.049343,4.1856,
420540,53-7121.00,53-7121.00,53-7121,2025,12803.0,Lower gauge rods into tanks or read meters to ...,Supplemental,85.0,08/2019,Incumbent,13,lower gauge rods into tanks or read meters to ...,C15085,2025.0,3.88,0.047625,4.9404,
420541,53-7121.00,53-7121.00,53-7121,2025,12805.0,Operate conveyors and equipment to transfer gr...,Supplemental,85.0,08/2019,Incumbent,22,operate conveyors and equipment to transfer gr...,C15087,2025.0,3.87,0.047502,4.7736,


In [None]:
grp = task_data_all.groupby(['O*NET-SOC Code', 'canon_id'])
task_data_all['first_seen'] = grp.year.transform('min')
task_data_all['last_seen'] = grp.year.transform('max')
task_data_all['active'] = (task_data_all.year >= task_data_all.first_seen) & (task_data_all.year <= task_data_all.last_seen)

In [40]:
# save the final DataFrame to a CSV file
task_data_all.to_csv(os.path.join(transformations_path, 'intermediate_data/task_data_merged_attributes.csv'), index=False)

In [6]:
task_data_all = pd.read_csv(os.path.join(transformations_path, 'intermediate_data/task_data_merged_attributes.csv'))
# Save task data for healthcare occupations only
ipums_healthcare_soc_codes = pd.read_csv(os.path.join(ailabor_rootpath, 'data/ipums/ipums_unique_healthcare_occsoc_codes.csv')).dropna().squeeze().tolist()

# Filter task data for healthcare occupations
task_data_healthcare = task_data_all[task_data_all['O*NET 2018 SOC Code'].apply(lambda x: x.replace('-', '')).isin(ipums_healthcare_soc_codes)].copy()
task_data_healthcare = task_data_healthcare[(task_data_healthcare['Task'].notna()) & (task_data_healthcare['Mean Importance'].notna()) & (task_data_healthcare['Task Type'] == 'Core')].copy()

In [8]:
task_data_healthcare.to_csv(os.path.join(transformations_path, 'intermediate_data/task_data_healthcare_filtered.csv'), index=False)