In [9]:
import pandas as pd
import os
import numpy as np

In [22]:
# Define task ratings files used for each year
onet_task_ratings_dir = '/Users/sidsatya/dev/ailabor/data/onet/historical_onet_task_ratings'

# TODO: add in 2003 - 2007
onet_task_files = {
    2008: os.path.join(onet_task_ratings_dir, "task_ratings_2008_jun.csv"),  # Follows SOC 2006
    2009: os.path.join(onet_task_ratings_dir, "task_ratings_2009_jun.csv"),  # Follows SOC 2006
    2010: os.path.join(onet_task_ratings_dir, "task_ratings_2010_jul.csv"),  # Follows SOC 2009
    2011: os.path.join(onet_task_ratings_dir, "task_ratings_2011_jul.csv"),  # Follows SOC 2009
    2012: os.path.join(onet_task_ratings_dir, "task_ratings_2012_jul.csv"),  # Follows SOC 2010
    2013: os.path.join(onet_task_ratings_dir, "task_ratings_2013_jul.csv"),  # Follows SOC 2010
    2014: os.path.join(onet_task_ratings_dir, "task_ratings_2014_jul.csv"),  # Follows SOC 2010
    2015: os.path.join(onet_task_ratings_dir, "task_ratings_2015_oct.csv"),  # Follows SOC 2010
    2016: os.path.join(onet_task_ratings_dir, "task_ratings_2016_nov.csv"),  # Follows SOC 2010
    2017: os.path.join(onet_task_ratings_dir, "task_ratings_2017_oct.csv"),  # Follows SOC 2010
    2018: os.path.join(onet_task_ratings_dir, "task_ratings_2018_nov.csv"),  # Follows SOC 2010
    2019: os.path.join(onet_task_ratings_dir, "task_ratings_2019_nov.csv"),  # Follows SOC 2010
    2020: os.path.join(onet_task_ratings_dir, "task_ratings_2020_nov.csv"),  # Follows SOC 2019
    2021: os.path.join(onet_task_ratings_dir, "task_ratings_2021_nov.csv"),  # Follows SOC 2019
    2022: os.path.join(onet_task_ratings_dir, "task_ratings_2022_nov.csv"),  # Follows SOC 2019
    2023: os.path.join(onet_task_ratings_dir, "task_ratings_2023_nov.csv"),  # Follows SOC 2019
    2024: os.path.join(onet_task_ratings_dir, "task_ratings_2024_nov.csv"),  # Follows SOC 2019
    2025: os.path.join(onet_task_ratings_dir, "task_ratings_2025_feb.csv"),  # Follows SOC 2019
}


In [23]:
# read in data from ../data/onet/historical_onet_task_ratings. Create giant dataframe with all the data
def read_onet_task_data(directory):
    all_files = pd.DataFrame()
    for year, file_path in onet_task_files.items():
        df = pd.read_csv(file_path, encoding='latin1')
        df['year'] = year  # Use the key as the year value
        all_files = pd.concat([all_files, df], ignore_index=True)
    return all_files

task_ratings_2008_onwards = read_onet_task_data(onet_task_ratings_dir)

In [27]:
IMs = task_ratings_2008_onwards[task_ratings_2008_onwards['Scale ID'] == 'IM']
IMs_grp = IMs.groupby(['O*NET-SOC Code', 'Task ID', 'year']).agg('count').reset_index()

In [28]:
IMs_grp.sort_values('Scale ID')

Unnamed: 0,O*NET-SOC Code,Task ID,year,Scale ID,Category,Data Value,N,Standard Error,Lower CI Bound,Upper CI Bound,Recommend Suppress,Date,Domain Source,Title,Task,Scale Name
0,11-1011.00,8823,2008,1,0,1,1,1,1,1,1,1,1,0,0,0
1,11-1011.00,8823,2009,1,0,1,1,1,1,1,1,1,1,0,0,0
2,11-1011.00,8823,2010,1,0,1,1,1,1,1,1,1,1,0,0,0
3,11-1011.00,8823,2011,1,0,1,1,1,1,1,1,1,1,0,0,0
4,11-1011.00,8823,2012,1,0,1,1,1,1,1,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320296,53-7121.00,12810,2021,1,0,1,1,1,1,1,1,1,1,1,1,1
320297,53-7121.00,12810,2022,1,0,1,1,1,1,1,1,1,1,1,1,1
320298,53-7121.00,12810,2023,1,0,1,1,1,1,1,1,1,1,1,1,1
320299,53-7121.00,12810,2024,1,0,1,1,1,1,1,1,1,1,1,1,1


In [38]:
# Calculate normalized IM score for each task within an occ-year group
task_ratings_2008_onwards_IM_grp = task_ratings_2008_onwards[task_ratings_2008_onwards['Scale ID'] == 'IM'].groupby(['O*NET-SOC Code', 'year', 'Scale ID']).agg({'Data Value': 'sum'}).reset_index()

# merge with task_ratings_2008_onwards to get the task statements
task_ratings_2008_onwards = pd.merge(task_ratings_2008_onwards, task_ratings_2008_onwards_IM_grp, on=['O*NET-SOC Code', 'year', 'Scale ID'], suffixes=('', '_total'))
task_ratings_2008_onwards['IM_normalized'] = task_ratings_2008_onwards['Data Value'] / task_ratings_2008_onwards['Data Value_total']
task_ratings_2008_onwards

Unnamed: 0,O*NET-SOC Code,Task ID,Scale ID,Category,Data Value,N,Standard Error,Lower CI Bound,Upper CI Bound,Recommend Suppress,Date,Domain Source,year,Title,Task,Scale Name,Data Value_total,IM_normalized
0,11-1011.00,8823,IM,,4.51,93.0,0.1300,4.2600,4.7700,N,06/2006,Incumbent,2008,,,,120.26,0.037502
1,11-1011.00,8824,IM,,4.38,102.0,0.1200,4.1500,4.6100,N,06/2006,Incumbent,2008,,,,120.26,0.036421
2,11-1011.00,8825,IM,,4.34,96.0,0.1500,4.0300,4.6400,N,06/2006,Incumbent,2008,,,,120.26,0.036088
3,11-1011.00,8826,IM,,4.19,98.0,0.1400,3.9100,4.4700,N,06/2006,Incumbent,2008,,,,120.26,0.034841
4,11-1011.00,8827,IM,,4.13,96.0,0.1400,3.8400,4.4200,N,06/2006,Incumbent,2008,,,,120.26,0.034342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320296,53-7121.00,12807,IM,,4.08,53.0,0.4084,3.2562,4.8952,N,08/2019,Incumbent,2025,"Tank Car, Truck, and Ship Loaders",Unload cars containing liquids by connecting h...,Importance,81.47,0.050080
320297,53-7121.00,12804,IM,,4.02,28.0,0.1538,3.7006,4.3319,N,08/2019,Incumbent,2025,"Tank Car, Truck, and Ship Loaders","Clean interiors of tank cars or tank trucks, u...",Importance,81.47,0.049343
320298,53-7121.00,12803,IM,,3.88,56.0,0.2332,3.4129,4.3475,N,08/2019,Incumbent,2025,"Tank Car, Truck, and Ship Loaders",Lower gauge rods into tanks or read meters to ...,Importance,81.47,0.047625
320299,53-7121.00,12805,IM,,3.87,31.0,0.2604,3.3420,4.4058,N,08/2019,Incumbent,2025,"Tank Car, Truck, and Ship Loaders",Operate conveyors and equipment to transfer gr...,Importance,81.47,0.047502


In [37]:
task_ratings_2008_onwards_IM_grp

Unnamed: 0,O*NET-SOC Code,year,Data Value
0,11-1011.00,2008,120.26
1,11-1011.00,2009,120.26
2,11-1011.00,2010,120.26
3,11-1011.00,2011,120.26
4,11-1011.00,2012,120.26
...,...,...,...
16155,53-7121.00,2021,81.47
16156,53-7121.00,2022,81.47
16157,53-7121.00,2023,81.47
16158,53-7121.00,2024,81.47


In [30]:
task_ratings_2008_onwards[(task_ratings_2008_onwards['year']==2016) & (task_ratings_2008_onwards['O*NET-SOC Code']=='15-1132.00') & (task_ratings_2008_onwards['Scale ID']=='IM')]

Unnamed: 0,O*NET-SOC Code,Task ID,Scale ID,Category,Data Value,N,Standard Error,Lower CI Bound,Upper CI Bound,Recommend Suppress,Date,Domain Source,year,Title,Task,Scale Name
1244500,15-1132.00,3431,IM,,4.23,88.0,0.14,3.95,4.5,N,07/2016,Incumbent,2016,"Software Developers, Applications","Modify existing software to correct errors, al...",Importance
1244509,15-1132.00,3432,IM,,4.14,88.0,0.37,3.42,4.87,N,07/2016,Incumbent,2016,"Software Developers, Applications",Analyze user needs and software requirements t...,Importance
1244518,15-1132.00,3430,IM,,3.93,88.0,0.11,3.71,4.16,N,07/2016,Incumbent,2016,"Software Developers, Applications","Confer with systems analysts, engineers, progr...",Importance
1244527,15-1132.00,3442,IM,,3.88,81.0,0.09,3.7,4.07,N,07/2016,Incumbent,2016,"Software Developers, Applications","Store, retrieve, and manipulate data for analy...",Importance
1244536,15-1132.00,3435,IM,,3.86,73.0,0.36,3.15,4.57,N,07/2016,Incumbent,2016,"Software Developers, Applications","Design, develop and modify software systems, u...",Importance
1244545,15-1132.00,3436,IM,,3.84,86.0,0.27,3.29,4.38,N,07/2016,Incumbent,2016,"Software Developers, Applications",Develop and direct software system testing and...,Importance
1244554,15-1132.00,3438,IM,,3.31,58.0,0.21,2.89,3.72,N,07/2016,Incumbent,2016,"Software Developers, Applications","Supervise the work of programmers, technologis...",Importance
1244563,15-1132.00,3440,IM,,3.14,78.0,0.23,2.69,3.59,N,07/2016,Incumbent,2016,"Software Developers, Applications",Determine system performance standards.,Importance
1244572,15-1132.00,3434,IM,,3.03,76.0,0.38,2.27,3.78,N,07/2016,Incumbent,2016,"Software Developers, Applications",Coordinate software system installation and mo...,Importance
1244581,15-1132.00,3433,IM,,3.43,68.0,0.15,3.14,3.73,N,07/2016,Incumbent,2016,"Software Developers, Applications",Consult with customers about software system d...,Importance
