In [16]:
import numpy as np
import pandas as pd 
from sklearn.linear_model import LinearRegression

In [17]:
# Specify directories 
data_dir = "C:/Users/singhy/Desktop/Chicago/cps_data/inflation/raw_data"
output_dir = "C:/Users/singhy/Desktop/Chicago/cps_data/inflation/output"
temp_dir = "C:/Users/singhy/Desktop/Chicago/cps_data/inflation/temp"

In [18]:

df = pd.read_excel(f"{data_dir}/employment_by_industry/hours-employment-detailed-industries.xlsx", sheet_name="MachineReadable")


In [19]:
# Sample Selection 

# Year (2016-2019)
df= df[(df["Year"] >= 2016) & (df['Year'] <= 2019)]

# Measure 
df = df[df['Measure'] == "Employment"]

# Units 
df = df[df['Units'] == "Thousands of jobs"]

# mapping based on 4 digit NAICS sectors 
df = df[df['Digit'] == "4-Digit"]

In [20]:
def map_naics_code_to_industry(naics_code):
    """
    Map NAICS 4- or 2-digit code to a JOLTS industry category.
    """
    naics_str = str(naics_code).zfill(4)  # Pad to 4 digits
    naics_4 = int(naics_str[:4])
    naics_2 = int(naics_str[:2])

    # Priority mapping for specific 4-digit codes
    four_digit_map = {
        1133: "Mining and Logging",
        321: "Durable Goods Manufacturing",
        327: "Durable Goods Manufacturing",
        322: "Nondurable Goods Manufacturing",
        323: "Nondurable Goods Manufacturing",
        324: "Nondurable Goods Manufacturing",
        325: "Nondurable Goods Manufacturing",
        326: "Nondurable Goods Manufacturing",
    }

    # General 2-digit NAICS to JOLTS sector
    two_digit_map = {
        21: "Mining and Logging",
        22: "Utilities",
        23: "Construction",
        31: "Nondurable Goods Manufacturing",
        32: "Nondurable Goods Manufacturing",
        33: "Durable Goods Manufacturing",
        42: "Wholesale Trade",
        44: "Retail Trade",
        45: "Retail Trade",
        48: "Transportation and Warehousing",
        49: "Transportation and Warehousing",
        51: "Information",
        52: "Finance and Insurance",
        53: "Real Estate and Rental and Leasing",
        54: "Professional and Business Services",
        55: "Professional and Business Services",
        56: "Administrative and support and waste management",
        61: "Private Educational Services",
        62: "Health Care and Social Assistance",
        71: "Arts, Entertainment, and Recreation",
        72: "Accommodation and Food Services",
        81: "Other Services",
    }

    # Try 4-digit first, fallback to 2-digit
    return four_digit_map.get(naics_4, two_digit_map.get(naics_2, "Unknown"))



In [21]:
df['industry'] = df['NAICS'].apply(map_naics_code_to_industry)

In [22]:
def map_to_jolts_industry(industry):
    """
    Maps detailed industry labels to standardized JOLTS industry labels.
    """
    mapping = {
        'Accommodation and Food Services': 'Accomodation and food services',
        'Arts, Entertainment, and Recreation': 'Arts, entertainment, and recreation',
        'Construction': 'Construction',
        'Durable Goods Manufacturing': 'Durable goods manufacturing',
        'Finance and Insurance': 'Finance and insurance',
        'Health Care and Social Assistance': 'Healthcare and Social Assistance',
        'Information': 'Information',
        'Mining and Logging': 'Mining and logging',
        'Nondurable Goods Manufacturing': 'Nondurable goods manufacturing',
        'Other Services': 'Other services',
        'Private Educational Services': 'Private education services',
        'Professional and Business Services': 'Professional and business services',
        'Real Estate and Rental and Leasing': 'Real estate and rental and leasing',
        'Retail Trade': 'Retail trade',
        'Transportation and Warehousing': 'Transportation, warehousing, and utilities',
        'Utilities': 'Transportation, warehousing, and utilities',  # grouped in JOLTS
        'Wholesale Trade': 'Wholesale trade',
        'Administrative and support and waste management': 'Professional and business services',  # subcategory of sector 56
        'Unknown': 'Unknown'  # or could be np.nan if you want to drop/mask it
    }

    return mapping.get(industry, 'Unknown')


In [23]:
df = df[df['industry'] != "Unknown"]

In [24]:
df['jolts_industry'] = df['industry'].apply(map_to_jolts_industry)


In [25]:
df = df.groupby(['Year', 'jolts_industry'], as_index=False)['Value'].sum()

In [26]:
total_emp_per_year = df.groupby('Year')['Value'].transform('sum')
df['emp_share'] = df['Value'] / total_emp_per_year


In [27]:
df = df.groupby('jolts_industry', as_index=False)['emp_share'].mean()

In [29]:
df.to_csv(f"{temp_dir}/industry_employment_shares.csv", index = False) 