In [54]:
import numpy as np
import pandas as pd 
from sklearn.linear_model import LinearRegression
import json 

In [55]:

# Specify directories 
data_dir = "C:/Users/singhy/Desktop/Chicago/cps_data/inflation/raw_data"
output_dir = "C:/Users/singhy/Desktop/Chicago/cps_data/inflation/output"
temp_dir = "C:/Users/singhy/Desktop/Chicago/cps_data/inflation/temp"

In [56]:
df_raw = pd.read_excel(f"{data_dir}/JOLTS/jolts_industry_level.xlsx", skiprows=2)

  warn("Workbook contains no default style, apply openpyxl's default")


In [57]:

# Extract header and reformat
new_header = df_raw.iloc[0]
df_clean = df_raw[1:]
df_clean.columns = new_header
df_clean = df_clean.rename(columns={df_clean.columns[0]: "seriesid"})
df_clean.columns = df_clean.columns.astype(str)

In [58]:
# Reshape and clean
df_long = df_clean.melt(id_vars=["seriesid"], var_name="date", value_name="value")
df_long["date"] = df_long["date"].str.replace("\n", " ").str.strip()
df_long["date"] = pd.to_datetime(df_long["date"], format="%b %Y", errors="coerce")
df_long = df_long.dropna(subset=["date", "value"])
df_long = df_long.sort_values(by=["seriesid", "date"]).reset_index(drop=True)

# Extract codes from seriesid
df_long["industry_code"] = df_long["seriesid"].str[3:11]
df_long["flow_type_code"] = df_long["seriesid"].str[-3:]

# Get unique codes
industry_codes = df_long["industry_code"].unique()
flow_type_codes = df_long["flow_type_code"].unique()

industry_codes, flow_type_codes

# Define flow type mapping
flow_type_map = {
    "HIL": "Hires",
    "QUL": "Quits",
    "TSL": "Total Separations",
    "JOL": "Job Openings",
    "LDL": "Layoffs & Discharges",
    "UOL": "Other Separations",
    "OSL": "Other Separations (Residual)"
}

# Define industry code mapping based on BLS JOLTS industry categories
industry_map = {
    "00000000": "Total nonfarm",
    "10000000": "Total private",
    "11009900": "Mining and logging",
    "23000000": "Construction",
    "30000000": "Manufacturing",
    "32000000": "Durable goods manufacturing",
    "34000000": "Nondurable goods manufacturing",
    "40000000": "Trade, transportation, and utilities",
    "42000000": "Wholesale trade",
    "44000000": "Retail trade",
    "48009900": "Transportation, warehousing, and utilities",
    "51000000": "Information",
    "51009900": "Financial activities",
    "52000000": "Finance and insurance",
    "53000000": "Real estate and rental and leasing",
    "54009900": "Professional and business services",
    "60000000": "Private education and health services", 
    "61000000": "Private education services",
    "62000000": "Healthcare and Social Assistance",
    "70000000": "Leisure and Hospitality",
    "71000000": "Arts, entertainment, and recreation",
    "72000000": "Accomodation and food services",
    "81000000": "Other services",
    "90000000": "Government",
    "91000000": "Federal government",
    "92000000": "State and local government",
    "92300000": "State and local government education",
    "92900000": "State and local government, excluding education"
}

# Apply mappings
df_long["flow_type"] = df_long["flow_type_code"].map(flow_type_map)
df_long["jolts_industry"] = df_long["industry_code"].map(industry_map)

df_long = df_long.rename(columns={
    'value': 'level',
})


In [101]:
df = df_long

In [102]:
# List of industries to keep
industries_to_keep = [
    "Accomodation and food services",
    "Arts, entertainment, and recreation",
    "Construction",
    "Durable goods manufacturing",
    "Finance and insurance",
    "Healthcare and Social Assistance",
    "Information",
    "Mining and logging",
    "Nondurable goods manufacturing",
    "Other services",
    "Private education services",
    "Professional and business services",
    "Real estate and rental and leasing",
    "Retail trade",
    "Transportation, warehousing, and utilities",
    "Wholesale trade"
]

# Filter the DataFrame
df = df[df['jolts_industry'].isin(industries_to_keep)].copy()


In [103]:
df['year'] = df['date'].dt.year

In [104]:
df = df[df['flow_type'].isin(['Job Openings'])].copy()

In [105]:
df = df.groupby(['year', 'jolts_industry'], as_index=False)['level'].sum()


In [106]:
exclude_keywords = ['government', 'Total']

# Filter to keep only industries that do NOT contain any exclude keywords
df = df[~df['jolts_industry'].str.contains('|'.join(exclude_keywords), case=False)]

In [107]:
df['tot'] = df.groupby(['year'])['level'].transform('sum')

In [108]:
df['vac_share'] = df['level']/df['tot']

In [109]:
df = df[df['year'] <= 2019]

In [110]:
industry_avg = df.groupby('jolts_industry', as_index=False)['vac_share'].mean()


In [113]:
industry_avg

Unnamed: 0,jolts_industry,vac_share
0,Accomodation and food services,0.133022
1,"Arts, entertainment, and recreation",0.01766
2,Construction,0.041271
3,Durable goods manufacturing,0.040687
4,Finance and insurance,0.047336
5,Healthcare and Social Assistance,0.184317
6,Information,0.019554
7,Mining and logging,0.004022
8,Nondurable goods manufacturing,0.027934
9,Other services,0.041386


In [114]:
industry_avg.to_csv(f"{temp_dir}/industry_vacancy_shares.csv", index = False) 