In [17]:
import os
import pandas as pd
from difflib import get_close_matches

# ----------------------
# Define normalization function
# ----------------------
def normalize(text):
    return text.lower().replace('.', ' ').strip()

# ----------------------
# Load O*NET entity CSVs
# ----------------------
onet_dir = "../data/o_net_files"
entity_dfs = {}

for filename in os.listdir(onet_dir):
    if filename.endswith(".csv"):
        category = os.path.splitext(filename)[0].lower()
        file_path = os.path.join(onet_dir, filename)
        df = pd.read_csv(file_path)
        entity_dfs[category] = df

print("✅ Loaded O*NET entity files:", list(entity_dfs.keys()))

# ----------------------
# Load VOLCANO job_trait file
# ----------------------
volcano_dir = "../data/volcano"
job_trait_file = "Job_traits.csv"
job_traits_df = pd.read_csv(os.path.join(volcano_dir, job_trait_file))

# ----------------------
# Map each trait to possible KSA matches
# ----------------------
from collections import defaultdict

# Track stats
no_match_traits = []
match_counts_by_category = defaultdict(int)
multi_match_traits = {}

for trait in job_traits_df["Job_Trait"].unique():
    norm_trait = normalize(trait)
    matches = []

    for category, df in entity_dfs.items():
        ksa_column = [col for col in df.columns if "_entity" in col][0]
        norm_ksas = df[ksa_column].dropna().apply(normalize).unique()
        close_matches = get_close_matches(norm_trait, norm_ksas, n=5, cutoff=0.8)

        for match in close_matches:
            matches.append((match, category))
            match_counts_by_category[category] += 1

    if not matches:
        no_match_traits.append(trait)
    elif len(set([m[1] for m in matches])) > 1:
        multi_match_traits[trait] = matches

    trait_to_ksas[trait] = matches

# Report
print(f"\n📊 Traits with no match: {len(no_match_traits)} / {len(job_traits_df)}")
print(f"📦 Match counts by category:")
for k, v in match_counts_by_category.items():
    print(f"   {k}: {v}")

print(f"\n🔁 Traits that matched multiple categories: {len(multi_match_traits)}")



✅ Loaded O*NET entity files: ['abilities', 'skills', 'work_activities', 'knowledge']

📊 Traits with no match: 2 / 120
📦 Match counts by category:
   abilities: 58
   skills: 45
   knowledge: 32

🔁 Traits that matched multiple categories: 4


In [19]:
trait_rows = []
for trait, matches in trait_to_ksas.items():
    for match_text, category in matches:
        trait_rows.append({
            "trait_text": trait,
            "matched_entity_text": match_text,
            "matched_category": category
        })

df_trait_links = pd.DataFrame(trait_rows)
df_trait_links.to_csv("../data/volcano/trait_to_ksas_mapping.csv", index=False)
print("✅ Saved trait-to-KSA matches to CSV.")


✅ Saved trait-to-KSA matches to CSV.


In [23]:
import pandas as pd
import os
from difflib import get_close_matches

# ----------- Load Files -----------
volcano_path = "../data/volcano/Occupations.csv"
onet_dir = "../data/o_net_files"

# Load the volcano occupations file
volcano_df = pd.read_csv(volcano_path)

# Load O*NET job titles from all entity CSVs
onet_jobtitle_rows = []
for file in os.listdir(onet_dir):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(onet_dir, file))
        if "onetsoc_code" in df.columns and "job_title" in df.columns:
            onet_jobtitle_rows.append(df[["onetsoc_code", "job_title"]])

jobtitles_df = pd.concat(onet_jobtitle_rows).drop_duplicates()

# ----------- Matching Logic -----------

matches = []

for _, row in volcano_df.iterrows():
    occ_name = row["Occupation"]
    
    norm_occ = normalize(occ_name)
    norm_jobs = jobtitles_df["job_title"].dropna().apply(normalize).unique()
    match = get_close_matches(norm_occ, norm_jobs, n=1, cutoff=0.85)
    
    if match:
        matched_title = match[0]
        matched_row = jobtitles_df[jobtitles_df["job_title"].apply(normalize) == matched_title]
        if not matched_row.empty:
            onetsoc_code = matched_row["onetsoc_code"].values[0]
            matches.append({
                "occupation_name": occ_name,
                "matched_job_title": matched_title,
                "onetsoc_code": onetsoc_code
            })

# Save output
mapped_df = pd.DataFrame(matches)
output_path = "../data/volcano/occupation_to_jobtitle_mapping.csv"
mapped_df.to_csv(output_path, index=False)
print(f"✅ Mapped {len(mapped_df)} occupations to job titles. Saved to:\n{output_path}")



✅ Mapped 317 occupations to job titles. Saved to:
../data/volcano/occupation_to_jobtitle_mapping.csv


### Option: Filter the job zones to job titles from researh 

In [1]:
# import os
# import json

# # -----------------------------
# # Paths
# # -----------------------------
# volcano_dir = "../data/volcano"
# onet_dir = "../data/o_net_files"
# mapping_path = os.path.join(volcano_dir, "occupation_to_jobtitle_mapping.csv")
# output_path = os.path.join(onet_dir, "filtered_occupation_mapping.csv")
# jobtitle_json_path = os.path.join(onet_dir, "jobs_titles.json")

# # -----------------------------
# # Step 1: Load JSON and extract SOC codes
# # -----------------------------
# with open(jobtitle_json_path, "r") as f:
#     jobtitle_map = json.load(f)

# focus_soc_codes = set([v[1] for v in jobtitle_map.values()])
# print(f"✅ Found {len(focus_soc_codes)} SOC codes in job_titles.json")

# # -----------------------------
# # Step 2: Load full occupation mapping
# # -----------------------------
# df_full = pd.read_csv(mapping_path)

# # -----------------------------
# # Step 3: Filter by SOC codes
# # -----------------------------
# df_filtered = df_full[df_full["onetsoc_code"].isin(focus_soc_codes)]

# # -----------------------------
# # Step 4: Save filtered version
# # -----------------------------
# df_filtered.to_csv(output_path, index=False)
# print(f"✅ Saved filtered mapping with {len(df_filtered)} rows to:\n{output_path}")
