In [None]:
# Creating taxonomy with jobs from O*Net
import os
import json
import pandas as pd
import openai
import tempfile
import shutil
import time

# -------------------------------
# Config
# -------------------------------
API_KEY = ""
BASE_URL = ""
MODEL_NAME = "gpt-5"

INPUT_FILE = "job_tasks.csv"
OUTPUT_FILE = "job_tasks_with_skills.csv"
TAXONOMY_FILE = "taxonomy.json"

BATCH_SIZE = 20
MAX_RETRIES = 3

# -------------------------------
# OpenAI Client
# -------------------------------
client = openai.OpenAI(api_key=API_KEY, base_url=BASE_URL)

# -------------------------------
# Load CSV
# -------------------------------
df = pd.read_csv(INPUT_FILE)

# Add a persistent unique TaskID if not already there
if "TaskID" not in df.columns:
    df.insert(0, "TaskID", range(1, len(df) + 1))

# Ensure required columns exist
required = {"Task", "Automation Desire Rating", "Job Security Rating", "Enjoyment Rating", "Occupation (O*NET-SOC Title)"}
if not required.issubset(df.columns):
    raise ValueError(f"CSV must contain columns: {required}")

# Add new column for skills if missing
if "generic_skill" not in df.columns:
    df["generic_skill"] = None

# -------------------------------
# Load or init taxonomy
# -------------------------------
if os.path.exists(TAXONOMY_FILE):
    with open(TAXONOMY_FILE, "r") as f:
        taxonomy = json.load(f)
else:
    taxonomy = {"root": {}}

# -------------------------------
# Utility: atomic file write
# -------------------------------
def atomic_write_json(data, filename):
    with tempfile.NamedTemporaryFile("w", delete=False) as tmp:
        json.dump(data, tmp, indent=2)
        tmp_path = tmp.name
    shutil.move(tmp_path, filename)

def atomic_write_csv(df, filename):
    with tempfile.NamedTemporaryFile("w", delete=False, suffix=".csv") as tmp:
        df.to_csv(tmp.name, index=False)
        tmp_path = tmp.name
    shutil.move(tmp_path, filename)

# -------------------------------
# Prompt function
# -------------------------------
def classify_tasks(batch, taxonomy):
    """Send a batch of tasks to the LLM and return skill mappings + updated taxonomy."""

    tasks_str = "\n".join(
        [f"ID:{row['TaskID']} - {row['Task']} ({row['Occupation (O*NET-SOC Title)']})"
         for _, row in batch.iterrows()]
    )

    prompt = f"""
You are an expert in occupational task analysis.

Here is the current taxonomy (JSON):
<taxonomy>
{json.dumps(taxonomy, indent=2)}
</taxonomy>

Here are new job tasks with job titles and IDs:
{tasks_str}

Instructions:
1. For each task, provide a generic skill description that captures the core action, while preserving necessary domain nuance.
   - Example: "Maintain records of drilling operations" → "Maintain technical records"
   - Example: "Review legal case documents" → "Review long formal documents"
2. Place each skill under the most relevant existing category in the taxonomy.
3. If no suitable category exists, add a new subcategory under the most relevant parent.
4. Output two sections:
   <skills>
   - ID:xxx → Generic skill (taxonomy_path)
   </skills>

   <updated_taxonomy>
   {{...json updated taxonomy...}}
   </updated_taxonomy>
"""

    for attempt in range(MAX_RETRIES):
        try:
            response = client.chat.completions.create(
                model=MODEL_NAME,
                messages=[
                    {"role": "system", "content": "You are an expert at building skill taxonomies."},
                    {"role": "user", "content": prompt},
                ]
            )

            content = response.choices[0].message.content if response.choices else ""
            if not content:
                raise ValueError("Empty response")
            print(content)

            # ----------------- Parse skills -----------------
            skills_map = {}
            if "<skills>" in content and "</skills>" in content:
                skills_section = content.split("<skills>")[1].split("</skills>")[0].strip()
                for line in skills_section.splitlines():
                    line = line.strip()
                    if line.startswith("- ID:") or line.startswith("ID:"):
                        try:
                            id_part, skill_part = line.replace("-", "").split("→", 1)
                            task_id = int(id_part.strip().split(":")[1])
                            skills_map[task_id] = skill_part.strip()
                        except Exception:
                            continue

            # ----------------- Parse taxonomy -----------------
            updated_taxonomy = taxonomy
            if "<updated_taxonomy>" in content and "</updated_taxonomy>" in content:
                taxonomy_section = content.split("<updated_taxonomy>")[1].split("</updated_taxonomy>")[0].strip()
                try:
                    updated_taxonomy = json.loads(taxonomy_section)
                except Exception as e:
                    print("Warning: could not parse taxonomy JSON:", e)
                    updated_taxonomy = taxonomy  # keep last good taxonomy

            return skills_map, updated_taxonomy

        except Exception as e:
            print(f"Error on attempt {attempt+1}: {e}")
            time.sleep(2 ** attempt)  # exponential backoff

    print("❌ Failed after max retries")
    return {}, taxonomy

# -------------------------------
# Process in batches
# -------------------------------
unprocessed = df[df["generic_skill"].isna()].copy()

for start in range(0, len(unprocessed), BATCH_SIZE):
    batch = unprocessed.iloc[start:start+BATCH_SIZE]
    print(f"Processing {start}–{start+len(batch)-1}...")

    skills_map, taxonomy = classify_tasks(batch, taxonomy)

    # Update DataFrame
    for task_id, skill in skills_map.items():
        df.loc[df["TaskID"] == task_id, "generic_skill"] = skill

    # Save progress atomically
    atomic_write_csv(df, OUTPUT_FILE)
    atomic_write_json(taxonomy, TAXONOMY_FILE)

print("✅ Done! Results saved.")

In [None]:
# Using benchmark tasks, map them to the common taxonomy
import os
import json
import random
import math
import tempfile
import shutil
from collections import Counter
import openai

# -------------------------------
# Config
# -------------------------------
API_KEY = ""
BASE_URL = ""
MODEL_NAME = "gpt-5"

BENCHMARK_FILE = "biocoder_tasks.txt"
TAXONOMY_FILE = "../taxonomy_restructured.json"
TAXONOMY_BACKUP_FILE = "../taxonomy_restructured.json.backup"
OUTPUT_FILE = "biocoder_tasks_mapping_re.json"
CHECKPOINT_FILE = "biocoder_tasks_progress_re.json"

BATCH_SIZE = 5
INITIAL_SAMPLE_MIN = 20
ALPHA = 0.1
DESIRED_COVERAGE = 0.90
MAX_CONSEC_NO_NEW = 10
SAMPLE_SEED = 42

client = openai.OpenAI(api_key=API_KEY, base_url=BASE_URL)

# -------------------------------
# Helper functions
# -------------------------------
def atomic_write_json(data, path):
    """Safely write JSON with temp + replace."""
    with tempfile.NamedTemporaryFile("w", delete=False, suffix=".json") as tf:
        json.dump(data, tf, indent=2)
        tmp = tf.name
    shutil.move(tmp, path)

def chao1(S_obs, freq_values):
    """Chao1 estimator for total species richness."""
    if len(freq_values) < 2:
        return float('inf')
    
    f1 = sum(1 for v in freq_values if v == 1)
    f2 = sum(1 for v in freq_values if v == 2)
    
    if f1 == 0:
        return S_obs
    
    if f2 == 0:
        return S_obs + f1 * (f1 - 1) / 2 if f1 > 1 else S_obs + f1
    
    return S_obs + f1 * f1 / (2 * f2)

# -------------------------------
# Load benchmark tasks
# -------------------------------
with open(BENCHMARK_FILE, "r") as f:
    lines = [l.strip() for l in f if l.strip()]

benchmark_tasks = []
for line in lines:
    if ":" in line:
        task_id, instruction = line.split(":", 1)
        benchmark_tasks.append({
            "benchmark_task_id": task_id.strip(),
            "instruction": instruction.strip()
        })

# -------------------------------
# Load taxonomy
# -------------------------------
if not os.path.exists(TAXONOMY_BACKUP_FILE) and os.path.exists(TAXONOMY_FILE):
    print(f"[BACKUP] Creating backup: {TAXONOMY_BACKUP_FILE}")
    shutil.copy2(TAXONOMY_FILE, TAXONOMY_BACKUP_FILE)

with open(TAXONOMY_FILE, "r") as f:
    taxonomy = json.load(f)

def init_leaf_dict(node):
    if isinstance(node, dict):
        for k, v in list(node.items()):
            if v == {}:
                node[k] = {"tasks": []}
            else:
                init_leaf_dict(v)

init_leaf_dict(taxonomy["root"])

# -------------------------------
# State (resume if checkpoint exists)
# -------------------------------
if os.path.exists(CHECKPOINT_FILE):
    with open(CHECKPOINT_FILE, "r") as f:
        state = json.load(f)
    remaining_indices = state["remaining_indices"]
    discovered_leaves = Counter({tuple(k): v for k, v in state["discovered_leaves"].items()})
    consec_no_new = state["consec_no_new"]
    mapping_log = state["mapping_log"]
    taxonomy = state["taxonomy"]
    print(f"[RESUME] Sampled: {len(mapping_log)}, Remaining: {len(remaining_indices)}, Unique leaves: {len(discovered_leaves)}")
else:
    random.seed(SAMPLE_SEED)
    remaining_indices = list(range(len(benchmark_tasks)))
    random.shuffle(remaining_indices)
    discovered_leaves = Counter()
    consec_no_new = 0
    mapping_log = {}
    print(f"[START] Total tasks: {len(benchmark_tasks)}")

# -------------------------------
# LLM mapping function
# -------------------------------
def map_tasks_to_taxonomy(tasks_batch, taxonomy):
    tasks_str = "\n".join(
        [f"{i+1}. [{t['benchmark_task_id']}] {t['instruction']}" for i, t in enumerate(tasks_batch)]
    )

    prompt = f"""
You are an expert occupational skill mapper.

Here is the current taxonomy (JSON):
<taxonomy>
{json.dumps(taxonomy, indent=2)}
</taxonomy>

You are given the following benchmark tasks:
{tasks_str}

Instructions:
- For each benchmark task, assign one or more relevant leaves (skills) from the taxonomy. The skills must be directly the function of the task as described by the task description.
- Do NOT extrapolate the task description to related skills, but map exactly the skills required to the task.
- If there is no direct skill that is related to the task, or if the task is not sufficiently representative of the skill, output "N/A" for the task.
- Multiple leaves can be assigned per task.
- Only use exact existing categories, do NOT invent new ones.
- Output must include:
<task_mappings>
- benchmark_task_id: [ "root/Category/Subcategory", ... ]
</task_mappings>
"""

    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": "You are an expert in skill mapping and taxonomy assignment."},
            {"role": "user", "content": prompt},
        ]
    )

    content = response.choices[0].message.content if response.choices else ""
    print(content)

    task_map = {}
    if "<task_mappings>" in content and "</task_mappings>" in content:
        mapping_section = content.split("<task_mappings>")[1].split("</task_mappings>")[0].strip()
        
        # Try to parse the entire mapping section as one JSON-like structure
        try:
            lines = mapping_section.splitlines()
            json_str = "{"
            for line in lines:
                line = line.strip()
                if line.startswith("-") and ":" in line:
                    parts = line[1:].split(":", 1)
                    task_id = parts[0].strip()
                    value_part = parts[1].strip()
                    if json_str != "{":
                        json_str += ","
                    json_str += f'"{task_id}": {value_part}'
                elif line and not line.startswith("-"):
                    json_str += line
            json_str += "}"
            
            parsed = json.loads(json_str.replace("'", '"'))
            task_map.update(parsed)
            
        except Exception as e:
            print(f"⚠️ Failed bulk parsing, falling back to line-by-line: {e}")
            
            current_task_id = None
            current_value = ""
            
            for line in mapping_section.splitlines():
                line = line.strip()
                if line.startswith("-") and ":" in line:
                    if current_task_id:
                        try:
                            if current_value.upper() == "N/A":
                                task_map[current_task_id] = []
                            else:
                                paths = json.loads(current_value.replace("'", '"'))
                                task_map[current_task_id] = paths
                        except Exception as ex:
                            print(f"⚠️ Failed parsing task {current_task_id}: {ex}")
                    
                    parts = line[1:].split(":", 1)
                    current_task_id = parts[0].strip()
                    current_value = parts[1].strip()
                elif line and current_task_id:
                    current_value += " " + line
            
            if current_task_id:
                try:
                    if current_value.upper() == "N/A":
                        task_map[current_task_id] = []
                    else:
                        paths = json.loads(current_value.replace("'", '"'))
                        task_map[current_task_id] = paths
                except Exception as ex:
                    print(f"⚠️ Failed parsing task {current_task_id}: {ex}")

    return task_map

# -------------------------------
# Helper to check path exists
# -------------------------------
def path_exists(taxonomy, path):
    try:
        node = taxonomy["root"]
        for p in path:
            node = node[p]
        return "tasks" in node
    except KeyError:
        return False

# -------------------------------
# Normalize and clean path
# -------------------------------
def normalize_path(path_str):
    """
    Normalize path: remove 'root/', strip '/tasks' suffix, handle missing root.
    Returns tuple of path components.
    """
    path_str = path_str.strip().strip('/')
    
    # Remove "root/" prefix if present
    if path_str.startswith("root/"):
        path_str = path_str[5:]
    
    # Remove "/tasks" suffix if present
    if path_str.endswith("/tasks"):
        path_str = path_str[:-6]
    
    # Skip empty paths
    if not path_str:
        return None
        
    return tuple(path_str.split("/"))

# -------------------------------
# Sampling loop
# -------------------------------
print("\n" + "="*80)
print("BATCH SAMPLING WITH COVERAGE-BASED STOPPING")
print("="*80)

while remaining_indices:
    # Get batch
    batch_indices = [remaining_indices.pop(0) for _ in range(min(BATCH_SIZE, len(remaining_indices)))]
    batch_tasks = [benchmark_tasks[i] for i in batch_indices]

    print(f"\n[BATCH] Processing {len(batch_tasks)} tasks...")
    task_map = map_tasks_to_taxonomy(batch_tasks, taxonomy)

    new_leaf_discovered = False
    batch_new_leaves = []
    
    for task_id, paths in task_map.items():
        mapping_log[task_id] = paths
        
        for path_str in paths:
            path = normalize_path(path_str)
            
            if path is None:
                print(f"⚠️ Skipping empty path for task {task_id}")
                continue
            
            if path_exists(taxonomy, path):
                # Navigate to node and add task
                node = taxonomy["root"]
                for p in path:
                    node = node[p]
                
                if task_id not in node["tasks"]:
                    node["tasks"].append(task_id)
                
                # Track leaf discovery
                if path not in discovered_leaves:
                    discovered_leaves[path] = 1
                    new_leaf_discovered = True
                    batch_new_leaves.append(path)
                else:
                    discovered_leaves[path] += 1
            else:
                print(f"⚠️ Invalid path suggested by LLM: {path_str}")

    # Update consecutive no-new counter
    if new_leaf_discovered:
        consec_no_new = 0
        print(f"✅ Discovered {len(batch_new_leaves)} new leaf/leaves:")
        for leaf in batch_new_leaves:
            print(f"  - {'/'.join(leaf)}")
    else:
        consec_no_new += 1
        print(f"✅ No new leaves discovered ({consec_no_new} consecutive batches)")

    # Calculate statistics
    n_sampled = len(mapping_log)
    S_obs = len(discovered_leaves)
    freq_values = list(discovered_leaves.values())
    
    p_new = (sum(1 for v in freq_values if v == 1) / n_sampled) if n_sampled > 0 else 1
    S_chao = chao1(S_obs, freq_values) if n_sampled > 0 else float("inf")
    coverage = S_obs / S_chao if S_chao and math.isfinite(S_chao) else 0

    print(f"\n[STATS] Sampled: {n_sampled}, Unique leaves: {S_obs}, Chao1: {S_chao:.1f}, Coverage: {coverage:.2%}, P(new): {p_new:.3f}")

    # Stopping conditions
    stop_reason = None
    if n_sampled >= INITIAL_SAMPLE_MIN:
        if p_new <= ALPHA:
            stop_reason = f"Low new discovery probability: {p_new:.3f} <= {ALPHA}"
        elif coverage >= DESIRED_COVERAGE and math.isfinite(S_chao):
            stop_reason = f"Desired coverage reached: {coverage:.2%} >= {DESIRED_COVERAGE:.2%}"
        elif consec_no_new >= MAX_CONSEC_NO_NEW:
            stop_reason = f"No new leaves for {consec_no_new} consecutive batches"
    else:
        print(f"[INFO] Continuing to reach minimum sample size ({n_sampled}/{INITIAL_SAMPLE_MIN})")

    # Save checkpoint
    atomic_write_json({
        "remaining_indices": remaining_indices,
        "discovered_leaves": {str(list(k)): v for k, v in discovered_leaves.items()},
        "consec_no_new": consec_no_new,
        "mapping_log": mapping_log,
        "taxonomy": taxonomy
    }, CHECKPOINT_FILE)
    
    if stop_reason:
        print(f"\n[STOPPING] {stop_reason}")
        break

# -------------------------------
# Final save
# -------------------------------
atomic_write_json({
    "taxonomy": taxonomy,
    "mapping_log": mapping_log,
    "statistics": {
        "total_sampled": len(mapping_log),
        "unique_leaves": len(discovered_leaves),
        "chao1_estimate": S_chao if math.isfinite(S_chao) else None,
        "coverage": coverage
    }
}, OUTPUT_FILE)

atomic_write_json(taxonomy, TAXONOMY_FILE)

print("\n" + "="*80)
print("FINAL SUMMARY:")
print(f"  Tasks sampled: {len(mapping_log)}/{len(benchmark_tasks)}")
print(f"  Unique leaves discovered: {len(discovered_leaves)}")
print(f"  Chao1 estimate: {S_chao:.1f}")
print(f"  Coverage: {coverage:.2%}")
print(f"  Mapping saved to: {OUTPUT_FILE}")
print(f"  Updated taxonomy: {TAXONOMY_FILE}")
print(f"  Checkpoint: {CHECKPOINT_FILE}")
print("="*80)

In [None]:
# Merging jobs and tasks into one taxonomy
import os
import json
import csv
from collections import defaultdict

# -------------------------------
# Config
# -------------------------------
JOB_SKILLS_FILE = "../job_tasks_with_skills.csv"
TAXONOMY_FILE = "taxonomy_with_instructions.json"
TAXONOMY_INFO_FILE = "../taxonomy_restructure_info.json"
OUTPUT_FILE = "taxonomy_with_jobs.json"

# -------------------------------
# Helper functions
# -------------------------------
def load_json(path):
    with open(path, "r") as f:
        return json.load(f)

def save_json(data, path):
    with open(path, "w") as f:
        json.dump(data, f, indent=2)

def get_node(taxonomy, path_str):
    """Navigate to a node in taxonomy given a path string."""
    parts = path_str.split("/")
    if parts[0] != "root":
        return None
    
    node = taxonomy["root"]
    for part in parts[1:]:
        if part in node:
            node = node[part]
        else:
            return None
    return node

# -------------------------------
# Load data
# -------------------------------
print("[LOAD] Loading taxonomy and job data...")

taxonomy = load_json(TAXONOMY_FILE)
taxonomy_info = load_json(TAXONOMY_INFO_FILE)
task_mapping = taxonomy_info.get("task_mapping", {})
# Collect jobs by path
jobs_by_path = defaultdict(list)

print("[LOAD] Reading job skills CSV...")
with open(JOB_SKILLS_FILE, "r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        task_id = row["TaskID"]
        task_text = row["Task"]
        occupation = row["Occupation (O*NET-SOC Title)"]
        automation = row["Automation Desire Rating"]
        job_security = row["Job Security Rating"]
        enjoyment = row["Enjoyment Rating"]
        generic_skill = row["generic_skill"]
        
        # Extract path from generic_skill (format: "Description (Path)")
        if "(" in generic_skill and ")" in generic_skill:
            path_part = generic_skill.split("(")[-1].split(")")[0]
            
            # Convert to taxonomy path format
            path_components = [p.strip() for p in path_part.split(">")]
            original_path = "root/" + "/".join(path_components)
            mapped_path = original_path
            # Fix duplicate root/ prefix if present
            if mapped_path.startswith("root/root/"):
                final_path = mapped_path[5:]  # Remove the first "root/"
            else:
                final_path = mapped_path
            final_path = task_mapping.get(final_path.strip(), final_path)
            
            # Add job to the final path
            jobs_by_path[final_path].append({
                "job_task_id": task_id,
                "task": task_text,
                "occupation": occupation,
                "automation_desire": automation,
                "job_security": job_security,
                "enjoyment": enjoyment
            })

print(f"[LOAD] Found {len(jobs_by_path)} unique paths with jobs")
print(f"[LOAD] Total job tasks: {sum(len(jobs) for jobs in jobs_by_path.values())}")

# -------------------------------
# Add jobs to taxonomy
# -------------------------------
print("\n[PROCESS] Adding jobs to taxonomy nodes...")

jobs_added = 0
paths_with_jobs = 0
paths_not_found = []

for path, jobs in jobs_by_path.items():
    node = get_node(taxonomy, path)
    if node is not None:
        if "jobs" not in node:
            node["jobs"] = []
        
        # Add each job if not already present
        for job in jobs:
            # Check if job already exists (by job_task_id)
            existing_ids = [j["job_task_id"] for j in node["jobs"] if isinstance(j, dict)]
            if job["job_task_id"] not in existing_ids:
                node["jobs"].append(job)
                jobs_added += 1
        
        paths_with_jobs += 1
    else:
        paths_not_found.append(path)
        print(f"⚠️ Path not found in taxonomy: {path}")

print(f"\n[SUMMARY]")
print(f"  Paths with jobs added: {paths_with_jobs}")
print(f"  Total jobs added: {jobs_added}")
print(f"  Paths not found: {len(paths_not_found)}")

if paths_not_found:
    print(f"\n[WARNING] The following paths were not found:")
    for path in paths_not_found[:10]:  # Show first 10
        print(f"  - {path}")
    if len(paths_not_found) > 10:
        print(f"  ... and {len(paths_not_found) - 10} more")

# -------------------------------
# Save updated taxonomy
# -------------------------------
save_json(taxonomy, OUTPUT_FILE)
print(f"\n✅ Updated taxonomy saved to: {OUTPUT_FILE}")

# -------------------------------
# Generate statistics
# -------------------------------
def count_jobs_in_taxonomy(node):
    """Recursively count jobs in taxonomy."""
    count = 0
    if isinstance(node, dict):
        if "jobs" in node:
            count += len([j for j in node["jobs"] if isinstance(j, dict)])
        for k, v in node.items():
            if k not in ["tasks", "jobs"]:
                count += count_jobs_in_taxonomy(v)
    return count

total_jobs = count_jobs_in_taxonomy(taxonomy["root"])
print(f"\n[STATS] Total jobs in taxonomy: {total_jobs}")

In [None]:
# Check similarity between tasks and jobs on same taxonomy leaves
"""
Integrate new benchmark tasks into existing taxonomy with job similarities.
This will:
1. Add new benchmark tasks to taxonomy_with_similarity.json
2. Recalculate similarities only for affected leaves
"""

import os
import json
import tempfile
import shutil
from glob import glob
import openai

# -------------------------------
# Config
# -------------------------------
API_KEY = ""
BASE_URL = ""
MODEL_NAME = "gpt-5"

TAXONOMY_FILE = "taxonomy_with_similarity.json"  # Start from existing file with similarities
OUTPUT_FILE = "taxonomy_with_similarity_updated.json"
CHECKPOINT_FILE = "integration_progress.json"

TASK_BATCH_SIZE = 10

client = openai.OpenAI(api_key=API_KEY, base_url=BASE_URL)

# -------------------------------
# Helper functions
# -------------------------------
def atomic_write_json(data, path):
    with tempfile.NamedTemporaryFile("w", delete=False, suffix=".json") as tf:
        json.dump(data, tf, indent=2)
        tmp = tf.name
    shutil.move(tmp, path)

def load_json(path):
    with open(path, "r") as f:
        return json.load(f)

def get_node(taxonomy, path_list):
    node = taxonomy
    for p in path_list:
        if p not in node:
            return None
        node = node[p]
    return node

# -------------------------------
# Step 1: Merge new benchmark tasks
# -------------------------------
def merge_new_benchmarks(taxonomy):
    print("\n" + "="*60)
    print("STEP 1: MERGING NEW BENCHMARK TASKS")
    print("="*60)
    
    affected_leaves = set()
    
    task_files = sorted(glob("*_tasks.txt"))
    print(f"\nFound {len(task_files)} benchmark file(s)")
    
    for task_file in task_files:
        base = task_file.replace("_tasks.txt", "")
        mapping_file = f"{base}_tasks_mapping_re.json"
        
        if not os.path.exists(mapping_file):
            print(f"Skipping {base}: no mapping file")
            continue
        
        benchmark = base
        print(f"\nProcessing: {benchmark}")
        
        # Load tasks
        tasks = {}
        with open(task_file, "r", encoding="utf-8") as f:
            for line in f:
                if ":" in line:
                    tid, inst = line.split(":", 1)
                    tasks[tid.strip()] = inst.strip()
        
        print(f"  Loaded {len(tasks)} tasks")
        
        # Load mapping
        mapping = load_json(mapping_file)
        mapping_tax = mapping.get("taxonomy", {}).get("root", {})
        
        # Process mapping
        added = 0
        
        def traverse(node, path):
            nonlocal added
            if not isinstance(node, dict):
                return
            
            if "tasks" in node:
                task_ids = [t for t in node["tasks"] if isinstance(t, str) and t in tasks]
                if task_ids:
                    master = get_node(taxonomy, path)
                    if master:
                        # Initialize
                        if "tasks" not in master:
                            master["tasks"] = []
                        
                        # Clean string IDs
                        master["tasks"] = [t for t in master["tasks"] if isinstance(t, dict)]
                        
                        # Get existing IDs
                        existing = {t.get("task_id") for t in master["tasks"]}
                        
                        # Add new tasks
                        for tid in task_ids:
                            if tid not in existing:
                                master["tasks"].append({
                                    "benchmark": benchmark,
                                    "task_id": tid,
                                    "instruction": tasks[tid],
                                    "job_similarities": []  # Will be filled later
                                })
                                added += 1
                                affected_leaves.add(tuple(path))
            
            for k, v in node.items():
                if k != "tasks":
                    traverse(v, path + [k])
        
        traverse(mapping_tax, ["root"])
        print(f"  Added {added} new task objects")
    
    print(f"\n✓ Total affected leaves: {len(affected_leaves)}")
    return affected_leaves

# -------------------------------
# Step 2: Recalculate similarities for affected leaves
# -------------------------------
def calculate_similarity_batch(benchmark_tasks, job_tasks, skill_path):
    tasks_str = "\n".join([
        f"{i+1}. [ID: {t['task_id']}] {t['instruction']}"
        for i, t in enumerate(benchmark_tasks)
    ])
    
    jobs_str = "\n".join([
        f"{i+1}. [ID: {j['job_task_id']}] {j['task']} (Occupation: {j['occupation']})"
        for i, j in enumerate(job_tasks)
    ])
    
    prompt = f"""You are an expert at comparing and rating task similarity in occupational contexts.

You are analyzing tasks within the skill category: {skill_path}

Here are BENCHMARK TASKS (from automated task benchmarks):
{tasks_str}

Here are JOB TASKS (from real occupations):
{jobs_str}

For each BENCHMARK TASK, evaluate its similarity to each JOB TASK on a scale of 0-10:
- 0: Completely unrelated
- 1-3: Minimal relation (shares very broad concepts only)
- 4-6: Moderate relation (overlapping skills but different contexts/goals)
- 7-9: Strong relation (similar skills and contexts, different specifics)
- 10: Nearly identical (same skills, same context, same goal)

Consider:
- What specific skills are required?
- How well is the benchmark task representative of the skills required for the job?
- How close in terms of complexity is the benchmark task to the job task?
- What is the context/domain?
- What is the end goal/outcome?
- How transferable are the skills?

Output format:
<similarities>
benchmark_task_id: [
  {{"job_task_id": "X", "score": Y, "reasoning": "brief explanation"}},
  ...
]
</similarities>
"""

    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": "You are an expert in occupational task analysis and skill matching."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.3
    )

    content = response.choices[0].message.content if response.choices else ""
    
    # Parse response
    similarity_map = {}
    if "<similarities>" in content and "</similarities>" in content:
        sim_section = content.split("<similarities>")[1].split("</similarities>")[0].strip()
        
        current_task_id = None
        current_json = ""
        
        for line in sim_section.splitlines():
            line = line.strip()
            
            if ":" in line and line.endswith("["):
                if current_task_id and current_json:
                    try:
                        similarity_map[current_task_id] = json.loads(current_json)
                    except:
                        pass
                current_task_id = line.split(":")[0].strip()
                current_json = "["
            elif line.startswith("]"):
                current_json += "]"
                if current_task_id:
                    try:
                        similarity_map[current_task_id] = json.loads(current_json)
                    except:
                        pass
                current_task_id = None
                current_json = ""
            elif current_task_id:
                current_json += line
        
        if current_task_id and current_json:
            try:
                if not current_json.endswith("]"):
                    current_json += "]"
                similarity_map[current_task_id] = json.loads(current_json)
            except:
                pass
    
    return similarity_map

def recalculate_similarities(taxonomy, affected_leaves):
    print("\n" + "="*60)
    print("STEP 2: RECALCULATING SIMILARITIES")
    print("="*60)
    
    # Load checkpoint if exists
    if os.path.exists(CHECKPOINT_FILE):
        checkpoint = load_json(CHECKPOINT_FILE)
        processed = set(checkpoint.get("processed_leaves", []))
    else:
        processed = set()
    
    leaves_to_process = [
        leaf for leaf in affected_leaves 
        if "/".join(leaf) not in processed
    ]
    
    print(f"\nTotal affected leaves: {len(affected_leaves)}")
    print(f"Already processed: {len(processed)}")
    print(f"Remaining: {len(leaves_to_process)}")
    
    total_comparisons = 0
    
    for idx, leaf_path in enumerate(leaves_to_process):
        leaf_str = "/".join(leaf_path)
        print(f"\n[{idx+1}/{len(leaves_to_process)}] {leaf_str}")
        
        node = get_node(taxonomy, list(leaf_path))
        if not node:
            continue
        
        tasks = [t for t in node.get("tasks", []) if isinstance(t, dict)]
        jobs = [j for j in node.get("jobs", []) if isinstance(j, dict)]
        
        # Find tasks without similarities
        tasks_needing_sim = [t for t in tasks if not t.get("job_similarities")]
        
        print(f"  Tasks: {len(tasks)}, Jobs: {len(jobs)}, Need similarity: {len(tasks_needing_sim)}")
        
        if not tasks_needing_sim or not jobs:
            processed.add(leaf_str)
            continue
        
        # Process in batches
        for batch_start in range(0, len(tasks_needing_sim), TASK_BATCH_SIZE):
            batch_end = min(batch_start + TASK_BATCH_SIZE, len(tasks_needing_sim))
            task_batch = tasks_needing_sim[batch_start:batch_end]
            
            print(f"  Processing tasks {batch_start+1}-{batch_end}...")
            
            try:
                similarities = calculate_similarity_batch(task_batch, jobs, leaf_str)
                
                for task in task_batch:
                    task_id = task["task_id"]
                    if task_id in similarities:
                        task["job_similarities"] = similarities[task_id]
                    else:
                        task["job_similarities"] = []
                
                total_comparisons += len(task_batch) * len(jobs)
            except Exception as e:
                print(f"  ⚠️ Error: {e}")
        
        processed.add(leaf_str)
        
        # Save checkpoint
        atomic_write_json({
            "processed_leaves": list(processed),
            "taxonomy": taxonomy
        }, CHECKPOINT_FILE)
    
    print(f"\n✓ Completed {total_comparisons} comparisons")

# -------------------------------
# Main
# -------------------------------
def main():
    print("="*60)
    print("INTEGRATING NEW BENCHMARK TASKS")
    print("="*60)
    
    # Load existing taxonomy with similarities
    print(f"\nLoading: {TAXONOMY_FILE}")
    taxonomy = load_json(TAXONOMY_FILE)
    
    # Step 1: Merge new tasks
    affected_leaves = merge_new_benchmarks(taxonomy)
    
    # Step 2: Recalculate similarities only for affected leaves
    if affected_leaves:
        recalculate_similarities(taxonomy, affected_leaves)
    else:
        print("\nNo new tasks added, nothing to recalculate")
    
    # Save final result
    atomic_write_json(taxonomy, OUTPUT_FILE)
    
    print("\n" + "="*60)
    print("COMPLETE")
    print(f"Output: {OUTPUT_FILE}")
    print("="*60)

if __name__ == "__main__":
    main()

In [2]:
"""
Calculate job coverage metrics from taxonomy with similarities.
Metrics:
1. Coverage: % of leaves with jobs that also have tasks
2. Coverage Quality: For each job, count tasks with similarity >= threshold
"""

import json
from collections import defaultdict

# -------------------------------
# Config
# -------------------------------
TAXONOMY_FILE = "taxonomy_with_similarity_updated.json"
OUTPUT_FILE = "job_coverage_metrics.json"
SIMILARITY_THRESHOLD = 7  # Configurable: 5, 6, 7, etc.

# -------------------------------
# Helper functions
# -------------------------------
def load_json(path):
    with open(path, "r") as f:
        return json.load(f)

def save_json(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

def find_all_leaves(node, path=[]):
    """Recursively find all leaf nodes."""
    leaves = []
    if isinstance(node, dict):
        has_tasks_or_jobs = "tasks" in node or "jobs" in node
        is_leaf = has_tasks_or_jobs
        
        if is_leaf:
            leaves.append({
                "path": path,
                "tasks": node.get("tasks", []),
                "jobs": node.get("jobs", [])
            })
        
        for k, v in node.items():
            if k not in ["tasks", "jobs"]:
                leaves.extend(find_all_leaves(v, path + [k]))
    
    return leaves

# -------------------------------
# Main analysis
# -------------------------------
def main():
    print("="*80)
    print("JOB COVERAGE ANALYSIS")
    print("="*80)
    print(f"\nSimilarity threshold: {SIMILARITY_THRESHOLD}")
    
    # Load taxonomy
    print(f"Loading: {TAXONOMY_FILE}")
    taxonomy = load_json(TAXONOMY_FILE)
    
    # Find all leaves
    all_leaves = find_all_leaves(taxonomy["root"], ["root"])
    print(f"\nTotal leaves: {len(all_leaves)}")
    
    # Filter leaves with jobs
    leaves_with_jobs = [
        leaf for leaf in all_leaves 
        if len([j for j in leaf["jobs"] if isinstance(j, dict)]) > 0
    ]
    print(f"Leaves with jobs: {len(leaves_with_jobs)}")
    
    # Calculate coverage: leaves with both jobs and tasks
    leaves_with_both = [
        leaf for leaf in leaves_with_jobs
        if len([t for t in leaf["tasks"] if isinstance(t, dict)]) > 0
    ]
    
    coverage = len(leaves_with_both) / len(leaves_with_jobs) if leaves_with_jobs else 0
    
    print(f"\n{'='*80}")
    print(f"METRIC 1: COVERAGE")
    print(f"{'='*80}")
    print(f"Leaves with jobs that also have tasks: {len(leaves_with_both)}/{len(leaves_with_jobs)}")
    print(f"Coverage: {coverage:.2%}")
    
    # Calculate coverage quality: job-level analysis
    print(f"\n{'='*80}")
    print(f"METRIC 2: COVERAGE QUALITY")
    print(f"{'='*80}")
    
    job_coverage_details = []
    total_jobs = 0
    jobs_with_high_quality_tasks = 0
    
    for leaf in leaves_with_both:
        path_str = "/".join(leaf["path"])
        
        jobs = [j for j in leaf["jobs"] if isinstance(j, dict)]
        tasks = [t for t in leaf["tasks"] if isinstance(t, dict)]
        
        for job in jobs:
            total_jobs += 1
            job_id = job["job_task_id"]
            
            # Count tasks with high similarity to this job
            high_sim_tasks = []
            
            for task in tasks:
                similarities = task.get("job_similarities", [])
                for sim in similarities:
                    if sim.get("job_task_id") == job_id and sim.get("score", 0) >= SIMILARITY_THRESHOLD:
                        high_sim_tasks.append({
                            "task_id": task["task_id"],
                            "benchmark": task["benchmark"],
                            "score": sim["score"],
                            "reasoning": sim.get("reasoning", "")
                        })
            
            if high_sim_tasks:
                jobs_with_high_quality_tasks += 1
            
            job_coverage_details.append({
                "job_task_id": job_id,
                "occupation": job["occupation"],
                "job_task": job["task"],
                "skill_path": path_str,
                "num_high_quality_tasks": len(high_sim_tasks),
                "high_quality_tasks": high_sim_tasks
            })
    
    quality_coverage = jobs_with_high_quality_tasks / total_jobs if total_jobs else 0
    
    print(f"Total jobs analyzed: {total_jobs}")
    print(f"Jobs with high-quality tasks (>={SIMILARITY_THRESHOLD}): {jobs_with_high_quality_tasks}")
    print(f"Quality coverage: {quality_coverage:.2%}")
    
    # Distribution analysis
    task_counts = [jc["num_high_quality_tasks"] for jc in job_coverage_details]
    task_count_dist = defaultdict(int)
    for count in task_counts:
        task_count_dist[count] += 1
    
    print(f"\n{'='*80}")
    print(f"DISTRIBUTION: Tasks per Job")
    print(f"{'='*80}")
    for count in sorted(task_count_dist.keys()):
        pct = task_count_dist[count] / total_jobs * 100 if total_jobs else 0
        print(f"{count} tasks: {task_count_dist[count]} jobs ({pct:.1f}%)")
    
    # Jobs with no coverage
    jobs_no_coverage = [jc for jc in job_coverage_details if jc["num_high_quality_tasks"] == 0]
    
    if jobs_no_coverage:
        print(f"\n{'='*80}")
        print(f"JOBS WITH NO HIGH-QUALITY TASKS: {len(jobs_no_coverage)}")
        print(f"{'='*80}")
        for jc in jobs_no_coverage[:10]:  # Show first 10
            print(f"\n[{jc['job_task_id']}] {jc['occupation']}")
            print(f"  Path: {jc['skill_path']}")
            print(f"  Task: {jc['job_task'][:100]}...")
        if len(jobs_no_coverage) > 10:
            print(f"\n... and {len(jobs_no_coverage) - 10} more")
    
    # Jobs with best coverage
    jobs_best_coverage = sorted(job_coverage_details, key=lambda x: x["num_high_quality_tasks"], reverse=True)[:10]
    
    print(f"\n{'='*80}")
    print(f"JOBS WITH BEST COVERAGE (Top 10)")
    print(f"{'='*80}")
    for jc in jobs_best_coverage:
        print(f"\n[{jc['job_task_id']}] {jc['occupation']} ({jc['num_high_quality_tasks']} tasks)")
        print(f"  Path: {jc['skill_path']}")
        print(f"  Task: {jc['job_task'][:100]}...")
    
    # Benchmark coverage analysis - GREEDY SET COVER
    print(f"\n{'='*80}")
    print(f"METRIC 3: BENCHMARK COVERAGE (Greedy Set Cover)")
    print(f"{'='*80}")
    
    # Build mapping: benchmark -> set of jobs it covers well
    benchmark_to_jobs = defaultdict(set)
    
    for leaf in leaves_with_both:
        jobs = [j for j in leaf["jobs"] if isinstance(j, dict)]
        tasks = [t for t in leaf["tasks"] if isinstance(t, dict)]
        
        for task in tasks:
            benchmark = task.get("benchmark", "unknown")
            similarities = task.get("job_similarities", [])
            
            for sim in similarities:
                if sim.get("score", 0) >= SIMILARITY_THRESHOLD:
                    job_id = sim.get("job_task_id")
                    benchmark_to_jobs[benchmark].add(job_id)
    
    # Greedy set cover: pick benchmark covering most uncovered jobs
    covered_jobs = set()
    benchmark_ranking = []
    remaining_benchmarks = dict(benchmark_to_jobs)
    
    while remaining_benchmarks:
        # Find benchmark that covers most NEW jobs
        best_benchmark = None
        best_new_coverage = set()
        
        for bench, jobs_covered in remaining_benchmarks.items():
            new_jobs = jobs_covered - covered_jobs
            if len(new_jobs) > len(best_new_coverage):
                best_benchmark = bench
                best_new_coverage = new_jobs
        
        if not best_benchmark or len(best_new_coverage) == 0:
            break
        
        # Add to ranking
        benchmark_ranking.append({
            "rank": len(benchmark_ranking) + 1,
            "benchmark": best_benchmark,
            "new_jobs_covered": len(best_new_coverage),
            "total_jobs_covered": len(remaining_benchmarks[best_benchmark]),
            "cumulative_coverage": len(covered_jobs) + len(best_new_coverage)
        })
        
        # Update covered jobs
        covered_jobs.update(best_new_coverage)
        del remaining_benchmarks[best_benchmark]
    
    print(f"\nGreedy Benchmark Selection (by marginal job coverage):\n")
    print(f"{'Rank':<6} {'Benchmark':<25} {'New Jobs':<12} {'Total Jobs':<12} {'Cumulative':<12} {'Coverage %'}")
    print("-" * 85)
    
    for br in benchmark_ranking:
        coverage_pct = br['cumulative_coverage'] / total_jobs * 100 if total_jobs else 0
        print(f"{br['rank']:<6} {br['benchmark']:<25} {br['new_jobs_covered']:<12} {br['total_jobs_covered']:<12} {br['cumulative_coverage']:<12} {coverage_pct:.1f}%")
    
    print(f"\n{'='*80}")
    print(f"RECOMMENDED BENCHMARK SELECTION")
    print(f"{'='*80}")
    
    # Find minimum set covering target percentage (e.g., 80%)
    target_coverage = 0.80
    target_jobs = int(target_coverage * total_jobs)
    
    recommended = []
    cumulative = 0
    for br in benchmark_ranking:
        recommended.append(br['benchmark'])
        cumulative = br['cumulative_coverage']
        if cumulative >= target_jobs:
            break
    
    print(f"\nTo cover {target_coverage:.0%} of jobs ({target_jobs}/{total_jobs}), use these benchmarks in order:")
    for i, bench in enumerate(recommended, 1):
        br = benchmark_ranking[i-1]
        print(f"{i}. {bench} (+{br['new_jobs_covered']} new jobs)")
    
    final_coverage = cumulative / total_jobs if total_jobs else 0
    print(f"\nFinal coverage: {cumulative}/{total_jobs} jobs ({final_coverage:.1%})")
    
    # Save results
    output = {
        "config": {
            "similarity_threshold": SIMILARITY_THRESHOLD
        },
        "summary": {
            "total_leaves": len(all_leaves),
            "leaves_with_jobs": len(leaves_with_jobs),
            "leaves_with_both": len(leaves_with_both),
            "coverage": coverage,
            "total_jobs": total_jobs,
            "jobs_with_high_quality_tasks": jobs_with_high_quality_tasks,
            "quality_coverage": quality_coverage
        },
        "distribution": dict(task_count_dist),
        "benchmark_ranking": benchmark_ranking,
        "recommended_benchmarks": recommended,
        "job_details": job_coverage_details
    }
    
    save_json(output, OUTPUT_FILE)
    
    print(f"\n{'='*80}")
    print(f"Results saved to: {OUTPUT_FILE}")
    print(f"{'='*80}")

if __name__ == "__main__":
    main()

JOB COVERAGE ANALYSIS

Similarity threshold: 7
Loading: taxonomy_with_similarity_updated.json

Total leaves: 322
Leaves with jobs: 275

METRIC 1: COVERAGE
Leaves with jobs that also have tasks: 187/275
Coverage: 68.00%

METRIC 2: COVERAGE QUALITY
Total jobs analyzed: 673
Jobs with high-quality tasks (>=7): 444
Quality coverage: 65.97%

DISTRIBUTION: Tasks per Job
0 tasks: 229 jobs (34.0%)
1 tasks: 128 jobs (19.0%)
2 tasks: 80 jobs (11.9%)
3 tasks: 46 jobs (6.8%)
4 tasks: 33 jobs (4.9%)
5 tasks: 24 jobs (3.6%)
6 tasks: 11 jobs (1.6%)
7 tasks: 17 jobs (2.5%)
8 tasks: 15 jobs (2.2%)
9 tasks: 8 jobs (1.2%)
10 tasks: 4 jobs (0.6%)
11 tasks: 4 jobs (0.6%)
12 tasks: 5 jobs (0.7%)
13 tasks: 7 jobs (1.0%)
14 tasks: 4 jobs (0.6%)
15 tasks: 4 jobs (0.6%)
16 tasks: 1 jobs (0.1%)
17 tasks: 2 jobs (0.3%)
18 tasks: 2 jobs (0.3%)
20 tasks: 7 jobs (1.0%)
21 tasks: 2 jobs (0.3%)
22 tasks: 1 jobs (0.1%)
23 tasks: 2 jobs (0.3%)
24 tasks: 1 jobs (0.1%)
25 tasks: 1 jobs (0.1%)
26 tasks: 1 jobs (0.1%)
27 tas

In [2]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

# For embeddings
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score


class TaxonomyAnalyzer:
    """Analyze taxonomy quality using embeddings and statistical metrics."""
    
    def __init__(self, taxonomy_path: str, jobs_path: str = None):
        """Initialize analyzer with taxonomy and job data."""
        with open(taxonomy_path, 'r') as f:
            self.taxonomy = json.load(f)
        
        self.jobs_df = None
        self.embeddings = None
        self.embedding_model = None
        
        if jobs_path:
            self.jobs_df = pd.read_csv(jobs_path)
            print("Loading sentence embedding model (this may take a moment)...")
            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
            self._prepare_embeddings()
    
    def _prepare_embeddings(self):
        """Create embeddings for all tasks."""
        if 'Task' not in self.jobs_df.columns or 'generic_skill' not in self.jobs_df.columns:
            print(f"Warning: Required columns not found. Available columns: {self.jobs_df.columns.tolist()}")
            return
        
        # Filter valid rows
        valid_rows = self.jobs_df[
            self.jobs_df['Task'].notna() & 
            self.jobs_df['generic_skill'].notna()
        ].copy()
        
        if len(valid_rows) == 0:
            print("Warning: No valid task-skill pairs found")
            return
        
        # Extract taxonomy paths
        valid_rows['taxonomy_path'] = valid_rows['generic_skill'].apply(self._extract_taxonomy_path)
        valid_rows = valid_rows[valid_rows['taxonomy_path'].notna()]
        
        if len(valid_rows) == 0:
            print("Warning: No valid taxonomy paths extracted")
            return
        
        # Get all paths to determine leaf nodes
        all_paths = self.get_all_paths()
        
        # Filter to only leaf nodes
        valid_rows['is_leaf'] = valid_rows['taxonomy_path'].apply(
            lambda p: self._is_leaf_from_set(p, all_paths)
        )
        
        non_leaf_count = (~valid_rows['is_leaf']).sum()
        if non_leaf_count > 0:
            print(f"⚠️  Warning: {non_leaf_count} tasks assigned to non-leaf nodes (will be excluded)")
            # Show examples
            non_leaf_examples = valid_rows[~valid_rows['is_leaf']]['taxonomy_path'].unique()[:5]
            for ex in non_leaf_examples:
                print(f"    Non-leaf: {ex}")
        
        # Keep only leaf node assignments
        valid_rows = valid_rows[valid_rows['is_leaf']].copy()
        
        if len(valid_rows) == 0:
            print("Warning: No tasks assigned to leaf nodes")
            return
        
        # Generate embeddings
        print(f"Generating embeddings for {len(valid_rows)} tasks (leaf nodes only)...")
        tasks = valid_rows['Task'].tolist()
        self.embeddings = self.embedding_model.encode(tasks, show_progress_bar=True)
        
        # Store with taxonomy mapping
        valid_rows['embedding_idx'] = range(len(valid_rows))
        self.jobs_df = valid_rows
        
        print(f"✓ Embeddings ready for {len(valid_rows)} tasks in leaf nodes")
    
    def _is_leaf_from_set(self, path: str, all_paths: List[str]) -> bool:
        """Check if a path is a leaf node given a set of all paths."""
        return not any(p.startswith(path + ' > ') for p in all_paths if p != path)
    
    def _extract_taxonomy_path(self, skill_text: str) -> str:
        """Extract taxonomy path from generic skill column."""
        if pd.isna(skill_text) or '(' not in skill_text or ')' not in skill_text:
            return None
        try:
            path = skill_text.split('(')[1].split(')')[0].strip()
            # Normalize path: remove leading/trailing spaces, normalize separators
            path = ' > '.join([p.strip() for p in path.split('>')])
            return path
        except:
            return None
    
    def get_all_paths(self, node: Dict = None, current_path: str = "") -> List[str]:
        """
        Recursively get all paths in taxonomy.
        Returns paths WITHOUT 'root' prefix to match CSV format.
        """
        if node is None:
            node = self.taxonomy
            # Start recursion - handle root node
            if 'root' in node:
                return self.get_all_paths(node['root'], "")
            else:
                return self.get_all_paths(node, "")
        
        paths = []
        if current_path:  # Don't include empty string as a path
            paths.append(current_path)
        
        if isinstance(node, dict):
            for key, value in node.items():
                if key != "root":
                    new_path = f"{current_path} > {key}" if current_path else key
                    paths.extend(self.get_all_paths(value, new_path))
        
        return paths if paths else [""]  # Return at least empty for root
    
    def _is_leaf(self, path: str) -> bool:
        """Check if a path is a leaf node."""
        all_paths = self.get_all_paths()
        # A leaf has no children (no paths that start with "path > ")
        if not path:  # Empty path (root)
            return False
        return not any(p.startswith(path + ' > ') for p in all_paths if p != path)
    
    # ========== STRUCTURAL METRICS ==========
    
    def calculate_depth_metrics(self) -> Dict:
        """Calculate depth-related metrics."""
        all_paths = self.get_all_paths()
        depths = [path.count(' > ') for path in all_paths]
        
        return {
            'average_depth': np.mean(depths),
            'max_depth': max(depths),
            'min_depth': min(depths),
            'std_depth': np.std(depths),
            'total_nodes': len(all_paths),
            'leaf_nodes': sum(1 for p in all_paths if self._is_leaf(p))
        }
    
    def calculate_branching_factor(self) -> Dict:
        """Calculate branching factor statistics."""
        def get_children_counts(node, path="root"):
            counts = []
            if isinstance(node, dict):
                for key, value in node.items():
                    if key != "root":
                        child_count = len(value) if isinstance(value, dict) else 0
                        if child_count > 0:
                            counts.append(child_count)
                        counts.extend(get_children_counts(value, f"{path}/{key}"))
                    else:
                        counts.extend(get_children_counts(value, path))
            return counts
        
        children_counts = get_children_counts(self.taxonomy)
        if not children_counts:
            return {'avg_branching_factor': 0, 'max_branching_factor': 0}
        
        return {
            'avg_branching_factor': np.mean(children_counts),
            'max_branching_factor': max(children_counts),
            'min_branching_factor': min(children_counts),
            'std_branching_factor': np.std(children_counts)
        }
    
    def calculate_balance_metric(self) -> float:
        """Calculate tree balance (coefficient of variation: lower is more balanced)."""
        all_paths = self.get_all_paths()
        depths = [path.count('/') for path in all_paths]
        return np.std(depths) / np.mean(depths) if np.mean(depths) > 0 else 0
    
    # ========== TASK DISTRIBUTION METRICS ==========
    
    def calculate_task_distribution_metrics(self) -> Dict:
        """Calculate how tasks are distributed across taxonomy."""
        if self.jobs_df is None or len(self.jobs_df) == 0:
            return {}
        
        task_counts = self.jobs_df['taxonomy_path'].value_counts()
        all_leaf_paths = [p for p in self.get_all_paths() if self._is_leaf(p)]
        
        # Coverage
        coverage = len(task_counts) / len(all_leaf_paths) if len(all_leaf_paths) > 0 else 0
        
        # Distribution evenness (Gini coefficient)
        counts = task_counts.values
        gini = self._gini_coefficient(counts)
        
        return {
            'total_tasks': len(self.jobs_df),
            'categories_with_tasks': len(task_counts),
            'total_leaf_nodes': len(all_leaf_paths),
            'coverage_rate': coverage,
            'avg_tasks_per_category': np.mean(counts),
            'max_tasks_per_category': max(counts),
            'min_tasks_per_category': min(counts),
            'std_tasks_per_category': np.std(counts),
            'gini_coefficient': gini
        }
    
    def _gini_coefficient(self, values: np.ndarray) -> float:
        """Calculate Gini coefficient (0 = perfect equality, 1 = perfect inequality)."""
        sorted_values = sorted(values)
        n = len(sorted_values)
        cumsum = np.cumsum(sorted_values)
        return (2 * sum((i + 1) * val for i, val in enumerate(sorted_values))) / (n * sum(sorted_values)) - (n + 1) / n
    
    # ========== SEMANTIC COHERENCE METRICS ==========
    
    def calculate_semantic_coherence(self) -> Dict:
        """
        Calculate semantic coherence using embeddings.
        Measures if tasks in same category are similar, and different categories are distinct.
        """
        if self.embeddings is None or len(self.jobs_df) == 0:
            return {}
        
        # Group embeddings by category
        category_embeddings = defaultdict(list)
        category_indices = defaultdict(list)
        
        for idx, row in self.jobs_df.iterrows():
            path = row['taxonomy_path']
            emb_idx = row['embedding_idx']
            category_embeddings[path].append(self.embeddings[emb_idx])
            category_indices[path].append(emb_idx)
        
        # Filter categories with at least 2 tasks
        valid_categories = {k: v for k, v in category_embeddings.items() if len(v) >= 2}
        
        if len(valid_categories) < 2:
            return {'error': 'Need at least 2 categories with 2+ tasks each'}
        
        # Calculate intra-category coherence (within-category similarity)
        intra_coherence_scores = []
        for category, embeddings in valid_categories.items():
            embeddings_array = np.array(embeddings)
            sim_matrix = cosine_similarity(embeddings_array)
            # Average pairwise similarity (excluding diagonal)
            n = len(embeddings)
            intra_sim = (sim_matrix.sum() - n) / (n * (n - 1)) if n > 1 else 0
            intra_coherence_scores.append(intra_sim)
        
        # Calculate inter-category separation (between-category dissimilarity)
        category_centroids = {}
        for category, embeddings in valid_categories.items():
            category_centroids[category] = np.mean(embeddings, axis=0)
        
        inter_distances = []
        categories = list(category_centroids.keys())
        for i in range(len(categories)):
            for j in range(i + 1, len(categories)):
                sim = cosine_similarity([category_centroids[categories[i]]], 
                                       [category_centroids[categories[j]]])[0][0]
                inter_distances.append(1 - sim)  # Convert to distance
        
        # Clustering quality metrics
        all_embeddings = self.embeddings
        labels = []
        label_map = {cat: idx for idx, cat in enumerate(valid_categories.keys())}
        
        for _, row in self.jobs_df.iterrows():
            path = row['taxonomy_path']
            if path in label_map:
                labels.append(label_map[path])
        
        labels = np.array(labels)
        
        try:
            silhouette = silhouette_score(all_embeddings, labels, metric='cosine')
            davies_bouldin = davies_bouldin_score(all_embeddings, labels)
            calinski = calinski_harabasz_score(all_embeddings, labels)
        except:
            silhouette = davies_bouldin = calinski = None
        
        return {
            'intra_category_coherence': np.mean(intra_coherence_scores),
            'inter_category_separation': np.mean(inter_distances),
            'silhouette_score': silhouette,
            'davies_bouldin_index': davies_bouldin,
            'calinski_harabasz_score': calinski,
            'num_categories': len(valid_categories)
        }
    
    def calculate_hierarchical_consistency(self) -> Dict:
        """
        Check if parent-child relationships make semantic sense.
        Child categories should be semantically related to parent categories.
        """
        if self.embeddings is None or len(self.jobs_df) == 0:
            return {}
        
        # Group embeddings by category
        category_embeddings = defaultdict(list)
        for idx, row in self.jobs_df.iterrows():
            path = row['taxonomy_path']
            emb_idx = row['embedding_idx']
            category_embeddings[path].append(self.embeddings[emb_idx])
        
        # Find parent-child pairs
        parent_child_similarities = []
        
        for child_path in category_embeddings.keys():
            parts = child_path.split(' > ')
            if len(parts) > 1:
                parent_path = ' > '.join(parts[:-1])
                if parent_path in category_embeddings:
                    # Calculate centroid similarity
                    parent_centroid = np.mean(category_embeddings[parent_path], axis=0)
                    child_centroid = np.mean(category_embeddings[child_path], axis=0)
                    similarity = cosine_similarity([parent_centroid], [child_centroid])[0][0]
                    parent_child_similarities.append(similarity)
        
        if not parent_child_similarities:
            return {}
        
        return {
            'avg_parent_child_similarity': np.mean(parent_child_similarities),
            'std_parent_child_similarity': np.std(parent_child_similarities),
            'num_relationships': len(parent_child_similarities)
        }
    
    def calculate_sibling_coherence(self) -> Dict:
        """
        Measure if sibling categories (sharing same parent) are semantically similar.
        Higher similarity = better grouping under common parent.
        """
        if self.embeddings is None or len(self.jobs_df) == 0:
            return {}
        
        # Group embeddings by category
        category_embeddings = defaultdict(list)
        for idx, row in self.jobs_df.iterrows():
            path = row['taxonomy_path']
            emb_idx = row['embedding_idx']
            category_embeddings[path].append(self.embeddings[emb_idx])
        
        # Group categories by parent
        parent_to_children = defaultdict(list)
        for path in category_embeddings.keys():
            parts = path.split(' > ')
            if len(parts) > 1:
                parent_path = ' > '.join(parts[:-1])
                parent_to_children[parent_path].append(path)
        
        # Calculate sibling similarity
        sibling_similarities = []
        parents_with_multiple_children = 0
        
        for parent, children in parent_to_children.items():
            if len(children) >= 2:
                parents_with_multiple_children += 1
                # Calculate centroids for each child
                child_centroids = []
                for child in children:
                    if len(category_embeddings[child]) > 0:
                        centroid = np.mean(category_embeddings[child], axis=0)
                        child_centroids.append(centroid)
                
                # Calculate pairwise similarity between siblings
                if len(child_centroids) >= 2:
                    for i in range(len(child_centroids)):
                        for j in range(i + 1, len(child_centroids)):
                            sim = cosine_similarity([child_centroids[i]], [child_centroids[j]])[0][0]
                            sibling_similarities.append(sim)
        
        if not sibling_similarities:
            return {}
        
        return {
            'avg_sibling_similarity': np.mean(sibling_similarities),
            'std_sibling_similarity': np.std(sibling_similarities),
            'num_sibling_pairs': len(sibling_similarities),
            'parents_with_multiple_children': parents_with_multiple_children
        }
    
    def calculate_structural_complexity(self) -> Dict:
        """
        Calculate structural complexity metrics.
        Measures the organizational structure independent of task assignments.
        """
        all_paths = self.get_all_paths()
        
        # Count leaf vs non-leaf nodes
        leaf_nodes = [p for p in all_paths if self._is_leaf(p)]
        non_leaf_nodes = [p for p in all_paths if not self._is_leaf(p) and p != 'root']
        
        # Depth distribution
        depth_distribution = {}
        for path in all_paths:
            depth = path.count(' > ')
            depth_distribution[depth] = depth_distribution.get(depth, 0) + 1
        
        return {
            'total_nodes': len(all_paths),
            'leaf_nodes': len(leaf_nodes),
            'intermediate_nodes': len(non_leaf_nodes),
            'intermediate_to_leaf_ratio': len(non_leaf_nodes) / len(leaf_nodes) if len(leaf_nodes) > 0 else 0,
            'depth_distribution': depth_distribution
        }
    
    def calculate_intermediate_node_quality(self) -> Dict:
        """
        Measure quality of intermediate (non-leaf) nodes.
        Good intermediate nodes should group semantically similar children.
        NOTE: This uses embeddings from leaf nodes to measure intermediate node quality.
        """
        if self.embeddings is None or len(self.jobs_df) == 0:
            return {}
        
        # Group embeddings by LEAF category
        leaf_category_embeddings = defaultdict(list)
        for idx, row in self.jobs_df.iterrows():
            path = row['taxonomy_path']
            emb_idx = row['embedding_idx']
            leaf_category_embeddings[path].append(self.embeddings[emb_idx])
        
        # Get all paths from taxonomy
        all_paths = self.get_all_paths()
        
        # Find intermediate nodes (non-empty paths that are not leaves)
        intermediate_nodes = []
        for path in all_paths:
            if path and not self._is_leaf(path):  # Non-empty and not a leaf
                intermediate_nodes.append(path)
        
        if not intermediate_nodes:
            return {'error': 'No intermediate nodes found'}
        
        print(f"  [Debug] Found {len(intermediate_nodes)} intermediate nodes")
        print(f"  [Debug] Sample intermediate: {intermediate_nodes[:3] if intermediate_nodes else 'none'}")
        
        intermediate_quality_scores = []
        
        for intermediate_path in intermediate_nodes:
            # Find all leaf descendants of this intermediate node
            # Try both with and without "root > " prefix to handle both formats
            
            # Pattern 1: Direct match (for CSV without root prefix)
            leaf_descendants = [
                p for p in leaf_category_embeddings.keys() 
                if p.startswith(intermediate_path + ' > ') or p == intermediate_path
            ]
            
            # Pattern 2: With root prefix (for CSV with root prefix)
            if not leaf_descendants or len(leaf_descendants) < 2:
                intermediate_with_root = f"root > {intermediate_path}"
                leaf_descendants = [
                    p for p in leaf_category_embeddings.keys() 
                    if p.startswith(intermediate_with_root + ' > ') or p == intermediate_with_root
                ]
            
            if len(leaf_descendants) >= 2:
                # Collect embeddings from all leaf descendants
                all_descendant_embeddings = []
                for leaf in leaf_descendants:
                    all_descendant_embeddings.extend(leaf_category_embeddings[leaf])
                
                if len(all_descendant_embeddings) >= 2:
                    # Calculate cohesion of all tasks under this intermediate node
                    embeddings_array = np.array(all_descendant_embeddings)
                    
                    # Sample if too many (for efficiency)
                    if len(embeddings_array) > 100:
                        indices = np.random.choice(len(embeddings_array), 100, replace=False)
                        embeddings_array = embeddings_array[indices]
                    
                    sim_matrix = cosine_similarity(embeddings_array)
                    n = len(embeddings_array)
                    cohesion = (sim_matrix.sum() - n) / (n * (n - 1)) if n > 1 else 0
                    
                    intermediate_quality_scores.append({
                        'node': intermediate_path.split(' > ')[-1] if ' > ' in intermediate_path else intermediate_path,
                        'full_path': intermediate_path,
                        'depth': intermediate_path.count(' > '),
                        'num_leaf_descendants': len(leaf_descendants),
                        'num_tasks': len(all_descendant_embeddings),
                        'cohesion': cohesion
                    })
        
        if not intermediate_quality_scores:
            print(f"  [Debug] No intermediate nodes with 2+ leaf descendants found")
            print(f"  [Debug] Leaf categories: {list(leaf_category_embeddings.keys())[:5]}")
            return {}
        
        print(f"  [Debug] Evaluated {len(intermediate_quality_scores)} intermediate nodes with descendants")
        
        cohesion_scores = [s['cohesion'] for s in intermediate_quality_scores]
        
        return {
            'avg_intermediate_node_cohesion': np.mean(cohesion_scores),
            'std_intermediate_node_cohesion': np.std(cohesion_scores),
            'num_intermediate_nodes_evaluated': len(intermediate_quality_scores),
            'avg_leaf_descendants': np.mean([s['num_leaf_descendants'] for s in intermediate_quality_scores]),
            'avg_tasks_under_intermediate': np.mean([s['num_tasks'] for s in intermediate_quality_scores]),
            'max_cohesion': max(cohesion_scores),
            'min_cohesion': min(cohesion_scores)
        }
    
    # ========== REPORTING ==========
    
    def generate_report(self) -> Dict:
        """Generate comprehensive quality metrics report."""
        report = {
            'structural': {
                'depth': self.calculate_depth_metrics(),
                'branching': self.calculate_branching_factor(),
                'balance': self.calculate_balance_metric(),
                'complexity': self.calculate_structural_complexity()
            }
        }
        
        if self.jobs_df is not None and len(self.jobs_df) > 0:
            report['task_distribution'] = self.calculate_task_distribution_metrics()
            report['semantic_coherence'] = self.calculate_semantic_coherence()
            report['hierarchical_consistency'] = self.calculate_hierarchical_consistency()
            report['sibling_coherence'] = self.calculate_sibling_coherence()
            report['intermediate_node_quality'] = self.calculate_intermediate_node_quality()
        
        return report
    
    def print_report(self):
        """Print formatted analysis report."""
        report = self.generate_report()
        
        print("\n" + "=" * 70)
        print("TAXONOMY QUALITY ANALYSIS REPORT")
        print("=" * 70)
        
        # Structural metrics
        print("\n📊 STRUCTURAL METRICS")
        print("-" * 70)
        print("\nDepth Metrics:")
        for key, value in report['structural']['depth'].items():
            formatted_value = f"{value:.2f}" if isinstance(value, float) else str(value)
            print(f"  {key.replace('_', ' ').title()}: {formatted_value}")
        
        print("\nBranching Metrics:")
        for key, value in report['structural']['branching'].items():
            print(f"  {key.replace('_', ' ').title()}: {value:.2f}")
        
        print(f"\nBalance Coefficient (lower = better): {report['structural']['balance']:.3f}")
        
        print("\nStructural Complexity:")
        for key, value in report['structural']['complexity'].items():
            if key != 'depth_distribution':
                formatted_value = f"{value:.2f}" if isinstance(value, float) else str(value)
                print(f"  {key.replace('_', ' ').title()}: {formatted_value}")
        
        # Task distribution
        if 'task_distribution' in report:
            print("\n📋 TASK DISTRIBUTION")
            print("-" * 70)
            for key, value in report['task_distribution'].items():
                formatted_value = f"{value:.3f}" if isinstance(value, float) else str(value)
                print(f"  {key.replace('_', ' ').title()}: {formatted_value}")
        
        # Semantic coherence
        if 'semantic_coherence' in report and 'error' not in report['semantic_coherence']:
            print("\n🧠 SEMANTIC COHERENCE (Does the taxonomy make sense?)")
            print("-" * 70)
            sc = report['semantic_coherence']
            
            print(f"  Intra-Category Coherence: {sc['intra_category_coherence']:.3f}")
            print(f"    → Higher = tasks within categories are more similar")
            print(f"  Inter-Category Separation: {sc['inter_category_separation']:.3f}")
            print(f"    → Higher = categories are more distinct")
            
            if sc['silhouette_score'] is not None:
                print(f"  Silhouette Score: {sc['silhouette_score']:.3f}")
                print(f"    → Range [-1, 1], higher = better clustering")
            if sc['davies_bouldin_index'] is not None:
                print(f"  Davies-Bouldin Index: {sc['davies_bouldin_index']:.3f}")
                print(f"    → Lower = better separated clusters")
            
            print(f"  Categories Analyzed: {sc['num_categories']}")
        
        # Hierarchical consistency
        if 'hierarchical_consistency' in report and report['hierarchical_consistency']:
            print("\n🎯 HIERARCHICAL CONSISTENCY")
            print("-" * 70)
            hc = report['hierarchical_consistency']
            print(f"  Avg Parent-Child Similarity: {hc['avg_parent_child_similarity']:.3f}")
            print(f"    → Higher = children are related to parents")
            print(f"  Parent-Child Relationships Analyzed: {hc['num_relationships']}")
        
        # Sibling coherence
        if 'sibling_coherence' in report and report['sibling_coherence']:
            print("\n👥 SIBLING COHERENCE (Are categories grouped well?)")
            print("-" * 70)
            sc = report['sibling_coherence']
            print(f"  Avg Sibling Similarity: {sc['avg_sibling_similarity']:.3f}")
            print(f"    → Higher = siblings under same parent are more related")
            print(f"  Sibling Pairs Analyzed: {sc['num_sibling_pairs']}")
            print(f"  Parents with Multiple Children: {sc['parents_with_multiple_children']}")
        
        # Intermediate node quality
        if 'intermediate_node_quality' in report and report['intermediate_node_quality']:
            print("\n🌲 INTERMEDIATE NODE QUALITY")
            print("-" * 70)
            inq = report['intermediate_node_quality']
            print(f"  Avg Intermediate Node Cohesion: {inq['avg_intermediate_node_cohesion']:.3f}")
            print(f"    → Higher = better grouping at intermediate levels")
            print(f"  Intermediate Nodes Evaluated: {inq['num_intermediate_nodes_evaluated']}")
            print(f"  Avg Leaf Descendants per Intermediate: {inq['avg_leaf_descendants']:.1f}")
            print(f"  Avg Tasks under Intermediate Nodes: {inq['avg_tasks_under_intermediate']:.1f}")
        
        print("\n" + "=" * 70)


def compare_taxonomies(original_taxonomy: str, restructured_taxonomy: str, 
                       original_jobs: str, restructured_jobs: str):
    """Compare two taxonomies with their respective job mappings."""
    
    print("\n" + "=" * 70)
    print("ANALYZING ORIGINAL TAXONOMY")
    print("=" * 70)
    original = TaxonomyAnalyzer(original_taxonomy, original_jobs)
    original_report = original.generate_report()
    
    print("\n" + "=" * 70)
    print("ANALYZING RESTRUCTURED TAXONOMY")
    print("=" * 70)
    restructured = TaxonomyAnalyzer(restructured_taxonomy, restructured_jobs)
    restructured_report = restructured.generate_report()
    
    # Comparison
    print("\n" + "=" * 70)
    print("COMPARATIVE ANALYSIS")
    print("=" * 70)
    
    # Depth improvement
    orig_depth = original_report['structural']['depth']['average_depth']
    new_depth = restructured_report['structural']['depth']['average_depth']
    print(f"\n✅ Average Depth: {orig_depth:.2f} → {new_depth:.2f} ({new_depth - orig_depth:+.2f})")
    
    # Structural complexity improvements
    orig_intermediate = original_report['structural']['complexity']['intermediate_nodes']
    new_intermediate = restructured_report['structural']['complexity']['intermediate_nodes']
    print(f"✅ Intermediate Nodes: {orig_intermediate} → {new_intermediate} ({new_intermediate - orig_intermediate:+d})")
    
    orig_total = original_report['structural']['complexity']['total_nodes']
    new_total = restructured_report['structural']['complexity']['total_nodes']
    print(f"✅ Total Nodes: {orig_total} → {new_total} ({new_total - orig_total:+d})")
    
    # Balance improvement
    orig_balance = original_report['structural']['balance']
    new_balance = restructured_report['structural']['balance']
    print(f"✅ Balance (lower=better): {orig_balance:.3f} → {new_balance:.3f} ({orig_balance - new_balance:+.3f})")
    
    # Semantic improvements
    if ('semantic_coherence' in original_report and 'semantic_coherence' in restructured_report and
        'error' not in original_report['semantic_coherence'] and 
        'error' not in restructured_report['semantic_coherence']):
        
        orig_coherence = original_report['semantic_coherence']['intra_category_coherence']
        new_coherence = restructured_report['semantic_coherence']['intra_category_coherence']
        print(f"✅ Intra-Category Coherence: {orig_coherence:.3f} → {new_coherence:.3f} ({new_coherence - orig_coherence:+.3f})")
        
        orig_sep = original_report['semantic_coherence']['inter_category_separation']
        new_sep = restructured_report['semantic_coherence']['inter_category_separation']
        print(f"✅ Inter-Category Separation: {orig_sep:.3f} → {new_sep:.3f} ({new_sep - orig_sep:+.3f})")
        
        if (original_report['semantic_coherence']['silhouette_score'] is not None and
            restructured_report['semantic_coherence']['silhouette_score'] is not None):
            orig_sil = original_report['semantic_coherence']['silhouette_score']
            new_sil = restructured_report['semantic_coherence']['silhouette_score']
            print(f"✅ Silhouette Score: {orig_sil:.3f} → {new_sil:.3f} ({new_sil - orig_sil:+.3f})")
    
    # Sibling coherence improvements
    if ('sibling_coherence' in original_report and 'sibling_coherence' in restructured_report):
        print("\n🔥 KEY IMPROVEMENT: Sibling Coherence")
        orig_sib = original_report['sibling_coherence'].get('avg_sibling_similarity', 0)
        new_sib = restructured_report['sibling_coherence'].get('avg_sibling_similarity', 0)
        print(f"✅ Avg Sibling Similarity: {orig_sib:.3f} → {new_sib:.3f} ({new_sib - orig_sib:+.3f})")
        print(f"   → This shows restructuring grouped related categories together!")
    
    # Intermediate node quality
    if ('intermediate_node_quality' in original_report and 'intermediate_node_quality' in restructured_report):
        print("\n🔥 KEY IMPROVEMENT: Intermediate Node Quality")
        orig_inq = original_report['intermediate_node_quality'].get('avg_intermediate_node_cohesion', 0)
        new_inq = restructured_report['intermediate_node_quality'].get('avg_intermediate_node_cohesion', 0)
        print(f"✅ Intermediate Node Cohesion: {orig_inq:.3f} → {new_inq:.3f} ({new_inq - orig_inq:+.3f})")
        
        orig_num_with_tasks = original_report['intermediate_node_quality'].get('num_intermediate_nodes_evaluated', 0)
        new_num_with_tasks = restructured_report['intermediate_node_quality'].get('num_intermediate_nodes_evaluated', 0)
        print(f"✅ Intermediate Nodes Evaluated: {orig_num_with_tasks} → {new_num_with_tasks} ({new_num_with_tasks - orig_num_with_tasks:+d})")
        print(f"   → More organizational structure with quality groupings!")
    
    # Summary
    print("\n" + "=" * 70)
    print("📊 SUMMARY")
    print("=" * 70)
    print(f"✓ Added {new_intermediate - orig_intermediate} intermediate nodes for better organization")
    print(f"✓ Increased average depth by {new_depth - orig_depth:.2f} levels (more specificity)")
    
    if 'sibling_coherence' in original_report and 'sibling_coherence' in restructured_report:
        orig_sib = original_report['sibling_coherence'].get('avg_sibling_similarity', 0)
        new_sib = restructured_report['sibling_coherence'].get('avg_sibling_similarity', 0)
        if new_sib > orig_sib:
            print(f"✓ Improved sibling coherence by {new_sib - orig_sib:.3f} (better groupings)")
    
    print(f"✓ Overall taxonomy size increased by {((new_total - orig_total) / orig_total * 100):.1f}%")
    
    print("\n" + "=" * 70)


if __name__ == "__main__":
    import sys
    
    # Analyze single taxonomy (restructured)
    print("=" * 70)
    print("ANALYZING RESTRUCTURED TAXONOMY")
    print("=" * 70)
    analyzer = TaxonomyAnalyzer('../taxonomy_restructured.json', '../job_tasks_with_skills_restructured.csv')
    analyzer.print_report()
    
    # Compare taxonomies
    print("\n\n")
    print("=" * 70)
    print("COMPARISON: ORIGINAL vs RESTRUCTURED")
    print("=" * 70)
    compare_taxonomies(
        original_taxonomy='../taxonomy.json',
        restructured_taxonomy='../taxonomy_restructured.json',
        original_jobs='../job_tasks_with_skills.csv',
        restructured_jobs='../job_tasks_with_skills_restructured.csv'
    )

ANALYZING RESTRUCTURED TAXONOMY
Loading sentence embedding model (this may take a moment)...
Generating embeddings for 844 tasks (leaf nodes only)...


Batches:   0%|          | 0/27 [00:00<?, ?it/s]

✓ Embeddings ready for 844 tasks in leaf nodes
  [Debug] Found 321 intermediate nodes
  [Debug] Sample intermediate: ['Operations & Scheduling', 'Operations & Scheduling > Scheduling & Calendar Management', 'Operations & Scheduling > Scheduling & Calendar Management > Workforce Scheduling']
  [Debug] Evaluated 61 intermediate nodes with descendants

TAXONOMY QUALITY ANALYSIS REPORT

📊 STRUCTURAL METRICS
----------------------------------------------------------------------

Depth Metrics:
  Average Depth: 2.35
  Max Depth: 5
  Min Depth: 0
  Std Depth: 0.81
  Total Nodes: 643
  Leaf Nodes: 322

Branching Metrics:
  Avg Branching Factor: 1.96
  Max Branching Factor: 19.00
  Min Branching Factor: 1.00
  Std Branching Factor: 2.69

Balance Coefficient (lower = better): 0.000

Structural Complexity:
  Total Nodes: 643
  Leaf Nodes: 322
  Intermediate Nodes: 321
  Intermediate To Leaf Ratio: 1.00

📋 TASK DISTRIBUTION
----------------------------------------------------------------------
  T

Batches:   0%|          | 0/25 [00:00<?, ?it/s]

✓ Embeddings ready for 793 tasks in leaf nodes
  [Debug] Found 27 intermediate nodes
  [Debug] Sample intermediate: ['Scheduling', 'Record Keeping', 'Record Keeping > Insurance Claims Documentation']
  [Debug] Evaluated 19 intermediate nodes with descendants

ANALYZING RESTRUCTURED TAXONOMY
Loading sentence embedding model (this may take a moment)...
Generating embeddings for 844 tasks (leaf nodes only)...


Batches:   0%|          | 0/27 [00:00<?, ?it/s]

✓ Embeddings ready for 844 tasks in leaf nodes
  [Debug] Found 321 intermediate nodes
  [Debug] Sample intermediate: ['Operations & Scheduling', 'Operations & Scheduling > Scheduling & Calendar Management', 'Operations & Scheduling > Scheduling & Calendar Management > Workforce Scheduling']
  [Debug] Evaluated 61 intermediate nodes with descendants

COMPARATIVE ANALYSIS

✅ Average Depth: 1.12 → 2.35 (+1.23)
✅ Intermediate Nodes: 27 → 321 (+294)
✅ Total Nodes: 286 → 643 (+357)
✅ Balance (lower=better): 0.000 → 0.000 (+0.000)
✅ Intra-Category Coherence: 0.494 → 0.494 (-0.000)
✅ Inter-Category Separation: 0.640 → 0.636 (-0.004)

🔥 KEY IMPROVEMENT: Sibling Coherence
✅ Avg Sibling Similarity: 0.386 → 0.450 (+0.064)
   → This shows restructuring grouped related categories together!

🔥 KEY IMPROVEMENT: Intermediate Node Quality
✅ Intermediate Node Cohesion: 0.397 → 0.408 (+0.011)
✅ Intermediate Nodes Evaluated: 19 → 61 (+42)
   → More organizational structure with quality groupings!

📊 SUMMAR