# SEEK Job Metadata Extraction — Case Study
This notebook implements a hybrid pipeline combining **traditional NLP** and **Generative AI** techniques to extract structured information (skills, responsibilities, requirements) from SEEK job advertisements.

**Author:** ChatGPT (generated for Xiaoshi Lu)

## Overview
- Load and inspect SEEK job ad dataset
- Extract and clean text fields
- Apply NLP keyword extraction (KeyBERT, heuristics)
- Apply Generative AI structured extraction (LLM)
- Evaluate, summarize and export results


In [None]:
# Import libraries and define functions
# (Content copied from seek_job_metadata_analysis.py)
"""
seek_job_metadata_analysis.py

Hybrid pipeline for extracting structured metadata (skills, responsibilities, requirements)
from SEEK job ads (each row JSON in a CSV).

Designed for local execution with optional LLM API integration (OpenAI-compatible).
Produces:
 - structured JSON output per ad
 - evaluation utilities (manual labels or sampling)
 - simple PowerPoint export for presentation

Usage:
    1. Install required Python packages (examples provided below).
    2. Place your `random_rows.csv` in the same folder or pass the path.
    3. Export OPENAI_API_KEY in your environment if using OpenAI LLM calls.
    4. Run: python seek_job_metadata_analysis.py --input /mnt/data/random_rows.csv --output ./results.jsonl

Requirements (suggested):
    pip install -U spacy pandas tqdm python-dotenv sentence-transformers keybert openai python-pptx rake-nltk nltk
    python -m spacy download en_core_web_sm

Notes:
 - LLM usage is optional. The script contains a `llm_extract` function that calls OpenAI's chat completions
   if an API key is present. You can adapt it to other providers by replacing `call_openai_llm`.
 - The code is modular: replace or extend extraction methods as desired.

Author: ChatGPT (generated)
Date: 2025
"""

import os
import json
import argparse
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import re
import logging
from tqdm import tqdm

# --- Optional heavy imports guarded with try/except for graceful degradation ---
try:
    import pandas as pd
except Exception:
    pd = None

try:
    import spacy
    from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
except Exception:
    spacy = None

try:
    from sentence_transformers import SentenceTransformer, util as sbert_util
except Exception:
    SentenceTransformer = None
    sbert_util = None

try:
    from keybert import KeyBERT
except Exception:
    KeyBERT = None

try:
    import openai
except Exception:
    openai = None

try:
    from pptx import Presentation
    from pptx.util import Inches, Pt
except Exception:
    Presentation = None

# Fallback simple keyword extraction using regex and stopwords
import math
from collections import Counter, defaultdict

# --- Logging ---
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger("seek-extract")

# -------------------------
# Utilities and preprocessing
# -------------------------
def load_jsonl_csv(filepath: str, json_column: Optional[str] = None) -> List[Dict[str, Any]]:
    """
    Load data from CSV where each row contains a JSON object (string) or
    where the CSV columns are flattened. This function attempts to:
      - If a CSV cell contains a JSON string, parse each row's first column as JSON
      - Else, load entire CSV with pandas and convert rows to dicts

    Returns list of dicts (job ads).
    """
    path = Path(filepath)
    if not path.exists():
        raise FileNotFoundError(filepath)
    # Try reading as text lines: each line is JSON
    try:
        with open(path, "r", encoding="utf-8") as f:
            sample = f.read(1000)
            f.seek(0)
            # If file looks like JSON lines (starts with { or [)
            if sample.lstrip().startswith("{"):
                objs = []
                f.seek(0)
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        objs.append(json.loads(line))
                    except json.JSONDecodeError:
                        # Maybe CSV with quoted JSON in one column; try split
                        try:
                            # take everything after the first comma as JSON
                            parts = line.split(",", 1)
                            candidate = parts[-1]
                            objs.append(json.loads(candidate))
                        except Exception:
                            continue
                if objs:
                    logger.info(f"Loaded {len(objs)} JSON-lines entries from {filepath}")
                    return objs
    except Exception:
        pass

    # Fallback: try pandas
    if pd is None:
        raise RuntimeError("pandas not available and file is not JSON-lines. Install pandas or provide JSON-lines file.")
    df = pd.read_csv(path, dtype=str, keep_default_na=False, na_filter=False)
    logger.info(f"CSV loaded with shape {df.shape}")
    # If a json_column is provided, parse it
    if json_column and json_column in df.columns:
        records = []
        for val in df[json_column].astype(str).tolist():
            try:
                records.append(json.loads(val))
            except Exception:
                records.append({"raw_text": val})
        return records
    # Otherwise, convert each row to dict
    records = df.replace({pd.NA: None}).to_dict(orient="records")
    return records

def basic_clean_text(text: str) -> str:
    if not text:
        return ""
    # Remove HTML tags
    text = re.sub(r"<[^>]+>", " ", text)
    # Replace escape sequences
    text = text.replace("\\n", " ").replace("\\t", " ")
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

# -------------------------
# Baseline: simple keyword extraction (RAKE-like)
# -------------------------
def simple_keyword_candidates(text: str, stopwords: Optional[set] = None, max_ngram: int = 3) -> List[str]:
    """
    A simple candidate extractor: returns phrases that exclude stopwords and punctuation.
    """
    if not text:
        return []
    if stopwords is None:
        stopwords = {"and","or","the","a","an","with","to","of","in","for","on","at","by","from","as","that","is","are","be","will","experience","experience."}
    # Lowercase and remove non-alphanum except spaces
    cleaned = re.sub(r"[^A-Za-z0-9\s/+\-]", " ", text.lower())
    tokens = cleaned.split()
    candidates = []
    n = len(tokens)
    for i in range(n):
        for L in range(1, max_ngram+1):
            if i+L <= n:
                phrase = " ".join(tokens[i:i+L])
                # Skip phrases that are only stopwords
                if any(t in stopwords for t in phrase.split()):
                    continue
                if len(phrase) < 2:
                    continue
                candidates.append(phrase)
    # Score by frequency
    counts = Counter(candidates)
    common = [kw for kw, _ in counts.most_common(30)]
    return common

# -------------------------
# KeyBERT wrapper (if available)
# -------------------------
def keybert_extract(text: str, top_n: int = 10, model_name: str = "all-MiniLM-L6-v2") -> List[str]:
    if KeyBERT is None or SentenceTransformer is None:
        return []
    try:
        sbert = SentenceTransformer(model_name)
        kw_model = KeyBERT(sbert)
        keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1,3), stop_words="english", top_n=top_n)
        return [k[0] for k in keywords]
    except Exception as e:
        logger.warning("KeyBERT extract failed: %s", e)
        return []

# -------------------------
# Embedding helpers
# -------------------------
def load_sbert_model(model_name: str = "all-MiniLM-L6-v2"):
    if SentenceTransformer is None:
        raise RuntimeError("sentence-transformers not installed.")
    return SentenceTransformer(model_name)

def embed_texts(model, texts: List[str]):
    if model is None:
        raise RuntimeError("Embedding model not loaded.")
    return model.encode(texts, convert_to_tensor=True, show_progress_bar=False)

# -------------------------
# LLM extraction (OpenAI-compatible)
# -------------------------
DEFAULT_PROMPT = """
You are a helpful assistant that extracts structured information from job advertisements.

Input job ad:
{job_text}

Instruction:
Extract the following fields from the job ad. If a field is not present, return an empty list for that field.

- skills: short list of skills, tools, technologies (e.g., "Python", "SQL", "scikit-learn", "communication")
- responsibilities: short concise list of responsibilities / duties described in the ad
- requirements: candidate requirements such as experience, qualifications, degrees, certifications, years of experience

Return a JSON object ONLY with keys: "skills", "responsibilities", "requirements". Example:
{{"skills": ["Python", "SQL"], "responsibilities": ["build predictive models"], "requirements": ["3+ years experience", "Bachelor's degree"]}}
"""

def call_openai_llm(system_prompt: str, user_prompt: str, api_key: Optional[str] = None, model: str = "gpt-4o-mini"):
    """
    Minimal wrapper for OpenAI ChatCompletion (may need adaptation to your OpenAI client version).
    Replace with your own provider wrapper if needed.
    """
    if openai is None:
        raise RuntimeError("openai package not installed or available.")
    if api_key is None:
        api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("No OpenAI API key found in environment or arguments.")
    openai.api_key = api_key
    # Using ChatCompletion (this code may require adaptation depending on openai package version)
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.0,
            max_tokens=1000
        )
        text = response["choices"][0]["message"]["content"]
        return text
    except Exception as e:
        logger.error("OpenAI call failed: %s", e)
        raise

def llm_extract(job_text: str, model: str = "gpt-4o-mini", api_key: Optional[str] = None, prompt_template: str = DEFAULT_PROMPT, safe_mode: bool = True) -> Dict[str, List[str]]:
    """
    Extract structured fields via LLM. Returns dict with lists.
    """
    prompt = prompt_template.format(job_text=job_text)
    raw = call_openai_llm(system_prompt="You are a JSON-output assistant.", user_prompt=prompt, api_key=api_key, model=model)
    # Attempt to parse JSON from LLM output
    parsed = {"skills": [], "responsibilities": [], "requirements": []}
    try:
        # Try to find the first JSON object in the text
        start = raw.find("{")
        end = raw.rfind("}")
        if start != -1 and end != -1:
            candidate = raw[start:end+1]
            parsed_candidate = json.loads(candidate)
            # Ensure keys exist
            for k in parsed.keys():
                if k in parsed_candidate and isinstance(parsed_candidate[k], list):
                    parsed[k] = [str(x).strip() for x in parsed_candidate[k] if x]
    except Exception as e:
        logger.warning("Failed to parse LLM JSON output: %s. Raw output: %s", e, raw[:500])
        # As fallback, do naive regex extraction of lines starting with "-"
        lines = raw.splitlines()
        for line in lines:
            m = re.match(r"skills[:\-]\s*(.*)", line, re.I)
            if m:
                parsed["skills"] = [x.strip() for x in re.split(r",|\;", m.group(1)) if x.strip()]
    return parsed

# -------------------------
# High-level pipeline
# -------------------------
def extract_from_ad(ad: Dict[str, Any], sbert_model=None, use_keybert: bool = True, llm_api_key: Optional[str] = None, llm_model: str = "gpt-4o-mini") -> Dict[str, Any]:
    """
    Extract metadata from a single job ad dict. Returns structured dict.
    """
    title = ad.get("title") or ad.get("jobTitle") or ad.get("headline") or ""
    # Long text fields that commonly contain responsibilities/skills
    text_fields = []
    for candidate in ["description", "jobDescription", "summary", "adText", "details"]:
        if candidate in ad and ad[candidate]:
            text_fields.append(str(ad[candidate]))
    # Also include title and other short fields
    text_fields.append(str(title))
    job_text = "\n".join([basic_clean_text(t) for t in text_fields if t])

    result = {
        "id": ad.get("id") or ad.get("jobId") or None,
        "title": title,
        "raw_text": job_text,
        "skills_baseline": [],
        "responsibilities_baseline": [],
        "requirements_baseline": [],
        "skills_llm": [],
        "responsibilities_llm": [],
        "requirements_llm": []
    }

    # Baseline extraction: simple keywords from job_text
    baseline_keywords = simple_keyword_candidates(job_text, stopwords=set(SPACY_STOPWORDS) if spacy else None, max_ngram=2)
    result["skills_baseline"] = baseline_keywords[:20]

    # KeyBERT if available
    if use_keybert and KeyBERT is not None and SentenceTransformer is not None:
        try:
            kb = keybert_extract(job_text, top_n=15)
            result["skills_baseline"] = list(dict.fromkeys(kb + result["skills_baseline"]))  # preserve order, de-dup
        except Exception as e:
            logger.debug("KeyBERT failed: %s", e)

    # LLM-based extraction (if API key is provided)
    if llm_api_key:
        try:
            parsed = llm_extract(job_text, model=llm_model, api_key=llm_api_key)
            result["skills_llm"] = parsed.get("skills", [])
            result["responsibilities_llm"] = parsed.get("responsibilities", [])
            result["requirements_llm"] = parsed.get("requirements", [])
        except Exception as e:
            logger.warning("LLM extraction failed for ad id %s: %s", result.get("id"), e)

    # Simple heuristic for responsibilities/requirements from sentences (fallback)
    sentences = re.split(r"\.|\n|\r", job_text)
    responsibilities = []
    requirements = []
    for s in sentences:
        s_clean = s.strip()
        if not s_clean or len(s_clean) < 20:
            continue
        # heuristics
        if re.search(r"\b(responsibl|responsibilit|you will)\b", s_clean, re.I):
            responsibilities.append(s_clean)
        if re.search(r"\b(require|must|preferred|experience|degree|years)\b", s_clean, re.I):
            requirements.append(s_clean)
    result["responsibilities_baseline"] = responsibilities[:10]
    result["requirements_baseline"] = requirements[:10]

    # Embedding similarity optional: cluster or dedupe skills (not implemented fully here)
    return result

# -------------------------
# Batch processing and evaluation
# -------------------------
def process_ads(ads: List[Dict[str, Any]], output_path: str, llm_api_key: Optional[str] = None, max_records: Optional[int] = None):
    """
    Process a list of job ads and write results to JSONL at output_path.
    """
    outp = Path(output_path)
    outp.parent.mkdir(parents=True, exist_ok=True)
    count = 0
    with open(outp, "w", encoding="utf-8") as fout:
        for ad in tqdm(ads[:max_records] if max_records else ads, desc="Processing ads"):
            try:
                res = extract_from_ad(ad, llm_api_key=llm_api_key)
                fout.write(json.dumps(res, ensure_ascii=False) + "\n")
                count += 1
            except Exception as e:
                logger.error("Failed to process ad: %s", e)
    logger.info("Wrote %d extracted records to %s", count, outp)
    return outp

def sample_and_manual_label(output_jsonl: str, sample_n: int = 10) -> List[Dict[str, Any]]:
    """
    Utility to sample N extracted records for manual labeling.
    This function will just return the sample list so the user can create labels in a notebook or file.
    """
    results = []
    with open(output_jsonl, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= sample_n:
                break
            results.append(json.loads(line))
    return results

def evaluate_against_manual(predictions_jsonl: str, manual_labels: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Given a JSONL of predictions and a list of manual labels (dicts with the same structure),
    compute simple precision/recall metrics for skills extraction.
    This is a minimal example: exact-match / bag-of-words metrics.
    """
    # Convert manual labels to dict by id (if ids present), else align by order.
    preds = []
    with open(predictions_jsonl, "r", encoding="utf-8") as f:
        for line in f:
            preds.append(json.loads(line))
    n = min(len(preds), len(manual_labels))
    total_precision = 0.0
    total_recall = 0.0
    for i in range(n):
        p_skills = set(x.lower() for x in preds[i].get("skills_llm", []) + preds[i].get("skills_baseline", []))
        g_skills = set(x.lower() for x in manual_labels[i].get("skills", []))
        if not p_skills and not g_skills:
            continue
        tp = len(p_skills & g_skills)
        prec = tp / len(p_skills) if p_skills else 0.0
        rec = tp / len(g_skills) if g_skills else 0.0
        total_precision += prec
        total_recall += rec
    metrics = {
        "n_evaluated": n,
        "precision_avg": total_precision / n if n else 0.0,
        "recall_avg": total_recall / n if n else 0.0
    }
    return metrics

# -------------------------
# Simple PowerPoint creator
# -------------------------
def create_presentation(summary: Dict[str, Any], out_path: str = "seek_solution_presentation.pptx"):
    """
    Create a simple PowerPoint that summarizes approach and findings.
    This is intentionally minimal — adapt styling and content as needed.
    """
    if Presentation is None:
        logger.warning("python-pptx not installed; skipping PPTX generation.")
        return None
    prs = Presentation()
    # Title slide
    slide_layout = prs.slide_layouts[0]
    slide = prs.slides.add_slide(slide_layout)
    title = slide.shapes.title
    subtitle = slide.placeholders[1]
    title.text = "SEEK Job Metadata Extraction — Case Study"
    subtitle.text = "Hybrid pipeline: Traditional NLP + Generative AI\nAuto-generated summary"

    # Approach slide
    slide = prs.slides.add_slide(prs.slide_layouts[1])
    slide.shapes.title.text = "Approach"
    body = slide.shapes.placeholders[1].text_frame
    body.text = "1) Data ingestion and cleaning\n2) Baseline keyword extraction (KeyBERT / heuristics)\n3) LLM-based structured extraction\n4) Evaluation & scaling considerations"

    # Results slide
    slide = prs.slides.add_slide(prs.slide_layouts[1])
    slide.shapes.title.text = "Sample Results"
    body = slide.shapes.placeholders[1].text_frame
    sample_lines = summary.get("sample_lines", ["No sample"])
    for i, line in enumerate(sample_lines[:6]):
        p = body.add_paragraph()
        p.level = 1
        p.text = line if isinstance(line, str) else json.dumps(line)[:200]

    prs.save(out_path)
    logger.info("Presentation saved to %s", out_path)
    return out_path

# -------------------------
# CLI and main
# -------------------------
def parse_args():
    parser = argparse.ArgumentParser(description="SEEK job metadata extraction pipeline")
    parser.add_argument("--input", "-i", type=str, default="/mnt/data/random_rows.csv", help="Path to input CSV/JSONL")
    parser.add_argument("--output", "-o", type=str, default="./seek_extracted.jsonl", help="Path to output JSONL")
    parser.add_argument("--max", "-m", type=int, default=200, help="Max records to process (for testing)")
    parser.add_argument("--llm", action="store_true", help="Use LLM extraction (requires OPENAI_API_KEY in env)")
    parser.add_argument("--llm_model", type=str, default="gpt-4o-mini", help="LLM model name (OpenAI-compatible)")
    parser.add_argument("--pptx", action="store_true", help="Generate a simple PPTX summary")
    return parser.parse_args()

def main():
    args = parse_args()
    input_path = args.input
    output_path = args.output
    max_records = args.max

    # Load data
    ads = load_jsonl_csv(input_path)
    logger.info("Loaded %d ads", len(ads))

    llm_key = None
    if args.llm:
        llm_key = os.environ.get("OPENAI_API_KEY")
        if not llm_key:
            logger.error("LLM extraction requested but OPENAI_API_KEY not found in environment. Exiting.")
            return

    processed_path = process_ads(ads, output_path, llm_api_key=llm_key, max_records=max_records)

    # Sample for manual labeling and quick summary
    sample = sample_and_manual_label(str(processed_path), sample_n=6)
    summary = {"n_processed": len(ads[:max_records] if max_records else ads), "sample_lines": [s.get("title") or s.get("raw_text")[:200] for s in sample]}

    if args.pptx:
        pptx_path = create_presentation(summary, out_path="seek_solution_presentation.pptx")
        if pptx_path:
            logger.info("PPTX created at %s", pptx_path)

    logger.info("Done. Output written to %s", processed_path)

if __name__ == "__main__":
    main()


## Example usage
Below is an example of how to run the pipeline from this notebook:

```python
ads = load_jsonl_csv('/mnt/data/random_rows.csv')
results_path = process_ads(ads, './seek_extracted.jsonl', llm_api_key=os.getenv('OPENAI_API_KEY'), max_records=20)
sample = sample_and_manual_label(results_path, sample_n=3)
sample
```

You can modify the parameters and explore the structured outputs interactively.