# Global setup and package installation used in most phases

## Colab + GPU Detection Utilities

In [None]:
import subprocess

def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

def get_available_gpu_memory_gb():
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,nounits,noheader"],
            encoding="utf-8"
        )
        free_mem_mb = int(output.strip().split("\n")[0])
        return free_mem_mb / 1024
    except Exception:
        return 0.0


## install dependencies

In [None]:
if is_running_in_colab():
    # Install the required packages
    !pip install kagglehub pandas
    !pip install -q transformers accelerate bitsandbytes sentencepiece pydantic huggingface_hub xformers
    !pip install regex json5
else:
    %pip install kagglehub pandas
    %pip install -q transformers accelerate sentencepiece pydantic huggingface_hub xformers
    #%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
    #%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
    %pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
    %pip install -U bitsandbytes
    %pip install regex json5


## Login to huggingface

In [None]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    if is_running_in_colab():
        # If running in Colab, use the Colab secrets
        try:
            from google.colab import userdata
            HF_TOKEN = userdata.get('HF_TOKEN')
            if not HF_TOKEN:
                raise ValueError("⚠️ Hugging Face token not found in Colab secrets.")
            print("🔑 Hugging Face token found in Colab secrets.")
        except ImportError:
            print("⚠️ Unable to authenticate in Colab. Please set your Hugging Face token manually.")
    else:
        # Prompt for token if not set in environment
        print("🔑 Please enter your Hugging Face token:")
        # For Colab or local prompt input
        HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


## Setup Kaggle Credentials

In [None]:
import shutil

def setup_kaggle_credentials():
    kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
    if not os.path.exists(kaggle_path):
        from google.colab import files
        print("📂 Upload kaggle.json file...")
        uploaded = files.upload()
        os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
        for filename in uploaded.keys():
            shutil.move(filename, kaggle_path)
        os.chmod(kaggle_path, 0o600)
        print(f"✅ Kaggle credentials setup at {kaggle_path}")
    else:
        print(f"✅ Kaggle credentials already exist at {kaggle_path}")

setup_kaggle_credentials()

## Mount Google Drive (Colab)

In [None]:
if is_running_in_colab():
   from google.colab import drive
   drive.mount('/content/drive')

##  Load Nous-Hermes-mistral-Instruct with Fallback to Quantized

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

def load_model_pipeline(model_name: str, hf_token: str):
    has_cuda = torch.cuda.is_available()
    free_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) if has_cuda else 0
    print(f"💻 CUDA: {has_cuda} | GPU Memory: {free_mem:.2f} GB")

    device_map = {"": 0} if has_cuda else "cpu"
    use_4bit = has_cuda and free_mem < 24

    # Set quantization config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True if use_4bit else False,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    ) if use_4bit else None

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token  # ✅ Fix warning about pad_token

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=quant_config,
        torch_dtype=torch.float16 if not quant_config else None,
        trust_remote_code=True,
        token=hf_token
    )

    print(f"✅ Model loaded on {next(model.parameters()).device}")
    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=1)


In [None]:
llm_pipeline = load_model_pipeline(
    model_name="NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
    hf_token=HF_TOKEN
)


# Global utilities

### Utility to merge normalized json files

In [None]:
import json
import shutil
from pathlib import Path

def merge_json_files(
    source_dir: Path,
    output_file: Path,
    pattern: str,
    merged_dir: Path
):
    source_dir.mkdir(parents=True, exist_ok=True)
    merged_dir.mkdir(parents=True, exist_ok=True)

    merged_data = []

    # Load existing output if it exists
    if output_file.exists():
        with open(output_file, "r", encoding="utf-8") as f:
            try:
                merged_data = json.load(f)
            except json.JSONDecodeError:
                print(f"⚠️ Could not decode {output_file}, starting from scratch.")

    # Identify matching files
    files_to_merge = sorted(source_dir.glob(pattern))

    for file_path in files_to_merge:
        with open(file_path, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    merged_data.extend(data)
                else:
                    print(f"⚠️ Skipping {file_path.name}: not a list.")
            except Exception as e:
                print(f"⚠️ Failed to parse {file_path.name}: {e}")
                continue

        # Move to merged folder
        shutil.move(str(file_path), merged_dir / file_path.name)
        print(f"✅ Merged and moved: {file_path.name}")

    # Write combined output
    if merged_data:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(merged_data, f, indent=2)
        print(f"💾 Saved to: {output_file}")
    else:
        print("📭 No valid data to merge.")




### Configurations  

In [None]:
# ==============================
# 🛠 CONFIGURATION
# ==============================

class Config:
    DATASET_DOWNLOAD_DIR = "datasets"
    JSON_OUTPUT_DIR = "json_outputs_run3"
    JSON_OUTPUT_NORMALIZED_DIR = "json_outputs_run3/normalized"
    AUTO_CLEANUP = True


## Utility to save json to a folder

In [None]:
import json
import os
# 📦 Save JSON Output with Safety
def save_json_output(data, output_path: str, indent: int = 4, overwrite: bool = True):
    output_dir = os.path.dirname(output_path)
    os.makedirs(output_dir, exist_ok=True)

    if os.path.exists(output_path):
        if overwrite:
            os.remove(output_path)
        else:
            raise FileExistsError(f"File {output_path} already exists and overwrite=False.")

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=indent, ensure_ascii=False)

    print(f"✅ Saved output to {output_path}")


## Utility to load file

### load_ndjson_file() (for resume/jd input)

In [None]:
from typing import List


def load_ndjson_file(file_path: Path) -> List[dict]:
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as file:
        return [json.loads(line) for line in file if line.strip()]


### load_json_file() (for checkpoint & metadata)

In [None]:
def load_json_file(file_path: Path) -> dict:
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)


# Phase 2 -	Parse resume/JD into JSON structured scheme

### define utilities for schemas

In [None]:
import re
from typing import List, Optional
from datetime import datetime, timezone
from dateutil import parser


def estimate_total_experience_years(
    experiences: List[dict],
    fallback_fields: Optional[List[str]] = None
) -> float:

    now = datetime.now(timezone.utc)
    total_months = 0
    date_ranges = []

    for exp in experiences:
        start_raw = (exp.get("start_date") or "").strip()
        end_raw = (exp.get("end_date") or "").strip().lower()

        try:
            if not start_raw:
                continue
            start = parser.parse(start_raw, default=datetime(1900, 1, 1))
            end = now if end_raw in {"", "current", "present"} else parser.parse(end_raw, default=now)
            if end >= start:
                date_ranges.append((start, end))
        except Exception:
            try:
                months = int(exp.get("duration_in_months", 0))
                if months > 0:
                    total_months += months
            except Exception:
                continue

    date_ranges.sort()
    merged = []
    for start, end in date_ranges:
        if not merged or start > merged[-1][1]:
            merged.append([start, end])
        else:
            merged[-1][1] = max(merged[-1][1], end)

    for start, end in merged:
        months = (end.year - start.year) * 12 + (end.month - start.month)
        total_months += max(0, months)

    if total_months > 0:
        return round(total_months / 12.0, 1)

    if fallback_fields:
        combined_text = " ".join(fallback_fields).lower()
        matches = re.findall(r"(\d{1,2})\+?\s+(?:years|yrs)\s+(?:of\s+)?experience", combined_text)
        if matches:
            return max(int(m) for m in matches)

    return 0.0


In [None]:
from typing import List, Dict, Optional
from dateutil import parser
from datetime import datetime

def postprocess_resume(resume: dict) -> dict:
    
    def months_between(start: datetime, end: datetime) -> int:
        return (end.year - start.year) * 12 + (end.month - start.month)

    def calculate_duration_in_months(start_date: str, end_date: str) -> Optional[int]:
        try:
            if not start_date.strip():
                return None
            start = parser.parse(start_date)
            end = parser.parse(end_date) if end_date.strip().lower() not in {"current", "present", ""} else datetime.now()
            return max(0, months_between(start, end))
        except Exception:
            return None

    def remove_duplicate_experiences(experiences: List[Dict]) -> List[Dict]:
        seen = set()
        result = []
        for exp in experiences:
            key = (
                exp.get("job_title", "").lower(),
                exp.get("company", "").lower(),
                exp.get("start_date", ""),
                exp.get("end_date", "")
            )
            if key not in seen:
                seen.add(key)
                result.append(exp)
        return result

    # 🛠 Fill in missing durations
    for exp in resume.get("experience", []):
        if exp.get("duration_in_months") in [None, ""]:
            exp["duration_in_months"] = calculate_duration_in_months(exp.get("start_date", ""), exp.get("end_date", ""))

    # 🧹 Deduplicate experiences
    resume["experience"] = remove_duplicate_experiences(resume.get("experience", []))

    # 🧠 Recalculate total experience
    resume["total_experience_years"] = estimate_total_experience_years(resume.get("experience", []))

    return resume


## Define Pydantic Schemas

In [None]:
from typing import List, Optional
from pydantic import BaseModel

class Education(BaseModel):
    degree: Optional[str] = None
    field: Optional[str] = None
    institution: Optional[str] = None
    year: Optional[str] = None
    gpa: Optional[float] = None

class Experience(BaseModel):
    job_title: Optional[str] = None
    company: Optional[str] = None
    start_date: Optional[str] = None
    end_date: Optional[str] = None
    duration_in_months: Optional[int] = None
    description: Optional[List[str]] = None

class Certification(BaseModel):
    certification: Optional[str] = None
    date_issued: Optional[str] = None

class Project(BaseModel):
    project_title: Optional[str] = None
    description: Optional[str] = None
    start_date: Optional[str] = None
    end_date: Optional[str] = None
    url: Optional[str] = None

class Language(BaseModel):
    language: Optional[str] = None
    proficiency: Optional[str] = None



In [None]:
import re
from datetime import datetime
from typing import List, Optional
from pydantic import BaseModel

class ResumeSchema(BaseModel):
    resume_id: Optional[str] = None
    basics: Optional[dict] = None
    summary: Optional[str] = None
    education: Optional[List[dict]] = None
    experience: Optional[List[dict]] = None
    skills: Optional[List[str]] = None
    certifications: Optional[List[dict]] = None
    projects: Optional[List[dict]] = None
    languages: Optional[List[dict]] = None
    total_experience_years: Optional[float] = None

    @classmethod
    def normalize(cls, resume_dict: dict) -> dict:
        resume_dict = dict(resume_dict)

        # ✅ Clean 'basics' only if valid
        basics = resume_dict.get("basics")
        if isinstance(basics, dict):
            location = basics.get("location", "")
            resume_dict["basics"] = {
                "name": basics.get("name", "").strip() if basics.get("name", "").strip().lower() != "not visible" else None,
                "email": basics.get("email", "").strip() if basics.get("email", "").strip().lower() != "not visible" else None,
                "phone": basics.get("phone", "").strip() if basics.get("phone", "").strip().lower() != "not visible" else None,
                "location": "" if location.strip().lower() in {"city, state", "not visible"} else location.strip(),
                "current_title": basics.get("current_title", basics.get("title", "")).strip(),
                "linkedin_url": basics.get("linkedin_url", "").strip() if basics.get("linkedin_url", "").strip().lower() != "not visible" else None
            }

        # ✅ Summary fallback
        summary = resume_dict.get("summary")
        if summary is None:
            resume_dict["summary"] = ""
        elif not summary.strip() and isinstance(resume_dict.get("text"), str):
            first_chunk = resume_dict["text"].lower()
            match = re.search(r"(professional summary|summary|profile)\s*[:\-]?\s*(.+?)(\n{2,}|$)", first_chunk, re.IGNORECASE | re.DOTALL)
            if match:
                resume_dict["summary"] = match.group(2).strip()

        # ✅ Clean and deduplicate skills
        skills_raw = resume_dict.get("skills")
        if skills_raw:
            skills = set()
            for skill in skills_raw:
                if isinstance(skill, str):
                    parts = re.split(r"[,/;]| and | or ", skill)
                    for part in parts:
                        cleaned = part.strip().lower()
                        if cleaned and 2 <= len(cleaned.split()) <= 6 and "not visible" not in cleaned:
                            skills.add(cleaned)
            resume_dict["skills"] = sorted(skills)

        # ✅ Normalize languages
        langs_raw = resume_dict.get("languages")
        if langs_raw:
            langs = []
            for lang in langs_raw:
                if isinstance(lang, str) and lang.lower() != "not visible":
                    langs.append({"language": lang.strip(), "proficiency": ""})
                elif isinstance(lang, dict):
                    language = lang.get("language", "").strip()
                    if language.lower() != "not visible":
                        langs.append({
                            "language": language,
                            "proficiency": lang.get("proficiency", "").strip()
                        })
            resume_dict["languages"] = langs

        # ✅ Normalize certifications
        certs_raw = resume_dict.get("certifications")
        if certs_raw:
            certs = []
            for cert in certs_raw:
                if isinstance(cert, str) and cert.lower() != "not visible":
                    certs.append({"certification": cert.strip()})
                elif isinstance(cert, dict):
                    name = cert.get("certification", "").strip()
                    if name and name.lower() != "not visible":
                        certs.append({
                            "certification": name,
                            "date_issued": cert.get("date_issued", "").strip() if isinstance(cert.get("date_issued"), str) else None
                        })
            resume_dict["certifications"] = certs

        # ✅ Normalize education
        edu_raw = resume_dict.get("education")
        if edu_raw:
            edu_clean = []
            for edu in edu_raw:
                if not isinstance(edu, dict):
                    continue
                degree = edu.get("degree", "").strip()
                field = edu.get("field", "").strip()
                institution = edu.get("institution", "").strip()
                year = str(edu.get("year", "")).strip()

                if not year:
                    match_year = re.search(r"\b(19|20)\d{2}\b", degree)
                    if match_year:
                        year = match_year.group(0)

                try:
                    gpa = float(edu.get("gpa")) if edu.get("gpa") not in [None, ""] else None
                except (ValueError, TypeError):
                    gpa = None

                if degree.lower() != "not visible" and institution.lower() != "not visible":
                    edu_clean.append({
                        "degree": degree,
                        "field": field,
                        "institution": institution,
                        "year": year,
                        "gpa": gpa
                    })
            resume_dict["education"] = edu_clean

        # ✅ Normalize experience
        exp_raw = resume_dict.get("experience")
        if exp_raw:
            seen_exp = set()
            exp_clean = []
            for exp in exp_raw:
                if not isinstance(exp, dict):
                    continue
                title = exp.get("job_title", exp.get("title", "")).strip()
                company = exp.get("company", "").strip()
                start = exp.get("start_date", "").strip()
                end = exp.get("end_date", "").strip()
                key = (title.lower(), company.lower(), start, end)
                if key in seen_exp:
                    continue
                seen_exp.add(key)

                description = exp.get("description", [])
                if isinstance(description, str):
                    description = [line.strip() for line in re.split(r"[•\-\n\.]+", description) if line.strip()]
                elif not isinstance(description, list):
                    description = []

                exp_clean.append({
                    "job_title": title,
                    "company": company,
                    "start_date": start,
                    "end_date": end,
                    "duration_in_months": int(exp["duration_in_months"]) if isinstance(exp.get("duration_in_months"), (int, float)) else None,
                    "description": description
                })
            resume_dict["experience"] = exp_clean

        # ✅ Clean projects if any
        projects_raw = resume_dict.get("projects")
        if projects_raw:
            resume_dict["projects"] = [p for p in projects_raw if isinstance(p, dict)]

        resume_dict = postprocess_resume(resume_dict)

        return resume_dict


In [None]:
import re
from pydantic import BaseModel, field_validator
from typing import List, Optional


class JobDescriptionSchema(BaseModel):
    jd_id: str
    title: Optional[str] = ""
    summary: Optional[str] = ""
    required_experience_years: Optional[str] = None
    preferred_degrees: Optional[List[str]] = []
    required_skills: Optional[List[str]] = []
    optional_skills: Optional[List[str]] = []
    certifications: Optional[List[str]] = []
    soft_skills: Optional[List[str]] = []
    job_location: Optional[str] = ""
    remote_option: Optional[bool] = False
    employment_type: Optional[str] = None
    inferred_domain: str = "unknown"
    
    @field_validator("required_experience_years", mode="before")
    @classmethod
    def convert_experience_to_string(cls, v):
        if v is None:
            return None
        return str(v).strip()

    @classmethod
    def normalize(cls, jd_dict: dict) -> dict:
        jd_dict = dict(jd_dict)

        # ✅ Clean jd_id only if it's a valid number-like string
        jd_id = jd_dict.get("jd_id") or jd_dict.get("job_id")
        if isinstance(jd_id, str) and jd_id.strip().isdigit():
            jd_dict["jd_id"] = jd_id.strip()
        elif isinstance(jd_id, int):
            jd_dict["jd_id"] = str(jd_id)
        else:
            jd_dict.pop("jd_id", None)

        # ✅ Normalize string fields with deduplication protection
        for field in ["title", "summary", "job_location", "employment_type", "inferred_domain"]:
            val = jd_dict.get(field)
            if isinstance(val, str):
                val = val.strip()
                # drop if repeated e.g. "marketing marketing"
                if len(val.split()) > 1 and val.lower().split()[0] == val.lower().split()[-1]:
                    jd_dict.pop(field, None)
                elif val:
                    jd_dict[field] = val
                else:
                    jd_dict.pop(field, None)
            else:
                jd_dict.pop(field, None)

        # ✅ Deduplicate list fields, clean strings, drop if empty or invalid
        for field in [
            "preferred_degrees", "required_skills", "optional_skills", "certifications", "soft_skills"
        ]:
            value = jd_dict.get(field)
            if isinstance(value, list):
                cleaned = list(dict.fromkeys(v.strip() for v in value if isinstance(v, str) and v.strip()))
                if cleaned:
                    jd_dict[field] = cleaned
                else:
                    jd_dict.pop(field, None)
            else:
                jd_dict.pop(field, None)

        # ✅ Ensure remote_option is valid boolean
        remote = jd_dict.get("remote_option")
        if not isinstance(remote, bool):
            jd_dict.pop("remote_option", None)

        # ✅ Normalize experience years with fallback from summary/title
        val = jd_dict.get("required_experience_years")
        if val in [None, "", 0, 0.0]:
            text = f"{jd_dict.get('summary', '')} {jd_dict.get('title', '')}".lower()
            match = re.search(r"(\d{1,2})\s*[-–]?\s*(\d{1,2})?\s*(\+)?\s*(years|yrs)", text)
            if match:
                jd_dict["required_experience_years"] = float(match.group(1))
            else:
                jd_dict.pop("required_experience_years", None)
        else:
            try:
                jd_dict["required_experience_years"] = float(val)
            except Exception:
                jd_dict.pop("required_experience_years", None)

        return jd_dict


In [None]:
from typing import get_origin, get_args, Union
from pydantic import BaseModel

def generate_example_structure(model_class) -> dict:
    def default_for_type(field_type):
        origin = get_origin(field_type)
        args = get_args(field_type)

        if origin is list:
            return []
        elif origin is Union and type(None) in args:
            non_none_types = [arg for arg in args if arg is not type(None)]
            return default_for_type(non_none_types[0]) if non_none_types else ""
        elif field_type is str:
            return ""
        elif field_type in [float, int]:
            return 0.0
        elif isinstance(field_type, type) and issubclass(field_type, BaseModel):
            return generate_example_structure(field_type)
        else:
            return ""

    return {
        field_name: default_for_type(field.annotation)
        for field_name, field in model_class.model_fields.items()
    }


In [None]:
from functools import lru_cache
import json
from typing import Optional, Type

@lru_cache(maxsize=4)
def get_schema_str(schema_model: Optional[Type]) -> str:
    """
    Returns a cached JSON schema string for a given Pydantic model.
    Uses LRU cache to avoid recomputing the schema for every call.
    """
    if schema_model is None:
        return "{}"
    example = generate_example_structure(schema_model)
    return json.dumps(example, indent=2)


In [None]:
from typing import Dict, Type
import json
import re
from pydantic import BaseModel

# Cache containers
SCHEMA_STR_CACHE: Dict[str, str] = {}
PROMPT_TEMPLATE_PARTS: Dict[str, tuple] = {}

def cache_prompt_parts(name: str, template: str):
    """Split template into static parts around {text} placeholder."""
    if name not in PROMPT_TEMPLATE_PARTS:
        parts = re.split(r"{text}", template)
        if len(parts) != 2:
            raise ValueError(f"Invalid template: missing '{{text}}' placeholder → {template}")
        PROMPT_TEMPLATE_PARTS[name] = (parts[0], parts[1])

def render_prompt(name: str, text: str, schema_str: str):
    """Use cached prompt parts for fast prompt construction."""
    prefix, suffix = PROMPT_TEMPLATE_PARTS[name]
    return f"{prefix}{text}{suffix.replace('{schema}', schema_str)}"


##  Prompt Templates

In [None]:
RESUME_PROMPT_TEMPLATE = """
You are a JSON extractor for a partial resume chunk.

Return a **VALID JSON object** with only fields that are **clearly visible and complete** in this chunk.

{schema}

❌ STRICTLY AVOID:
- Fabricating or guessing
- Assuming data from other chunks
- Placeholder/fake/default values:
  - "John Doe", "City, State", "555-555-5555", "johndoe@email.com", "resume_1", "12345"
  - "N/A", "Not Visible", "Unknown"

✅ INCLUDE ONLY IF CLEARLY VISIBLE:
- basics: Only valid fields like name, email, phone, title.
- summary: Include 1–3 sentences near the top (within first 30 lines) that describe the candidate’s background, qualifications, or career focus. Accept even if not labeled “Summary”. Skip if it’s clearly just a skill list or job bullet.
- experience: Include jobs only if job_title and company are present AND at least one of start_date, end_date, or a readable date string (e.g., "Aug 2013 – Current") is present. Description must be bullet-like lines.
- education: Include only if degree, field, and institution are present. Year is optional.
- skills: Real tools/tech only (1–3 words), no traits/fragments.
- certifications: Named credentials only (e.g., “CPA”).
- languages: Real languages only (e.g., “English”, “Spanish”).
- total_experience_years: Only if 2+ valid experience entries with job_title, company, and at least one date are visible.

⚠️ If no valid fields are visible, return: {{}}
⚠️ Output ONLY valid JSON (no markdown, explanation, or fallback placeholders)

====================
\"\"\"{text}\"\"\"
====================
"""


In [None]:
JD_PROMPT_TEMPLATE = """
You are a structured JSON parser working on **partial job descriptions** (chunked input).
Each chunk may contain incomplete or partial data — only return fields that are fully visible and valid in the current chunk.

Extract a single valid JSON object matching this schema:
{schema}

Instructions:
- ONLY include a field if you clearly see its valid value in the current chunk. Do NOT fabricate or repeat fields from earlier chunks.
- If a field like "jd_id", "job_location", or "employment_type" is not present in this chunk, leave it out entirely.
- NEVER guess values or default them unless explicitly instructed.

Field Guidance:
- "jd_id": Use only the exact job_id visible in the current chunk (e.g. 3899527256). Do NOT guess or reuse.
- "title": Use job title if available. No repetition or combining with roles.
- "summary": First 1–2 sentences introducing the role.
- "required_experience_years":
  • Extract from phrases like "3–5 years", "5+ years", etc.
  • Use the **lower value** if range.
  • If not found but title has "Senior", "Mid", or "Junior", infer as 5, 3, or 0.
  • Otherwise, do NOT include this field.

- "preferred_degrees": Only if clearly stated (e.g. "Bachelor’s in Marketing").
- "optional_skills": If marked as preferred or nice-to-have.
- "certifications": Only named ones like "PMP", "Google Ads Certified".
- "job_location": Only if explicitly stated in the chunk.
- "remote_option": true if this chunk says "remote", "WFH", "hybrid"; else leave it out.
- "employment_type": Use only if visible (e.g. "full-time", "contract").
- "inferred_domain": Return a **single domain noun** like "marketing", "software", "finance". Never guess if unsure.
- "required_skills":
  • Short verb–noun skills like "build provider relationships", "conduct territory planning". 
  • Include only multi-word task phrases, known tools (e.g. "TikTok", "Adobe Suite"), or specific techniques.
  • Do NOT include vague one-word entries like "developing", "implementing", "clients", "platforms", or "content" unless they're clearly paired with an action.
  • Avoid full sentences or overly broad/general phrases.

- "soft_skills":
  • Only extract actual personality or behavior traits (e.g., "team player", "detail-oriented").
  • Do NOT infer from vague phrases like "maintain relationships" or "excellent communication".
  
- Skip any hallucinated domain-generic skills like "optimize ad performance" unless chunk explicitly includes it.


Output Rules:
- Return ONLY a valid JSON object. No markdown, comments, labels, or extra text.
- Output must begin with {{ and end with }}.

====================
{text}
====================
"""


##  Inference + Validation Functions

### Generate Raw LLM Output

In [None]:
def generate_llm_output(prompt: str, max_new_tokens: int = 1024) -> str:
    """
    Run LLM using generate() and return decoded output.
    """
    try:
        inputs = llm_pipeline.tokenizer(prompt, return_tensors="pt", truncation=True).to(llm_pipeline.model.device)
        generated_ids = llm_pipeline.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            pad_token_id=llm_pipeline.tokenizer.eos_token_id,
            do_sample=False
        )
        return llm_pipeline.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    except Exception as e:
        raise RuntimeError(f"LLM generation failed: {e}")


### Sanitize Output: Strip Prompt, Fix Cutoffs

In [None]:
import re
import unicodedata

def clean_json_string(raw: str) -> str:
    """
    Cleans LLM output to extract and sanitize the most likely valid JSON string.
    - Removes ASCII and Unicode control characters
    - Normalizes smart quotes, dashes, BOM, and other confusables
    - Extracts the largest balanced {...} or [...] block
    """

    # 1. Replace invisible ASCII control characters
    raw = re.sub(r"[\x00-\x1F\x7F]", " ", raw)

    # 2. Strip BOM and normalize typographic characters
    raw = raw.replace("\ufeff", "")

    replacements = {
        "“": '"', "”": '"', "‘": "'", "’": "'", "´": "'",
        "«": '"', "»": '"', "‐": "-", "–": "-", "—": "-",
        "…": "...", "\u00a0": " "  # non-breaking space
    }
    for bad, good in replacements.items():
        raw = raw.replace(bad, good)

    # 3. Remove any remaining control Unicode categories (Cc, Cf, etc.)
    raw = "".join(c for c in raw if unicodedata.category(c)[0] != "C")

    # 4. Extract best-matching balanced JSON candidate
    start_curly = raw.find("{")
    start_square = raw.find("[")
    end_curly = raw.rfind("}")
    end_square = raw.rfind("]")

    candidates = []
    if 0 <= start_curly < end_curly:
        candidates.append(raw[start_curly:end_curly + 1])
    if 0 <= start_square < end_square:
        candidates.append(raw[start_square:end_square + 1])

    for candidate in candidates:
        if is_brace_balanced(candidate):
            return candidate.strip()

    # Fallback: return whole cleaned string
    return raw.strip()

def is_brace_balanced(s: str) -> bool:
    """
    Returns True if the string has balanced {} and [] braces.
    """
    stack = []
    pair = {']': '[', '}': '{'}
    for c in s:
        if c in "{[":
            stack.append(c)
        elif c in "}]" and (not stack or stack[-1] != pair[c]):
            return False
        elif c in "}]" and stack:
            stack.pop()
    return not stack



In [None]:
def sanitize_llm_output(raw_output: str, prompt: str = "") -> str:
    """
    Removes echoed prompt and trims raw output to valid JSON block.
    Then performs cleaning and balancing.
    """
    # 1. Remove prompt echo
    cleaned = raw_output.replace(prompt, "").strip()

    # 2. Trim to first {...} block
    json_start = cleaned.find("{")
    if json_start != -1:
        cleaned = cleaned[json_start:]
        last_brace = cleaned.rfind("}")
        if last_brace != -1:
            cleaned = cleaned[:last_brace + 1]

    # 3. Final sanitization
    return clean_json_string(cleaned)

### Regex-based JSON Block Extractor

In [None]:
def json_fix_fallback(raw_output: str) -> Optional[dict]:
    """
    Best-effort fallback to recover a JSON object from malformed text.
    """
    try:
        cleaned = raw_output.strip()

        json_start = cleaned.find("{")
        if json_start != -1:
            cleaned = cleaned[json_start:]

        last_brace = cleaned.rfind("}")
        if last_brace != -1:
            cleaned = cleaned[:last_brace+1]

        return json.loads(cleaned)
    except Exception:
        return None


In [None]:
import regex
import json5

def extract_json_block(text: str) -> dict:
    """
    Extracts the first valid JSON object from a text using the `regex` module and parses with `json5`.
    This is more robust than standard `json` and can handle trailing commas, comments, etc.
    """
    # Recursive regex pattern to find balanced curly braces (non-greedy)
    pattern = r'(\{(?:[^{}]|(?R))*\})'

    for match in regex.finditer(pattern, text, flags=regex.DOTALL):
        json_candidate = match.group(1)
        try:
            return json5.loads(json_candidate)
        except json5.JSONDecodeError:
            continue

    raise ValueError("❌ No valid JSON object found using regex and json5.")


### Final Orchestrator: Fault-Tolerant Extraction

In [None]:
def count_tokens(text: str) -> int:
    return len(text.split())


In [None]:
import re
from typing import List

def split_resume_text_into_chunks(
    text: str,
    chunk_size: int = 1500,
    overlap: int = 250,
    min_chunk_size: int = 800,
    max_sentences_per_chunk: int = 40
) -> List[str]:
    SECTION_HEADERS = [
        "Skills", "Work History", "Experience", "Education", "Certifications",
        "Projects", "Professional Summary", "Summary", "Highlights"
    ]

    sentences = re.split(r'(?<=[.?!])\s+', text.strip())
    chunks = []
    current_chunk = ""
    sentence_count = 0

    def add_chunk(chunk: str):
        if chunk and len(chunk.strip()) >= min_chunk_size:
            chunks.append(chunk.strip())

    for sentence in sentences:
        sentence = sentence.strip()
        if any(sentence.startswith(header) for header in SECTION_HEADERS):
            add_chunk(current_chunk)
            current_chunk = sentence
            sentence_count = 1
            continue

        if len(current_chunk) + len(sentence) + 1 > chunk_size or sentence_count >= max_sentences_per_chunk:
            add_chunk(current_chunk)
            current_chunk = sentence
            sentence_count = 1
        else:
            current_chunk = f"{current_chunk} {sentence}".strip()
            sentence_count += 1

    add_chunk(current_chunk)

    # Overlap
    if overlap > 0 and len(chunks) > 1:
        return [
            f"{chunks[i - 1][-overlap:]} {chunks[i]}".strip() if i > 0 else chunks[i]
            for i in range(len(chunks))
        ]

    return chunks


In [None]:
import re
from typing import List

def split_jd_text_into_chunks(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
    """
    Splits JD text into coherent chunks based on line breaks and block sections.
    Uses character length, not sentence splitting.
    """
    lines = [line.strip() for line in text.strip().splitlines() if line.strip()]
    chunks = []
    current_chunk = ""

    for line in lines:
        if len(current_chunk) + len(line) + 1 > chunk_size:
            chunks.append(current_chunk.strip())
            # Start next chunk with last `overlap` chars from current
            current_chunk = current_chunk[-overlap:] + "\n" + line
        else:
            current_chunk += "\n" + line

    if current_chunk.strip():
        chunks.append(current_chunk.strip())

    return chunks

In [None]:
from typing import List, Dict, Any
from collections import OrderedDict
from copy import deepcopy

def merge_jsons_resume(json_list: List[Dict[str, Any]]) -> Dict[str, Any]:
    FIELD_ORDER = [
        "resume_id", "basics", "summary", "education", "experience",
        "skills", "certifications", "projects", "languages", "total_experience_years"
    ]
    dedupe_list_keys = {"skills", "certifications", "projects", "languages"}
    preserve_keys = {"resume_id"}
    skip_values = {
        "", "not visible", "n/a", "unknown", "city, state",
        "john doe", "linkedin.com/in/johndoe", "johndoe@email.com", "555-555-5555", "et"
    }

    def is_clean(val: Any) -> bool:
        return isinstance(val, str) and val.strip().lower() not in skip_values

    def try_extract_summary(text: str) -> str:
        if not isinstance(text, str):
            return ""
        lines = text.strip().splitlines()
        candidate_lines = lines[:30]
        blob = " ".join(line.strip() for line in candidate_lines if line.strip())
        sentences = [s.strip() for s in blob.split(".") if len(s.strip().split()) > 6]
        if 1 <= len(sentences) <= 3:
            return ". ".join(sentences) + "."
        return ""

    merged: Dict[str, Any] = {}
    seen_edu = set()
    seen_exp = set()

    sorted_parts = sorted(json_list, key=lambda x: x.get("_chunk_index", 0))

    for part in sorted_parts:
        for key, value in part.items():
            if key == "_chunk_index":
                continue

            if key in preserve_keys:
                merged.setdefault(key, deepcopy(value))

            elif key == "summary":
                if is_clean(value) and not merged.get("summary"):
                    merged["summary"] = value.strip()

            elif key == "basics":
                if isinstance(value, dict):
                    merged.setdefault("basics", {})
                    for k, v in value.items():
                        if is_clean(v) and not merged["basics"].get(k):
                            merged["basics"][k] = v.strip()

            elif key == "education":
                if isinstance(value, list):
                    merged.setdefault("education", [])
                    for item in value:
                        if isinstance(item, dict):
                            degree = item.get("degree", "").strip().lower()
                            field = item.get("field", "").strip().lower()
                            institution = item.get("institution", "").strip().lower()
                            if not degree or not field or not institution:
                                continue
                            edu_key = (degree, field, institution)
                            if edu_key not in seen_edu:
                                seen_edu.add(edu_key)
                                merged["education"].append(deepcopy(item))

            elif key == "experience":
                if isinstance(value, list):
                    merged.setdefault("experience", [])
                    for exp in value:
                        if isinstance(exp, dict):
                            title = exp.get("job_title", "").strip()
                            company = exp.get("company", "").strip()
                            start = exp.get("start_date", "").strip()
                            end = exp.get("end_date", "").strip()
                            if not (title and company and (start or end)):
                                continue
                            exp_key = (title.lower(), company.lower(), start, end)
                            if exp_key not in seen_exp:
                                seen_exp.add(exp_key)
                                merged["experience"].append(deepcopy(exp))

            elif key in dedupe_list_keys:
                if isinstance(value, list):
                    merged.setdefault(key, [])
                    for item in value:
                        if isinstance(item, dict):
                            if all(not is_clean(str(v)) for v in item.values()):
                                continue
                            merged[key].append(deepcopy(item))
                        elif isinstance(item, str) and is_clean(item):
                            merged[key].append(item.strip())

            elif key == "text":
                # Fallback for summary from top of text
                if "summary" not in merged:
                    inferred = try_extract_summary(value)
                    if is_clean(inferred):
                        merged["summary"] = inferred

            elif isinstance(value, (str, int, float)) and is_clean(str(value)) and not merged.get(key):
                merged[key] = deepcopy(value)

    # Deduplicate flat list fields like skills
    for key in dedupe_list_keys:
        if key in merged and isinstance(merged[key], list):
            seen = set()
            deduped = []
            for item in merged[key]:
                fingerprint = str(item).strip().lower()
                if fingerprint and fingerprint not in seen:
                    seen.add(fingerprint)
                    deduped.append(item)
            merged[key] = deduped

    # Ensure all required fields are present in correct order
    ordered = OrderedDict()
    for field in FIELD_ORDER:
        default = [] if field in {"education", "experience", "skills", "certifications", "projects", "languages"} else ""
        ordered[field] = merged.get(field, deepcopy(default))

    return ordered


In [None]:
import re
from typing import Dict, List, Any

GENERIC_PLACEHOLDER_VALUES = {
    "not visible", "n/a", "unknown", "city, state", "resume_1", "resume_123", "resume123",
    "john doe", "jane smith", "linkedin.com/in/johndoe", "linkedin.com/in/example",
    "johndoe@email.com", "johndoe@example.com", "555-555-5555", "123-456-7890"
}

def is_placeholder(value: str) -> bool:
    val = value.strip().lower()
    return (
        val in GENERIC_PLACEHOLDER_VALUES
        or val.endswith("@example.com") or val.endswith("@email.com")
        or re.match(r"^resume_\d+$", val)
        or re.match(r"^[a-z0-9\-]{36}$", val)  # UUID pattern
    )

def cleanup_resume_json(resume: Dict[str, Any]) -> Dict[str, Any]:
    # 🔹 basics field
    if "basics" in resume and isinstance(resume["basics"], dict):
        resume["basics"] = {
            k: v for k, v in resume["basics"].items()
            if isinstance(v, str) and v.strip() and not is_placeholder(v)
        }
        if not resume["basics"]:
            del resume["basics"]

    # 🔹 summary
    if "summary" in resume and (
        not isinstance(resume["summary"], str)
        or is_placeholder(resume["summary"])
        or re.search(r"\b\d+\+?\s+years?\s+of\s+experience\b", resume["summary"].lower())
    ):
        resume.pop("summary", None)

    # 🔹 skills
    if "skills" in resume:
        cleaned = set()
        for s in resume["skills"]:
            if isinstance(s, str):
                val = s.strip().lower()
                if val and not is_placeholder(val) and 2 <= len(val.split()) <= 6:
                    cleaned.add(val)
        resume["skills"] = sorted(cleaned)
        if not resume["skills"]:
            resume.pop("skills", None)

    # 🔹 experience
    if "experience" in resume:
        cleaned_exp = []
        for exp in resume["experience"]:
            if isinstance(exp, dict):
                if all(
                    not is_placeholder(str(exp.get(field, "")).lower())
                    for field in ["job_title", "company"]
                ):
                    cleaned_exp.append(exp)
        resume["experience"] = cleaned_exp
        if not resume["experience"]:
            resume.pop("experience", None)

    # 🔹 certifications
    if "certifications" in resume:
        resume["certifications"] = [
            c for c in resume["certifications"]
            if isinstance(c, dict) and not is_placeholder(c.get("certification", ""))
        ]
        if not resume["certifications"]:
            resume.pop("certifications", None)

    # 🔹 total_experience_years
    if "total_experience_years" in resume:
        if not isinstance(resume["total_experience_years"], (int, float)) or resume["total_experience_years"] == 0:
            resume.pop("total_experience_years", None)

    return resume


In [None]:
def is_valid_jd_chunk(chunk: dict) -> bool:
    if not isinstance(chunk, dict):
        return False
    if "title" not in chunk and "summary" not in chunk and "required_skills" not in chunk:
        return False
    if "You are a" in str(chunk) or "schema" in str(chunk):
        return False
    return True


In [None]:
from typing import List, Dict
import copy

def merge_jsons_jd(json_list: List[Dict]) -> Dict:
    merged = {}
    
    preserve_keys = {"jd_id"}
    replace_once_keys = {"job_location", "employment_type", "inferred_domain", "summary", "title"}
    dedupe_list_keys = {
        "preferred_degrees", "required_skills", "optional_skills", "certifications", "soft_skills"
    }

    for part in json_list:
        if not is_valid_jd_chunk(part):
            continue

        for key, value in part.items():
            if key in preserve_keys:
                merged.setdefault(key, copy.deepcopy(value))

            elif key in replace_once_keys:
                if key not in merged and isinstance(value, str) and value.strip():
                    merged[key] = value.strip()

            elif isinstance(value, list):
                merged.setdefault(key, []).extend(copy.deepcopy(value))

            elif key not in merged:
                merged[key] = copy.deepcopy(value)

    # Deduplicate lists
    for key in dedupe_list_keys:
        if key in merged and isinstance(merged[key], list):
            merged[key] = list(dict.fromkeys(
                [v.strip() for v in merged[key] if isinstance(v, str) and v.strip()]
            ))

    return merged


In [None]:
def merge_jsons(json_list: List[Dict], is_resume: bool) -> Dict:
    return merge_jsons_resume(json_list) if is_resume else merge_jsons_jd(json_list)

In [None]:
def save_text_json_pair(output_dir: str, record_id: str, text: str, parsed_json: Dict, is_resume=True):
    os.makedirs(output_dir, exist_ok=True)
    fname = f"{record_id}_{'resume' if is_resume else 'jd'}_pair.json"
    with open(Path(output_dir) / fname, "w", encoding="utf-8") as f:
        json.dump({
            "text": text,
            "parsed_json": parsed_json
        }, f, indent=2)

In [None]:
import uuid
from typing import Dict, Optional, Type

def inject_ids(parsed: Dict, schema_model: Optional[Type]) -> Dict:
    """
    Injects a UUID as `resume_id` or `jd_id` based on the schema model name.
    """
    if not schema_model:
        print("⚠️ No schema model provided for ID injection.")
        return parsed
    schema_name = schema_model.__name__.lower()
    if schema_name.startswith("resume"):
        parsed["resume_id"] = str(uuid.uuid4())
    elif schema_name.startswith("jobdescription") and not parsed.get("jd_id", "").strip():
        parsed["jd_id"] = str(uuid.uuid4())
    return parsed


In [None]:
import traceback

def extract_structured_json(
    text: str,
    prompt_template: str,
    schema_model: Optional[type] = None,
    max_new_tokens: int = 1024, 
    validate: bool = True,
    is_resume: bool = True,
    chunk_size: int = 1000,     # 🔧 Larger chunks
    overlap: int = 200,
    record_id: Optional[str] = None 
) -> dict:

    # ✅ Retrieve cached schema string
    schema_str = get_schema_str(schema_model)

    # ✅ Choose chunking strategy
    if is_resume:
        chunks = split_resume_text_into_chunks(text, chunk_size=1500, overlap=overlap)
    else:
        chunks = split_jd_text_into_chunks(text, chunk_size=chunk_size, overlap=overlap)
    merged_result = {}
    raw_output = ""

    for i, chunk in enumerate(chunks):
        prompt = prompt_template.format(text=chunk, schema=schema_str)
        print(f"🔢 Chunk {i+1} ({record_id or 'no-id'}): {len(prompt.split())} tokens")

        try:
            response = generate_llm_output(prompt, max_new_tokens=max_new_tokens)
            raw_output = sanitize_llm_output(response, prompt)
            print(f"🔍 Chunk {i+1} raw output:\n{raw_output}")
            
            # ✅ Robust guard: skip if output is empty or just repeats prompt
            if not raw_output.strip() or raw_output.strip() == "{}" or raw_output.strip().lower().startswith("you are a structured"):
                print(f"🧪 Chunk {i+1} produced empty output:\nPrompt:\n{prompt}\nResponse:\n{response}")
                raise ValueError("Echoed prompt or empty response")

            parsed = json.loads(raw_output)
            merged_result = merge_jsons([merged_result, parsed], is_resume=is_resume)
            if is_resume:
                merged_result = cleanup_resume_json(merged_result)

        except Exception as e:
            print(f"⚠️ Chunk {i+1}: Failed to parse – {e}")
            print("🧪 Raw output :\n", raw_output)
            traceback.print_exc(limit=4)

            # 🔁 Try JSON fix fallback first
            fallback = json_fix_fallback(raw_output)
            if fallback:
                print(f"✅ Chunk {i+1}: Fallback JSON recovery successful.")
                merged_result = merge_jsons([merged_result, fallback], is_resume=is_resume)
                continue

            

    if not merged_result:
        print("⚠️ All chunks failed. Attempting final fallback.")
        try:
            fallback = extract_json_block(raw_output)
            merged_result = fallback
        except Exception as e:
            return {
                "raw_output": raw_output.strip(),
                "error": f"Regex fallback failed: {e}"
            }

    merged_result = inject_ids(merged_result, schema_model)

    if validate and schema_model:
        if hasattr(schema_model, "normalize"):
            merged_result = schema_model.normalize(merged_result)
        schema_model.model_validate(merged_result)

    return merged_result


In [None]:
from pydantic import ValidationError

def pydantic_validate(model_class, data):
    """
    Version-safe validator that supports both Pydantic v1 and v2.
    """
    try:
        # Pydantic v2
        return model_class.model_validate(data)
    except AttributeError:
        # Fallback to Pydantic v1
        return model_class.parse_obj(data)


def validate_entry(entry, is_resume):
    try:
        model = ResumeSchema if is_resume else JobDescriptionSchema
        if hasattr(model, "normalize"):
            normalized = model.normalize(entry)
        else:
            normalized = entry
        pydantic_validate(model, normalized)
        return True, None
    except ValidationError as ve:
        return False, str(ve)


##  Normalize in Batches with Validation

In [None]:
def save_metadata_summary(
    output_dir: Path,
    is_resume: bool,
    input_file: str,
    total_records: int,
    total_valid: int,
    total_invalid: int,
    start_index: int,
    end_index: int,
    timestamp: str,
    batch_id: str
):
    summary = {
        "batch_id": batch_id,
        "timestamp": timestamp,
        "input_file": input_file,
        "input_type": "resume" if is_resume else "job_description",
        "records_start_index": start_index,
        "records_end_index": end_index,
        "records_total": total_records,
        "records_valid": total_valid,
        "records_invalid": total_invalid,
        "output_dir": str(output_dir)
    }
    summary_file = output_dir / f"meta_{'resumes' if is_resume else 'jds'}_{start_index}_{end_index}_{timestamp}_{batch_id}.json"
    save_json_output(summary, str(summary_file), overwrite=True)


In [None]:
def render_jd_text(record: dict) -> str:
    """
    Constructs a rich text string from all non-empty fields of a JD record.
    """
    jd_parts = []
    for k, v in record.items():
        label = k.replace('_', ' ').title()

        if v is None:
            continue

        if isinstance(v, list):
            if v:
                value_str = ", ".join(str(i) for i in v if i)
                jd_parts.append(f"{label}: {value_str}")
        elif isinstance(v, (str, int, float)):
            value_str = str(v).strip()
            if value_str:
                jd_parts.append(f"{label}: {value_str}")

    return "\n".join(jd_parts)


In [None]:
import uuid
from datetime import datetime
from pathlib import Path
from typing import List
import json
import os

def normalize_batch(
    records: List[dict],
    start_idx: int,
    end_idx: int,
    is_resume: bool,
    output_dir: Path,
    prompt_template,
    schema_model
):
    results, invalids = [], []

    pair_dir = output_dir / "pairs"
    pair_dir.mkdir(parents=True, exist_ok=True)

    for i, record in enumerate(records):
        idx = start_idx + i
        text = record.get("Resume_str", "") if is_resume else render_jd_text(record)
        record_id = record.get("resume_id") if is_resume else record.get("jd_id")
        if not record_id:
            record_id = f"{'resume' if is_resume else 'jd'}_{idx}"

        parsed = extract_structured_json(
            text=text,
            prompt_template=prompt_template,
            schema_model=schema_model,
            validate=True,
            record_id=record_id,
            is_resume=is_resume
        )

        if "raw_output" in parsed or "error" in parsed:
            invalids.append({
                "record_id": record_id,
                "input": text,
                "output": parsed,
                "error": parsed.get("error", "Malformed or unstructured output")
            })
        else:
            results.append(parsed)
            pair_path = pair_dir / f"{record_id}_pair.json"
            with open(pair_path, "w", encoding="utf-8") as f:
                json.dump({"text": text, "parsed_json": parsed}, f, indent=2)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    batch_id = uuid.uuid4().hex[:6]
    prefix = "resumes" if is_resume else "jds"

    if results:
        save_json_output(
            results,
            output_path=output_dir / f"{prefix}_valid_{start_idx}_{end_idx}_{timestamp}_{batch_id}.json"
        )
    if invalids:
        save_json_output(
            invalids,
            output_path=output_dir / f"{prefix}_invalid_{start_idx}_{end_idx}_{timestamp}_{batch_id}.json"
        )

    return results, invalids


In [None]:
from tqdm import tqdm

def normalize_file_in_batches(
    input_filename: str,
    output_dir: Path,
    is_resume: bool = True,
    input_dir: Path = Path("json_outputs"),
    save_every: int = 5,
    limit: int = None
):
    input_path = input_dir / input_filename
    with open(input_path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f if line.strip()]

    checkpoint_file = output_dir / f"checkpoint_{'resumes' if is_resume else 'jds'}.json"
    start_index = 0
    if checkpoint_file.exists():
        with open(checkpoint_file, "r", encoding="utf-8") as f:
            checkpoint = json.load(f)
            start_index = checkpoint.get("last_index", 0)
            print(f"🔁 Resuming from index {start_index}")

    data_to_process = data[start_index:]
    if limit is not None:
        data_to_process = data_to_process[:limit]

    prompt_template = RESUME_PROMPT_TEMPLATE if is_resume else JD_PROMPT_TEMPLATE
    schema_model = ResumeSchema if is_resume else JobDescriptionSchema
   

    os.makedirs(output_dir, exist_ok=True)

    total_valid, total_invalid = 0, 0
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    batch_id = uuid.uuid4().hex[:6]
    actual_start = start_index
    actual_end = start_index + len(data_to_process)

    for i in tqdm(range(0, len(data_to_process), save_every)):
        batch = data_to_process[i:i + save_every]
        batch_start = start_index + i
        batch_end = batch_start + len(batch)

        results, invalids = normalize_batch(
            records=batch,
            start_idx=batch_start,
            end_idx=batch_end,
            is_resume=is_resume,
            output_dir=output_dir,
            prompt_template=prompt_template,
            schema_model=schema_model,
        )

        total_valid += len(results)
        total_invalid += len(invalids)

        with open(checkpoint_file, "w", encoding="utf-8") as f:
            json.dump({"last_index": batch_end}, f)

    save_metadata_summary(
        output_dir=output_dir,
        is_resume=is_resume,
        input_file=input_filename,
        total_records=len(data_to_process),
        total_valid=total_valid,
        total_invalid=total_invalid,
        start_index=actual_start,
        end_index=actual_end,
        timestamp=timestamp,
        batch_id=batch_id
    )


## Run Phase 2 End-to-End

In [None]:
normalize_file_in_batches(
    input_filename="parsed_jds.json",
    input_dir=Path(Config.JSON_OUTPUT_DIR),
    output_dir=Path(Config.JSON_OUTPUT_NORMALIZED_DIR),
    is_resume=False,
    save_every=5,
    limit=5  # Set to None to process all records, or specify a limit for testing
)


In [None]:
normalize_file_in_batches(
    input_filename="parsed_resumes.json",
    input_dir=Path(Config.JSON_OUTPUT_DIR),
    output_dir=Path(Config.JSON_OUTPUT_NORMALIZED_DIR),
    is_resume=True,
    save_every=5,
    limit=1  
)


## Merge normalized files

In [None]:
# Paths
normalized_dir = Path(Config.JSON_OUTPUT_NORMALIZED_DIR)
merged_dir = normalized_dir / "merged"

merge_json_files(
    source_dir=normalized_dir,
    output_file=normalized_dir / "normalized_jds.json",
    pattern="jds_valid*.json",
    merged_dir=merged_dir
)

merge_json_files(
    source_dir=normalized_dir,
    output_file=normalized_dir / "normalized_resumes.json",
    pattern="resumes_valid*.json",
    merged_dir=merged_dir
)
