# Global setup and package installation used in most phases

## Colab + GPU Detection Utilities

In [1]:
import subprocess

def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

def get_available_gpu_memory_gb():
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,nounits,noheader"],
            encoding="utf-8"
        )
        free_mem_mb = int(output.strip().split("\n")[0])
        return free_mem_mb / 1024
    except Exception:
        return 0.0


## install dependencies

In [2]:
if is_running_in_colab():
    # Install the required packages
    !pip install kagglehub pandas
    !pip install -q transformers accelerate bitsandbytes sentencepiece pydantic huggingface_hub xformers
    !pip install regex json5
else:
    %pip install kagglehub pandas
    %pip install -q transformers accelerate sentencepiece pydantic huggingface_hub xformers
    #%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
    #%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
    %pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
    %pip install -U bitsandbytes
    %pip install regex json5


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.6.0.dev20250421+cu128 requires torch==2.8.0.dev20250420+cu128, but you have torch 2.6.0 which is incompatible.
torchvision 0.22.0.dev20250421+cu128 requires torch==2.8.0.dev20250420+cu128, but you have torch 2.6.0 which is incompatible.

[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Looking in indexes: https://download.pytorch.org/whl/nightly/cu128
Collecting torch
  Using cached https://download.pytorch.org/whl/nightly/cu128/torch-2.8.0.dev20250420%2Bcu128-cp312-cp312-win_amd64.whl (3331.4 MB)
Collecting sympy>=1.13.3 (from torch)
  Using cached https://download.pytorch.org/whl/nightly/sympy-1.13.3-py3-none-any.whl (6.2 MB)
Installing collected packages: sympy, torch
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.1
    Uninstalling sympy-1.13.1:
      Successfully uninstalled sympy-1.13.1
  Attempting uninstall: torch
    Found existing installation: torch 2.6.0
    Uninstalling torch-2.6.0:
      Successfully uninstalled torch-2.6.0
Successfully installed sympy-1.13.3 torch-2.8.0.dev20250420+cu128
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
xformers 0.0.29.post3 requires torch==2.6.0, but you have torch 2.8.0.dev20250420+cu128 which is incompatible.

[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Login to huggingface

In [None]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    if is_running_in_colab():
        # If running in Colab, use the Colab secrets
        try:
            from google.colab import userdata
            HF_TOKEN = userdata.get('HF_TOKEN')
            if not HF_TOKEN:
                raise ValueError("⚠️ Hugging Face token not found in Colab secrets.")
            print("🔑 Hugging Face token found in Colab secrets.")
        except ImportError:
            print("⚠️ Unable to authenticate in Colab. Please set your Hugging Face token manually.")
    else:
        # Prompt for token if not set in environment
        print("🔑 Please enter your Hugging Face token:")
        # For Colab or local prompt input
        HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


## Setup Kaggle Credentials

In [None]:
import shutil

def setup_kaggle_credentials():
    kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
    if not os.path.exists(kaggle_path):
        from google.colab import files
        print("📂 Upload kaggle.json file...")
        uploaded = files.upload()
        os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
        for filename in uploaded.keys():
            shutil.move(filename, kaggle_path)
        os.chmod(kaggle_path, 0o600)
        print(f"✅ Kaggle credentials setup at {kaggle_path}")
    else:
        print(f"✅ Kaggle credentials already exist at {kaggle_path}")

setup_kaggle_credentials()

## Mount Google Drive (Colab)

In [None]:
if is_running_in_colab():
   from google.colab import drive
   drive.mount('/content/drive')

##  Load Nous-Hermes-mistral-Instruct with Fallback to Quantized

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

def load_model_pipeline(model_name: str, hf_token: str):
    has_cuda = torch.cuda.is_available()
    free_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) if has_cuda else 0
    print(f"💻 CUDA: {has_cuda} | GPU Memory: {free_mem:.2f} GB")

    device_map = {"": 0} if has_cuda else "cpu"
    use_4bit = has_cuda and free_mem < 24

    # Set quantization config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True if use_4bit else False,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    ) if use_4bit else None

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token  # ✅ Fix warning about pad_token

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=quant_config,
        torch_dtype=torch.float16 if not quant_config else None,
        trust_remote_code=True,
        token=hf_token
    )

    print(f"✅ Model loaded on {next(model.parameters()).device}")
    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=1)


In [None]:
llm_pipeline = load_model_pipeline(
    model_name="NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
    hf_token=HF_TOKEN
)


# Global utilities

### Utility to merge normalized json files

In [None]:
import json
import shutil
from pathlib import Path

def merge_json_files(
    source_dir: Path,
    output_file: Path,
    pattern: str,
    merged_dir: Path
):
    source_dir.mkdir(parents=True, exist_ok=True)
    merged_dir.mkdir(parents=True, exist_ok=True)

    merged_data = []

    # Load existing output if it exists
    if output_file.exists():
        with open(output_file, "r", encoding="utf-8") as f:
            try:
                merged_data = json.load(f)
            except json.JSONDecodeError:
                print(f"⚠️ Could not decode {output_file}, starting from scratch.")

    # Identify matching files
    files_to_merge = sorted(source_dir.glob(pattern))

    for file_path in files_to_merge:
        with open(file_path, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    merged_data.extend(data)
                else:
                    print(f"⚠️ Skipping {file_path.name}: not a list.")
            except Exception as e:
                print(f"⚠️ Failed to parse {file_path.name}: {e}")
                continue

        # Move to merged folder
        shutil.move(str(file_path), merged_dir / file_path.name)
        print(f"✅ Merged and moved: {file_path.name}")

    # Write combined output
    if merged_data:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(merged_data, f, indent=2)
        print(f"💾 Saved to: {output_file}")
    else:
        print("📭 No valid data to merge.")

# === Usage ===

# Paths
normalized_dir = Path("json_outputs_phase1_run1/normalized")
merged_dir = normalized_dir / "merged"

merge_json_files(
    source_dir=normalized_dir,
    output_file=normalized_dir / "normalized_jds.json",
    pattern="normalized_jds_*.json",
    merged_dir=merged_dir
)

merge_json_files(
    source_dir=normalized_dir,
    output_file=normalized_dir / "normalized_resumes.json",
    pattern="normalized_resumes_*.json",
    merged_dir=merged_dir
)


### Infer JD Domains

In [None]:
domain_keywords_dict = {
    'advocate': ['advocate'],
    'agriculture': ['agriculture'],
    'apparel': ['apparel'],
    'arts': ['arts'],
    'automobile': ['automobile'],
    'aviation': ['aviation'],
    'banking': ['banking'],
    'bpo': ['bpo'],
    'business development': ['business', 'development', 'business development', 'business-development'],
    'chef': ['chef'],
    'construction': ['construction'],
    'consultant': ['consultant'],
    'data scientist': ['data', 'data analyst', 'data scientist', 'scientist'],
    'designing': ['designing', 'designer'],
    'digital media': ['digital', 'digital marketing executive', 'media', 'digital media', 'digital-media'],
    'engineering': ['engineering'],
    'finance': ['finance', 'financial analyst'],
    'healthcare': ['healthcare'],
    'hr': ['hr'],
    'information technology': ['information', 'technology', 'information technology', 'information-technology'],
    'public relations': ['public', 'relations', 'public relations', 'public-relations'],
    'marketing': ['marketing'],
    'sales': ['sales', 'sales executive'],
    'teacher': ['teacher'],
    'technician': ['technician'],
    'training': ['training'],
    'web designing': ['web', 'designing'],
    'fitness': ['fitness'],
    'accountant': ['accountant', 'accounting']
}


In [None]:
def infer_domain_from_title(title):
    title_lower = title.lower()
    for domain, keywords in domain_keywords_dict.items():
        if any(kw in title_lower for kw in keywords):
            return domain
    return "unknown"


In [None]:
import re
# no longer being used
def infer_job_domain_llm(description: str) -> str:
    print("🔍 Inferring job domain using LLM...")

    prompt = f"""
Given a job description, return only the most likely domain of the job as a one-word or short noun phrase. Do not include explanation, punctuation, or label. Just return the domain.

Job Description:
{description.strip()}

Domain:"""

    try:
        response = llm_pipeline(prompt)[0]['generated_text']

        # Extract only the part after the final "Domain:" (handles echo case)
        if "Domain:" in response:
            response = response.split("Domain:")[-1]

        # Get the first non-empty word/line
        domain = next((line.strip().lower() for line in response.strip().splitlines() if line.strip()), "unknown")

        # Clean up unwanted characters
        domain = re.sub(r"[^a-zA-Z &\-]", "", domain)

        return domain if domain else "unknown"

    except Exception as e:
        print(f"⚠️ LLM inference failed: {e}")
        return "unknown"


### Configurations  

In [None]:
# ==============================
# 🛠 CONFIGURATION
# ==============================

class Config:
    DATASET_DOWNLOAD_DIR = "datasets"
    JSON_OUTPUT_DIR = "json_outputs_phase1_run1"
    JSON_OUTPUT_NORMALIZED_DIR = "json_outputs_phase1_run1/normalized"
    AUTO_CLEANUP = True


## Utility to save json to a folder

In [None]:
import json
import os
# 📦 Save JSON Output with Safety
def save_json_output(data, output_path: str, indent: int = 4, overwrite: bool = True):
    output_dir = os.path.dirname(output_path)
    os.makedirs(output_dir, exist_ok=True)

    if os.path.exists(output_path):
        if overwrite:
            os.remove(output_path)
        else:
            raise FileExistsError(f"File {output_path} already exists and overwrite=False.")

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=indent, ensure_ascii=False)

    print(f"✅ Saved output to {output_path}")


## Utility to load file

In [None]:
from typing import Any
import json

# 📂 Load normalized JSON data
def load_json_file(file_path: str) -> Any:
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Phase 2 -	Parse resume/JD into JSON structured scheme

## Define Pydantic Schemas

In [None]:
from typing import Optional
from pydantic import BaseModel

class Education(BaseModel):
    degree: str
    field: str
    institution: str
    year: str
    gpa: Optional[str] = None

class Experience(BaseModel):
    job_title: str
    company: str
    start_date: Optional[str] = None
    end_date: Optional[str] = None
    duration_in_months: Optional[int] = None
    description: str

class Basics(BaseModel):
    name: str
    email: str
    phone: str
    location: str
    current_title: str
    linkedin_url: Optional[str] = None

In [None]:
from typing import List, Optional
from pydantic import BaseModel
import re


class ResumeSchema(BaseModel):
    resume_id: Optional[str] = None
    basics: Basics
    education: List[Education]
    experience: List[Experience]
    skills: List[str]
    certifications: List[str]
    projects: List[str]
    languages: Optional[List[str]] = []
    total_experience_years: Optional[float] = 0.0  # ✅ New field added

    @classmethod
    def normalize(cls, resume_dict: dict) -> dict:
        resume_dict = dict(resume_dict)

        # Basics
        basics = resume_dict.get("basics", {})
        resume_dict["basics"] = {
            "name": basics.get("name", ""),
            "email": basics.get("email", ""),
            "phone": basics.get("phone", ""),
            "location": basics.get("location", ""),
            "current_title": basics.get("current_title", basics.get("title", "")),
            "linkedin_url": basics.get("linkedin_url", "")
        }

        # Normalize sections
        for key in ["skills", "certifications", "projects", "languages"]:
            if not isinstance(resume_dict.get(key), list):
                resume_dict[key] = []

        # Normalize Experience
        normalized_exp = []
        for item in resume_dict.get("experience", []):
            if not isinstance(item, dict):
                continue
            normalized_exp.append({
                "job_title": item.get("job_title", item.get("title", "")),
                "company": item.get("company", ""),
                "start_date": item.get("start_date", ""),
                "end_date": item.get("end_date", ""),
                "duration_in_months": item.get("duration_in_months", None),
                "description": item.get("description", "")
            })
        resume_dict["experience"] = normalized_exp

        # Normalize Education
        normalized_edu = []
        for item in resume_dict.get("education", []):
            if not isinstance(item, dict):
                continue
            degree = item.get("degree", "")
            field = item.get("field", "")
            if not field:
                match = re.search(r"in\\s+(.+)", degree, flags=re.IGNORECASE)
                field = match.group(1).strip() if match else ""
            year = str(item.get("year", "")) if item.get("year") else ""
            gpa = item.get("gpa", None)
            normalized_edu.append({
                "degree": degree,
                "field": field,
                "institution": item.get("institution", ""),
                "year": year,
                "gpa": gpa
            })
        resume_dict["education"] = normalized_edu

        # Total Experience fallback
        if "total_experience_years" not in resume_dict:
            resume_dict["total_experience_years"] = 0.0

        return resume_dict


In [None]:
class JobDescriptionSchema(BaseModel):
    jd_id: Optional[str] = None
    title: str
    summary: str
    required_experience_years: float
    preferred_degrees: List[str]
    required_skills: List[str]
    optional_skills: List[str]
    certifications: List[str]
    soft_skills: List[str]
    job_location: str
    remote_option: Optional[bool] = False
    employment_type: Optional[str] = None
    inferred_domain: str = "unknown"

    @classmethod
    def normalize(cls, jd_dict: dict) -> dict:
        jd_dict = dict(jd_dict)

        aliases = {
            "years_required": "required_experience_years",
            "requirements": "required_skills",
            "degree_preferences": "preferred_degrees",
            "certs": "certifications",
            "skills_soft": "soft_skills",
            "job_summary": "summary"
        }
        for old, new in aliases.items():
            if old in jd_dict and new not in jd_dict:
                jd_dict[new] = jd_dict.pop(old)

        # Required Experience Extraction
        def extract_experience_years(text: str) -> float:
            if not isinstance(text, str):
                return 0.0
            match = re.search(r'(\\d+(\\.\\d+)?)\\s*\\+?\\s*(years?|yrs?)', text.lower())
            return float(match.group(1)) if match else 0.0

        try:
            val = jd_dict.get("required_experience_years")
            if val is None:
                jd_dict["required_experience_years"] = extract_experience_years(jd_dict.get("summary", ""))
            elif isinstance(val, str):
                jd_dict["required_experience_years"] = float(val.split()[0])
            else:
                jd_dict["required_experience_years"] = float(val)
        except Exception:
            jd_dict["required_experience_years"] = 0.0

        # Normalize fields
        for field in ["preferred_degrees", "required_skills", "optional_skills", "certifications", "soft_skills"]:
            if not isinstance(jd_dict.get(field), list):
                jd_dict[field] = []

        for field in ["title", "summary", "job_location", "employment_type"]:
            jd_dict[field] = jd_dict.get(field, "") or ""

        # Remote Option
        remote_flag = jd_dict.get("remote_option", None)
        if remote_flag is None:
            remote_flag = "remote" in jd_dict.get("summary", "").lower()
        jd_dict["remote_option"] = bool(remote_flag)

        return jd_dict


In [None]:
def generate_example_structure(model_class) -> dict:
    """Generate a JSON structure from a Pydantic model using placeholder values, handling Optional fields better."""
    from typing import get_origin, get_args, Union
    from pydantic import BaseModel

    def default_for_type(field_type):
        origin = get_origin(field_type)
        args = get_args(field_type)

        if origin is list:
            return []
        elif origin is Union and type(None) in args:
            # Optional[...] detected
            non_none_types = [arg for arg in args if arg is not type(None)]
            return default_for_type(non_none_types[0]) if non_none_types else ""
        elif field_type is str:
            return ""
        elif field_type in [float, int]:
            return 0.0
        elif isinstance(field_type, type) and issubclass(field_type, BaseModel):
            return generate_example_structure(field_type)
        else:
            return ""

    structure = {}
    for field_name, field in model_class.model_fields.items():
        try:
            structure[field_name] = default_for_type(field.annotation)
        except Exception:
            structure[field_name] = ""
    return structure


##  Prompt Templates

In [None]:
RESUME_PROMPT_TEMPLATE = """
You are a JSON resume parser and experience calculator.

Given the following resume text, extract a structured JSON following this schema:

{schema}

Instructions:
- Parse education, experience, skills, certifications, and other fields exactly as described.
- In the "experience" list, if start_date and end_date are missing, try to infer them if mentioned anywhere.
- Accept various date formats such as "March 2007", "Mar 07", "03/2007", "Current", "Present" etc.
- Interpret "Current", "Present", "Today" as the current month and year.
- Calculate "total_experience_years" as the cumulative duration of professional work experience from all roles.
    - Overlapping durations should not be double-counted.
    - If start and end dates are missing or ambiguous, skip them for total experience calculation.
- If a field is missing in the resume, leave it empty ("") or an empty list [] depending on the field type.
- Return ONLY a valid JSON object. No extra text, no explanations, no markdown formatting.
- Your output MUST start with a {{.

Resume Text:
--------------------
{text}
--------------------
"""


In [None]:
JD_PROMPT_TEMPLATE = """
You are a JSON job description parser and experience extractor.

Given the following job description text, extract a structured JSON following this schema:

{schema}

Instructions:
- Parse title, summary, skills, certifications, and other fields exactly as shown.
- Pay special attention to "required_experience_years":
    - If experience years are explicitly listed, extract that number.
    - Accept formats like "5+ years", "3-5 years", "8 years required", etc.
    - If multiple ranges are mentioned (e.g., "3-5 years"), use the lower value (3 years).
    - If no years are mentioned explicitly, infer from job title level:
        - "Senior", "Lead" → Assume 5+ years
        - "Mid-level", "Experienced" → Assume 3 years
        - "Entry level", "Junior" → Assume 0-1 years
    - If still ambiguous, default to 0 years.
- Handle remote/hybrid jobs:
    - Set "remote_option" = true if remote keywords are present (remote, work from home, hybrid, WFH).
- Infer the **inferred_domain** from the job description:
    - Return a short domain noun (e.g., "software", "marketing", "data science", "finance", "healthcare").
    - Use the title and summary to guide inference.
    - If uncertain, use "unknown".
- If a field is missing, leave it empty ("") or as an empty list [] depending on the field type.
- Return ONLY a valid JSON object. No extra text, no explanations, no markdown formatting.
- Your output MUST start with a {{.

Job Description Text:
--------------------
{text}
--------------------
"""


##  Inference + Validation Functions

### Generate Raw LLM Output

In [None]:
def generate_llm_output(prompt: str, max_new_tokens: int = 1024) -> str:
    """Run LLM and return the generated text."""
    try:
        return llm_pipeline(prompt, max_new_tokens=max_new_tokens,  do_sample=False)[0]["generated_text"]
    except Exception as e:
        raise RuntimeError(f"LLM generation failed: {e}")


### Sanitize Output: Strip Prompt, Fix Cutoffs

In [None]:
def sanitize_llm_output(response: str, prompt: str) -> str:
    raw = response.replace(prompt, "").strip()

    # Truncate garbage after the last closing brace
    raw = re.sub(r'}[^}]*$', '}', raw)

    # Remove markdown bullets or --- headers at end
    raw = re.sub(r'(---|•|–|-)\s*$', '', raw, flags=re.MULTILINE)

    return raw


### Regex-based JSON Block Extractor

In [None]:
import regex
import json5

def extract_json_block(text: str) -> dict:
    """
    Extracts the first valid JSON object from a text using the `regex` module and parses with `json5`.
    This is more robust than standard `json` and can handle trailing commas, comments, etc.
    """
    # Recursive regex pattern to find balanced curly braces (non-greedy)
    pattern = r'(\{(?:[^{}]|(?R))*\})'

    for match in regex.finditer(pattern, text, flags=regex.DOTALL):
        json_candidate = match.group(1)
        try:
            return json5.loads(json_candidate)
        except json5.JSONDecodeError:
            continue

    raise ValueError("❌ No valid JSON object found using regex and json5.")


### Final Orchestrator: Fault-Tolerant Extraction

In [None]:
def truncate_text(text: str, max_chars=1500) -> str:
    """Trims long resumes/JDs to prevent LLM overload."""
    return text.strip()[:max_chars]

In [None]:
import uuid
from typing import Dict, Optional, Type

def inject_ids(parsed: Dict, schema_model: Optional[Type]) -> Dict:
    """
    Injects a UUID as `resume_id` or `jd_id` based on the schema model name.
    """
    if not schema_model:
        print("⚠️ No schema model provided for ID injection.")
        return parsed
    schema_name = schema_model.__name__.lower()
    if schema_name.startswith("resume"):
        parsed["resume_id"] = str(uuid.uuid4())
    elif schema_name.startswith("jobdescription"):
        parsed["jd_id"] = str(uuid.uuid4())
    return parsed


In [None]:
from typing import Union

def extract_structured_json(
    text: str,
    prompt_template: str,
    schema_model: Union[None, type] = None,
    max_new_tokens: int = 1024,
    retries: int = 0,
    validate: bool = True,
) -> dict:
    """
    Runs LLM to extract structured JSON and validates against schema.
    Includes: prompt sanitization, retry, echo detection, brace parser fallback, schema validation.
    """
    example_schema = generate_example_structure(schema_model)
    schema_str = json.dumps(example_schema, indent=2)
    prompt = prompt_template.format(text=truncate_text(text), schema=schema_str)
    raw_output = ""
    attempt = 0

    while attempt <= retries:
        try:
            # Step 1: Get LLM output
            response = generate_llm_output(prompt, max_new_tokens)
            raw_output = sanitize_llm_output(response, prompt)

            # Step 2: Detect schema echo or instruction echo
            if "$schema" in raw_output or "Ensure these rules" in raw_output:
                raise ValueError("LLM echoed schema or instruction block instead of generating JSON.")

            # Step 3: Try JSON load directly
            json_start = raw_output.find("{")
            if json_start == -1:
                raise ValueError("No opening '{' found in LLM output.")

            parsed = json.loads(raw_output[json_start:])
            parsed = inject_ids(parsed, schema_model)
            

            # Step 4: Optional schema validation
            if validate and schema_model:
                if hasattr(schema_model, "normalize"):
                    parsed = schema_model.normalize(parsed)
                schema_model.model_validate(parsed)

            return parsed

        except Exception as e:
            print(f"⚠️ Attempt {attempt + 1} failed: {e}")
            print("🧪 Raw output was:\n", raw_output[:300])  # Preview first 300 chars
            attempt += 1

    # Step 5: Fallback using brace matching
    try:
        parsed = extract_json_block(raw_output)
        parsed = inject_ids(parsed, schema_model)
            
        if validate and schema_model:
            if hasattr(schema_model, "normalize"):
                parsed = schema_model.normalize(parsed)
            schema_model.model_validate(parsed)
        return parsed
    except Exception as e:
        return {
            "raw_output": raw_output.strip(),
            "error": f"Regex fallback failed: {e}"
        }

In [None]:
from pydantic import ValidationError

def pydantic_validate(model_class, data):
    """
    Version-safe validator that supports both Pydantic v1 and v2.
    """
    try:
        # Pydantic v2
        return model_class.model_validate(data)
    except AttributeError:
        # Fallback to Pydantic v1
        return model_class.parse_obj(data)


def validate_entry(entry, is_resume):
    try:
        model = ResumeSchema if is_resume else JobDescriptionSchema
        if hasattr(model, "normalize"):
            normalized = model.normalize(entry)
        else:
            normalized = entry
        pydantic_validate(model, normalized)
        return True, None
    except ValidationError as ve:
        return False, str(ve)


##  Normalize in Batches with Validation

In [None]:
from datetime import datetime
import uuid
from pathlib import Path
import json
from tqdm import tqdm


def normalize_and_save(
    input_filename,
    output_filename_prefix,
    is_resume=True,
    input_dir=Path("json_outputs"),
    output_dir=Path("json_outputs/normalized"),
    limit: int = None,
    resume: bool = True,
    save_every: int = 5,
    checkpointing: bool = True,
    STRICT: bool = True
):
    # Ensure output directory exists
    output_dir.mkdir(parents=True, exist_ok=True)
    checkpoint_file = output_dir / f"checkpoint_{output_filename_prefix}.json"

    # Generate output filenames
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    batch_id = uuid.uuid4().hex[:6]
    valid_file = f"{output_filename_prefix}_{timestamp}_{batch_id}.json"
    invalid_file = f"invalid_{output_filename_prefix}_{timestamp}_{batch_id}.json"
    metadata_file = f"meta_{output_filename_prefix}_{timestamp}_{batch_id}.json"

    # Load raw input
    input_path = input_dir / input_filename
    with open(input_path, "r", encoding="utf-8") as f:
        raw_data = [json.loads(line) for line in f.readlines() if line.strip()]

    start_index = 0
    results, invalids = [], []

    if resume and checkpointing and checkpoint_file.exists():
        with open(checkpoint_file, "r") as ckpt:
            checkpoint = json.load(ckpt)
            start_index = checkpoint.get("last_index", 0)
            print(f"🔁 Resuming from record {start_index}")

    raw_data = raw_data[start_index:]
    if limit:
        raw_data = raw_data[:limit]

    prompt_template = RESUME_PROMPT_TEMPLATE if is_resume else JD_PROMPT_TEMPLATE
    schema_model = ResumeSchema if is_resume else JobDescriptionSchema

    for idx, record in enumerate(tqdm(raw_data), start=start_index):
        text = record.get("Resume_str" if is_resume else "description", "")
        parsed = extract_structured_json(
            text=text,
            prompt_template=prompt_template,
            schema_model=schema_model,
            validate=STRICT
        )

        if "raw_output" in parsed or "error" in parsed:
            invalids.append({
                "input": text,
                "output": parsed,
                "error": parsed.get("error", "Malformed or unstructured output")
            })
            continue

        if STRICT:
            is_valid, error_msg = validate_entry(parsed, is_resume)
            if is_valid:
                results.append(parsed)
            else:
                invalids.append({
                    "input": text,
                    "output": parsed,
                    "error": error_msg
                })
        else:
            results.append(parsed)

        # Save periodically
        if save_every and ((idx + 1 - start_index) % save_every == 0):
            if results:
                with open(output_dir / valid_file, "w") as f:
                    json.dump(results, f, indent=2)
            if invalids:
                with open(output_dir / invalid_file, "w") as f:
                    json.dump(invalids, f, indent=2)
            if checkpointing:
                with open(checkpoint_file, "w") as f:
                    json.dump({"last_index": idx + 1}, f)

    # Final save
    if results:
        with open(output_dir / valid_file, "w") as f:
            json.dump(results, f, indent=2)
    if invalids:
        with open(output_dir / invalid_file, "w") as f:
            json.dump(invalids, f, indent=2)
    if checkpointing:
        with open(checkpoint_file, "w") as f:
            json.dump({"last_index": start_index + len(raw_data)}, f)

    # Metadata summary
    meta = {
        "batch_id": batch_id,
        "timestamp": timestamp,
        "input_file": input_filename,
        "valid_output_file": valid_file if results else None,
        "invalid_output_file": invalid_file if invalids else None,
        "count_total": len(raw_data),
        "count_valid": len(results),
        "count_invalid": len(invalids),
        "strict_validation": STRICT,
        "model": llm_pipeline.model.config.name_or_path,
        "device": str(next(llm_pipeline.model.parameters()).device)
    }
    with open(output_dir / metadata_file, "w") as f:
        json.dump(meta, f, indent=2)

    print(f"✅ Done. Valid: {len(results)} | Invalid: {len(invalids)}")


In [None]:
def save_metadata_summary(
    output_dir: Path,
    is_resume: bool,
    input_file: str,
    total_records: int,
    total_valid: int,
    total_invalid: int,
    start_index: int,
    end_index: int,
    timestamp: str,
    batch_id: str
):
    summary = {
        "batch_id": batch_id,
        "timestamp": timestamp,
        "input_file": input_file,
        "input_type": "resume" if is_resume else "job_description",
        "records_start_index": start_index,
        "records_end_index": end_index,
        "records_total": total_records,
        "records_valid": total_valid,
        "records_invalid": total_invalid,
        "output_dir": str(output_dir)
    }
    summary_file = output_dir / f"meta_{'resumes' if is_resume else 'jds'}_{start_index}_{end_index}_{timestamp}_{batch_id}.json"
    save_json_output(summary, str(summary_file), overwrite=True)


In [None]:
import os
import json
import uuid
from datetime import datetime
from pathlib import Path
from typing import Any, List
from tqdm import tqdm


# 🔁 Normalize one batch of records
def normalize_batch(
    records: List[dict],
    start_idx: int,
    end_idx: int,
    is_resume: bool,
    output_dir: Path,
    prompt_template,
    schema_model
):
    results, invalids = [], []

    for record in records:
        text = record.get("Resume_str" if is_resume else "description", "")
        parsed = extract_structured_json(
            text=text,
            prompt_template=prompt_template,
            schema_model=schema_model,
            validate=False
        )

        if "raw_output" in parsed or "error" in parsed:
            invalids.append({
                "input": text,
                "output": parsed,
                "error": parsed.get("error", "Malformed or unstructured output")
            })
        else:
            results.append(parsed)

    # 🧾 Timestamped file naming
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    batch_id = uuid.uuid4().hex[:6]
    prefix = "resumes" if is_resume else "jds"

    if results:
        save_json_output(
            results,
            output_path=output_dir / f"{prefix}_valid_{start_idx}_{end_idx}_{timestamp}_{batch_id}.json"
        )
    if invalids:
        save_json_output(
            invalids,
            output_path=output_dir / f"{prefix}_invalid_{start_idx}_{end_idx}_{timestamp}_{batch_id}.json"
        )
    
    return results, invalids

In [None]:
# 📁 Main entry: Normalize in Batches with Limit
def normalize_file_in_batches(
    input_filename: str,
    output_dir: Path,
    is_resume: bool = True,
    input_dir: Path = Path("json_outputs"),
    save_every: int = 5,
    limit: int = None  # 🚀 New param: total records to process
):
    os.makedirs(output_dir, exist_ok=True)
    total_valid, total_invalid = 0, 0
    
    input_path = input_dir / input_filename
    data = load_json_file(input_path)

    checkpoint_file = output_dir / f"checkpoint_{'resumes' if is_resume else 'jds'}.json"
    start_index = 0
    if checkpoint_file.exists():
        checkpoint = load_json_file(checkpoint_file)
        start_index = checkpoint.get("last_index", 0)
        print(f"🔁 Resuming from index {start_index}")
        
    # 🧾 Metadata identifiers
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    batch_id = uuid.uuid4().hex[:6]
    actual_start = start_index
    

    # Respect limit if provided
    data_to_process = data[start_index:]
    if limit is not None:
        data_to_process = data_to_process[:limit]
    
    actual_end = start_index + len(data_to_process)

    prompt_template = RESUME_PROMPT_TEMPLATE if is_resume else JD_PROMPT_TEMPLATE
    schema_model = ResumeSchema if is_resume else JobDescriptionSchema

    os.makedirs(output_dir, exist_ok=True)

    for i in tqdm(range(0, len(data_to_process), save_every)):
        batch_start = start_index + i
        batch_end = batch_start + save_every
        batch = data_to_process[i:i + save_every]

        results, invalids = normalize_batch(
            records=batch,
            start_idx=batch_start,
            end_idx=batch_end,
            is_resume=is_resume,
            output_dir=output_dir,
            prompt_template=prompt_template,
            schema_model=schema_model
        )
        
        total_valid += len(results)
        total_invalid += len(invalids)

        # Save checkpoint relative to file not just this batch
        save_json_output({"last_index": batch_end}, str(checkpoint_file), overwrite=True)
        
     # 🧠 Save metadata after all batches
    save_metadata_summary(
        output_dir=output_dir,
        is_resume=is_resume,
        input_file=input_filename,
        total_records=len(data_to_process),
        total_valid=total_valid,
        total_invalid=total_invalid,
        start_index=actual_start,
        end_index=actual_end,
        timestamp=timestamp,
        batch_id=batch_id
    )

## Run Phase 2 End-to-End

In [None]:
import json
def run_phase2_structured_normalization():
    
    normalize_and_save(
    input_filename="parsed_resumes.json",
    output_filename_prefix="normalized_resumes",
    is_resume=True,
    limit=1,
    resume=True,
    STRICT=False,  # or set to False for schema-tolerant mode
    input_dir=Path(Config.JSON_OUTPUT_DIR),
    output_dir=Path(Config.JSON_OUTPUT_NORMALIZED_DIR)
    )
    
    normalize_and_save(
    input_filename="parsed_jds.json",
    output_filename_prefix="normalized_jds",
    is_resume=False,
    limit=1,
    resume=True,
    STRICT=False,  # or set to False for schema-tolerant mode
    input_dir=Path(Config.JSON_OUTPUT_DIR),
    output_dir=Path(Config.JSON_OUTPUT_NORMALIZED_DIR)
    )
    
run_phase2_structured_normalization()

In [None]:
normalize_file_in_batches(
    input_filename="parsed_jds.json",
    input_dir=Path(Config.JSON_OUTPUT_DIR),
    output_dir=Path(Config.JSON_OUTPUT_NORMALIZED_DIR),
    is_resume=False,
    save_every=5,
    limit=1  # ✅ Process only 20 records max
)


In [None]:
normalize_file_in_batches(
    input_filename="parsed_resumes.json",
    input_dir=Path(Config.JSON_OUTPUT_DIR),
    output_dir=Path(Config.JSON_OUTPUT_NORMALIZED_DIR),
    is_resume=True,
    save_every=5,
    limit=1  # ✅ Process only 20 records max
)
