# Global setup and package installation used in most phases

## Colab + GPU Detection Utilities

In [None]:
import subprocess

def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

def get_available_gpu_memory_gb():
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,nounits,noheader"],
            encoding="utf-8"
        )
        free_mem_mb = int(output.strip().split("\n")[0])
        return free_mem_mb / 1024
    except Exception:
        return 0.0


## install dependencies

In [None]:
if is_running_in_colab():
    # Install the required packages
    !pip install kagglehub pandas
    !pip install -q transformers accelerate bitsandbytes sentencepiece pydantic huggingface_hub xformers
    !pip install regex json5 
else:
    %pip install kagglehub pandas
    %pip install -q transformers accelerate sentencepiece pydantic huggingface_hub xformers
    #%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
    #%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
    %pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
    %pip install -U bitsandbytes
    %pip install regex json5 


## Login to huggingface

In [None]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    if is_running_in_colab():
        # If running in Colab, use the Colab secrets
        try:
            from google.colab import userdata
            HF_TOKEN = userdata.get('HF_TOKEN')
            if not HF_TOKEN:
                raise ValueError("⚠️ Hugging Face token not found in Colab secrets.")
            print("🔑 Hugging Face token found in Colab secrets.")
        except ImportError:
            print("⚠️ Unable to authenticate in Colab. Please set your Hugging Face token manually.")
    else:
        # Prompt for token if not set in environment
        print("🔑 Please enter your Hugging Face token:")
        # For Colab or local prompt input
        HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


## Setup Kaggle Credentials

In [None]:
import shutil

def setup_kaggle_credentials():
    kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
    if not os.path.exists(kaggle_path):
        from google.colab import files
        print("📂 Upload kaggle.json file...")
        uploaded = files.upload()
        os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
        for filename in uploaded.keys():
            shutil.move(filename, kaggle_path)
        os.chmod(kaggle_path, 0o600)
        print(f"✅ Kaggle credentials setup at {kaggle_path}")
    else:
        print(f"✅ Kaggle credentials already exist at {kaggle_path}")

setup_kaggle_credentials()

## Mount Google Drive (Colab)

In [None]:
if is_running_in_colab():
   from google.colab import drive
   drive.mount('/content/drive')

##  Load Qwen-Instruct with Fallback to Quantized

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

def load_model_pipeline(model_name: str, hf_token: str):
    has_cuda = torch.cuda.is_available()
    free_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) if has_cuda else 0
    print(f"💻 CUDA: {has_cuda} | GPU Memory: {free_mem:.2f} GB")

    device_map = {"": 0} if has_cuda else "cpu"
    use_4bit = has_cuda and free_mem < 24

    # Set quantization config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True if use_4bit else False,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    ) if use_4bit else None

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token  # ✅ Fix warning about pad_token

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=quant_config,
        torch_dtype=torch.float16 if not quant_config else None,
        trust_remote_code=True,
        token=hf_token
    )

    print(f"✅ Model loaded on {next(model.parameters()).device}")
    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=1)


In [None]:
llm_pipeline = load_model_pipeline(
    model_name="Qwen/Qwen2-7B-Instruct",
    hf_token=HF_TOKEN
)


# Global utilities

### Utility to merge normalized json files

In [None]:
import json
import shutil
from pathlib import Path

def merge_json_files(
    source_dir: Path,
    output_file: Path,
    pattern: str,
    merged_dir: Path
):
    source_dir.mkdir(parents=True, exist_ok=True)
    merged_dir.mkdir(parents=True, exist_ok=True)

    merged_data = []

    # Load existing output if it exists
    if output_file.exists():
        with open(output_file, "r", encoding="utf-8") as f:
            try:
                merged_data = json.load(f)
            except json.JSONDecodeError:
                print(f"⚠️ Could not decode {output_file}, starting from scratch.")

    # Identify matching files
    files_to_merge = sorted(source_dir.glob(pattern))

    for file_path in files_to_merge:
        with open(file_path, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    merged_data.extend(data)
                else:
                    print(f"⚠️ Skipping {file_path.name}: not a list.")
            except Exception as e:
                print(f"⚠️ Failed to parse {file_path.name}: {e}")
                continue

        # Move to merged folder
        shutil.move(str(file_path), merged_dir / file_path.name)
        print(f"✅ Merged and moved: {file_path.name}")

    # Write combined output
    if merged_data:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(merged_data, f, indent=2)
        print(f"💾 Saved to: {output_file}")
    else:
        print("📭 No valid data to merge.")




### Configurations  

In [None]:
# ==============================
# 🛠 CONFIGURATION
# ==============================

class Config:
    DATASET_DOWNLOAD_DIR = "datasets"
    JSON_OUTPUT_DIR = "json_outputs_run3"
    JSON_OUTPUT_NORMALIZED_DIR = "json_outputs_run3/normalized"
    AUTO_CLEANUP = True


## Utility to save json to a folder

In [None]:
import json
import os
# 📦 Save JSON Output with Safety
def save_json_output(data, output_path: str, indent: int = 4, overwrite: bool = True):
    output_dir = os.path.dirname(output_path)
    os.makedirs(output_dir, exist_ok=True)

    if os.path.exists(output_path):
        if overwrite:
            os.remove(output_path)
        else:
            raise FileExistsError(f"File {output_path} already exists and overwrite=False.")

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=indent, ensure_ascii=False)

    print(f"✅ Saved output to {output_path}")


## Utility to load file

### load_ndjson_file() (for resume/jd input)

In [None]:
from typing import List


def load_ndjson_file(file_path: Path) -> List[dict]:
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as file:
        return [json.loads(line) for line in file if line.strip()]


### load_json_file() (for checkpoint & metadata)

In [None]:
def load_json_file(file_path: Path) -> dict:
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)


# Phase 2 -	Parse resume/JD into JSON structured scheme

## Define Pydantic Schemas

In [None]:
from typing import Optional
from pydantic import BaseModel

class Education(BaseModel):
    degree: Optional[str] = None
    field: Optional[str] = None
    institution: Optional[str] = None
    year: Optional[str] = None
    gpa: Optional[float] = None

class Experience(BaseModel):
    job_title: Optional[str] = None
    company: Optional[str] = None
    start_date: Optional[str] = None
    end_date: Optional[str] = None
    description: Optional[List[str]] = None
    
class Certification(BaseModel):
    certification: Optional[str] = None
    date_issued: Optional[str] = None

class Project(BaseModel):
    project_title: Optional[str] = None
    description: Optional[str] = None
    start_date: Optional[str] = None
    end_date: Optional[str] = None
    url: Optional[str] = None
    
class Language(BaseModel):
    language: Optional[str] = None
    proficiency: Optional[str] = None
    
class Other(BaseModel):
    section_name: Optional[str] = None
    content: Optional[str] = None

In [None]:
from typing import List, Optional
from pydantic import BaseModel
import re


class ResumeSchema(BaseModel):
    resume_id: Optional[str] = None
    summary: Optional[str] = None
    education: Optional[List[Education]] = None
    experience: Optional[List[Experience]] = None
    skills: Optional[List[str]] = None
    certifications: Optional[List[Certification]] = None
    projects: Optional[List[Project]] = None
    languages: Optional[List[Language]] = None
    other: Optional[List[Other]] = None
    total_experience_years: Optional[float] = None


In [None]:
class JobDescriptionSchema(BaseModel):
    jd_id: Optional[str] = None
    inferred_domain: str = "unknown"
    title: Optional[str] = None
    summary: Optional[str] = None
    required_experience_years: Optional[str] = None
    preferred_degrees: Optional[List[str]] = None
    required_skills: Optional[List[str]] = None
    optional_skills: Optional[List[str]] = None
    tools_and_technologies: Optional[List[str]] = None
    certifications: Optional[List[str]] = None
    soft_skills: Optional[List[str]] = None
    job_responsibilities: Optional[List[str]] = None
    job_location: Optional[str] = None
    remote_option: Optional[bool] = False
    employment_type: Optional[str] = None
    travel_requirements: Optional[str] = None
    physical_requirements: Optional[str] = None
    benefits: Optional[List[str]] = None
    company_information: Optional[str] = None
    equal_opportunity_policy: Optional[str] = None
    other: Optional[List[Other]] = None


### Generate schema string

In [None]:
from typing import get_origin, get_args, Union
from pydantic import BaseModel

def generate_example_structure(model_class) -> dict:
    def default_for_type(field_type):
        origin = get_origin(field_type)
        args = get_args(field_type)

        if origin is list and args:
            inner_type = args[0]
            return [default_for_type(inner_type)]
        elif origin is Union and type(None) in args:
            non_none_types = [arg for arg in args if arg is not type(None)]
            return default_for_type(non_none_types[0]) if non_none_types else ""
        elif field_type is str:
            return ""
        elif field_type in [float, int]:
            return 0.0
        elif isinstance(field_type, type) and issubclass(field_type, BaseModel):
            return generate_example_structure(field_type)
        else:
            return ""

    return {
        field_name: default_for_type(field.annotation)
        for field_name, field in model_class.model_fields.items()
    }


In [None]:
from functools import lru_cache
import json
from typing import Optional, Type

@lru_cache(maxsize=4)
def get_schema_str(schema_model: Optional[Type]) -> str:
    """
    Returns a cleaned JSON schema string (as example structure) for a given Pydantic model.
    Filters out fields not needed for the prompt like 'resume_id', 'duration_in_months', etc.
    """
    if schema_model is None:
        return "{}"
    
    example = generate_example_structure(schema_model)

    # ⛔️ Fields to remove before rendering into the prompt
    exclude_fields = {} #{"resume_id", "total_experience_years", "jd_id"}

    def recursive_filter(obj):
        if isinstance(obj, dict):
            return {
                k: recursive_filter(v)
                for k, v in obj.items()
                if k not in exclude_fields
            }
        elif isinstance(obj, list):
            return [recursive_filter(v) for v in obj]
        return obj

    filtered = recursive_filter(example)

    return json.dumps(filtered, indent=2)


In [None]:

# ✅ Step 2: Generate schema string from your updated ResumeSchema
schema_str = get_schema_str(ResumeSchema)
#schema_str = get_schema_str(JobDescriptionSchema)


# ✅ Step 3: Print and inspect
print(schema_str)


##  Prompt Templates

In [None]:
RESUME_PROMPT_TEMPLATE = """
<|im_start|>system
You are a strict resume-to-JSON parser. Your job is to extract structured data from resumes using the exact schema and instructions provided.
- Follow the schema format and keys exactly.
- Nest values correctly under each section.
- Do not hallucinate or paraphrase.
- Only include information present in the resume.
- For any data not found, fill with "" or empty list [] or 0.0 as applicable.
- Any resume content in sections like "Affiliations", "Accomplishments", "Achievements", "Awards", "Honors", "Volunteer Work", "Memberships", "Leadership", or "Contributions" that does not map to the fixed schema must be included under "other" with appropriate "section_name" and "content".
- Your output must be valid JSON only — no explanation, no markdown, no extra formatting.
<|im_end|>

<|im_start|>user
Extract structured JSON from the resume below using this schema:

Schema:
{schema}

Resume:
<<<
{text}
>>>
<|im_end|>
<|im_start|>assistant
"""


In [None]:
JD_PROMPT_TEMPLATE = """
<|im_start|>system
You are a strict job description to JSON converter. Your job is to extract structured JSON from job descriptions using the exact schema and rules below.

- Use only the content present in the input text.
- Do NOT hallucinate, guess, or paraphrase.
- Extract all values VERBATIM where possible, especially for "required_experience_years", "preferred_degrees", "soft_skills", and any bullet-style lists.
- For missing values, use "" for strings, [] for lists.
- Populate "inferred_domain" based on the content (e.g., marketing, software, healthcare).
- Extract soft skills into "soft_skills", NOT "required_skills" (e.g., communication, detail-oriented, go-getter mindset).
- Extract tools and platforms used (e.g., Microsoft Office, Adobe Creative Suite, Facebook, Salesforce) into "tools_and_technologies".
- Extract job responsibilities (duties, tasks, outcomes) as a list of bullet points or descriptive phrases into "job_responsibilities".
- Do NOT duplicate data across fields. For example, if a responsibility contains a tool, extract the tool separately into "tools_and_technologies".
- For remote roles, set "remote_option" to: "yes" if explicitly remote, "no" if clearly in-office, "" if not mentioned.
- Extract "travel_requirements", "physical_requirements", and "benefits" from job postings if mentioned.
- Extract "company_information" from About section and "equal_opportunity_policy" from EEO or DEI disclosures.
- Do not use role titles in "required_experience_years". Extract years explicitly.
- Ensure no placeholder symbols like "," are included in any fields.
- Move technical proficiencies (e.g., "Proficient in Excel") to "tools_and_technologies", not "optional_skills".
- Parse short company descriptions from the "About" section into "company_information".
- If any remaining information doesn’t fit the schema, include it under "other" with "section_name" and "content".
- Output MUST be a single valid JSON object — no explanations, no markdown, no extra formatting.
<|im_end|>

<|im_start|>user
Extract the following structured JSON from this job description.

Schema:
{schema}

Job Description:
<<<
{text}
>>>
<|im_end|>

<|im_start|>assistant
"""


##  Inference + Validation Functions

### Generate Raw LLM Output

In [None]:
def generate_llm_output(prompt: str, max_new_tokens: int = 4096) -> str:
    """Run LLM and return the generated text with token count logging."""
    try:
        # 🔢 Print input token count
        input_tokens = llm_pipeline.tokenizer.encode(prompt, add_special_tokens=False)
        print(f"🧮 Prompt token count: {len(input_tokens)} | Max new tokens: {max_new_tokens} | Estimated total: {len(input_tokens) + max_new_tokens}")

        # 🔁 Generate
        outputs = llm_pipeline(prompt, max_new_tokens=max_new_tokens, do_sample=False, temperature=None, top_p=None, top_k=None)
        
        output_tokens = llm_pipeline.tokenizer.encode(outputs[0]["generated_text"], add_special_tokens=False)
        print(f"📝 Output token count: {len(output_tokens)}")

        return outputs[0]["generated_text"]

    except Exception as e:
        raise RuntimeError(f"LLM generation failed: {e}")


### Sanitize Output: Strip Prompt, Fix Cutoffs

In [None]:
def sanitize_llm_output(response: str, prompt: str) -> str:
    raw = response.replace(prompt, "").strip()

    # Truncate garbage after the last closing brace
    raw = re.sub(r'}[^}]*$', '}', raw)

    # Remove markdown bullets or --- headers at end
    raw = re.sub(r'(---|•|–|-)\s*$', '', raw, flags=re.MULTILINE)

    return raw


### Regex-based JSON Block Extractor and raw data processor

In [None]:
import regex
import json5

def extract_json_block(text: str) -> dict:
    """
    Extracts the first valid JSON object from a text using the `regex` module and parses with `json5`.
    This is more robust than standard `json` and can handle trailing commas, comments, etc.
    """
    # Recursive regex pattern to find balanced curly braces (non-greedy)
    pattern = r'(\{(?:[^{}]|(?R))*\})'

    for match in regex.finditer(pattern, text, flags=regex.DOTALL):
        json_candidate = match.group(1)
        try:
            return json5.loads(json_candidate)
        except json5.JSONDecodeError:
            continue

    raise ValueError("❌ No valid JSON object found using regex and json5.")


In [None]:
def clean_raw_data(raw: str) -> str:
    import regex
    import json
    import json5
    
    raw = raw.strip().strip("`")

    # Remove code fences
    raw = regex.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=regex.IGNORECASE)

    # Extract content between first '{' and last '}'
    start_idx = raw.find('{')
    end_idx = raw.rfind('}')
    if start_idx == -1 or end_idx == -1 or end_idx <= start_idx:
        raise ValueError("Cannot locate complete JSON object in the output")

    json_body = raw[start_idx:end_idx + 1]

    # Fix common issues
    json_body = regex.sub(r",\s*([\]}])", r"\1", json_body)        # remove trailing commas
    json_body = regex.sub(r",\s*,", ",", json_body)                # remove double commas
    json_body = regex.sub(r'"\s*:\s*:', '":', json_body)           # fix "::"
    json_body = regex.sub(r'("\s*:[^,}\]]+")\s*(")', r'\1,\2', json_body)  # fix missing commas

    # Final fallback: force valid quote usage
    json_body = regex.sub(r"‘|’", '"', json_body)
    json_body = regex.sub(r"“|”", '"', json_body)
    
    # ✅ Try json.loads, fallback to json5
    try:
        json.loads(json_body)
    except json.JSONDecodeError as e:
        print(f"⚠️ Standard JSON decode failed: {e} — using json5 as fallback.")
        parsed = json5.loads(json_body)
        return json.dumps(parsed, ensure_ascii=False, indent=2)  # return as clean valid JSON

    return json_body


### Format Resume string for LLM processing

In [None]:
import regex as re

CANONICAL_HEADER_PATTERNS = [
    r'\b(?:work|professional)?\s*experience\b',
    r'\b(?:education(?:\s+(?:and|&)\s+training)?|training(?:\s+(?:and|&)\s+(?:development|certifications?|programs?)|s)?|specialized\s+training)\b',
    r'\b(?:technical\s+)?skills?\b',
    r'\b(?:certifications?|licenses?|certifications?\s*(?:and|&)\s*licenses?)\b',
    r'\b(?:projects?|key\s+projects|project\s+highlights)\b',
    r'\blanguages?\b',
    r'\b(?:executive\s+)?summary\b',
    r'\b(?:professional\s+)?affiliations?\b',
    r'\b(?:awards?|accomplishments?|honors|achievements)\b',
    r'\bpublications?\b',
    r'\b(?:interests|hobbies|extracurricular\s+activities)\b',
    r'\b(?:volunteer\s+experience|community\s+involvement|volunteering)\b',
    r'\bobjective\b',
    r'\breferences\b',
    r'\bprofile\b'
]



def is_known_section_header(line):
    stripped = line.strip()
    return any(
        re.search(rf'^\s*{pattern}\s*[:\-]?\s*$', stripped, re.IGNORECASE)
        for pattern in CANONICAL_HEADER_PATTERNS
    )


def is_date_like(text):
    return bool(re.match(r'(?i)^\d{1,2}/\d{4}$|^[A-Za-z]{3,9} \d{4}$|^\d{4}$', text.strip()))

def is_location_like(text):
    return bool(re.fullmatch(r'(City|State|[A-Z][a-z]{1,15})', text.strip()))

def is_degree_fragment(text):
    return bool(re.fullmatch(r'[A-Za-z]{2,5}', text.strip()))  # BBA, MSc, etc.


def detect_real_headers(lines):
    section_lines = []

    for i, line in enumerate(lines):
        stripped = line.strip()
        if not stripped:
            continue

        # ✅ Canonical match: skip heuristics & filtering
        if is_known_section_header(stripped):
            section_lines.append((i, stripped))
            continue

        # ❌ Filtering for heuristic-only (to avoid false positives)
        if is_date_like(stripped) or is_location_like(stripped) or is_degree_fragment(stripped):
            continue

    return section_lines


In [None]:
import regex as re

def preprocess_resume_text(text: str) -> str:
    # Normalize escaped newlines and tabs into actual characters
    text = text.replace('\\n', '\n').replace('\\t', '\t')

    # Clean actual tabs and escaped slashes
    text = text.replace('\\/', '/').replace('\t', ' ')
    # Clean tabs and escaped slashes
    text = text.replace('\\/', '/').replace('\t', ' ')

    # ✅ Normalize: fix spaced slashes in dates (e.g. '08 / 2014' → '08/2014')
    text = re.sub(r'(?<=\d)\s*/\s*(?=\d{4})', '/', text)

    # ✅ Protect date ranges by marking them before splitting on 3+ spaces
    text = re.sub(
        r'(?i)(\d{1,2}/\d{4})\s*(to|-)\s*(\d{1,2}/\d{4}|current|present)',
        r'__DATERANGE__\1 to \3__ENDDATE__',
        text
    )

    # ✅ Collapse "Company — City, State" formatting
    text = re.sub(
        r'(?i)([A-Za-z0-9&.,()\-\' ]{2,})\s*[-－–—]{1,2}\s*([A-Z][a-z]+)\s*,\s*([A-Z][a-z]+)',
        r'\1 — \2 , \3',
        text
    )

    # ✅ Convert 3+ spaces into newlines
    text = re.sub(r' {3,}', '\n', text)

    # Normalize excessive newlines and spaces
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text)

    # ✅ Restore protected date ranges
    text = text.replace('__DATERANGE__', '\n').replace('__ENDDATE__', '\n')

    # Add breaks around known section headers
    def insert_header_breaks(line):
        stripped = line.strip()
        if is_known_section_header(stripped):
            return f"\n{stripped}\n"
        return stripped

    lines = text.splitlines()
    lines = [insert_header_breaks(line) for line in lines]
    text = "\n".join(lines)

    return text.strip()


In [None]:
import regex as re

def split_run_on_sentences(text):
    """Split long lines into sentence-like segments while preserving periods."""
    lines = text.splitlines()
    split_lines = []

    for line in lines:
        stripped = line.strip()

        # Skip short or bullet lines
        if not stripped or stripped.startswith('-') or len(stripped) < 50:
            split_lines.append(stripped)
            continue

        # Split after period only if followed by space and capital letter
        if stripped.count('.') >= 2:
            # This keeps the period with the sentence
            segments = re.split(r'(?<=\.) (?=[A-Z])', stripped)
            split_lines.extend([seg.strip() for seg in segments])
        else:
            split_lines.append(stripped)

    return split_lines


In [None]:
def format_resume_for_llm(text: str) -> str:
    text = preprocess_resume_text(text)
 
    lines = text.splitlines()
     # Sentence splitting
    lines = split_run_on_sentences("\n".join(lines))
  
    headers = detect_real_headers(lines)
    print("Detected Headers:", headers)

    header_indices = {idx for idx, _ in headers}

    output_lines = []
    for i, line in enumerate(lines):
        stripped = line.strip()

        if not stripped:
            continue

        # Skip lines that are isolated numbers (e.g. caused by bad splitting)
        if stripped.isdigit():
            continue

        # Insert a line break before date ranges (if not already done)
        if re.match(r'(?i)^\d{1,2}/\d{4} to (\d{1,2}/\d{4}|current|present)$', stripped):
            output_lines.append("")  # add a break before
            output_lines.append(stripped)
            continue

        # Header detection
        if i in header_indices:
            output_lines.append("")  # add a blank line before header
            output_lines.append(stripped.upper())
            output_lines.append("")  # and after
        else:
            output_lines.append(stripped)

    formatted = "\n".join(output_lines)

    # Normalize inline date ranges (if still embedded)
    formatted = re.sub(
        r'(?i)(\d{1,2}/\d{4})\s*(to|-)\s*(\d{1,2}/\d{4}|current|present)',
        r'\1 to \3',
        formatted
    )

    # Format SKILLS section as bullets
    formatted = re.sub(
        r'(?i)(\nSKILLS\n)([^\n]+)',
        lambda m: m.group(1) + "\n" + "\n".join(f"- {s.strip()}" for s in m.group(2).split(',')),
        formatted
    )

    return formatted.strip()


### Format Job Description for LLM processing

In [None]:
def format_jd_for_llm(jd: dict) -> str:
    # Extract basic fields with defaults
    title = jd.get("title", "")
    description = jd.get("description", "").strip()
    location = jd.get("location", "")
    work_type = jd.get("formatted_work_type", "")
    skills_desc = jd.get("skills_desc", "")
    experience_level = jd.get("formatted_experience_level", "")
    
    # Build human-readable prompt string for LLM
    prompt_parts = [
        f"Job Title: {title}",
        f"Location: {location}",
        f"Employment Type: {work_type}",
        f"Listed Skills: {skills_desc}",
        f"Description:\n{description}",
        f"Experience Level (if available): {experience_level}",
    ]
    
    return "\n\n".join(prompt_parts)


### Final Orchestrator: Fault-Tolerant Extraction

#### inject ID in parsed json

In [None]:
import uuid
from typing import Dict, Optional, Type

def inject_ids(parsed: Dict, schema_model: Optional[Type], record_id: str) -> Dict:
    """
    Injects a UUID as `resume_id` or `jd_id` based on the schema model name.
    """
    if not schema_model:
        print("⚠️ No schema model provided for ID injection.")
        return parsed
    schema_name = schema_model.__name__.lower()
    if schema_name.startswith("resume"):
        parsed["resume_id"] = record_id
    elif schema_name.startswith("jobdescription"):
        parsed["jd_id"] = record_id
    return parsed


#### Inject total experience in parsed resume

In [None]:
from dateutil import parser
from datetime import datetime
from typing import Dict, List, Tuple, Set, Optional

def parse_date(date_str: str) -> Optional[datetime]:
    """Parses date string into datetime, handling edge cases like 'Present'. Returns None if invalid."""
    if not date_str or date_str.strip().lower() in {"present", "current", "till date"}:
        return datetime.today()
    try:
        return parser.parse(date_str, default=datetime(2000, 1, 1))
    except (ValueError, TypeError):
        return None

def get_months_between(start: datetime, end: datetime) -> List[Tuple[int, int]]:
    """Returns a list of (year, month) tuples between start and end inclusive."""
    months = []
    current = datetime(start.year, start.month, 1)
    end = datetime(end.year, end.month, 1)
    while current <= end:
        months.append((current.year, current.month))
        if current.month == 12:
            current = datetime(current.year + 1, 1, 1)
        else:
            current = datetime(current.year, current.month + 1, 1)
    return months

def calculate_total_experience_years(resume: Dict) -> float:
    """Calculates total experience in years with strict date validation and deduplication."""
    unique_months: Set[Tuple[int, int]] = set()
    experiences = resume.get("experience", [])

    for exp in experiences:
        start_str = exp.get("start_date", "")
        end_str = exp.get("end_date", "")

        start_date = parse_date(start_str)
        end_date = parse_date(end_str)

        if start_date is None or end_date is None:
            continue  # ❌ skip invalid/malformed dates
        if start_date > end_date:
            continue  # ❌ skip logically incorrect ranges

        months = get_months_between(start_date, end_date)
        unique_months.update(months)

    total_months = len(unique_months)
    total_years = round(total_months / 12.0, 1)
    return total_years


#### Inject and reorder fields

In [None]:
from collections import OrderedDict
from typing import Dict, Optional

def inject_and_reorder_top_fields(parsed: Dict, is_resume: Optional[bool] = None) -> Dict:
    """
    Injects derived fields (like total_experience_years) and reorders ID-related fields to the top.
    - For resumes: resume_id and total_experience_years come first.
    - For JDs: jd_id comes first.
    """
    if is_resume:
        resume_id = parsed.get("resume_id", "")
        total_exp = calculate_total_experience_years(parsed)
        return OrderedDict([
            ("resume_id", resume_id),
            ("total_experience_years", total_exp),
            *[(k, v) for k, v in parsed.items() if k not in {"resume_id", "total_experience_years"}]
        ])
    else:
        jd_id = parsed.get("jd_id", "")
        return OrderedDict([
            ("jd_id", jd_id),
            *[(k, v) for k, v in parsed.items() if k != "jd_id"]
        ])


#### Extract JSON from LLM

In [None]:

from typing import Union

def extract_structured_json(
    text: str,
    prompt_template: str,
    schema_model: Union[None, type] = None,
    max_new_tokens: int = 4096,
    retries: int = 0,
    record_id: str = "",
    is_resume: bool = None
) -> dict:
    """
    Runs LLM to extract structured JSON and validates against schema.
    Includes: prompt sanitization, retry, echo detection, brace parser fallback, schema validation.
    """
    schema_str = get_schema_str(schema_model)
    prompt = prompt_template.format(text=text, schema=schema_str)
    raw_output = ""
    attempt = 0

    while attempt <= retries:
        try:
            # Step 1: Get LLM output
            #print(f"\n prompt: \n {prompt} \n")
            response = generate_llm_output(prompt, max_new_tokens)
            #print("🧪 LLM output was:\n", response)  # Preview first 300 chars
            raw_output = sanitize_llm_output(response, prompt)
         

            # Step 2: Detect schema echo or instruction echo
            if "$schema" in raw_output or "Ensure these rules" in raw_output:
                raise ValueError("LLM echoed schema or instruction block instead of generating JSON.")

            # Step 3: Try JSON load directly
            json_start = raw_output.find("{")
            if json_start == -1:
                raise ValueError("No opening '{' found in LLM output.")
            
            cleaned_output = raw_output[json_start:]
            cleaned_output = clean_raw_data(cleaned_output)
            
            #print("🧪 Cleaned output to parse:\n", cleaned_output)

            parsed = json.loads(cleaned_output)
            parsed = inject_ids(parsed, schema_model, record_id)
            parsed = inject_and_reorder_top_fields(parsed, is_resume)
            
            #print("\n🧪 Parsed output was:\n", parsed) 

            return parsed

        except Exception as e:
            print(f"⚠️ Attempt {attempt + 1} failed: {e}")
            print("🧪 Raw output was:\n", raw_output) 
            attempt += 1

    # Step 5: Fallback using brace matching
    try:
        parsed = extract_json_block(raw_output)
        parsed = inject_ids(parsed, schema_model, record_id)
        parsed = inject_and_reorder_top_fields(parsed, is_resume)
        print("🧪 Fallback parsed output was:\n", parsed)  
        return parsed
    except Exception as e:
        return {
            "raw_output": raw_output.strip(),
            "error": f"Regex fallback failed: {e}"
        }

##  Normalize in Batches

In [None]:
def save_metadata_summary(
    output_dir: Path,
    is_resume: bool,
    input_file: str,
    total_records: int,
    total_valid: int,
    total_invalid: int,
    start_index: int,
    end_index: int,
    timestamp: str,
    batch_id: str
):
    summary = {
        "batch_id": batch_id,
        "timestamp": timestamp,
        "input_file": input_file,
        "input_type": "resume" if is_resume else "job_description",
        "records_start_index": start_index,
        "records_end_index": end_index,
        "records_total": total_records,
        "records_valid": total_valid,
        "records_invalid": total_invalid,
        "output_dir": str(output_dir)
    }
    summary_file = output_dir / f"meta_{'resumes' if is_resume else 'jds'}_{start_index}_{end_index}_{timestamp}_{batch_id}.json"
    save_json_output(summary, str(summary_file), overwrite=True)


In [None]:
import uuid
from datetime import datetime
from pathlib import Path

def normalize_record(
    record: dict,
    is_resume: bool,
    output_dir: Path,
    prompt_template,
    schema_model
):
    if is_resume:
        raw_resume_text = record.get("Resume_str", "")
        text = format_resume_for_llm(raw_resume_text)
        max_new_tokens = 4096
        record_id = record.get("ID", str(uuid.uuid4()))
    else:
        text = format_jd_for_llm(record)
        max_new_tokens = 2048
        record_id = record.get("job_id", str(uuid.uuid4()))

    parsed = extract_structured_json(
        text=text,
        prompt_template=prompt_template,
        schema_model=schema_model,
        max_new_tokens=max_new_tokens,
        record_id=record_id,
        is_resume=is_resume
    )

    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    prefix = "resumes" if is_resume else "jds"
    
    # ✅ Handle complete failure
    if parsed is None:
        output_type = "invalid"
        output_filename = f"{prefix}_{output_type}_{record_id}_{timestamp}.json"
        save_json_output(
            [{
                "record_id": record_id,
                "domain": "unknown",
                "input_text": text,
                "output_json": None,
                "error": "LLM and fallback extraction both failed"
            }],
            output_path=output_dir / output_filename
        )
        return output_type

    # ✅ Determine domain
    if is_resume:
        domain = record.get("Category", "unknown")
    else:
        domain = parsed.get("inferred_domain", "unknown")
        
    # ✅ Normal path
    output_type = "invalid" if "error" in parsed or "raw_output" in parsed else "valid"
    output_filename = f"{prefix}_{record_id}_{output_type}_{timestamp}.json"

    # ✅ Save structured record with input and output
    output_data = {
        "record_id": record_id,
        "domain": domain,
        "input_text": text,
        "output_json": parsed
    }
    save_json_output([output_data], output_path=output_dir / output_filename)
    return output_type


In [None]:
from tqdm import tqdm
import os
from datetime import datetime
import uuid
from pathlib import Path

def normalize_file_in_batches(
    input_filename: str,
    output_dir: Path,
    is_resume: bool = True,
    input_dir: Path = Path("json_outputs"),
    limit: int = None
):
    input_path = input_dir / input_filename
    data = load_ndjson_file(input_path)

    checkpoint_file = output_dir / f"checkpoint_{'resumes' if is_resume else 'jds'}.json"
    start_index = 0
    if checkpoint_file.exists():
        checkpoint = load_json_file(checkpoint_file)
        start_index = checkpoint.get("last_index", 0)
        print(f"🔁 Resuming from index {start_index}")

    data_to_process = data[start_index:]
    if limit is not None:
        data_to_process = data_to_process[:limit]

    prompt_template = RESUME_PROMPT_TEMPLATE if is_resume else JD_PROMPT_TEMPLATE
    schema_model = ResumeSchema if is_resume else JobDescriptionSchema

    os.makedirs(output_dir, exist_ok=True)

    total_valid, total_invalid = 0, 0
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    batch_id = uuid.uuid4().hex[:6]
    actual_start = start_index
    actual_end = start_index + len(data_to_process)

    for idx, record in enumerate(tqdm(data_to_process, desc="🔄 Normalizing records")):
        absolute_idx = start_index + idx
        output_type = normalize_record(
            record=record,
            is_resume=is_resume,
            output_dir=output_dir,
            prompt_template=prompt_template,
            schema_model=schema_model
        )
        if output_type == "valid":
            total_valid += 1
        else:
            total_invalid += 1

        # ✅ Save checkpoint after each record
        save_json_output({"last_index": absolute_idx + 1}, str(checkpoint_file), overwrite=True)

    # ✅ Save summary metadata
    save_metadata_summary(
        output_dir=output_dir,
        is_resume=is_resume,
        input_file=input_filename,
        total_records=len(data_to_process),
        total_valid=total_valid,
        total_invalid=total_invalid,
        start_index=actual_start,
        end_index=actual_end,
        timestamp=timestamp,
        batch_id=batch_id
    )


## Run Phase 2 End-to-End

In [None]:
normalize_file_in_batches(
    input_filename="parsed_jds.json",
    input_dir=Path(Config.JSON_OUTPUT_DIR),
    output_dir=Path(Config.JSON_OUTPUT_NORMALIZED_DIR),
    is_resume=False,
    limit=1  # ✅ Process only 20 records max
)


In [None]:
normalize_file_in_batches(
    input_filename="parsed_resumes.json",
    input_dir=Path(Config.JSON_OUTPUT_DIR),
    output_dir=Path(Config.JSON_OUTPUT_NORMALIZED_DIR),
    is_resume=True,
    limit=1  # ✅ Process only 20 records max
)


In [None]:
normalize_file_in_batches(
    input_filename="parsed_jds.json",
    input_dir=Path(Config.JSON_OUTPUT_DIR),
    output_dir=Path(Config.JSON_OUTPUT_NORMALIZED_DIR),
    is_resume=False,
)

normalize_file_in_batches(
    input_filename="parsed_resumes.json",
    input_dir=Path(Config.JSON_OUTPUT_DIR),
    output_dir=Path(Config.JSON_OUTPUT_NORMALIZED_DIR),
    is_resume=True,
)

## Merge normalized files

In [None]:
# Paths
normalized_dir = Path(Config.JSON_OUTPUT_NORMALIZED_DIR)
merged_dir = normalized_dir / "merged"

merge_json_files(
    source_dir=normalized_dir,
    output_file=normalized_dir / "normalized_jds.json",
    pattern="jds_valid*.json",
    merged_dir=merged_dir
)

merge_json_files(
    source_dir=normalized_dir,
    output_file=normalized_dir / "normalized_resumes.json",
    pattern="resumes_valid*.json",
    merged_dir=merged_dir
)
