# Phase 1: First Steps Notebook — Data Ingestion + Minimal Parsing
1. Setup and Install Dependencies
2. Load Resume and JD datasets
3. Minimal Parsing into JSON Structure
4. Save structured JSON for Phase 2

## Setup and Install Dependencies

In [None]:
%pip install kaggle kagglehub pandas


## Util Classes and methods

### Configurations  

In [None]:
# ==============================
# 🛠 CONFIGURATION
# ==============================
import os
import shutil
import zipfile
import pandas as pd
from pathlib import Path
from typing import List

class Config:
    DATASET_DOWNLOAD_DIR = "datasets"
    JSON_OUTPUT_DIR = "json_outputs"
    AUTO_CLEANUP = True

    @staticmethod
    def setup_kaggle_credentials():
        kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
        if not os.path.exists(kaggle_path):
            from google.colab import files
            print("📂 Upload kaggle.json file...")
            uploaded = files.upload()
            os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
            for filename in uploaded.keys():
                shutil.move(filename, kaggle_path)
            os.chmod(kaggle_path, 0o600)
            print(f"✅ Kaggle credentials setup at {kaggle_path}")
        else:
            print(f"✅ Kaggle credentials already exist at {kaggle_path}")



### Downloader 

In [None]:

# ==============================
# DOWNLOADER
# ==============================
class DatasetDownloader:
    @staticmethod
    def download_and_extract(dataset_path: str) -> tuple[str, str]:
        os.makedirs(Config.DATASET_DOWNLOAD_DIR, exist_ok=True)
        dataset_slug = dataset_path.split("/")[-1]
        extract_folder_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
        zip_filename = f"{dataset_slug}.zip"
        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)

        if os.path.exists(extract_folder_path) and any(Path(extract_folder_path).rglob("*.csv")):
            print(f"⚡ Dataset folder already exists at '{extract_folder_path}', skipping download and extraction.")
            return extract_folder_path, zip_filename

        print(f"⬇️ Downloading dataset: {dataset_path} ...")
        !kaggle datasets download -d {dataset_path} -p {Config.DATASET_DOWNLOAD_DIR}

        if not os.path.exists(zip_path):
            raise FileNotFoundError(f"❌ Zip file '{zip_filename}' not found after download!")

        os.makedirs(extract_folder_path, exist_ok=True)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_folder_path)

        print(f"✅ Downloaded and extracted to '{extract_folder_path}'.")
        return extract_folder_path, zip_filename



### Data Loader

In [None]:

# ==============================
# LOADER
# ==============================
class DatasetLoader:
    @staticmethod
    def load_csv(dataset_folder: str, target_csv_name: str) -> pd.DataFrame:
        print(f"🔍 Searching for '{target_csv_name}' inside {dataset_folder}...")
        if not os.path.exists(dataset_folder):
            raise FileNotFoundError(f"❌ Dataset folder '{dataset_folder}' does not exist!")

        for root, _, files in os.walk(dataset_folder):
            for file in files:
                if file.lower() == target_csv_name.lower():
                    csv_path = os.path.join(root, file)
                    df = pd.read_csv(csv_path)
                    print(f"✅ Loaded CSV with shape {df.shape}")
                    return df

        raise FileNotFoundError(f"❌ CSV file '{target_csv_name}' not found inside extracted dataset!")



### Data Processor

In [None]:

# ==============================
# PROCESSOR
# ==============================
class DatasetProcessor:
    @staticmethod
    def filter_fields(df: pd.DataFrame, allowed_fields: List[str]) -> pd.DataFrame:
        missing_fields = [field for field in allowed_fields if field not in df.columns]
        if missing_fields:
            raise ValueError(f"❌ Fields {missing_fields} not found in dataset!")

        filtered_df = df[allowed_fields]
        print(f"✅ Filtered columns: {list(filtered_df.columns)}")
        return filtered_df

    @staticmethod
    def save_to_json(df: pd.DataFrame, output_json_name: str):
        os.makedirs(Config.JSON_OUTPUT_DIR, exist_ok=True)
        output_path = os.path.join(Config.JSON_OUTPUT_DIR, output_json_name)

        if os.path.exists(output_path):
            os.remove(output_path)
            print(f"🗑️ Existing JSON '{output_path}' deleted.")

        df.to_json(output_path, orient='records', lines=True, force_ascii=False)
        print(f"✅ Data saved to JSON at '{output_path}'")



### Cleanup

In [None]:


# ==============================
# CLEANER
# ==============================
class Cleaner:
    @staticmethod
    def cleanup_dataset_artifacts(extracted_folder_path: str, zip_filename: str):
        if os.path.exists(extracted_folder_path):
            shutil.rmtree(extracted_folder_path)
            print(f"🧹 Folder '{extracted_folder_path}' has been deleted successfully.")

        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)
        if os.path.exists(zip_path):
            os.remove(zip_path)
            print(f"🗑️ Zip file '{zip_path}' has been deleted successfully.")



### Hybrid Data loader

In [None]:


# ==============================
# HYBRID LOADER
# ==============================
try:
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
except ImportError:
    kagglehub = None

class HybridDatasetLoader:
    @staticmethod
    def load_dataset(dataset_path: str, file_name: str) -> pd.DataFrame:
        if kagglehub:
            try:
                print(f"📥 Trying KaggleHub for {dataset_path}...")
                df = kagglehub.dataset_load(KaggleDatasetAdapter.PANDAS, dataset_path, file_name)
                print(f"✅ Loaded using KaggleHub: shape = {df.shape}")
                return df
            except Exception as e:
                print(f"⚠️ KaggleHub failed: {e}\nFalling back to ZIP-based loader.")

        extracted_folder, _ = DatasetDownloader.download_and_extract(dataset_path)
        return DatasetLoader.load_csv(extracted_folder, file_name)



### Main flow

In [None]:

# ==============================
# MAIN FLOW
# ==============================
def process_dataset(dataset_path: str, target_csv_name: str, allowed_fields: List[str], output_json_name: str):
    df = HybridDatasetLoader.load_dataset(dataset_path, target_csv_name)
    filtered_df = DatasetProcessor.filter_fields(df, allowed_fields)
    DatasetProcessor.save_to_json(filtered_df, output_json_name)

    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)

## Login and do the processing of Resume and JD dataset

In [None]:
Config.setup_kaggle_credentials()
# Process Resume Dataset
process_dataset(
    dataset_path="snehaanbhawal/resume-dataset",
    target_csv_name="Resume.csv",
    allowed_fields=["Category", "Resume_str"],
    output_json_name="parsed_resumes.json"
)

# Process Job Postings Dataset
process_dataset(
    dataset_path="arshkon/linkedin-job-postings",
    target_csv_name="postings.csv",
    allowed_fields=["title", "company_name", "location", "description", "skills_desc", "job_id" , "formatted_experience_level", "formatted_work_type"],
    output_json_name="parsed_jds.json"
)

# Phase 2 -	Parse resume/JD into JSON structured scheme

## Colab + GPU Detection Utilities

In [None]:
import subprocess


def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

def get_available_gpu_memory_gb():
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,nounits,noheader"],
            encoding="utf-8"
        )
        free_mem_mb = int(output.strip().split("\n")[0])
        return free_mem_mb / 1024
    except Exception:
        return 0.0


##  Install Dependencies  & Login to Hugging Face Hub

In [None]:
if is_running_in_colab():
    # Install the required packages
    !pip install -q transformers accelerate bitsandbytes sentencepiece pydantic huggingface_hub xformers
else:
    %pip install -q transformers accelerate sentencepiece pydantic huggingface_hub xformers
    #%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
    #%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
    %pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
    %pip install -U bitsandbytes


In [None]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    if is_running_in_colab():
        # If running in Colab, use the Colab secrets
        try:
            from google.colab import userdata
            HF_TOKEN = userdata.get('HF_TOKEN')
            if not HF_TOKEN:
                raise ValueError("⚠️ Hugging Face token not found in Colab secrets.")
            print("🔑 Hugging Face token found in Colab secrets.")
        except ImportError:
            print("⚠️ Unable to authenticate in Colab. Please set your Hugging Face token manually.")
    else:
        # Prompt for token if not set in environment
        print("🔑 Please enter your Hugging Face token:")
        # For Colab or local prompt input
        HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


## Mount Google Drive (Colab)

In [None]:
if is_running_in_colab():
   from google.colab import drive
   drive.mount('/content/drive')

## Import Libraries

In [None]:
import json, os, uuid, subprocess, torch
from datetime import datetime
from pathlib import Path
from tqdm import tqdm
from typing import List
from pydantic import BaseModel, ValidationError
from huggingface_hub import login
import re




##  Load Nous-Hermes-mistral-Instruct with Fallback to Quantized

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

def load_model_pipeline(model_name: str, hf_token: str):
    has_cuda = torch.cuda.is_available()
    free_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) if has_cuda else 0
    print(f"💻 CUDA: {has_cuda} | GPU Memory: {free_mem:.2f} GB")

    device_map = {"": 0} if has_cuda else "cpu"
    use_4bit = has_cuda and free_mem < 24

    # Set quantization config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True if use_4bit else False,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    ) if use_4bit else None

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token  # ✅ Fix warning about pad_token

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=quant_config,
        torch_dtype=torch.float16 if not quant_config else None,
        trust_remote_code=True,
        token=hf_token
    )

    print(f"✅ Model loaded on {next(model.parameters()).device}")
    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=1)


In [None]:
llm_pipeline = load_model_pipeline(
    model_name="NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
    hf_token=HF_TOKEN
)


## Define Pydantic Schemas

In [None]:
from typing import List, Dict
from pydantic import BaseModel

class Education(BaseModel):
    degree: str
    field: str
    institution: str
    year: str

class Experience(BaseModel):
    job_title: str
    company: str
    duration: str
    description: str


In [None]:
class ResumeSchema(BaseModel):
    basics: Dict
    education: List[Education]
    experience: List[Experience]
    skills: List[str]
    certifications: List[str]
    projects: List[str]

    @classmethod
    def normalize(cls, resume_dict: dict) -> dict:
        resume_dict = dict(resume_dict)  # safe copy

        resume_dict.setdefault("basics", {})
        for key in ["education", "experience", "skills", "certifications", "projects"]:
            if not isinstance(resume_dict.get(key), list):
                resume_dict[key] = []

        # ✅ Normalize experience
        normalized_exp = []
        for item in resume_dict["experience"]:
            if not isinstance(item, dict):
                continue
            desc = item.get("description") or item.get("summary", "")
            if isinstance(desc, list):
                desc = " ".join(desc)
            normalized_exp.append({
                "job_title": item.get("job_title") or item.get("title", ""),
                "company": item.get("company", ""),
                "duration": item.get("duration", ""),
                "description": desc
            })
        resume_dict["experience"] = normalized_exp

        # ✅ Normalize education
        normalized_edu = []
        for item in resume_dict["education"]:
            if not isinstance(item, dict):
                continue
            degree = item.get("degree", "")
            field = item.get("field", "")
            if not field:
                # Try to extract field from degree
                match = re.search(r"in\s+(.+)", degree, flags=re.IGNORECASE)
                field = match.group(1).strip() if match else ""
            year = item.get("year", "")
            if isinstance(year, int):
                year = str(year)
            normalized_edu.append({
                "degree": degree,
                "field": field,
                "institution": item.get("institution", ""),
                "year": year
            })
        resume_dict["education"] = normalized_edu

        return resume_dict


In [None]:
from typing import Union


class JobDescriptionSchema(BaseModel):
    title: str
    summary: str
    required_experience_years: Union[float, int] = 0.0
    preferred_degrees: List[str] = []
    required_skills: List[str] = []
    certifications: List[str] = []
    soft_skills: List[str] = []

    @classmethod
    def normalize(cls, jd_dict: dict) -> dict:
        jd_dict = dict(jd_dict)  # copy

        # Alias cleanup
        aliases = {
            "years_required": "required_experience_years",
            "requirements": "required_skills",
            "degree_preferences": "preferred_degrees",
            "certs": "certifications",
            "skills_soft": "soft_skills",
            "job_summary": "summary"
        }
        for old, new in aliases.items():
            if old in jd_dict and new not in jd_dict:
                jd_dict[new] = jd_dict.pop(old)

        # Extract experience years
        def extract_experience_years(text: str) -> float:
            if not isinstance(text, str):
                return 0.0
            match = re.search(r'(\d+(\.\d+)?)\s*\+?\s*(years?|yrs?)', text.lower())
            return float(match.group(1)) if match else 0.0

        # Try conversion or fallback extraction
        try:
            val = jd_dict.get("required_experience_years")
            if val is None:
                jd_dict["required_experience_years"] = extract_experience_years(jd_dict.get("summary", ""))
            elif isinstance(val, str):
                jd_dict["required_experience_years"] = float(val.split()[0])
            else:
                jd_dict["required_experience_years"] = float(val)
        except Exception:
            jd_dict["required_experience_years"] = 0.0

        # List fields
        for field in ["preferred_degrees", "required_skills", "certifications", "soft_skills"]:
            if not isinstance(jd_dict.get(field), list):
                jd_dict[field] = []

        # String fields
        for field in ["title", "summary"]:
            jd_dict[field] = jd_dict.get(field, "") or ""

        return jd_dict


In [None]:
def generate_example_structure(model_class) -> dict:
    """Generate an example JSON structure from a Pydantic model using placeholder values."""
    from typing import get_args, get_origin
    from pydantic import BaseModel

    def default_for_type(field_type):
        origin = get_origin(field_type)
        if origin is list or origin is List:
            return []
        elif field_type == str:
            return ""
        elif field_type in (float, int):
            return 0.0
        elif origin is dict or field_type == dict:
            return {}
        elif issubclass(field_type, BaseModel):
            return generate_example_structure(field_type)
        return ""

    structure = {}
    for field_name, field in model_class.model_fields.items():
        try:
            structure[field_name] = default_for_type(field.annotation)
        except Exception:
            structure[field_name] = ""
    return structure


##  Prompt Templates

In [None]:
RESUME_PROMPT_TEMPLATE = """
You are a JSON resume parser.

Given the following resume text, extract only the structured JSON in this format:

{schema}

Each object in "experience" must have these fields:
- job_title
- company
- duration
- description

Return only the JSON object. Do not include explanations or markdown.

Begin your response with a {{
Resume:
--------------------
{text}
--------------------
"""


In [None]:
JD_PROMPT_TEMPLATE = """
You are a job description-to-JSON parser.

Given the job description below, return a structured JSON object following this format:

{schema}

💡 Return only a JSON object. No extra text. Do not include instructions or formatting.

Start your response with a {{.

Job Description:
--------------------
{text}
--------------------
"""


##  Inference + Validation Functions

### Generate Raw LLM Output

In [None]:
def generate_llm_output(prompt: str, max_new_tokens: int = 1024) -> str:
    return llm_pipeline(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0]["generated_text"]


In [None]:
def generate_llm_output_old(prompt: str, max_new_tokens: int = 1024) -> str:
    """Run LLM and return the generated text."""
    try:
        return llm_pipeline(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0]["generated_text"]
    except Exception as e:
        raise RuntimeError(f"LLM generation failed: {e}")

### Sanitize Output: Strip Prompt, Fix Cutoffs

In [None]:
def sanitize_llm_output(response: str, prompt: str) -> str:
    raw = response.replace(prompt, "").strip()

    # Truncate garbage after the last closing brace
    raw = re.sub(r'}[^}]*$', '}', raw)

    # Remove markdown bullets or --- headers at end
    raw = re.sub(r'(---|•|–|-)\s*$', '', raw, flags=re.MULTILINE)

    return raw


### Regex-based JSON Block Extractor

In [None]:
def extract_json_block(text: str) -> dict:
    """
    Regex-free fallback JSON block extractor using brace balance.
    Finds first balanced {} block.
    """
    stack = []
    start = None
    for i, char in enumerate(text):
        if char == '{':
            if not stack:
                start = i
            stack.append(char)
        elif char == '}':
            if stack:
                stack.pop()
                if not stack:
                    try:
                        return json.loads(text[start:i+1])
                    except json.JSONDecodeError:
                        continue
    raise ValueError("No valid JSON object found in fallback.")

### Final Orchestrator: Fault-Tolerant Extraction

In [None]:
def truncate_text(text: str, max_chars=1500) -> str:
    """Trims long resumes/JDs to prevent LLM overload."""
    return text.strip()[:max_chars]

In [None]:
def extract_structured_json(
    text: str,
    prompt_template: str,
    schema_model: Union[None, type] = None,
    max_new_tokens: int = 1024,
    retries: int = 0,
    validate: bool = True,
) -> dict:
    """
    Runs LLM to extract structured JSON and validates against schema.
    Includes: prompt sanitization, retry, echo detection, brace parser fallback, schema validation.
    """
    example_schema = generate_example_structure(schema_model)
    schema_str = json.dumps(example_schema, indent=2)
    prompt = prompt_template.format(text=truncate_text(text), schema=schema_str)
    raw_output = ""
    attempt = 0

    while attempt <= retries:
        try:
            # Step 1: Get LLM output
            response = generate_llm_output(prompt, max_new_tokens)
            raw_output = sanitize_llm_output(response, prompt)

            # Step 2: Detect schema echo or instruction echo
            if "$schema" in raw_output or "Ensure these rules" in raw_output:
                raise ValueError("LLM echoed schema or instruction block instead of generating JSON.")

            # Step 3: Try JSON load directly
            json_start = raw_output.find("{")
            if json_start == -1:
                raise ValueError("No opening '{' found in LLM output.")

            parsed = json.loads(raw_output[json_start:])

            # Step 4: Optional schema validation
            if validate and schema_model:
                if hasattr(schema_model, "normalize"):
                    parsed = schema_model.normalize(parsed)
                schema_model.model_validate(parsed)

            return parsed

        except Exception as e:
            print(f"⚠️ Attempt {attempt + 1} failed: {e}")
            print("🧪 Raw output was:\n", raw_output[:300])  # Preview first 300 chars
            attempt += 1

    # Step 5: Fallback using brace matching
    try:
        parsed = extract_json_block(raw_output)
        if validate and schema_model:
            if hasattr(schema_model, "normalize"):
                parsed = schema_model.normalize(parsed)
            schema_model.model_validate(parsed)
        return parsed
    except Exception as e:
        return {
            "raw_output": raw_output.strip(),
            "error": f"Regex fallback failed: {e}"
        }

In [None]:

def pydantic_validate(model_class, data):
    """
    Version-safe validator that supports both Pydantic v1 and v2.
    """
    try:
        # Pydantic v2
        return model_class.model_validate(data)
    except AttributeError:
        # Fallback to Pydantic v1
        return model_class.parse_obj(data)


def validate_entry(entry, is_resume):
    try:
        model = ResumeSchema if is_resume else JobDescriptionSchema
        if hasattr(model, "normalize"):
            normalized = model.normalize(entry)
        else:
            normalized = entry
        pydantic_validate(model, normalized)
        return True, None
    except ValidationError as ve:
        return False, str(ve)


##  Normalize in Batches with Validation

In [None]:
def normalize_and_save(
    input_filename,
    output_filename_prefix,
    is_resume=True,
    input_dir=Path("json_outputs"),
    output_dir=Path("json_outputs/normalized"),
    limit: int = None,
    resume: bool = True,
    save_every: int = 5,
    checkpointing: bool = True,
    STRICT: bool = True
):
    # Ensure output directory exists
    output_dir.mkdir(parents=True, exist_ok=True)
    checkpoint_file = output_dir / f"checkpoint_{output_filename_prefix}.json"

    # Generate output filenames
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    batch_id = uuid.uuid4().hex[:6]
    valid_file = f"{output_filename_prefix}_{timestamp}_{batch_id}.json"
    invalid_file = f"invalid_{output_filename_prefix}_{timestamp}_{batch_id}.json"
    metadata_file = f"meta_{output_filename_prefix}_{timestamp}_{batch_id}.json"

    # Load raw input
    input_path = input_dir / input_filename
    with open(input_path, "r", encoding="utf-8") as f:
        raw_data = [json.loads(line) for line in f.readlines() if line.strip()]

    start_index = 0
    results, invalids = [], []

    if resume and checkpointing and checkpoint_file.exists():
        with open(checkpoint_file, "r") as ckpt:
            checkpoint = json.load(ckpt)
            start_index = checkpoint.get("last_index", 0)
            print(f"🔁 Resuming from record {start_index}")

    raw_data = raw_data[start_index:]
    if limit:
        raw_data = raw_data[:limit]

    prompt_template = RESUME_PROMPT_TEMPLATE if is_resume else JD_PROMPT_TEMPLATE
    schema_model = ResumeSchema if is_resume else JobDescriptionSchema

    for idx, record in enumerate(tqdm(raw_data), start=start_index):
        text = record.get("Resume_str" if is_resume else "description", "")
        parsed = extract_structured_json(
            text=text,
            prompt_template=prompt_template,
            schema_model=schema_model,
            validate=STRICT
        )

        if "raw_output" in parsed or "error" in parsed:
            invalids.append({
                "input": text,
                "output": parsed,
                "error": parsed.get("error", "Malformed or unstructured output")
            })
            continue

        if STRICT:
            is_valid, error_msg = validate_entry(parsed, is_resume)
            if is_valid:
                results.append(parsed)
            else:
                invalids.append({
                    "input": text,
                    "output": parsed,
                    "error": error_msg
                })
        else:
            results.append(parsed)

        # Save periodically
        if save_every and ((idx + 1 - start_index) % save_every == 0):
            if results:
                with open(output_dir / valid_file, "w") as f:
                    json.dump(results, f, indent=2)
            if invalids:
                with open(output_dir / invalid_file, "w") as f:
                    json.dump(invalids, f, indent=2)
            if checkpointing:
                with open(checkpoint_file, "w") as f:
                    json.dump({"last_index": idx + 1}, f)

    # Final save
    if results:
        with open(output_dir / valid_file, "w") as f:
            json.dump(results, f, indent=2)
    if invalids:
        with open(output_dir / invalid_file, "w") as f:
            json.dump(invalids, f, indent=2)
    if checkpointing:
        with open(checkpoint_file, "w") as f:
            json.dump({"last_index": start_index + len(raw_data)}, f)

    # Metadata summary
    meta = {
        "batch_id": batch_id,
        "timestamp": timestamp,
        "input_file": input_filename,
        "valid_output_file": valid_file if results else None,
        "invalid_output_file": invalid_file if invalids else None,
        "count_total": len(raw_data),
        "count_valid": len(results),
        "count_invalid": len(invalids),
        "strict_validation": STRICT,
        "model": llm_pipeline.model.config.name_or_path,
        "device": str(next(llm_pipeline.model.parameters()).device)
    }
    with open(output_dir / metadata_file, "w") as f:
        json.dump(meta, f, indent=2)

    print(f"✅ Done. Valid: {len(results)} | Invalid: {len(invalids)}")


## Run Phase 2 End-to-End

In [None]:
def run_phase2_structured_normalization():
    normalize_and_save(
    input_filename="parsed_resumes.json",
    output_filename_prefix="normalized_resumes",
    is_resume=True,
    limit=1500,
    resume=True,
    STRICT=False  # or set to False for schema-tolerant mode
    )
    """
    normalize_and_save(
    input_filename="parsed_jds.json",
    output_filename_prefix="normalized_jds",
    is_resume=False,
    limit=50,
    resume=True,
    STRICT=False  # or set to False for schema-tolerant mode
    )
    """

run_phase2_structured_normalization()




### Utility to merge normalized json files

In [None]:
import json
import shutil
from pathlib import Path

def merge_json_files(
    source_dir: Path,
    output_file: Path,
    pattern: str,
    merged_dir: Path
):
    source_dir.mkdir(parents=True, exist_ok=True)
    merged_dir.mkdir(parents=True, exist_ok=True)

    merged_data = []

    # Load existing output if it exists
    if output_file.exists():
        with open(output_file, "r", encoding="utf-8") as f:
            try:
                merged_data = json.load(f)
            except json.JSONDecodeError:
                print(f"⚠️ Could not decode {output_file}, starting from scratch.")

    # Identify matching files
    files_to_merge = sorted(source_dir.glob(pattern))

    for file_path in files_to_merge:
        with open(file_path, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    merged_data.extend(data)
                else:
                    print(f"⚠️ Skipping {file_path.name}: not a list.")
            except Exception as e:
                print(f"⚠️ Failed to parse {file_path.name}: {e}")
                continue

        # Move to merged folder
        shutil.move(str(file_path), merged_dir / file_path.name)
        print(f"✅ Merged and moved: {file_path.name}")

    # Write combined output
    if merged_data:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(merged_data, f, indent=2)
        print(f"💾 Saved to: {output_file}")
    else:
        print("📭 No valid data to merge.")

# === Usage ===

# Paths
normalized_dir = Path("json_outputs/normalized")
merged_dir = normalized_dir / "merged"

merge_json_files(
    source_dir=normalized_dir,
    output_file=normalized_dir / "normalized_jds.json",
    pattern="normalized_jds_*.json",
    merged_dir=merged_dir
)

merge_json_files(
    source_dir=normalized_dir,
    output_file=normalized_dir / "normalized_resumes.json",
    pattern="normalized_resumes_*.json",
    merged_dir=merged_dir
)
