# Global setup and package installation used in most phases

## Colab + GPU Detection Utilities

In [17]:
import subprocess

def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

def get_available_gpu_memory_gb():
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,nounits,noheader"],
            encoding="utf-8"
        )
        free_mem_mb = int(output.strip().split("\n")[0])
        return free_mem_mb / 1024
    except Exception:
        return 0.0


## install dependencies

In [None]:
if is_running_in_colab():
    # Install the required packages
    !pip install kagglehub pandas
    !pip install -q transformers accelerate bitsandbytes sentencepiece pydantic huggingface_hub xformers
else:
    %pip install kagglehub pandas
    %pip install -q transformers accelerate sentencepiece pydantic huggingface_hub xformers
    #%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
    #%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
    %pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
    %pip install -U bitsandbytes


## Login to huggingface

In [18]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    if is_running_in_colab():
        # If running in Colab, use the Colab secrets
        try:
            from google.colab import userdata
            HF_TOKEN = userdata.get('HF_TOKEN')
            if not HF_TOKEN:
                raise ValueError("⚠️ Hugging Face token not found in Colab secrets.")
            print("🔑 Hugging Face token found in Colab secrets.")
        except ImportError:
            print("⚠️ Unable to authenticate in Colab. Please set your Hugging Face token manually.")
    else:
        # Prompt for token if not set in environment
        print("🔑 Please enter your Hugging Face token:")
        # For Colab or local prompt input
        HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


## Setup Kaggle Credentials

In [19]:
import shutil

def setup_kaggle_credentials():
    kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
    if not os.path.exists(kaggle_path):
        from google.colab import files
        print("📂 Upload kaggle.json file...")
        uploaded = files.upload()
        os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
        for filename in uploaded.keys():
            shutil.move(filename, kaggle_path)
        os.chmod(kaggle_path, 0o600)
        print(f"✅ Kaggle credentials setup at {kaggle_path}")
    else:
        print(f"✅ Kaggle credentials already exist at {kaggle_path}")

setup_kaggle_credentials()

✅ Kaggle credentials already exist at C:\Users\rubyj/.kaggle/kaggle.json


## Mount Google Drive (Colab)

In [6]:
if is_running_in_colab():
   from google.colab import drive
   drive.mount('/content/drive')

##  Load Nous-Hermes-mistral-Instruct with Fallback to Quantized

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

def load_model_pipeline(model_name: str, hf_token: str):
    has_cuda = torch.cuda.is_available()
    free_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) if has_cuda else 0
    print(f"💻 CUDA: {has_cuda} | GPU Memory: {free_mem:.2f} GB")

    device_map = {"": 0} if has_cuda else "cpu"
    use_4bit = has_cuda and free_mem < 24

    # Set quantization config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True if use_4bit else False,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    ) if use_4bit else None

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token  # ✅ Fix warning about pad_token

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=quant_config,
        torch_dtype=torch.float16 if not quant_config else None,
        trust_remote_code=True,
        token=hf_token
    )

    print(f"✅ Model loaded on {next(model.parameters()).device}")
    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=1)


In [21]:
llm_pipeline = load_model_pipeline(
    model_name="NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
    hf_token=HF_TOKEN
)


💻 CUDA: True | GPU Memory: 15.92 GB


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


✅ Model loaded on cuda:0


# Global utilities

### Utility to merge normalized json files

In [90]:
import json
import shutil
from pathlib import Path

def merge_json_files(
    source_dir: Path,
    output_file: Path,
    pattern: str,
    merged_dir: Path
):
    source_dir.mkdir(parents=True, exist_ok=True)
    merged_dir.mkdir(parents=True, exist_ok=True)

    merged_data = []

    # Load existing output if it exists
    if output_file.exists():
        with open(output_file, "r", encoding="utf-8") as f:
            try:
                merged_data = json.load(f)
            except json.JSONDecodeError:
                print(f"⚠️ Could not decode {output_file}, starting from scratch.")

    # Identify matching files
    files_to_merge = sorted(source_dir.glob(pattern))

    for file_path in files_to_merge:
        with open(file_path, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    merged_data.extend(data)
                else:
                    print(f"⚠️ Skipping {file_path.name}: not a list.")
            except Exception as e:
                print(f"⚠️ Failed to parse {file_path.name}: {e}")
                continue

        # Move to merged folder
        shutil.move(str(file_path), merged_dir / file_path.name)
        print(f"✅ Merged and moved: {file_path.name}")

    # Write combined output
    if merged_data:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(merged_data, f, indent=2)
        print(f"💾 Saved to: {output_file}")
    else:
        print("📭 No valid data to merge.")

# === Usage ===

# Paths
normalized_dir = Path("json_outputs_phase1_run3/normalized")
merged_dir = normalized_dir / "merged"

merge_json_files(
    source_dir=normalized_dir,
    output_file=normalized_dir / "normalized_jds.json",
    pattern="normalized_jds_*.json",
    merged_dir=merged_dir
)

merge_json_files(
    source_dir=normalized_dir,
    output_file=normalized_dir / "normalized_resumes.json",
    pattern="normalized_resumes_*.json",
    merged_dir=merged_dir
)


✅ Merged and moved: normalized_jds_20250426_0212_2f0933.json
💾 Saved to: json_outputs_phase1_run3\normalized\normalized_jds.json
✅ Merged and moved: normalized_resumes_20250426_0208_e5a1bc.json
💾 Saved to: json_outputs_phase1_run3\normalized\normalized_resumes.json


### Utility to save json to a folder

In [22]:
import json
import os
# 📦 Save JSON Output with Safety
def save_json_output(data, output_path: str, indent: int = 4, overwrite: bool = True):
    output_dir = os.path.dirname(output_path)
    os.makedirs(output_dir, exist_ok=True)

    if os.path.exists(output_path):
        if overwrite:
            os.remove(output_path)
        else:
            raise FileExistsError(f"File {output_path} already exists and overwrite=False.")

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=indent, ensure_ascii=False)

    print(f"✅ Saved output to {output_path}")


# Phase 1: First Steps Notebook — Data Ingestion + Minimal Parsing
1. Load Resume and JD datasets
2. Minimal Parsing into JSON Structure
3. Save structured JSON for Phase 2

## Util Classes and methods

### Configurations  

In [9]:
# ==============================
# 🛠 CONFIGURATION
# ==============================

class Config:
    DATASET_DOWNLOAD_DIR = "datasets"
    JSON_OUTPUT_DIR = "json_outputs_phase1_run3"
    JSON_OUTPUT_NORMALIZED_DIR = "json_outputs_phase1_run3/normalized"
    AUTO_CLEANUP = True


### Downloader 

In [10]:
# ==============================
# DOWNLOADER
# ==============================
import zipfile
from pathlib import Path

class DatasetDownloader:
    @staticmethod
    def download_and_extract(dataset_path: str) -> tuple[str, str]:
        os.makedirs(Config.DATASET_DOWNLOAD_DIR, exist_ok=True)
        dataset_slug = dataset_path.split("/")[-1]
        extract_folder_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
        zip_filename = f"{dataset_slug}.zip"
        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)

        if os.path.exists(extract_folder_path) and any(Path(extract_folder_path).rglob("*.csv")):
            print(f"⚡ Dataset folder already exists at '{extract_folder_path}', skipping download and extraction.")
            return extract_folder_path, zip_filename

        print(f"⬇️ Downloading dataset: {dataset_path} ...")
        !kaggle datasets download -d {dataset_path} -p {Config.DATASET_DOWNLOAD_DIR}

        if not os.path.exists(zip_path):
            raise FileNotFoundError(f"❌ Zip file '{zip_filename}' not found after download!")

        os.makedirs(extract_folder_path, exist_ok=True)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_folder_path)

        print(f"✅ Downloaded and extracted to '{extract_folder_path}'.")
        return extract_folder_path, zip_filename



### Data Loader

In [11]:
# ==============================
# LOADER
# ==============================
import pandas as pd


class DatasetLoader:
    @staticmethod
    def load_csv(dataset_folder: str, target_csv_name: str) -> pd.DataFrame:
        print(f"🔍 Searching for '{target_csv_name}' inside {dataset_folder}...")
        if not os.path.exists(dataset_folder):
            raise FileNotFoundError(f"❌ Dataset folder '{dataset_folder}' does not exist!")

        for root, _, files in os.walk(dataset_folder):
            for file in files:
                if file.lower() == target_csv_name.lower():
                    csv_path = os.path.join(root, file)
                    df = pd.read_csv(csv_path)
                    print(f"✅ Loaded CSV with shape {df.shape}")
                    return df

        raise FileNotFoundError(f"❌ CSV file '{target_csv_name}' not found inside extracted dataset!")



### Data Processor

In [12]:
# ==============================
# PROCESSOR
# ==============================
from typing import List


class DatasetProcessor:
    @staticmethod
    def filter_fields(df: pd.DataFrame, allowed_fields: List[str]) -> pd.DataFrame:
        missing_fields = [field for field in allowed_fields if field not in df.columns]
        if missing_fields:
            raise ValueError(f"❌ Fields {missing_fields} not found in dataset!")

        filtered_df = df[allowed_fields]
        print(f"✅ Filtered columns: {list(filtered_df.columns)}")
        return filtered_df

    @staticmethod
    def save_to_json(df: pd.DataFrame, output_json_name: str):
        os.makedirs(Config.JSON_OUTPUT_DIR, exist_ok=True)
        output_path = os.path.join(Config.JSON_OUTPUT_DIR, output_json_name)

        if os.path.exists(output_path):
            os.remove(output_path)
            print(f"🗑️ Existing JSON '{output_path}' deleted.")

        df.to_json(output_path, orient='records', lines=True, force_ascii=False)
        print(f"✅ Data saved to JSON at '{output_path}'")



### Cleanup

In [13]:
# ==============================
# CLEANER
# ==============================
class Cleaner:
    @staticmethod
    def cleanup_dataset_artifacts(extracted_folder_path: str, zip_filename: str):
        if os.path.exists(extracted_folder_path):
            shutil.rmtree(extracted_folder_path)
            print(f"🧹 Folder '{extracted_folder_path}' has been deleted successfully.")

        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)
        if os.path.exists(zip_path):
            os.remove(zip_path)
            print(f"🗑️ Zip file '{zip_path}' has been deleted successfully.")

### Hybrid Data loader

In [14]:

# ==============================
# HYBRID LOADER
# ==============================
try:
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
except ImportError:
    kagglehub = None

class HybridDatasetLoader:
    @staticmethod
    def load_dataset(dataset_path: str, file_name: str) -> pd.DataFrame:
        if kagglehub:
            try:
                print(f"📥 Trying KaggleHub for {dataset_path}...")
                df = kagglehub.dataset_load(KaggleDatasetAdapter.PANDAS, dataset_path, file_name)
                print(f"✅ Loaded using KaggleHub: shape = {df.shape}")
                return df
            except Exception as e:
                print(f"⚠️ KaggleHub failed: {e}\nFalling back to ZIP-based loader.")

        extracted_folder, _ = DatasetDownloader.download_and_extract(dataset_path)
        return DatasetLoader.load_csv(extracted_folder, file_name)



### Infer JD Domains

In [15]:
domain_keywords_dict = {
    'advocate': ['advocate'],
    'agriculture': ['agriculture'],
    'apparel': ['apparel'],
    'arts': ['arts'],
    'automobile': ['automobile'],
    'aviation': ['aviation'],
    'banking': ['banking'],
    'bpo': ['bpo'],
    'business development': ['business', 'development', 'business development', 'business-development'],
    'chef': ['chef'],
    'construction': ['construction'],
    'consultant': ['consultant'],
    'data scientist': ['data', 'data analyst', 'data scientist', 'scientist'],
    'designing': ['designing', 'designer'],
    'digital media': ['digital', 'digital marketing executive', 'media', 'digital media', 'digital-media'],
    'engineering': ['engineering'],
    'finance': ['finance', 'financial analyst'],
    'healthcare': ['healthcare'],
    'hr': ['hr'],
    'information technology': ['information', 'technology', 'information technology', 'information-technology'],
    'public relations': ['public', 'relations', 'public relations', 'public-relations'],
    'marketing': ['marketing'],
    'sales': ['sales', 'sales executive'],
    'teacher': ['teacher'],
    'technician': ['technician'],
    'training': ['training'],
    'web designing': ['web', 'designing'],
    'fitness': ['fitness'],
    'accountant': ['accountant', 'accounting']
}


In [16]:
def infer_domain_from_title(title):
    title_lower = title.lower()
    for domain, keywords in domain_keywords_dict.items():
        if any(kw in title_lower for kw in keywords):
            return domain
    return "unknown"


In [17]:
# ==============================
# Efficient LLM Inference in Batches for JD Domains
# ==============================
from tqdm import tqdm

def infer_domains_in_batches(texts, batch_size=8):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="🧠 Inferring JD domains"):
        batch = texts[i:i+batch_size]
        prompts = [
            f"Given this job description:\n\n{desc}\n\nWhat is the most likely job function or domain?" for desc in batch
        ]
        try:
            responses = llm_pipeline(prompts)
            for r in responses:
                results.append(r[0]['generated_text'].strip().split("\n")[-1])
        except Exception:
            results.extend(["unknown"] * len(batch))
    return results


### Filter and Rank JDs

In [18]:
# ==============================
# JD Filtering and Ranking (with batched domain inference)
# ==============================
def filter_and_rank_jds(jd_df, resume_domains, max_total=50, top_n_per_domain=2):
    # Ensure necessary columns exist
    for col in ['title', 'description']:
        if col not in jd_df.columns:
            raise ValueError(f"❌ Column '{col}' not found in JD dataset")
        jd_df[col] = jd_df[col].fillna('').astype(str)

    # Infer domains 
    print("🧠 Inferring JD domains from title using keyword matcing...")
    #jd_df['inferred_domain'] = infer_domains_in_batches(jd_df['description'].tolist())
    jd_df['inferred_domain'] = jd_df['title'].fillna("").apply(infer_domain_from_title)


    all_ranked = []

    for domain in resume_domains:
        matches = jd_df[
            jd_df['title'].str.contains(domain, na=False, case=False) |
            jd_df['inferred_domain'].str.contains(domain, na=False, case=False)
        ].copy()

        if matches.empty:
            print(f"⚠️ No JDs matched domain: '{domain}'")
            continue

        matches['richness_score'] = matches['description'].str.len()
        top = matches.sort_values(by='richness_score', ascending=False).head(top_n_per_domain)
        all_ranked.append(top)

    if not all_ranked:
        raise ValueError("❌ No job descriptions matched any resume domains.")

    final_jds_df = pd.concat(all_ranked, ignore_index=True)
    final_jds_df = final_jds_df.drop_duplicates().sort_values(by='richness_score', ascending=False).head(max_total)

    print(f"✅ Filtered and ranked {len(final_jds_df)} job descriptions across {len(resume_domains)} domains.")
    return final_jds_df


### Load Resume and JD datasets

In [19]:
# ==============================
# Resume Dataset Loader (with caching)
# ==============================
def load_resume_dataset(dataset_path: str = "snehaanbhawal/resume-dataset", target_csv_name: str = "Resume.csv") -> pd.DataFrame:
    if not hasattr(load_resume_dataset, "_cache"):
        print("📥 Loading resume dataset for the first time...")
        load_resume_dataset._cache = HybridDatasetLoader.load_dataset(dataset_path, target_csv_name)
    else:
        print("✅ Using cached resume dataset.")
    
    return load_resume_dataset._cache

# ==============================
# Job Description Dataset Loader (with caching)
# ==============================
def load_job_description_dataset(dataset_path: str = "arshkon/linkedin-job-postings", target_csv_name: str = "postings.csv") -> pd.DataFrame:
    if not hasattr(load_job_description_dataset, "_cache"):
        print("📥 Loading job description dataset for the first time...")
        load_job_description_dataset._cache = HybridDatasetLoader.load_dataset(dataset_path, target_csv_name)
    else:
        print("✅ Using cached job description dataset.")
    
    return load_job_description_dataset._cache



### JD Dataset Processing Function

In [20]:
# ==============================
# JD Processing Function
# ==============================
def process_dataset_jd(dataset_path: str, target_csv_name: str, allowed_fields: List[str], output_json_name: str):
    jd_df = load_job_description_dataset(dataset_path, target_csv_name)
    resume_df = load_resume_dataset()
    resume_domains = resume_df['Category'].dropna().str.lower().unique().tolist()
    ranked_jds_df = filter_and_rank_jds(jd_df, resume_domains)
    filtered_df = DatasetProcessor.filter_fields(ranked_jds_df, allowed_fields)
    DatasetProcessor.save_to_json(filtered_df, output_json_name)
    
    # cleanup dataset
    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)
    

### Resume Dataset Processing Function

In [21]:
# ==============================
# Resume Filtering (5 per category)
# ==============================
def filter_resumes_by_category(resume_df: pd.DataFrame, top_n: int = 1) -> pd.DataFrame:
    if 'Category' not in resume_df.columns:
        raise ValueError("❌ Resume dataset does not contain 'Category' column.")

    filtered_resumes = (
        resume_df
        .dropna(subset=['Category'])
        .groupby('Category', group_keys=False)
        .apply(lambda group: group.head(top_n))
        .reset_index(drop=True)
    )

    print(f"✅ Filtered {len(filtered_resumes)} resumes (top {top_n} from each category).")
    return filtered_resumes


In [22]:

# ==============================
# MAIN FLOW
# ==============================

def process_dataset_resume(dataset_path: str, target_csv_name: str, allowed_fields: List[str], output_json_name: str):
    df = load_resume_dataset(dataset_path, target_csv_name)
    filtered_df = DatasetProcessor.filter_fields(df, allowed_fields)
    DatasetProcessor.save_to_json(filtered_df, output_json_name)

    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)


# ==============================
# Save Filtered Resumes
# ==============================
def process_and_save_filtered_resumes(dataset_path: str, target_csv_name: str, allowed_fields: List[str], output_json_name: str):
    resume_df = load_resume_dataset(dataset_path, target_csv_name)
    df = filter_resumes_by_category(resume_df)
    filtered_df = DatasetProcessor.filter_fields(df, allowed_fields)
    DatasetProcessor.save_to_json(filtered_df, output_json_name)
    
    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)


## Login and do the processing of Resume and JD dataset

In [None]:
process_and_save_filtered_resumes(
    dataset_path="snehaanbhawal/resume-dataset",
    target_csv_name="Resume.csv",
    allowed_fields=["Category", "Resume_str"],
    output_json_name="parsed_resumes.json"
)

process_dataset_jd(
    dataset_path="arshkon/linkedin-job-postings",
    target_csv_name="postings.csv",
    allowed_fields=["title", "company_name", "location",  "skills_desc", "job_id" , "formatted_experience_level", "formatted_work_type", "description"], #"description",
    output_json_name="parsed_jds.json"
)


In [None]:

# Process Resume Dataset
process_dataset_resume(
    dataset_path="snehaanbhawal/resume-dataset",
    target_csv_name="Resume.csv",
    allowed_fields=["Category", "Resume_str"],
    output_json_name="parsed_resumes.json"
)


# Process Job Postings Dataset
process_dataset_jd(
    dataset_path="arshkon/linkedin-job-postings",
    target_csv_name="postings.csv",
    allowed_fields=["title", "company_name", "location",  "skills_desc", "job_id" , "formatted_experience_level", "formatted_work_type", "description"], #"description",
    output_json_name="parsed_jds.json"
)

# Phase 2 -	Parse resume/JD into JSON structured scheme

## Define Pydantic Schemas

In [58]:
from typing import Optional
from pydantic import BaseModel

class Education(BaseModel):
    degree: str
    field: str
    institution: str
    year: str
    gpa: Optional[str] = None

class Experience(BaseModel):
    job_title: str
    company: str
    start_date: Optional[str] = None
    end_date: Optional[str] = None
    duration_in_months: Optional[int] = None
    description: str

class Basics(BaseModel):
    name: str
    email: str
    phone: str
    location: str
    current_title: str
    linkedin_url: Optional[str] = None

In [59]:
from typing import List, Optional
from pydantic import BaseModel
import re


class ResumeSchema(BaseModel):
    basics: Basics
    education: List[Education]
    experience: List[Experience]
    skills: List[str]
    certifications: List[str]
    projects: List[str]
    languages: Optional[List[str]] = []
    total_experience_years: Optional[float] = 0.0  # ✅ New field added

    @classmethod
    def normalize(cls, resume_dict: dict) -> dict:
        resume_dict = dict(resume_dict)

        # Basics
        basics = resume_dict.get("basics", {})
        resume_dict["basics"] = {
            "name": basics.get("name", ""),
            "email": basics.get("email", ""),
            "phone": basics.get("phone", ""),
            "location": basics.get("location", ""),
            "current_title": basics.get("current_title", basics.get("title", "")),
            "linkedin_url": basics.get("linkedin_url", "")
        }

        # Normalize sections
        for key in ["skills", "certifications", "projects", "languages"]:
            if not isinstance(resume_dict.get(key), list):
                resume_dict[key] = []

        # Normalize Experience
        normalized_exp = []
        for item in resume_dict.get("experience", []):
            if not isinstance(item, dict):
                continue
            normalized_exp.append({
                "job_title": item.get("job_title", item.get("title", "")),
                "company": item.get("company", ""),
                "start_date": item.get("start_date", ""),
                "end_date": item.get("end_date", ""),
                "duration_in_months": item.get("duration_in_months", None),
                "description": item.get("description", "")
            })
        resume_dict["experience"] = normalized_exp

        # Normalize Education
        normalized_edu = []
        for item in resume_dict.get("education", []):
            if not isinstance(item, dict):
                continue
            degree = item.get("degree", "")
            field = item.get("field", "")
            if not field:
                match = re.search(r"in\\s+(.+)", degree, flags=re.IGNORECASE)
                field = match.group(1).strip() if match else ""
            year = str(item.get("year", "")) if item.get("year") else ""
            gpa = item.get("gpa", None)
            normalized_edu.append({
                "degree": degree,
                "field": field,
                "institution": item.get("institution", ""),
                "year": year,
                "gpa": gpa
            })
        resume_dict["education"] = normalized_edu

        # Total Experience fallback
        if "total_experience_years" not in resume_dict:
            resume_dict["total_experience_years"] = 0.0

        return resume_dict


In [60]:
class JobDescriptionSchema(BaseModel):
    title: str
    summary: str
    required_experience_years: float
    preferred_degrees: List[str]
    required_skills: List[str]
    optional_skills: List[str]
    certifications: List[str]
    soft_skills: List[str]
    job_location: str
    remote_option: Optional[bool] = False
    employment_type: Optional[str] = None

    @classmethod
    def normalize(cls, jd_dict: dict) -> dict:
        jd_dict = dict(jd_dict)

        aliases = {
            "years_required": "required_experience_years",
            "requirements": "required_skills",
            "degree_preferences": "preferred_degrees",
            "certs": "certifications",
            "skills_soft": "soft_skills",
            "job_summary": "summary"
        }
        for old, new in aliases.items():
            if old in jd_dict and new not in jd_dict:
                jd_dict[new] = jd_dict.pop(old)

        # Required Experience Extraction
        def extract_experience_years(text: str) -> float:
            if not isinstance(text, str):
                return 0.0
            match = re.search(r'(\\d+(\\.\\d+)?)\\s*\\+?\\s*(years?|yrs?)', text.lower())
            return float(match.group(1)) if match else 0.0

        try:
            val = jd_dict.get("required_experience_years")
            if val is None:
                jd_dict["required_experience_years"] = extract_experience_years(jd_dict.get("summary", ""))
            elif isinstance(val, str):
                jd_dict["required_experience_years"] = float(val.split()[0])
            else:
                jd_dict["required_experience_years"] = float(val)
        except Exception:
            jd_dict["required_experience_years"] = 0.0

        # Normalize fields
        for field in ["preferred_degrees", "required_skills", "optional_skills", "certifications", "soft_skills"]:
            if not isinstance(jd_dict.get(field), list):
                jd_dict[field] = []

        for field in ["title", "summary", "job_location", "employment_type"]:
            jd_dict[field] = jd_dict.get(field, "") or ""

        # Remote Option
        remote_flag = jd_dict.get("remote_option", None)
        if remote_flag is None:
            remote_flag = "remote" in jd_dict.get("summary", "").lower()
        jd_dict["remote_option"] = bool(remote_flag)

        return jd_dict


In [61]:
def generate_example_structure(model_class) -> dict:
    """Generate a JSON structure from a Pydantic model using placeholder values, handling Optional fields better."""
    from typing import get_origin, get_args, Union
    from pydantic import BaseModel

    def default_for_type(field_type):
        origin = get_origin(field_type)
        args = get_args(field_type)

        if origin is list:
            return []
        elif origin is Union and type(None) in args:
            # Optional[...] detected
            non_none_types = [arg for arg in args if arg is not type(None)]
            return default_for_type(non_none_types[0]) if non_none_types else ""
        elif field_type is str:
            return ""
        elif field_type in [float, int]:
            return 0.0
        elif isinstance(field_type, type) and issubclass(field_type, BaseModel):
            return generate_example_structure(field_type)
        else:
            return ""

    structure = {}
    for field_name, field in model_class.model_fields.items():
        try:
            structure[field_name] = default_for_type(field.annotation)
        except Exception:
            structure[field_name] = ""
    return structure


##  Prompt Templates

In [87]:
RESUME_PROMPT_TEMPLATE = """
You are a JSON resume parser and experience calculator.

Given the following resume text, extract a structured JSON following this schema:

{schema}

Instructions:
- Parse education, experience, skills, certifications, and other fields exactly as described.
- In the "experience" list, if start_date and end_date are missing, try to infer them if mentioned anywhere.
- Accept various date formats such as "March 2007", "Mar 07", "03/2007", "Current", "Present" etc.
- Interpret "Current", "Present", "Today" as the current month and year.
- Calculate "total_experience_years" as the cumulative duration of professional work experience from all roles.
    - Overlapping durations should not be double-counted.
    - If start and end dates are missing or ambiguous, skip them for total experience calculation.
- If a field is missing in the resume, leave it empty ("") or an empty list [] depending on the field type.
- Return ONLY a valid JSON object. No extra text, no explanations, no markdown formatting.
- Your output MUST start with a {{.

Resume Text:
--------------------
{text}
--------------------
"""


In [88]:
JD_PROMPT_TEMPLATE = """
You are a JSON job description parser and experience extractor.

Given the following job description text, extract a structured JSON following this schema:

{schema}

Instructions:
- Parse title, summary, skills, certifications, and other fields exactly as shown.
- Pay special attention to "required_experience_years":
    - If experience years are explicitly listed, extract that number.
    - Accept formats like "5+ years", "3-5 years", "8 years required", etc.
    - If multiple ranges are mentioned (e.g., "3-5 years"), use the lower value (3 years).
    - If no years are mentioned explicitly, infer from job title level:
        - "Senior", "Lead" → Assume 5+ years
        - "Mid-level", "Experienced" → Assume 3 years
        - "Entry level", "Junior" → Assume 0-1 years
    - If still ambiguous, default to 0 years.
- Handle remote/hybrid jobs:
    - Set "remote_option" = true if remote keywords are present (remote, work from home, hybrid, WFH).
- If a field is missing, leave it empty ("") or as an empty list [] depending on the field type.
- Return ONLY a valid JSON object. No extra text, no explanations, no markdown formatting.
- Your output MUST start with a {{.

Job Description Text:
--------------------
{text}
--------------------
"""


##  Inference + Validation Functions

### Generate Raw LLM Output

In [72]:
def generate_llm_output(prompt: str, max_new_tokens: int = 1024) -> str:
    """Run LLM and return the generated text."""
    try:
        return llm_pipeline(prompt, max_new_tokens=max_new_tokens,  do_sample=False)[0]["generated_text"]
    except Exception as e:
        raise RuntimeError(f"LLM generation failed: {e}")


### Sanitize Output: Strip Prompt, Fix Cutoffs

In [65]:
def sanitize_llm_output(response: str, prompt: str) -> str:
    raw = response.replace(prompt, "").strip()

    # Truncate garbage after the last closing brace
    raw = re.sub(r'}[^}]*$', '}', raw)

    # Remove markdown bullets or --- headers at end
    raw = re.sub(r'(---|•|–|-)\s*$', '', raw, flags=re.MULTILINE)

    return raw


### Regex-based JSON Block Extractor

In [77]:
import json

def extract_json_block(text: str) -> dict:
    """
    Regex-free fallback JSON block extractor using brace balance.
    Finds first balanced {} block.
    """
    stack = []
    start = None
    for i, char in enumerate(text):
        if char == '{':
            if not stack:
                start = i
            stack.append(char)
        elif char == '}':
            if stack:
                stack.pop()
                if not stack:
                    try:
                        return json.loads(text[start:i+1])
                    except json.JSONDecodeError:
                        continue
    raise ValueError("No valid JSON object found in fallback.")

### Final Orchestrator: Fault-Tolerant Extraction

In [78]:
def truncate_text(text: str, max_chars=1500) -> str:
    """Trims long resumes/JDs to prevent LLM overload."""
    return text.strip()[:max_chars]

In [79]:
from typing import Union

def extract_structured_json(
    text: str,
    prompt_template: str,
    schema_model: Union[None, type] = None,
    max_new_tokens: int = 1024,
    retries: int = 0,
    validate: bool = True,
) -> dict:
    """
    Runs LLM to extract structured JSON and validates against schema.
    Includes: prompt sanitization, retry, echo detection, brace parser fallback, schema validation.
    """
    example_schema = generate_example_structure(schema_model)
    schema_str = json.dumps(example_schema, indent=2)
    prompt = prompt_template.format(text=truncate_text(text), schema=schema_str)
    raw_output = ""
    attempt = 0

    while attempt <= retries:
        try:
            # Step 1: Get LLM output
            response = generate_llm_output(prompt, max_new_tokens)
            raw_output = sanitize_llm_output(response, prompt)

            # Step 2: Detect schema echo or instruction echo
            if "$schema" in raw_output or "Ensure these rules" in raw_output:
                raise ValueError("LLM echoed schema or instruction block instead of generating JSON.")

            # Step 3: Try JSON load directly
            json_start = raw_output.find("{")
            if json_start == -1:
                raise ValueError("No opening '{' found in LLM output.")

            parsed = json.loads(raw_output[json_start:])

            # Step 4: Optional schema validation
            if validate and schema_model:
                if hasattr(schema_model, "normalize"):
                    parsed = schema_model.normalize(parsed)
                schema_model.model_validate(parsed)

            return parsed

        except Exception as e:
            print(f"⚠️ Attempt {attempt + 1} failed: {e}")
            print("🧪 Raw output was:\n", raw_output[:300])  # Preview first 300 chars
            attempt += 1

    # Step 5: Fallback using brace matching
    try:
        parsed = extract_json_block(raw_output)
        if validate and schema_model:
            if hasattr(schema_model, "normalize"):
                parsed = schema_model.normalize(parsed)
            schema_model.model_validate(parsed)
        return parsed
    except Exception as e:
        return {
            "raw_output": raw_output.strip(),
            "error": f"Regex fallback failed: {e}"
        }

In [80]:
from pydantic import ValidationError

def pydantic_validate(model_class, data):
    """
    Version-safe validator that supports both Pydantic v1 and v2.
    """
    try:
        # Pydantic v2
        return model_class.model_validate(data)
    except AttributeError:
        # Fallback to Pydantic v1
        return model_class.parse_obj(data)


def validate_entry(entry, is_resume):
    try:
        model = ResumeSchema if is_resume else JobDescriptionSchema
        if hasattr(model, "normalize"):
            normalized = model.normalize(entry)
        else:
            normalized = entry
        pydantic_validate(model, normalized)
        return True, None
    except ValidationError as ve:
        return False, str(ve)


##  Normalize in Batches with Validation

In [81]:
from datetime import datetime
import uuid
from pathlib import Path
import json


def normalize_and_save(
    input_filename,
    output_filename_prefix,
    is_resume=True,
    input_dir=Path("json_outputs"),
    output_dir=Path("json_outputs/normalized"),
    limit: int = None,
    resume: bool = True,
    save_every: int = 5,
    checkpointing: bool = True,
    STRICT: bool = True
):
    # Ensure output directory exists
    output_dir.mkdir(parents=True, exist_ok=True)
    checkpoint_file = output_dir / f"checkpoint_{output_filename_prefix}.json"

    # Generate output filenames
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    batch_id = uuid.uuid4().hex[:6]
    valid_file = f"{output_filename_prefix}_{timestamp}_{batch_id}.json"
    invalid_file = f"invalid_{output_filename_prefix}_{timestamp}_{batch_id}.json"
    metadata_file = f"meta_{output_filename_prefix}_{timestamp}_{batch_id}.json"

    # Load raw input
    input_path = input_dir / input_filename
    with open(input_path, "r", encoding="utf-8") as f:
        raw_data = [json.loads(line) for line in f.readlines() if line.strip()]

    start_index = 0
    results, invalids = [], []

    if resume and checkpointing and checkpoint_file.exists():
        with open(checkpoint_file, "r") as ckpt:
            checkpoint = json.load(ckpt)
            start_index = checkpoint.get("last_index", 0)
            print(f"🔁 Resuming from record {start_index}")

    raw_data = raw_data[start_index:]
    if limit:
        raw_data = raw_data[:limit]

    prompt_template = RESUME_PROMPT_TEMPLATE if is_resume else JD_PROMPT_TEMPLATE
    schema_model = ResumeSchema if is_resume else JobDescriptionSchema

    for idx, record in enumerate(tqdm(raw_data), start=start_index):
        text = record.get("Resume_str" if is_resume else "description", "")
        parsed = extract_structured_json(
            text=text,
            prompt_template=prompt_template,
            schema_model=schema_model,
            validate=STRICT
        )

        if "raw_output" in parsed or "error" in parsed:
            invalids.append({
                "input": text,
                "output": parsed,
                "error": parsed.get("error", "Malformed or unstructured output")
            })
            continue

        if STRICT:
            is_valid, error_msg = validate_entry(parsed, is_resume)
            if is_valid:
                results.append(parsed)
            else:
                invalids.append({
                    "input": text,
                    "output": parsed,
                    "error": error_msg
                })
        else:
            results.append(parsed)

        # Save periodically
        if save_every and ((idx + 1 - start_index) % save_every == 0):
            if results:
                with open(output_dir / valid_file, "w") as f:
                    json.dump(results, f, indent=2)
            if invalids:
                with open(output_dir / invalid_file, "w") as f:
                    json.dump(invalids, f, indent=2)
            if checkpointing:
                with open(checkpoint_file, "w") as f:
                    json.dump({"last_index": idx + 1}, f)

    # Final save
    if results:
        with open(output_dir / valid_file, "w") as f:
            json.dump(results, f, indent=2)
    if invalids:
        with open(output_dir / invalid_file, "w") as f:
            json.dump(invalids, f, indent=2)
    if checkpointing:
        with open(checkpoint_file, "w") as f:
            json.dump({"last_index": start_index + len(raw_data)}, f)

    # Metadata summary
    meta = {
        "batch_id": batch_id,
        "timestamp": timestamp,
        "input_file": input_filename,
        "valid_output_file": valid_file if results else None,
        "invalid_output_file": invalid_file if invalids else None,
        "count_total": len(raw_data),
        "count_valid": len(results),
        "count_invalid": len(invalids),
        "strict_validation": STRICT,
        "model": llm_pipeline.model.config.name_or_path,
        "device": str(next(llm_pipeline.model.parameters()).device)
    }
    with open(output_dir / metadata_file, "w") as f:
        json.dump(meta, f, indent=2)

    print(f"✅ Done. Valid: {len(results)} | Invalid: {len(invalids)}")


## Run Phase 2 End-to-End

In [None]:
import json
def run_phase2_structured_normalization():
    
    normalize_and_save(
    input_filename="parsed_resumes.json",
    output_filename_prefix="normalized_resumes",
    is_resume=True,
    limit=50,
    resume=True,
    STRICT=False,  # or set to False for schema-tolerant mode
    input_dir=Path(Config.JSON_OUTPUT_DIR),
    output_dir=Path(Config.JSON_OUTPUT_NORMALIZED_DIR)
    )
    
    normalize_and_save(
    input_filename="parsed_jds.json",
    output_filename_prefix="normalized_jds",
    is_resume=False,
    limit=50,
    resume=True,
    STRICT=False,  # or set to False for schema-tolerant mode
    input_dir=Path(Config.JSON_OUTPUT_DIR),
    output_dir=Path(Config.JSON_OUTPUT_NORMALIZED_DIR)
    )
    
run_phase2_structured_normalization()

# Phase 3 Rubric-Based Scoring Engine

### Imports & Setup

In [None]:
# 📥 Imports
#import json
#import pandas as pd
#import random
#from tqdm import tqdm
#import matplotlib.pyplot as plt



## Load Normalized Resumes and JDs

In [23]:
from typing import Any
import json

# 📂 Load normalized JSON data
def load_json_file(file_path: str) -> Any:
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

### Load Resumes

In [None]:
import os

resumes_path = os.path.join(Config.JSON_OUTPUT_NORMALIZED_DIR, 'normalized_resumes.json')
resumes = load_json_file(resumes_path)

### Load JDs

In [None]:
import os

jds_path = os.path.join(Config.JSON_OUTPUT_NORMALIZED_DIR, 'normalized_jds.json')
jds = load_json_file(jds_path)

In [None]:
print(f"Loaded {len(resumes)} resumes and {len(jds)} job descriptions.")

## Rule-Based Scoring Functions

In [24]:
# ⚙️ Rule-Based Scoring

def skills_match(resume, jd):
    resume_skills = set(resume.get("skills", []))
    jd_skills = set(jd.get("required_skills", []))
    
    if not jd_skills:
        return {"score": 1.0, "details": "JD has no skill requirement."}
    
    matched = resume_skills.intersection(jd_skills)
    score = len(matched) / len(jd_skills)
    details = f"Matched skills: {list(matched)}"
    return {"score": round(score, 2), "details": details}

def experience_match(resume, jd):
    resume_exp = resume.get("total_experience_years", 0.0)
    required_exp = jd.get("required_experience_years", 0.0)
    
    if required_exp == 0:
        return {"score": 1.0, "details": "JD has no experience requirement."}
    
    ratio = min(resume_exp / required_exp, 1.0)
    details = f"Resume experience: {resume_exp} yrs vs JD requirement: {required_exp} yrs"
    return {"score": round(ratio, 2), "details": details}

def education_match(resume, jd):
    resume_edu = [edu["degree"].lower() for edu in resume.get("education", [])]
    preferred_degrees = [deg.lower() for deg in jd.get("preferred_degrees", [])]

    if not preferred_degrees:
        return {"score": 1.0, "details": "No preferred degrees in JD."}

    matched = set()
    for deg in preferred_degrees:
        for edu in resume_edu:
            if deg in edu:
                matched.add(deg)

    score = len(matched) / len(preferred_degrees)
    details = f"Matched degrees: {list(matched)}"
    return {"score": round(score, 2), "details": details}

def certifications_match(resume, jd):
    resume_certs = set(resume.get("certifications", []))
    required_certs = set(jd.get("certifications", []))
    
    if not required_certs:
        return {"score": 1.0, "details": "JD has no certification requirement."}
    
    matched = resume_certs.intersection(required_certs)
    score = len(matched) / len(required_certs)
    details = f"Matched certifications: {list(matched)}"
    return {"score": round(score, 2), "details": details}

def rule_based_scoring(resume, jd):
    return {
        "skills_match": skills_match(resume, jd),
        "experience_alignment": experience_match(resume, jd),
        "education_alignment": education_match(resume, jd),
        "certifications_alignment": certifications_match(resume, jd)
    }


## LLM-Based Scoring Functions (Structured Prompt)

In [30]:
def create_structured_prompt(resume, jd):
    prompt = f"""
You are an ATS Resume-JD Matcher Assistant.

Given these parsed fields:

Resume:
Title: {resume.get('basics', {}).get('current_title', '')}
Skills: {', '.join(resume.get('skills', []))}
Certifications: {', '.join(resume.get('certifications', []))}
Experience Years: {resume.get('total_experience_years', 0.0)}

Job Description:
Title: {jd.get('title', '')}
Required Skills: {', '.join(jd.get('required_skills', []))}
Required Experience: {jd.get('required_experience_years', 0.0)}
Required Certifications: {', '.join(jd.get('certifications', []))}

Evaluate and respond ONLY as JSON:
{{
    "soft_skills": {{"score": float, "details": "string"}},
    "transferable_skills": {{"score": float, "details": "string"}}
}}
"""
    return prompt


In [31]:
# 🤖 LLM-Based Scoring
import json


def llm_based_scoring(resume, jd):
    prompt = create_structured_prompt(resume, jd)
    try:
        response = llm_pipeline(prompt, max_new_tokens=300)[0]['generated_text']
        parsed_response = json.loads(response.split('{', 1)[1].rsplit('}', 1)[0].join(['{', '}']))
    except Exception as e:
        print(f"LLM Scoring Error: {e}")
        parsed_response = {
            "soft_skills": {"score": 0.5, "details": "LLM fallback"},
            "transferable_skills": {"score": 0.5, "details": "LLM fallback"}
        }
    return parsed_response


## Combine Section Scores

In [32]:
# 🔗 Combine all section scores

def combine_sections(rule_sections, llm_sections):
    all_sections = {**rule_sections, **llm_sections}
    
    total_weight = 0
    weighted_score_sum = 0
    
    section_weights = {
        "skills_match": 4.0,
        "experience_alignment": 2.0,
        "education_alignment": 1.0,
        "certifications_alignment": 1.0,
        "soft_skills": 1.0,
        "transferable_skills": 1.0
    }
    
    for section, content in all_sections.items():
        weight = section_weights.get(section, 1.0)
        weighted_score_sum += content['score'] * weight
        total_weight += weight
    
    overall_score = weighted_score_sum / total_weight if total_weight else 0.0
    return round(overall_score, 4), all_sections


## Main Scoring Loop

In [None]:
# 🔁 Main Scoring Loop (Corrected)
from tqdm import tqdm
from datetime import datetime


final_scores = []

for resume_idx, resume in tqdm(enumerate(resumes), desc="Scoring Resumes", total=len(resumes)):
    resume_id = f"resume_{resume_idx}"

    for jd_idx, jd in enumerate(jds):
        jd_id = f"jd_{jd_idx}"

        rule_sections = rule_based_scoring(resume, jd)
        llm_sections = llm_based_scoring(resume, jd)

        overall_score, merged_sections = combine_sections(rule_sections, llm_sections)

        final_scores.append({
            "resume_id": resume_id,
            "job_id": jd_id,
            "overall_score": overall_score,
            "sections": merged_sections,
            "scoring_timestamp": datetime.now().isoformat(),
            "model_used": "Nous-Hermes-2-Mistral-7B-DPO"
        })


## Save Final Rich JSON Output

In [None]:
# 💾 Save Final ATS Scores
save_json_output(final_scores, output_path ="results/final_ats_scores_rich.json")


## Visualize Score Distribution

In [None]:
# 📊 Score Distribution Plot
import matplotlib.pyplot as plt
scores = [item['overall_score'] for item in final_scores]

plt.figure(figsize=(10,6))
plt.hist(scores, bins=20, edgecolor='black')
plt.title('Distribution of Final ATS Scores')
plt.xlabel('Overall ATS Score')
plt.ylabel('Number of Resume-JD Pairs')
plt.grid(True)
plt.show()


## Test phase 3

#### Load Test Resumes and JDs

In [33]:
# 📂 Mini Test Resumes
test_resumes = [
    {
        "basics": {
            "name": "Alice Smith",
            "email": "alice@example.com",
            "phone": "123-456-7890",
            "location": "New York, NY",
            "current_title": "Software Engineer",
            "linkedin_url": ""
        },
        "education": [
            {"degree": "B.Sc. Computer Science", "field": "Computer Science", "institution": "NYU", "year": "2018", "gpa": "3.7"}
        ],
        "experience": [
            {"job_title": "Software Developer", "company": "ABC Corp", "start_date": "06/2018", "end_date": "08/2021", "duration_in_months": 38, "description": "Developed web applications."}
        ],
        "skills": ["Python", "Django", "SQL"],
        "certifications": ["AWS Certified Developer"],
        "projects": ["E-commerce platform"],
        "languages": ["English"],
        "total_experience_years": 3.2
    },
    {
        "basics": {
            "name": "Bob Johnson",
            "email": "bob@example.com",
            "phone": "987-654-3210",
            "location": "San Francisco, CA",
            "current_title": "Data Analyst",
            "linkedin_url": ""
        },
        "education": [
            {"degree": "B.A. Statistics", "field": "Statistics", "institution": "UCLA", "year": "2017", "gpa": "3.5"}
        ],
        "experience": [
            {"job_title": "Data Analyst", "company": "XYZ Inc", "start_date": "01/2018", "end_date": "12/2020", "duration_in_months": 36, "description": "Analyzed data trends."}
        ],
        "skills": ["SQL", "Tableau", "Python"],
        "certifications": [],
        "projects": ["Sales analytics dashboard"],
        "languages": ["English"],
        "total_experience_years": 3.0
    }
]

# 📂 Mini Test JDs
test_jds = [
    {
        "title": "Backend Engineer",
        "summary": "Looking for a backend engineer with 3+ years experience in Python and SQL. AWS certification preferred.",
        "required_experience_years": 3.0,
        "preferred_degrees": ["B.Sc. Computer Science"],
        "required_skills": ["Python", "SQL"],
        "optional_skills": ["Django"],
        "certifications": ["AWS Certified Developer"],
        "soft_skills": ["Teamwork", "Communication"],
        "job_location": "New York, NY",
        "remote_option": True,
        "employment_type": "Full-time"
    },
    {
        "title": "Business Data Analyst",
        "summary": "Seeking a Data Analyst with 2+ years experience in SQL, Excel, and data visualization tools.",
        "required_experience_years": 2.0,
        "preferred_degrees": ["B.A. Statistics"],
        "required_skills": ["SQL", "Excel"],
        "optional_skills": ["Tableau"],
        "certifications": [],
        "soft_skills": ["Analytical thinking", "Attention to detail"],
        "job_location": "San Francisco, CA",
        "remote_option": False,
        "employment_type": "Full-time"
    }
]


#### Scoring Loop (Test Batch)

In [35]:
# 🔁 Mini Scoring Loop (2x2 = 4 combinations)
from tqdm import tqdm
from datetime import datetime

test_final_scores = []

for resume_idx, resume in tqdm(enumerate(test_resumes), desc="Scoring Test Resumes", total=len(test_resumes)):
    resume_id = f"resume_{resume_idx}"

    for jd_idx, jd in enumerate(test_jds):
        jd_id = f"jd_{jd_idx}"

        rule_sections = rule_based_scoring(resume, jd)
        llm_sections = llm_based_scoring(resume, jd)

        overall_score, merged_sections = combine_sections(rule_sections, llm_sections)

        test_final_scores.append({
            "resume_id": resume_id,
            "job_id": jd_id,
            "overall_score": overall_score,
            "sections": merged_sections,
            "scoring_timestamp": datetime.now().isoformat(),
            "model_used": "Nous-Hermes-2-Mistral-7B-DPO"
        })


Scoring Test Resumes:   0%|          | 0/2 [00:00<?, ?it/s]

LLM Scoring Error: Expecting value: line 2 column 30 (char 31)


Scoring Test Resumes:  50%|█████     | 1/2 [00:12<00:12, 12.29s/it]

LLM Scoring Error: Expecting value: line 2 column 30 (char 31)
LLM Scoring Error: Expecting value: line 2 column 30 (char 31)


Scoring Test Resumes: 100%|██████████| 2/2 [00:21<00:00, 10.65s/it]

LLM Scoring Error: Expecting value: line 2 column 30 (char 31)





#### Save Mini Test Output

In [36]:
# 💾 Save Mini Test Output

save_json_output(test_final_scores, "results/test_final_ats_scores_rich.json")


✅ Saved output to results/test_final_ats_scores_rich.json


#### View Sample Output

In [37]:
# 📜 View first few records

for record in test_final_scores:
    print(json.dumps(record, indent=2))
    print("-" * 80)


{
  "resume_id": "resume_0",
  "job_id": "jd_0",
  "overall_score": 0.9,
  "sections": {
    "skills_match": {
      "score": 1.0,
      "details": "Matched skills: ['Python', 'SQL']"
    },
    "experience_alignment": {
      "score": 1.0,
      "details": "Resume experience: 3.2 yrs vs JD requirement: 3.0 yrs"
    },
    "education_alignment": {
      "score": 1.0,
      "details": "Matched degrees: ['b.sc. computer science']"
    },
    "certifications_alignment": {
      "score": 1.0,
      "details": "Matched certifications: ['AWS Certified Developer']"
    },
    "soft_skills": {
      "score": 0.5,
      "details": "LLM fallback"
    },
    "transferable_skills": {
      "score": 0.5,
      "details": "LLM fallback"
    }
  },
  "scoring_timestamp": "2025-04-26T03:01:13.826513",
  "model_used": "Nous-Hermes-2-Mistral-7B-DPO"
}
--------------------------------------------------------------------------------
{
  "resume_id": "resume_0",
  "job_id": "jd_1",
  "overall_score": 0.6,
