# Global setup and package installation used in most phases

## Colab + GPU Detection Utilities

In [None]:
import subprocess

def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

def get_available_gpu_memory_gb():
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,nounits,noheader"],
            encoding="utf-8"
        )
        free_mem_mb = int(output.strip().split("\n")[0])
        return free_mem_mb / 1024
    except Exception:
        return 0.0


## install dependencies

In [None]:
if is_running_in_colab():
    # Install the required packages
    !pip install kagglehub pandas
    !pip install -q transformers accelerate bitsandbytes sentencepiece pydantic huggingface_hub xformers
    !pip install regex json5
    !pip install sentence-transformers scikit-learn
    !pip install rapidfuzz unidecode

else:
    %pip install kagglehub pandas
    %pip install -q transformers accelerate sentencepiece pydantic huggingface_hub xformers
    #%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
    #%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
    %pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
    %pip install -U bitsandbytes
    %pip install regex json5
    %pip install sentence-transformers scikit-learn
    %pip install rapidfuzz unidecode


## Login to huggingface

In [None]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    if is_running_in_colab():
        # If running in Colab, use the Colab secrets
        try:
            from google.colab import userdata
            HF_TOKEN = userdata.get('HF_TOKEN')
            if not HF_TOKEN:
                raise ValueError("⚠️ Hugging Face token not found in Colab secrets.")
            print("🔑 Hugging Face token found in Colab secrets.")
        except ImportError:
            print("⚠️ Unable to authenticate in Colab. Please set your Hugging Face token manually.")
    else:
        # Prompt for token if not set in environment
        print("🔑 Please enter your Hugging Face token:")
        # For Colab or local prompt input
        HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


## Setup Kaggle Credentials

In [None]:
import shutil

def setup_kaggle_credentials():
    kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
    if not os.path.exists(kaggle_path):
        from google.colab import files
        print("📂 Upload kaggle.json file...")
        uploaded = files.upload()
        os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
        for filename in uploaded.keys():
            shutil.move(filename, kaggle_path)
        os.chmod(kaggle_path, 0o600)
        print(f"✅ Kaggle credentials setup at {kaggle_path}")
    else:
        print(f"✅ Kaggle credentials already exist at {kaggle_path}")

setup_kaggle_credentials()

## Mount Google Drive (Colab)

In [None]:
if is_running_in_colab():
   from google.colab import drive
   drive.mount('/content/drive')

##  Load Qwen-Instruct with Fallback to Quantized

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

def load_model_pipeline(model_name: str, hf_token: str):
    has_cuda = torch.cuda.is_available()
    free_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) if has_cuda else 0
    print(f"💻 CUDA: {has_cuda} | GPU Memory: {free_mem:.2f} GB")

    device_map = {"": 0} if has_cuda else "cpu"
    use_4bit = has_cuda and free_mem < 24

    # Set quantization config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True if use_4bit else False,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    ) if use_4bit else None

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token  # ✅ Fix warning about pad_token

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=quant_config,
        torch_dtype=torch.float16 if not quant_config else None,
        trust_remote_code=True,
        token=hf_token
    )

    print(f"✅ Model loaded on {next(model.parameters()).device}")
    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=1)


In [None]:
llm_pipeline = load_model_pipeline(
    model_name="Qwen/Qwen2-7B-Instruct",
    hf_token=HF_TOKEN
)

# Global utilities

### Utility to merge json files

In [None]:
import json
import shutil
from pathlib import Path

def merge_json_files(
    source_dir: Path,
    output_file: Path,
    pattern: str,
    merged_dir: Path
):
    source_dir.mkdir(parents=True, exist_ok=True)
    merged_dir.mkdir(parents=True, exist_ok=True)

    merged_data = []

    # Load existing output if it exists
    if output_file.exists():
        with open(output_file, "r", encoding="utf-8") as f:
            try:
                merged_data = json.load(f)
            except json.JSONDecodeError:
                print(f"⚠️ Could not decode {output_file}, starting from scratch.")

    # Identify matching files
    files_to_merge = sorted(source_dir.glob(pattern))

    for file_path in files_to_merge:
        with open(file_path, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    merged_data.extend(data)
                else:
                    print(f"⚠️ Skipping {file_path.name}: not a list.")
            except Exception as e:
                print(f"⚠️ Failed to parse {file_path.name}: {e}")
                continue

        # Move to merged folder
        shutil.move(str(file_path), merged_dir / file_path.name)
        print(f"✅ Merged and moved: {file_path.name}")

    # Write combined output
    if merged_data:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(merged_data, f, indent=2)
        print(f"💾 Saved to: {output_file}")
    else:
        print("📭 No valid data to merge.")

# === Usage ===



### Utility to save json to a folder

In [None]:
import json
import os
# 📦 Save JSON Output with Safety
def save_json_output(data, output_path: str, indent: int = 4, overwrite: bool = True):
    output_dir = os.path.dirname(output_path)
    os.makedirs(output_dir, exist_ok=True)

    if os.path.exists(output_path):
        if overwrite:
            os.remove(output_path)
        else:
            raise FileExistsError(f"File {output_path} already exists and overwrite=False.")

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=indent, ensure_ascii=False)

    print(f"✅ Saved output to {output_path}")


### Utility to load file

In [None]:
from typing import Any
import json

# 📂 Load normalized JSON data
def load_json_file(file_path: str) -> Any:
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

In [None]:
from typing import List, Dict
# ─────────────────────────────────────────────────────────────
# Utility: Load structured JSON records from one or more files
# ─────────────────────────────────────────────────────────────
def load_json_records(paths: List[Path]) -> List[Dict]:
    records = []
    for path in paths:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
            if isinstance(data, list):
                records.extend(data)
            else:
                records.append(data)
    return records

### Configurations  

In [None]:
# ==============================
# 🛠 CONFIGURATION
# ==============================

class Config:
    DATASET_DOWNLOAD_DIR = "datasets"
    JSON_OUTPUT_DIR = "json_outputs_run3"
    JSON_OUTPUT_NORMALIZED_DIR = "json_outputs_run3/normalized"
    JSON_OUTPUT_NORMALIZED_JD = "json_outputs_run3/normalized/jd"
    JSON_OUTPUT_NORMALIZED_RESUME = "json_outputs_run3/normalized/resume"
    JSON_OUTPUT_SCORING_DIR = "json_outputs_run3/scoring"
    AUTO_CLEANUP = True


# Pre Phas3 Embedding-Based Resume-JD Relevance Generator

In [None]:
from sentence_transformers import SentenceTransformer, util
from typing import Tuple

# Load embedding model once (lightweight)
domain_model = SentenceTransformer("all-MiniLM-L6-v2")

def compute_domain_similarity(domain1: str, domain2: str, threshold: float = 0.5) -> Tuple[float, bool]:
    """
    Compute cosine similarity between two domain strings using sentence embeddings.

    Returns:
        (similarity_score, is_similar) where:
            - similarity_score is a float between 0 and 1
            - is_similar is True if similarity >= threshold
    """
    if not domain1 or not domain2:
        return 1.0, True  # Treat empty/missing domain as matching

    embeddings = domain_model.encode([domain1, domain2], convert_to_tensor=True)
    similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
    return round(similarity, 4), similarity >= threshold


In [None]:
sim_score, is_similar = compute_domain_similarity("healthcare analytics", "medical data science")
print(f"Similarity: {sim_score}, Match: {is_similar}")


In [None]:
from typing import List, Dict
from pathlib import Path
import json
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from collections import defaultdict

def generate_relevance_map(
    resume_files: List[Path],
    jd_files: List[Path],
    model_name: str = "all-MiniLM-L6-v2",
    min_score: float = 0.2
) -> Dict[str, List[Dict]]:
    model = SentenceTransformer(model_name)

    resumes = load_json_records(resume_files)
    jds = load_json_records(jd_files)

    resumes_df = pd.DataFrame([r for r in resumes if r.get("input_text") and r.get("output_json")])
    jds_df = pd.DataFrame([j for j in jds if j.get("input_text") and j.get("output_json")])

    resume_embeds = model.encode(resumes_df["input_text"].tolist(), convert_to_tensor=True)
    jd_embeds = model.encode(jds_df["input_text"].tolist(), convert_to_tensor=True)
    scores = util.cos_sim(resume_embeds, jd_embeds).cpu().numpy()

    results = []
    for i, r_row in resumes_df.iterrows():
        for j, jd_row in jds_df.iterrows():
            r_domain = r_row["domain"].strip().lower()
            jd_domain = jd_row["domain"].strip().lower()
            domain_score, domain_match = compute_domain_similarity(r_domain, jd_domain, threshold=0.5)
            if not domain_match:
                continue  # Skip this resume–JD pair


            score = float(scores[i][j])
            if score < min_score:
                continue

            label = (
                "strong" if score >= 0.65 else
                "medium" if score >= 0.4 else
                "weak"
            )

            # Append result with extended metadata
            results.append({
                "resume_id": r_row["record_id"],
                "jd_id": jd_row["record_id"],
                "resume_domain": r_row["domain"],
                "jd_domain": jd_row["domain"],
                "domain_similarity": round(domain_score, 3),
                "resume_jd_similarity": round(score, 3),
                "semantic_match_label": label
            })

    print(f"Generated relevance map with {len(results)} pairs.")
    # print count of weak, medium, strong matches
    weak_count = sum(1 for r in results if r["semantic_match_label"] == "weak")
    medium_count = sum(1 for r in results if r["semantic_match_label"] == "medium")
    strong_count = sum(1 for r in results if r["semantic_match_label"] == "strong")
    print(f"Weak matches: {weak_count}, Medium matches: {medium_count}, Strong matches: {strong_count}")

    top_k = 3  # Change as needed
    reverse_map = defaultdict(list)

    # Group by resume_id and sort by similarity score
    for r in results:
        reverse_map[r["resume_id"]].append(r)

    # Sort and keep top K
    resume_top_matches = {
        resume_id: sorted(matches, key=lambda x: x["resume_jd_similarity"], reverse=True)[:top_k]
        for resume_id, matches in reverse_map.items()
    }


    return {
    "semantic_relevance_scores": results,
    "resume_top_matches": resume_top_matches
}



In [None]:
from pathlib import Path

resume_paths = list(Path(Config.JSON_OUTPUT_NORMALIZED_RESUME).glob("resumes_*.json"))
jd_paths = list(Path(Config.JSON_OUTPUT_NORMALIZED_JD).glob("jds_*.json"))
relevance_data = generate_relevance_map(resume_paths, jd_paths)
relevance_map_file = os.path.join(Config.JSON_OUTPUT_SCORING_DIR, 'semantic_relevance_scores.json')
save_json_output(relevance_data, relevance_map_file)


# Phase 3 Rubric-Based Scoring Engine

## Rule-Based Scoring Functions

In [None]:
scoring_config = {
    "matching": {
        # Global fuzzy threshold for fuzzy matching (0–100)
        "fuzzy_threshold_default": 85,
        "section_thresholds": {
            "skills": 85,
            "tools": 80,
            "certifications": 88,
            "responsibilities": 83,
            "education": 87
        }
    },
    "weights": {
        "skills": {
            "exact": 1.0,
            "substring": 0.8,
            "fuzzy": 0.5
        },
        "tools": {
            "exact": 1.0,
            "substring": 0.7,
            "fuzzy": 0.4
        },
        "certifications": {
            "exact": 1.0,
            "substring": 0.9,
            "fuzzy": 0.5
        },
        "responsibilities": {
            "exact": 1.0,
            "substring": 0.85,
            "fuzzy": 0.5
        },
        "education": {
            "exact": 1.0,
            "substring": 0.75,
            "fuzzy": 0.5
        }
    }
}


In [None]:
from rapidfuzz import fuzz
import re
from typing import List, Tuple, Dict, Any
from unidecode import unidecode


In [None]:
def normalize(text: str) -> str:
    text = unidecode(text.lower())
    text = re.sub(r"[^\w\s]", " ", text)
    return re.sub(r"\s+", " ", text).strip()

def normalize_score(score: float) -> float:
    return max(0.0, min(round(score, 3), 1.0))

In [None]:
def hybrid_match(jd_terms: List[str], resume_text: str, section: str) -> Tuple[Dict[str, int], List[Dict[str, str]]]:
    """
    Classifies JD terms matched in resume_text as 'exact', 'substring', or 'fuzzy'.
    Returns match counts and match breakdown list.
    """
    if not jd_terms:
        return {"exact": 0, "substring": 0, "fuzzy": 0}, []


    fuzzy_threshold = scoring_config["matching"].get("section_thresholds", {}).get(
        section,
        scoring_config["matching"].get("fuzzy_threshold_default", 85)
    )
    resume_tokens = set(re.findall(r"\b[\w\+\-\.#]{3,}\b", resume_text.lower()))
    normalized_text = normalize(resume_text)

    counts = {"exact": 0, "substring": 0, "fuzzy": 0}
    matched_terms = []

    for term in jd_terms:
        normalized_term = normalize(term)

        if normalized_term in resume_tokens:
            counts["exact"] += 1
            matched_terms.append({"term": term, "type": "exact"})
        elif normalized_term in normalized_text:
            counts["substring"] += 1
            matched_terms.append({"term": term, "type": "substring"})
        elif fuzz.partial_ratio(normalized_term, normalized_text) >= fuzzy_threshold:
            counts["fuzzy"] += 1
            matched_terms.append({"term": term, "type": "fuzzy"})

    return counts, matched_terms

In [None]:
def score_from_match_counts(counts: Dict[str, int], total: int, weights: Dict[str, float]) -> float:
    weighted_sum = (
        weights.get("exact", 1.0) * counts["exact"] +
        weights.get("substring", 0.8) * counts["substring"] +
        weights.get("fuzzy", 0.5) * counts["fuzzy"]
    )
    return normalize_score(weighted_sum / total) if total else 1.0

In [None]:
def score_skills_rule(resume_skills, resume_other, jd_required, jd_optional):
    resume_text = (
        " ".join(resume_skills or []) + " " +
        " ".join(section.get("content", "") for section in resume_other or [])
    )

    weights = scoring_config["weights"].get("skills", {"exact": 1.0, "substring": 0.8, "fuzzy": 0.5})

    r_counts, r_matches = hybrid_match(jd_required, resume_text, section="skills")
    o_counts, o_matches = hybrid_match(jd_optional, resume_text, section="skills")

    r_score = score_from_match_counts(r_counts, len(jd_required), weights)
    o_score = score_from_match_counts(o_counts, len(jd_optional), weights)

    final_score = normalize_score(0.8 * r_score + 0.2 * o_score)
    reason = (
        f"Required: {r_counts}, Optional: {o_counts}. "
        f"Matched skills: {[m['term'] + ' (' + m['type'] + ')' for m in r_matches + o_matches]}"
    )
    return final_score, reason


In [None]:
def score_certifications_rule(resume_certs, resume_other, jd_certs):
    if not jd_certs:
        return 1.0, "No certifications required by JD."

    cert_text = (
        " ".join(cert.get("certification", "") for cert in resume_certs or []) + " " +
        " ".join(section.get("content", "") for section in resume_other or [])
    )

    weights = scoring_config["weights"].get("certifications", {"exact": 1.0, "substring": 0.9, "fuzzy": 0.5})

    counts, matches = hybrid_match(jd_certs, cert_text, section="certifications")
    score = score_from_match_counts(counts, len(jd_certs), weights)

    reason = f"Certifications matched: {counts}. Matched: {[m['term'] + ' (' + m['type'] + ')' for m in matches]}"
    return score, reason


In [None]:
def score_education_rule(resume_education, jd_degrees):
    if not jd_degrees:
        return 1.0, "No preferred degrees listed in JD."
    if not resume_education:
        return 0.0, "No education information found in resume."

    resume_text = " ".join((edu.get("degree", "") or "") for edu in resume_education)

    weights = scoring_config["weights"].get("education", {"exact": 1.0, "substring": 0.75, "fuzzy": 0.5})

    counts, matches = hybrid_match(jd_degrees, resume_text, section="education")
    score = score_from_match_counts(counts, len(jd_degrees), weights)

    reason = f"Degrees matched: {counts}. Matched: {[m['term'] + ' (' + m['type'] + ')' for m in matches]}"
    return score, reason


In [None]:
def score_experience_rule(resume_years, jd_required_years, cap: float = 40.0):
    if not jd_required_years or resume_years is None:
        return 0.5, "Missing required or actual experience data."

    numbers = re.findall(r'\d+(?:\.\d+)?', jd_required_years)
    if not numbers:
        return 1.0, "JD experience string did not specify clear years."

    required_years = float(min(numbers))
    if required_years == 0:
        return 1.0, "JD required years = 0."

    resume_years_capped = min(resume_years, cap)
    score = resume_years_capped / required_years
    reason = f"Resume: {resume_years} yrs (capped to {resume_years_capped}), JD requires: {required_years} yrs."
    return normalize_score(score), reason

In [None]:
def score_tools_rule(resume_skills, resume_experience, resume_other, resume_projects, jd_tools):
    if not jd_tools:
        return 1.0, "No tools required by JD."

    resume_text = (
        " ".join(resume_skills or []) + " " +
        " ".join(" ".join(exp.get("description", [])) for exp in resume_experience or []) + " " +
        " ".join(section.get("content", "") for section in resume_other or []) + " " +
        " ".join(project.get("description", "") for project in resume_projects or [])
    )

    weights = scoring_config["weights"].get("tools", {"exact": 1.0, "substring": 0.7, "fuzzy": 0.4})

    counts, matches = hybrid_match(jd_tools, resume_text, section ="tools")
    score = score_from_match_counts(counts, len(jd_tools), weights)

    reason = f"Tools matched: {counts}. Matched: {[m['term'] + ' (' + m['type'] + ')' for m in matches]}"
    return score, reason


In [None]:
def score_responsibilities_rule(resume_experience, resume_other, resume_projects, jd_responsibilities):
    if not jd_responsibilities:
        return 1.0, "No responsibilities listed in JD."

    resume_text = (
        " ".join(" ".join(exp.get("description", [])) for exp in resume_experience or []) + " " +
        " ".join(section.get("content", "") for section in resume_other or []) + " " +
        " ".join(project.get("description", "") for project in resume_projects or [])
    )

    weights = scoring_config["weights"].get("responsibilities", {"exact": 1.0, "substring": 0.85, "fuzzy": 0.4})

    counts, matches = hybrid_match(jd_responsibilities, resume_text, section="responsibilities")
    score = score_from_match_counts(counts, len(jd_responsibilities), weights)

    reason = f"Responsibilities matched: {counts}. Matched: {[m['term'] + ' (' + m['type'] + ')' for m in matches]}"
    return score, reason


In [None]:
def compute_all_rule_scores(resume_json: Dict[str, Any], jd_json: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
    scores = {}

    skills_score, skills_reason = score_skills_rule(
        resume_json.get("skills", []),
        resume_json.get("other", []),
        jd_json.get("required_skills", []),
        jd_json.get("optional_skills", [])
    )
    scores["skills"] = {"score": skills_score, "reason": skills_reason}

    cert_score, cert_reason = score_certifications_rule(
        resume_json.get("certifications", []),
        resume_json.get("other", []),
        jd_json.get("certifications", [])
    )
    scores["certifications"] = {"score": cert_score, "reason": cert_reason}

    edu_score, edu_reason = score_education_rule(
        resume_json.get("education", []),
        jd_json.get("preferred_degrees", [])
    )
    scores["education"] = {"score": edu_score, "reason": edu_reason}

    exp_score, exp_reason = score_experience_rule(
        resume_json.get("total_experience_years", 0.0),
        jd_json.get("required_experience_years", "")
    )
    scores["experience"] = {"score": exp_score, "reason": exp_reason}

    tools_score, tools_reason = score_tools_rule(
        resume_json.get("skills", []),
        resume_json.get("experience", []),
        resume_json.get("other", []),
        resume_json.get("projects", []),
        jd_json.get("tools_and_technologies", [])
    )
    scores["tools"] = {"score": tools_score, "reason": tools_reason}

    resp_score, resp_reason = score_responsibilities_rule(
        resume_json.get("experience", []),
        resume_json.get("other", []),
        resume_json.get("projects", []),
        jd_json.get("job_responsibilities", [])
    )
    scores["responsibilities"] = {"score": resp_score, "reason": resp_reason}

    return scores

### unit test for each scorer

In [None]:
print("=== Testing: score_skills_rule ===")
resume_skills = ["Python", "SQL", "Excel"]
resume_other = [{"section_name": "Training", "content": "Completed MongoDB, Tableau, Excel"}]
jd_required_skills = ["Python", "MongoDB"]
jd_optional_skills = ["Tableau", "Java"]

score, reason = score_skills_rule(resume_skills, resume_other, jd_required_skills, jd_optional_skills)
print(f"Score: {score}\nReason: {reason}\n")


In [None]:
print("=== Testing: score_certifications_rule ===")
resume_certs = [{"certification": "AWS Certified"}, {"certification": "Azure"}]
resume_other = [{"section_name": "Achievements", "content": "Google Cloud certified"}]
jd_certs = ["AWS Certified", "Google Cloud"]

score, reason = score_certifications_rule(resume_certs, resume_other, jd_certs)
print(f"Score: {score}\nReason: {reason}\n")


In [None]:
print("=== Testing: score_education_rule ===")
resume_education = [{"degree": "Bachelor of Computer Science"}, {"degree": "MBA"}]
jd_degrees = ["Computer Science", "Information Technology"]

score, reason = score_education_rule(resume_education, jd_degrees)
print(f"Score: {score}\nReason: {reason}\n")


In [None]:
print("=== Testing: score_experience_rule ===")
resume_years = 6.0
jd_experience = "3–5 years"

score, reason = score_experience_rule(resume_years, jd_experience)
print(f"Score: {score}\nReason: {reason}\n")


In [None]:
print("=== Testing: score_tools_rule ===")
resume_skills = ["Python", "Docker"]
resume_experience = [
    {"job_title": "DevOps Engineer", "description": ["Used AWS, Docker, and Jenkins"]},
    {"job_title": "Software Engineer", "description": ["Built APIs with Flask"]}
]
resume_other = [{"section_name": "Misc", "content": "Worked on Kubernetes and Terraform"}]
resume_projects = [{"description": "Built ML model with Scikit-learn and deployed on AWS"}]
jd_tools = ["AWS", "Docker", "Kubernetes", "GCP"]

score, reason = score_tools_rule(resume_skills, resume_experience, resume_other, resume_projects, jd_tools)
print(f"Score: {score}\nReason: {reason}\n")


In [None]:
print("=== Testing: score_responsibilities_rule ===")
resume_experience = [
    {"job_title": "Data Analyst", "description": ["Created dashboards using Power BI", "Cleaned large datasets"]},
]
resume_other = [{"section_name": "Leadership", "content": "Led team of 5 analysts"}]
resume_projects = [{"description": "Automated data pipeline using Python"}]
jd_responsibilities = [
    "Created dashboards using Power BI",
    "Automated data pipeline using Python",
    "Built ETL workflows"
]

score, reason = score_responsibilities_rule(resume_experience, resume_other, resume_projects, jd_responsibilities)
print(f"Score: {score}\nReason: {reason}\n")


In [None]:
from pprint import pprint

print("=== Testing: compute_all_rule_scores ===")

resume_json = {
    "skills": ["Python", "Docker"],
    "certifications": [{"certification": "AWS Certified"}, {"certification": "Azure"}],
    "education": [{"degree": "Bachelor of Computer Science"}, {"degree": "MBA"}],
    "total_experience_years": 4.5,
    "experience": [
        {"job_title": "DevOps Engineer", "description": ["Used AWS, Docker, and Jenkins"]},
        {"job_title": "Data Analyst", "description": ["Created dashboards using Power BI"]}
    ],
    "other": [
        {"section_name": "Leadership", "content": "Led team of 5 analysts"},
        {"section_name": "Achievements", "content": "Google Cloud certified"}
    ],
    "projects": [
        {"description": "Built ML model with Scikit-learn and deployed on AWS"}
    ]
}

jd_json = {
    "required_skills": ["Python", "MongoDB"],
    "optional_skills": ["Tableau", "Java"],
    "certifications": ["AWS Certified", "Google Cloud"],
    "preferred_degrees": ["Computer Science", "Information Technology"],
    "required_experience_years": "3+ years",
    "tools_and_technologies": ["AWS", "Docker", "Kubernetes", "GCP"],
    "job_responsibilities": [
        "Created dashboards using Power BI",
        "Automated data pipeline using Python",
        "Built ETL workflows"
    ]
}

results = compute_all_rule_scores(resume_json, jd_json)


pprint(results)


## LLM-Based Scoring Functions (Structured Prompt)

In [None]:
LLM_SCORING_SCHEMA = """{
  "skills": {
    "score": float,
    "reason": str
  },
  "certifications": {
    "score": float,
    "reason": str
  },
  "education": {
    "score": float,
    "reason": str
  },
  "experience": {
    "score": float,
    "reason": str
  },
  "tools": {
    "score": float,
    "reason": str
  },
  "responsibilities": {
    "score": float,
    "reason": str
  },
  "soft_skills": {
    "score": float,
    "reason": str
  },
  "transferable_skills": {
    "score": float,
    "reason": str
  },
  "leadership": {
    "score": float,
    "reason": str
  },
  "grammar_cleanliness": {
    "score": float,
    "reason": str
  }
}"""


In [None]:
LLM_SCORING_PROMPT_TEMPLATE = """
You are an expert resume evaluator.

Your task is to **compare** a candidate's resume and a job description and assign **section-wise ATS scores**. Each section receives:
- a score between 0.0 and 1.0
- a short reason explaining why

You must return a valid JSON object. Do not return the resume. Do not repeat input. Do not include markdown or explanations.

RESUME:
{resume_json}

JOB DESCRIPTION:
{jd_json}

Output format (STRICTLY FOLLOW THIS STRUCTURE):
{schema}

Now respond ONLY with a JSON object in this format:
"""


In [None]:
import regex
import json5
from typing import Dict

def extract_json_block(text: str) -> Dict:
    """
    Extract the last valid JSON object block from the text using recursive regex and json5.
    Handles smart quotes, trailing commas, and prefers LLM's final output JSON.
    """
    # Normalize smart quotes
    text = text.replace("“", "\"").replace("”", "\"").replace("‘", "'").replace("’", "'")

    # Match all nested JSON-like blocks
    matches = regex.findall(r"\{(?:[^{}]|(?R))*\}", text, flags=regex.DOTALL)

    expected_keys = {"skills", "experience", "education", "certifications"}

    for block in reversed(matches):
        try:
            parsed = json5.loads(block)
            if isinstance(parsed, dict) and expected_keys.intersection(parsed.keys()):
                return parsed
        except Exception:
            continue

    print("❌ No valid JSON block found in LLM output.")
    print("🔎 Last few lines:\n", text[-500:])
    raise ValueError("No valid JSON block found.")


In [None]:
def score_with_llm(resume_json: dict, jd_json: dict, resume_id="resume", jd_id="jd") -> dict:
    """
    Use an LLM pipeline to compute ATS scores with reasoning per section.
    """
    prompt = LLM_SCORING_PROMPT_TEMPLATE.format(
        schema=LLM_SCORING_SCHEMA,
        resume_json=json.dumps(resume_json, indent=2),
        jd_json=json.dumps(jd_json, indent=2)
    )

    try:
        outputs = llm_pipeline(
            prompt,
            max_new_tokens=1024,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None,
            pad_token_id=llm_pipeline.tokenizer.pad_token_id
        )
        response_text = outputs[0]["generated_text"]
        #print("💬 LLM response preview:\n", response_text)  

        return extract_json_block(response_text)
        

    except Exception as e:
        print(f"❌ LLM inference failed for {resume_id} x {jd_id}: {str(e)}")
        print("🧪 Raw output preview:\n", response_text)
        return {}


In [None]:
from pprint import pprint

print("=== Testing: score_with_llm ===")

# Sample resume JSON
resume_json = {
    "skills": ["Python", "Docker"],
    "certifications": [{"certification": "AWS Certified"}, {"certification": "Azure"}],
    "education": [{"degree": "Bachelor of Computer Science"}, {"degree": "MBA"}],
    "total_experience_years": 4.5,
    "experience": [
        {"job_title": "DevOps Engineer", "description": ["Used AWS, Docker, and Jenkins"]},
        {"job_title": "Data Analyst", "description": ["Created dashboards using Power BI"]}
    ],
    "other": [
        {"section_name": "Leadership", "content": "Led team of 5 analysts"},
        {"section_name": "Achievements", "content": "Google Cloud certified"}
    ],
    "projects": [
        {"description": "Built ML model with Scikit-learn and deployed on AWS"}
    ]
}

# Sample job description JSON
jd_json = {
    "required_skills": ["Python", "MongoDB"],
    "optional_skills": ["Tableau", "Java"],
    "certifications": ["AWS Certified", "Google Cloud"],
    "preferred_degrees": ["Computer Science", "Information Technology"],
    "required_experience_years": "3+ years",
    "tools_and_technologies": ["AWS", "Docker", "Kubernetes", "GCP"],
    "job_responsibilities": [
        "Created dashboards using Power BI",
        "Automated data pipeline using Python",
        "Built ETL workflows"
    ]
}

# Run LLM-based scoring
llm_scores = score_with_llm(resume_json, jd_json)

print("=== LLM Scoring Output ===")
pprint(llm_scores)


In [None]:
from pprint import pprint

print("=== Testing: score_with_llm ===")

# Sample resume JSON
resume_json = {
            "resume_id": 88907739,
            "total_experience_years": 11.3,
            "summary": "High-achieving management professional and effective consultant possessing excellent communication, organizational and analytical capabilities with about 4 years of experience in devising innovative strategies and solutions to resolve complex business challenges.",
            "education": [
                {
                    "degree": "Master of Science",
                    "field": "Software Management",
                    "institution": "Carnegie Mellon University",
                    "year": ",",
                    "gpa": 3.8
                },
                {
                    "degree": "MBA",
                    "field": "International Business",
                    "institution": "Institute of Technology & Management",
                    "year": ",",
                    "gpa": 4.0
                },
                {
                    "degree": "MBA",
                    "field": "International Business",
                    "institution": "International Business Institute of Technology and Management India",
                    "year": ",",
                    "gpa": 4.0
                }
            ],
            "experience": [
                {
                    "job_title": "Consultant",
                    "company": "Company Name",
                    "start_date": "06/2015",
                    "end_date": "Current",
                    "description": [
                        "Managed and delivered a project to implement and integrate a new content management platform to create a unified brand experience, support scalability, growth and enhance digital presence for client's business - post acquisition",
                        "Led cross-functional global teams consisting of technical, business and functional representatives and achieved key milestones on time with quality deliverables",
                        "Prioritized, escalated and resolved issues with internal and external stakeholders",
                        "Directly managed 3rd party vendor and offshore teams."
                    ]
                },
                {
                    "job_title": "Product Strategy Intern",
                    "company": "Company Name",
                    "start_date": "09/2015",
                    "end_date": "12/2015",
                    "description": [
                        "Led a practicum team at Carnegie Mellon University to understand IBM Bluemix (PaaS), cloud based solution and use business frameworks to perform market, competitor and customer journey analysis",
                        "Liaised with cross functional teams to assess opportunities in marketplace, determine synergies and align business unit goals with corporate strategy",
                        "Worked with senior management and stakeholders to develop strategy for to enhance awareness, increase conversion and explore new market opportunities to scale the client's user base."
                    ]
                },
                {
                    "job_title": "Assistant Operations Manager",
                    "company": "Company Name",
                    "start_date": "07/2012",
                    "end_date": "10/2013",
                    "description": [
                        "Business Strategy & Vendor Management: Automation of Hub, typical model and replication",
                        "Reported to Chief Operating Officer to recommend company wide automation strategies and vendor selection",
                        "Conducted gap analysis, market research, competitor and financial analysis to propose short, mid and long term strategies to the Executive team",
                        "Project Management: RFID Project Member of the core project management team responsible for coordinated of cross-functional teams to achieve project milestones",
                        "Focused on process improvement and optimization to enhance team productivity",
                        "Defined the Key Performance Indicator's to evaluate vendors."
                    ]
                }
            ],
            "skills": [
                "Strategy & Operations Process Optimization",
                "Digital Transformation",
                "Cross Functional Team Management",
                "Project/Product Management",
                "Agile/Lean Methodologies",
                "Work History",
                "Client",
                "Data Analysis",
                "E-Commerce",
                "senior management",
                "Financial",
                "financial analysis",
                "functional",
                "Google Analytics",
                "Government",
                "Hub",
                "IBM",
                "International Business",
                "investments",
                "IP",
                "Marketing plan",
                "market research",
                "Market Strategy",
                "marketing",
                "market",
                "MBA",
                ".NET",
                "academic",
                "ADA",
                "Adobe",
                "Apple",
                "approach",
                "Automation",
                "business development",
                "Business Process",
                "Business Strategy",
                "Consulting",
                "content management",
                "Conversion",
                "Client",
                "Data Analysis",
                "E-Commerce",
                "senior management",
                "Financial",
                "financial analysis",
                "functional",
                "Google Analytics",
                "Government",
                "Hub",
                "IBM",
                "International Business",
                "investments",
                "IP",
                "Marketing plan",
                "market research",
                "Market Strategy",
                "marketing",
                "market",
                "MBA",
                "C#",
                "Excel",
                "Microsoft Office Suite",
                "Power Point",
                "Word",
                "Network",
                "Object Oriented Analysis and Design",
                "optimization",
                "policies",
                "process improvement",
                "Project Management",
                "proposals",
                "quality",
                "Requirement",
                "Research",
                "RFP",
                "Scrum",
                "SDLC",
                "Speech",
                "MS SQL",
                "Strategy",
                "Strategy Development",
                "Vendor Management",
                "Management",
                "Visio",
                "websites"
            ],
            "certifications": [],
            "projects": [
                {
                    "project_title": "Online E-commerce store",
                    "description": "Conceptualized and launched Online E-commerce store, developed Product Strategy and Roadmap, and produced Engineering, Financial and Marketing plan",
                    "start_date": "08/2014",
                    "end_date": "12/2015"
                },
                {
                    "project_title": "Commercialization of IP",
                    "description": "Developed Go-to-Market Strategy, Product Roadmap and proposed Business Model to launch CMU's Automatic Speech Recognition Technology and presented to Sand Hill Angel Investors",
                    "start_date": "08/2014",
                    "end_date": "12/2015"
                },
                {
                    "project_title": "Survivable Social Network on Chip",
                    "description": "Performed Object Oriented Analysis and Design along with the estimation, planning, development, measurement and tracking of the software project using the hybrid development approach",
                    "start_date": "08/2014",
                    "end_date": "12/2015"
                }
            ],
            "languages": [],
            "other": []
        }

# Sample job description JSON
jd_json = {
            "jd_id": 3906094741,
            "inferred_domain": "consulting",
            "title": "Director, Property Tax",
            "summary": "Director, Property Tax role at Kroll, focusing on tax consulting and valuation projects.",
            "required_experience_years": "7",
            "preferred_degrees": [
                "Accounting",
                "Economics",
                "Finance",
                "Management",
                "Real Estate"
            ],
            "required_skills": [
                "Management",
                "Sales"
            ],
            "optional_skills": [],
            "tools_and_technologies": [
                "Excel",
                "Word",
                "PowerPoint"
            ],
            "certifications": [
                "ASA",
                "CPA",
                "CFA",
                "MAI"
            ],
            "soft_skills": [
                "Leadership",
                "Client Relationship Management",
                "Analytical Skills",
                "Independence",
                "Teamwork",
                "Communication",
                "Diversity Awareness"
            ],
            "job_responsibilities": [
                "Client Research",
                "Data Analysis",
                "Presentation Development",
                "Valuation Techniques",
                "Tax Hearing Preparation",
                "Project Reporting",
                "Tax Projection Scenarios",
                "Business Solution Implementation",
                "Junior Staff Development",
                "Practice Growth"
            ],
            "job_location": "Atlanta, GA",
            "remote_option": ",",
            "employment_type": "full-time",
            "travel_requirements": "N/A",
            "physical_requirements": "N/A",
            "benefits": [],
            "company_information": "Kroll is a global firm providing services in governance, risk, and transparency.",
            "equal_opportunity_policy": "Kroll is committed to creating an inclusive work environment and is an equal opportunity employer.",
            "other": [
                {
                    "section_name": "Experience Level",
                    "content": "Director"
                }
            ]
        }

# Run LLM-based scoring
llm_scores = score_with_llm(resume_json, jd_json)

print("=== LLM Scoring Output ===")
pprint(llm_scores)


## Combine Section Scores

In [None]:
merge_weights_config = {
    "skills": {"rule": 0.6, "llm": 0.4},
    "certifications": {"rule": 0.5, "llm": 0.5},
    "education": {"rule": 0.5, "llm": 0.5},
    "experience": {"rule": 0.5, "llm": 0.5},
    "tools": {"rule": 0.6, "llm": 0.4},
    "responsibilities": {"rule": 0.5, "llm": 0.5},
    "soft_skills": {"rule": 0.0, "llm": 1.0},
    "transferable_skills": {"rule": 0.0, "llm": 1.0},
    "leadership": {"rule": 0.0, "llm": 1.0},
    "grammar_cleanliness": {"rule": 0.0, "llm": 1.0}
}


In [None]:
from typing import Dict, Any

def merge_scores(rule_scores: Dict[str, Dict[str, Any]],
                 llm_scores: Dict[str, Dict[str, Any]],
                 weights_config: Dict[str, Dict[str, float]]) -> Dict[str, Dict[str, Any]]:
    """
    Merges rule-based and LLM-based scores using weighted averages and combines reasons.
    Returns final section-wise score dictionary.
    """
    merged = {}
    all_sections = set(rule_scores) | set(llm_scores)

    for section in sorted(all_sections):
        rule = rule_scores.get(section, {})
        llm = llm_scores.get(section, {})

        rule_score = rule.get("score", 0.0)
        llm_score = llm.get("score", 0.0)
        rule_reason = rule.get("reason", "")
        llm_reason = llm.get("reason", "")

        weights = weights_config.get(section, {"rule": 0.5, "llm": 0.5})
        final_score = (rule_score * weights["rule"]) + (llm_score * weights["llm"])

        merged_reason = f"(Rule {weights['rule']:.1f}): {rule_reason} | (LLM {weights['llm']:.1f}): {llm_reason}"

        merged[section] = {
            "score": round(final_score, 3),
            "reason": merged_reason
        }

    return merged


In [None]:
def compute_total_ats_score(merged_scores: Dict[str, Dict[str, Any]],
                            section_weights: Dict[str, float]) -> Dict[str, Any]:
    """
    Computes the final weighted ATS score from merged section scores.
    Returns dict with final score and breakdown.
    """
    total_weight = sum(section_weights.values())
    if total_weight == 0:
        raise ValueError("Total section weights must be greater than zero.")

    weighted_sum = 0.0
    breakdown = []

    for section, weight in section_weights.items():
        score = merged_scores.get(section, {}).get("score", 0.0)
        weighted_sum += score * weight
        breakdown.append(f"{section}: {score:.2f} × {weight:.2f}")

    final_score = round(weighted_sum / total_weight, 3)
    return {
        "final_ats_score": final_score,
        "explanation": f"Weighted average across sections → {' | '.join(breakdown)}"
    }


In [None]:
section_weights_config = {
    "skills": 0.15,
    "certifications": 0.10,
    "education": 0.10,
    "experience": 0.20,
    "tools": 0.10,
    "responsibilities": 0.15,
    "soft_skills": 0.05,
    "transferable_skills": 0.05,
    "leadership": 0.05,
    "grammar_cleanliness": 0.05
}


In [None]:
from pprint import pprint

print("=== Testing: merge_scores and compute_total_ats_score ===")

# Sample rule-based scores
rule_scores = {
    "skills": {"score": 0.6, "reason": "Rule: Matched some required skills."},
    "certifications": {"score": 0.4, "reason": "Rule: No direct certification match."},
    "education": {"score": 0.9, "reason": "Rule: Degree aligned well."},
    "experience": {"score": 1.0, "reason": "Rule: Resume years > JD years."},
    "tools": {"score": 0.5, "reason": "Rule: Partial tool match."},
    "responsibilities": {"score": 0.3, "reason": "Rule: Low overlap on tasks."}
}

# Sample LLM-based scores
llm_scores = {
    "skills": {"score": 0.8, "reason": "LLM: Python and Docker match JD."},
    "certifications": {"score": 0.6, "reason": "LLM: AWS match, missing GCP."},
    "education": {"score": 0.7, "reason": "LLM: One degree matches preferred list."},
    "experience": {"score": 0.7, "reason": "LLM: Related roles, less domain alignment."},
    "tools": {"score": 0.4, "reason": "LLM: Excel mentioned, others missing."},
    "responsibilities": {"score": 0.2, "reason": "LLM: Few relevant duties matched."},
    "soft_skills": {"score": 0.5, "reason": "LLM: Communication and teamwork evident."},
    "transferable_skills": {"score": 0.4, "reason": "LLM: PM and cross-functional skills."},
    "leadership": {"score": 0.6, "reason": "LLM: Led global teams in resume."},
    "grammar_cleanliness": {"score": 0.9, "reason": "LLM: Clean formatting and language."}
}


# Merge section scores
merged_scores = merge_scores(rule_scores, llm_scores, merge_weights_config)

# Print merged scores
print("\n=== Merged Scores ===")
pprint(merged_scores)

# Compute total ATS score
final_result = compute_total_ats_score(merged_scores, section_weights_config)

# Print final ATS score
print("\n=== Final ATS Score ===")
pprint(final_result)


## Utilities to load semantic matching Resume-JD and find matching pairs

In [None]:
from typing import List, Tuple
import os

# ✅ Updated: Load and filter relevant resume–JD pairs
def find_matching_pairs(
    relevance_json_path: str,
    match_labels: List[str] = ["strong", "medium", "weak"]
) -> List[Tuple[int, int]]:
    """
    Load relevance map and return (resume_id, jd_id) tuples for selected match labels.
    """
    data = load_json_file(relevance_json_path)
    results = data.get("semantic_relevance_scores", [])

    filtered_pairs = [
        (item["resume_id"], item["jd_id"])
        for item in results
        if item.get("semantic_match_label") in match_labels
    ]
    return filtered_pairs


In [None]:
relevance_map_file = os.path.join(Config.JSON_OUTPUT_SCORING_DIR, 'semantic_relevance_scores.json')
matching_pairs = find_matching_pairs(relevance_map_file, match_labels=["strong", "medium", "weak"])

print(f"✅ Found {len(matching_pairs)} matching resume–JD pairs")
print(matching_pairs[:5])


In [None]:
def get_match_metadata(resume_id: str, jd_id: str, relevance_map: Dict[str, List[Dict]]) -> Dict:
    for entry in relevance_map.get("semantic_relevance_scores", []):
        if str(entry["resume_id"]) == str(resume_id) and str(entry["jd_id"]) == str(jd_id):
            return {
                "domain": entry.get("resume_domain", ""),
                "resume_jd_similarity": entry.get("resume_jd_similarity", 0.0),
                "semantic_match_label": entry.get("semantic_match_label", "weak")
            }
    raise ValueError(f"No semantic match metadata found for resume_id={resume_id}, jd_id={jd_id}")


## Phase 3: Scoring Loop

### Checkpoint Handling (JSON)

In [None]:
from datetime import datetime, timezone

def load_resume_checkpoint(path: str) -> int:
    if not os.path.exists(path):
        return 0
    with open(path, "r", encoding="utf-8") as file:
        return json.load(file).get("last_index", 0)

def save_resume_checkpoint(path: str, index: int):
    data = {
        "last_index": index,
        "timestamp": datetime.now(timezone.utc).isoformat()
    }
    save_json_output(data, path)
   

### Scoring a Single Resume-JD Pair

In [None]:
from typing import List, Dict, Optional, Any
from pathlib import Path

def find_record_by_id(
    records: List[Dict[str, Any]],
    record_id: str,
    id_field: str = "record_id"
) -> Optional[Dict[str, Any]]:
    """Finds a record in a list of dicts by a specified record ID field."""
    return next((r for r in records if str(r.get(id_field)) == str(record_id)), None)


In [None]:
from pathlib import Path

def score_resume_vs_jd(
    resume_id: str,
    jd_id: str,
    resume_json_dir: str,
    jd_json_dir: str,
    relevance_map: Dict[str, List[Dict]],
    rule_weight: float = 0.5,
    llm_weight: float = 0.5
) -> Dict:
    # Load resume + JD parsed JSONs
    resume_files = list(Path(resume_json_dir).glob(f"resumes_{resume_id}_*.json"))
    jd_files = list(Path(jd_json_dir).glob(f"jds_{jd_id}_*.json"))

    if not resume_files:
        raise FileNotFoundError(f"No resume file found for resume_id: {resume_id}")
    if not jd_files:
        raise FileNotFoundError(f"No JD file found for jd_id: {jd_id}")

    resume_records = load_json_file(str(resume_files[0]))
    jd_records = load_json_file(str(jd_files[0]))


    resume_record = find_record_by_id(resume_records, resume_id)
    jd_record = find_record_by_id(jd_records, jd_id)


    if not resume_record:
        raise ValueError(f"Resume record_id {resume_id} not found in file {resume_files[0].name}")
    if not jd_record:
        raise ValueError(f"JD record_id {jd_id} not found in file {jd_files[0].name}")

    resume_data = resume_record.get("output_json", {})
    jd_data = jd_record.get("output_json", {})


    # Rule-based scores
    rule_scores = compute_all_rule_scores(resume_data, jd_data)

    # LLM-based scores
    llm_scores = score_with_llm(resume_data, jd_data, resume_id=resume_id, jd_id=jd_id)

    # Merge section-wise scores
    section_scores = merge_scores(rule_scores, llm_scores, merge_weights_config)

    # Compute final ATS score
    final_score_result = compute_total_ats_score(section_scores, section_weights_config)
    final_score = final_score_result["final_ats_score"]

    # Derive match quality from final ATS score
    if final_score >= 0.75:
        match_quality = "strong"
    elif final_score >= 0.5:
        match_quality = "medium"
    else:
        match_quality = "weak"

    # Enrich with metadata from semantic map
    match_meta = get_match_metadata(resume_id, jd_id, relevance_map)

    return {
        "resume_id": resume_id,
        "jd_id": jd_id,
        "domain": match_meta["domain"],
        "resume_jd_similarity": match_meta["resume_jd_similarity"],
        "semantic_match_label": match_meta["semantic_match_label"],  # Relevance from Phase 1
        "section_scores": section_scores,
        "match_quality": match_quality,  # ATS score-based match
        "final_ats_score": final_score
    }


### Test Single Resume-JD Pair

In [None]:
from pprint import pprint
import os

# === Setup file paths ===
relevance_map_file = os.path.join(Config.JSON_OUTPUT_SCORING_DIR, 'semantic_relevance_scores.json')
resume_json_dir = Config.JSON_OUTPUT_NORMALIZED_RESUME
jd_json_dir = Config.JSON_OUTPUT_NORMALIZED_JD

# === Load relevance map ===
relevance_map = load_json_file(relevance_map_file)

# === Pick a strong match from the top for testing ===
sample_pair = None
for item in relevance_map["semantic_relevance_scores"]:
    if item["semantic_match_label"] == "strong":
        sample_pair = (item["resume_id"], item["jd_id"])
        break

if not sample_pair:
    raise ValueError("No strong match found in relevance map to test.")

resume_id, jd_id = sample_pair
print(f"=== Testing score_resume_vs_jd() for resume_id={resume_id}, jd_id={jd_id} ===")

# === Run scoring ===
result = score_resume_vs_jd(
    resume_id=resume_id,
    jd_id=jd_id,
    resume_json_dir=resume_json_dir,
    jd_json_dir=jd_json_dir,
    relevance_map=relevance_map
)

# === Show result ===
pprint(result)


test_file = os.path.join(Config.JSON_OUTPUT_SCORING_DIR, 'test_phase3_scoring.json')

save_json_output(result, test_file)


### Main Modular Function

In [None]:
from tqdm import tqdm
from typing import List, Dict
import os
import time

def score_and_save_in_batches(
    resumes: List[Dict],
    jd_lookup: Dict[str, Dict],
    relevance_map: Dict[str, List[Dict]],
    output_dir: str = Config.JSON_OUTPUT_SCORING_DIR,
    save_every: int = 5,
    limit: int = 20,
    relevance_threshold: float = 0.4,
    rule_weight: float = 0.5,
    llm_weight: float = 0.5,
    resume_from_checkpoint: bool = True
):
    os.makedirs(output_dir, exist_ok=True)
    checkpoint_file = os.path.join(output_dir, "checkpoint.json")

    # Determine starting index
    start = load_resume_checkpoint(checkpoint_file) if resume_from_checkpoint else 0
    end = min(start + limit, len(resumes))

    successes, failures = [], []
    timestamp = int(time.time())

    for idx in tqdm(range(start, end), desc="Scoring resumes"):
        resume = resumes[idx]
        resume_id = resume.get("resume_id", f"resume_{idx}")
 

        relevant_jds = get_relevant_jds(resume_id, relevance_map, jd_lookup, threshold=relevance_threshold)
        if not relevant_jds:
            print(f"⚠️ No relevant JDs found for {resume_id}")
            continue

        for jd in relevant_jds:
            jd_id = jd.get("jd_id", "")
            try:
                relevance_score = next((m["score"] for m in relevance_map[resume_id] if m["jd_id"] == jd_id), 0.0)
                result = score_resume_jd_pair(
                    resume, jd, resume_id, jd_id,
                    rule_weight, llm_weight,
                    relevance_score
                )
                successes.append(result)
            except Exception as e:
                failures.append({
                    "resume_id": resume_id,
                    "jd_id": jd_id,
                    "error": str(e)
                })

        if (idx - start + 1) % save_every == 0:
            partial_success_file = f"{output_dir}/scored_part_{start}_{idx}_{timestamp}.json"
            partial_fail_file = f"{output_dir}/failed_part_{start}_{idx}_{timestamp}.json"
            if successes:
                save_json_output(successes, partial_success_file)
            if failures:
                save_json_output(failures, partial_fail_file)
            save_resume_checkpoint(checkpoint_file, idx + 1)

    # Final save
    success_file = f"{output_dir}/scored_final_{start}_{end}_{timestamp}.json"
    fail_file = f"{output_dir}/failed_final_{start}_{end}_{timestamp}.json"
    if successes:
        save_json_output(successes, success_file)
    if failures:
        save_json_output(failures, fail_file)
    save_resume_checkpoint(checkpoint_file, end)


## Execute Scoring

In [None]:

jd_lookup = {jd["jd_id"]: jd for jd in jds}

score_and_save_in_batches(
    resumes=resumes,
    jd_lookup=jd_lookup,
    relevance_map=relevance_map,
    output_dir=Config.JSON_OUTPUT_SCORING_DIR,
    save_every=5,
    limit=1,
    relevance_threshold=0.45,
    rule_weight=0.5,
    llm_weight=0.5,
    resume_from_checkpoint=True  # set True to resume, False to start fresh
)
