# Global setup and package installation used in most phases

## Colab + GPU Detection Utilities

In [None]:
import subprocess

def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

def get_available_gpu_memory_gb():
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,nounits,noheader"],
            encoding="utf-8"
        )
        free_mem_mb = int(output.strip().split("\n")[0])
        return free_mem_mb / 1024
    except Exception:
        return 0.0


## install dependencies

In [None]:
if is_running_in_colab():
    # Install the required packages
    !pip install kagglehub pandas
    !pip install -q transformers accelerate bitsandbytes sentencepiece pydantic huggingface_hub xformers
    !pip install regex json5
    !pip install sentence-transformers scikit-learn

else:
    %pip install kagglehub pandas
    %pip install -q transformers accelerate sentencepiece pydantic huggingface_hub xformers
    #%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
    #%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
    %pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
    %pip install -U bitsandbytes
    %pip install regex json5
    %pip install sentence-transformers scikit-learn



## Login to huggingface

In [None]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    if is_running_in_colab():
        # If running in Colab, use the Colab secrets
        try:
            from google.colab import userdata
            HF_TOKEN = userdata.get('HF_TOKEN')
            if not HF_TOKEN:
                raise ValueError("⚠️ Hugging Face token not found in Colab secrets.")
            print("🔑 Hugging Face token found in Colab secrets.")
        except ImportError:
            print("⚠️ Unable to authenticate in Colab. Please set your Hugging Face token manually.")
    else:
        # Prompt for token if not set in environment
        print("🔑 Please enter your Hugging Face token:")
        # For Colab or local prompt input
        HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


## Setup Kaggle Credentials

In [None]:
import shutil

def setup_kaggle_credentials():
    kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
    if not os.path.exists(kaggle_path):
        from google.colab import files
        print("📂 Upload kaggle.json file...")
        uploaded = files.upload()
        os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
        for filename in uploaded.keys():
            shutil.move(filename, kaggle_path)
        os.chmod(kaggle_path, 0o600)
        print(f"✅ Kaggle credentials setup at {kaggle_path}")
    else:
        print(f"✅ Kaggle credentials already exist at {kaggle_path}")

setup_kaggle_credentials()

## Mount Google Drive (Colab)

In [None]:
if is_running_in_colab():
   from google.colab import drive
   drive.mount('/content/drive')

##  Load Nous-Hermes-mistral-Instruct with Fallback to Quantized

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

def load_model_pipeline(model_name: str, hf_token: str):
    has_cuda = torch.cuda.is_available()
    free_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) if has_cuda else 0
    print(f"💻 CUDA: {has_cuda} | GPU Memory: {free_mem:.2f} GB")

    device_map = {"": 0} if has_cuda else "cpu"
    use_4bit = has_cuda and free_mem < 24

    # Set quantization config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True if use_4bit else False,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    ) if use_4bit else None

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token  # ✅ Fix warning about pad_token

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=quant_config,
        torch_dtype=torch.float16 if not quant_config else None,
        trust_remote_code=True,
        token=hf_token
    )

    print(f"✅ Model loaded on {next(model.parameters()).device}")
    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=1)


In [None]:
llm_pipeline = load_model_pipeline(
    model_name="NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
    hf_token=HF_TOKEN
)


# Global utilities

### Utility to merge json files

In [None]:
import json
import shutil
from pathlib import Path

def merge_json_files(
    source_dir: Path,
    output_file: Path,
    pattern: str,
    merged_dir: Path
):
    source_dir.mkdir(parents=True, exist_ok=True)
    merged_dir.mkdir(parents=True, exist_ok=True)

    merged_data = []

    # Load existing output if it exists
    if output_file.exists():
        with open(output_file, "r", encoding="utf-8") as f:
            try:
                merged_data = json.load(f)
            except json.JSONDecodeError:
                print(f"⚠️ Could not decode {output_file}, starting from scratch.")

    # Identify matching files
    files_to_merge = sorted(source_dir.glob(pattern))

    for file_path in files_to_merge:
        with open(file_path, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    merged_data.extend(data)
                else:
                    print(f"⚠️ Skipping {file_path.name}: not a list.")
            except Exception as e:
                print(f"⚠️ Failed to parse {file_path.name}: {e}")
                continue

        # Move to merged folder
        shutil.move(str(file_path), merged_dir / file_path.name)
        print(f"✅ Merged and moved: {file_path.name}")

    # Write combined output
    if merged_data:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(merged_data, f, indent=2)
        print(f"💾 Saved to: {output_file}")
    else:
        print("📭 No valid data to merge.")

# === Usage ===



### Utility to save json to a folder

In [None]:
import json
import os
# 📦 Save JSON Output with Safety
def save_json_output(data, output_path: str, indent: int = 4, overwrite: bool = True):
    output_dir = os.path.dirname(output_path)
    os.makedirs(output_dir, exist_ok=True)

    if os.path.exists(output_path):
        if overwrite:
            os.remove(output_path)
        else:
            raise FileExistsError(f"File {output_path} already exists and overwrite=False.")

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=indent, ensure_ascii=False)

    print(f"✅ Saved output to {output_path}")


### Utility to load file

In [None]:
from typing import Any
import json

# 📂 Load normalized JSON data
def load_json_file(file_path: str) -> Any:
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

### truncate text util

In [None]:
def truncate_text(text: str, max_chars=1500) -> str:
    """Trims long resumes/JDs to prevent LLM overload."""
    return text.strip()[:max_chars]

### Configurations  

In [None]:
# ==============================
# 🛠 CONFIGURATION
# ==============================

class Config:
    DATASET_DOWNLOAD_DIR = "datasets"
    JSON_OUTPUT_DIR = "json_outputs_run1"
    JSON_OUTPUT_NORMALIZED_DIR = "json_outputs_run1/normalized"
    JSON_OUTPUT_SCORING_DIR = "json_outputs_run1/scoring"
    AUTO_CLEANUP = True


# Phase 3 Rubric-Based Scoring Engine

## Rule-Based Scoring Functions

In [None]:
from typing import Dict


def rule_based_scoring(resume: Dict, jd: Dict) -> Dict[str, Dict]:
    """
    Rule-based scoring for each section. Each section returns:
    {
        "score": float (0.0 to 1.0),
        "details": str
    }
    """

    def skills_match():
        r = set(resume.get("skills", []))
        j = set(jd.get("required_skills", []))
        score = len(r & j) / len(j) if j else 1.0
        return {"score": round(score, 2), "details": f"Matched skills: {list(r & j)}"}

    def experience_alignment():
        r = resume.get("total_experience_years", 0.0)
        j = jd.get("required_experience_years", 0.0)
        score = min(r / j, 1.0) if j else 1.0
        return {"score": round(score, 2), "details": f"{r} years vs required {j}"}

    def education_alignment():
        resume_degrees = [edu["degree"].lower() for edu in resume.get("education", [])]
        jd_degrees = [deg.lower() for deg in jd.get("preferred_degrees", [])]
        match = [deg for deg in jd_degrees if any(deg in r for r in resume_degrees)]
        score = len(match) / len(jd_degrees) if jd_degrees else 1.0
        return {"score": round(score, 2), "details": f"Matched degrees: {match}"}

    def certs_projects():
        certs_r = set(resume.get("certifications", []))
        certs_j = set(jd.get("certifications", []))
        projects = resume.get("projects", [])
        has_projects = len(projects) > 0

        cert_score = len(certs_r & certs_j) / len(certs_j) if certs_j else 1.0
        project_bonus = 0.1 if has_projects else 0.0
        final_score = min(cert_score + project_bonus, 1.0)

        details = f"Certs matched: {list(certs_r & certs_j)}; Projects: {'Yes' if has_projects else 'No'}"
        return {"score": round(final_score, 2), "details": details}

    return {
        "skills_match": skills_match(),
        "experience_alignment": experience_alignment(),
        "education_alignment": education_alignment(),
        "certs_projects": certs_projects()
    }


## LLM-Based Scoring Functions (Structured Prompt)

In [None]:
def flatten_text_list(field):
    """Flattens a string or list of strings into a clean single string."""
    if isinstance(field, str):
        return field
    elif isinstance(field, list):
        return " ".join(str(f) for f in field if isinstance(f, str))
    return str(field)

def format_experience_descriptions(resume: Dict) -> str:
    descs = []
    for e in resume.get("experience", []):
        desc = flatten_text_list(e.get("description", ""))
        descs.append(desc)
    return "; ".join(descs)

def format_projects(resume: Dict) -> str:
    return "; ".join(flatten_text_list(p) for p in resume.get("projects", []))

def format_education(resume: Dict) -> str:
    return "; ".join(f"{edu.get('degree', '')} from {edu.get('institution', '')}" for edu in resume.get("education", []))

def format_certifications(resume: Dict) -> str:
    return ", ".join(flatten_text_list(c) for c in resume.get("certifications", []))

def create_llm_prompt(resume: Dict, jd: Dict) -> str:
    resume_summary = f"""
Resume:
- Title: {resume.get("basics", {}).get("current_title", "")}
- Skills: {', '.join(resume.get("skills", []))}
- Certifications: {format_certifications(resume)}
- Projects: {format_projects(resume)}
- Experience: {format_experience_descriptions(resume)}
- Education: {format_education(resume)}
"""

    jd_summary = f"""
Job Description:
- Title: {jd.get("title", "")}
- Required Skills: {', '.join(jd.get("required_skills", []))}
- Required Certifications: {', '.join(jd.get("certifications", []))}
- Preferred Degrees: {', '.join(jd.get("preferred_degrees", []))}
- Soft Skills Required: {', '.join(jd.get("soft_skills", []))}
"""

    return f"""
You are an ATS scoring expert.

Evaluate the following resume against the job description.

Respond ONLY with valid JSON using this exact structure:

{{
  "skills_match": {{
    "score": float between 0 and 1,
    "details": "reasoning"
  }},
  "experience_alignment": {{
    "score": float between 0 and 1,
    "details": "reasoning"
  }},
  "education_alignment": {{
    "score": float between 0 and 1,
    "details": "reasoning"
  }},
  "certs_projects": {{
    "score": float between 0 and 1,
    "details": "reasoning"
  }},
  "soft_skills": {{
    "score": float between 0 and 1,
    "details": "reasoning"
  }},
  "transferable_skills": {{
    "score": float between 0 and 1,
    "details": "reasoning"
  }},
  "grammar_and_cleanliness": {{
    "score": float between 0 and 1,
    "details": "reasoning"
  }},
  "leadership": {{
    "score": float between 0 and 1,
    "details": "only if leadership inferred or requested by JD"
  }}
}}

{resume_summary}
{jd_summary}

Assistant:"""


In [None]:
def extract_last_json_block_with_regex(text: str) -> Dict:
    import regex
    import json5
    text = text.replace("“", "\"").replace("”", "\"").replace("‘", "'").replace("’", "'")
    matches = regex.findall(r"\{(?:[^{}]|(?R))*\}", text, flags=regex.DOTALL)

    for block in reversed(matches):
        try:
            parsed = json5.loads(block)
            if "skills_match" in parsed:
                return parsed
        except Exception:
            continue
    raise ValueError("No valid JSON block found.")


In [None]:
def llm_based_scoring(resume, jd, resume_id="resume", jd_id="jd", retries=2):
    prompt = create_llm_prompt(resume, jd)

    for attempt in range(retries + 1):
        try:
            response = llm_pipeline(prompt, max_new_tokens=512)[0]['generated_text']
            parsed = extract_last_json_block_with_regex(response)
            return parsed
        except Exception as e:
            print(f"⚠️ LLM attempt {attempt + 1} failed for {resume_id} x {jd_id}: {e}")
            print("🧪 Raw output preview:\n", response[:512])

    # Fallback structure
    return {
        section: {"score": 0.5, "details": "LLM fallback"}
        for section in [
            "skills_match", "experience_alignment", "education_alignment", "certs_projects",
            "soft_skills", "transferable_skills", "grammar_and_cleanliness", "leadership"
        ]
    }


## Combine Section Scores

In [None]:
def merge_section_scores(
    rule_scores: Dict[str, Dict],
    llm_scores: Dict[str, Dict],
    rule_weight: float = 0.5,
    llm_weight: float = 0.5
) -> Dict:
    """
    Merge rule-based and LLM-based section scores using weighted average.
    Each section includes:
    - score (merged)
    - details: { "rule": ..., "llm": ... }
    Returns:
    {
      "final_section_scores": {
         section_name: {
            "score": float,
            "details": { "rule": str, "llm": str }
         }
      },
      "final_ats_score": float,
      "weights_used": { section: { "rule": float, "llm": float } }
    }
    """
    merged_scores = {}
    section_weights_used = {}
    all_sections = set(rule_scores.keys()) | set(llm_scores.keys())

    for section in all_sections:
        rule_score = rule_scores.get(section, {}).get("score")
        rule_details = rule_scores.get(section, {}).get("details")
        llm_score = llm_scores.get(section, {}).get("score")
        llm_details = llm_scores.get(section, {}).get("details")

        if rule_score is not None and llm_score is not None:
            score = (rule_score * rule_weight) + (llm_score * llm_weight)
            weight_info = {"rule": rule_weight, "llm": llm_weight}
        elif rule_score is not None:
            score = rule_score
            weight_info = {"rule": 1.0, "llm": 0.0}
        elif llm_score is not None:
            score = llm_score
            weight_info = {"rule": 0.0, "llm": 1.0}
        else:
            score = 0.0
            weight_info = {"rule": 0.0, "llm": 0.0}

        merged_scores[section] = {
            "score": round(score, 4),
            "details": {
                "rule": rule_details or "N/A",
                "llm": llm_details or "N/A"
            }
        }
        section_weights_used[section] = weight_info

    final_ats_score = round(
        sum(sec["score"] for sec in merged_scores.values()) / len(merged_scores),
        4
    )

    return {
        "final_section_scores": merged_scores,
        "final_ats_score": final_ats_score,
        "weights_used": section_weights_used
    }


## Test phase 3 - scoring logic

In [None]:
# 📂 Mini Test Resumes
test_resumes = [
    {
        "resume_id": "61e6bdda-548c-4d24-87f3-5d97fdef032b",
        "basics": {
            "name": "Alice Smith",
            "email": "alice@example.com",
            "phone": "123-456-7890",
            "location": "New York, NY",
            "current_title": "Software Engineer",
            "linkedin_url": ""
        },
        "education": [
            {"degree": "B.Sc. Computer Science", "field": "Computer Science", "institution": "NYU", "year": "2018", "gpa": "3.7"}
        ],
        "experience": [
            {"job_title": "Software Developer", "company": "ABC Corp", "start_date": "06/2018", "end_date": "08/2021", "duration_in_months": 38, "description": "Developed web applications."}
        ],
        "skills": ["Python", "Django", "SQL"],
        "certifications": ["AWS Certified Developer"],
        "projects": ["E-commerce platform"],
        "languages": ["English"],
        "total_experience_years": 3.2
    },
    {
        "resume_id": "f14f29c5-8ed9-493a-975d-c210655ff0aa",
        "basics": {
            "name": "Bob Johnson",
            "email": "bob@example.com",
            "phone": "987-654-3210",
            "location": "San Francisco, CA",
            "current_title": "Data Analyst",
            "linkedin_url": ""
        },
        "education": [
            {"degree": "B.A. Statistics", "field": "Statistics", "institution": "UCLA", "year": "2017", "gpa": "3.5"}
        ],
        "experience": [
            {"job_title": "Data Analyst", "company": "XYZ Inc", "start_date": "01/2018", "end_date": "12/2020", "duration_in_months": 36, "description": "Analyzed data trends."}
        ],
        "skills": ["SQL", "Tableau", "Python"],
        "certifications": [],
        "projects": ["Sales analytics dashboard"],
        "languages": ["English"],
        "total_experience_years": 3.0
    }
]

# 📂 Mini Test JDs
test_jds = [
    {
        "jd_id": "9a62f845-94f2-40fe-a63b-e6f5cbd765c5",
        "title": "Backend Engineer",
        "summary": "Looking for a backend engineer with 3+ years experience in Python and SQL. AWS certification preferred.",
        "required_experience_years": 3.0,
        "preferred_degrees": ["B.Sc. Computer Science"],
        "required_skills": ["Python", "SQL"],
        "optional_skills": ["Django"],
        "certifications": ["AWS Certified Developer"],
        "soft_skills": ["Teamwork", "Communication"],
        "job_location": "New York, NY",
        "remote_option": True,
        "employment_type": "Full-time",
        "inferred_domain": "engineering"
    },
    {
        "jd_id": "c890e8d6-9f04-429b-9a27-c4f5fcb59ce5",
        "title": "Business Data Analyst",
        "summary": "Seeking a Data Analyst with 2+ years experience in SQL, Excel, and data visualization tools.",
        "required_experience_years": 2.0,
        "preferred_degrees": ["B.A. Statistics"],
        "required_skills": ["SQL", "Excel"],
        "optional_skills": ["Tableau"],
        "certifications": [],
        "soft_skills": ["Analytical thinking", "Attention to detail"],
        "job_location": "San Francisco, CA",
        "remote_option": False,
        "employment_type": "Full-time",
        "inferred_domain": "technology"
    }
]


#### Scoring Loop (Test Batch)

In [None]:
from tqdm import tqdm
from datetime import datetime
# Replace with your test resumes and JDs
#test_resumes = resumes[:2]  # or load from a separate test dataset
#test_jds = jds[:2]

test_results = []

for resume_idx, resume in tqdm(enumerate(test_resumes), desc="Scoring Test Batch", total=len(test_resumes)):
    resume_id = resume.get("resume_id", f"resume_{resume_idx}")
    
    for jd_idx, jd in enumerate(test_jds):
        jd_id = jd.get("jd_id", f"jd_{jd_idx}")

        # Run both scoring engines
        rule_scores = rule_based_scoring(resume, jd)
        llm_scores = llm_based_scoring(resume, jd, resume_id, jd_id)

        # Merge section scores and compute final ATS score
        merged_result = merge_section_scores(rule_scores, llm_scores, rule_weight=0.5, llm_weight=0.5)

        test_results.append({
            "resume_id": resume_id,
            "job_id": jd_id,
            "rule_based": rule_scores,
            "llm_based": llm_scores,
            "final_section_scores": merged_result["final_section_scores"],
            "final_ats_score": merged_result["final_ats_score"],
            "weights_used": merged_result["weights_used"],
            "timestamp": datetime.now().isoformat(),
            "model_used": "Nous-Hermes-2-Mistral-7B-DPO"
        })


#### Save Mini Test Output

In [None]:
# 💾 Save Mini Test Output
import os
test_file = os.path.join(Config.JSON_OUTPUT_SCORING_DIR, 'test_phase3_hybrid_parallel_scoring.json')

save_json_output(test_results, test_file)


## Embedding-Based Relevance Generator

### Text Construction Functions

In [None]:
def resume_to_text(resume: Dict) -> str:
    def safe_join(items):
        return ", ".join(str(i) for i in items if i)

    def extract_certifications(cert_list):
        return [
            c.get("name", "") if isinstance(c, dict) else str(c)
            for c in cert_list
        ]

    def extract_education(edus):
        return [
            f"{e.get('degree', '')} from {e.get('institution', '')} ({e.get('year', '')})"
            for e in edus if isinstance(e, dict)
        ]

    def extract_experience(exps):
        results = []
        for exp in exps:
            parts = [
                exp.get("title", ""),
                exp.get("company", ""),
                exp.get("location", ""),
                f"{exp.get('start_date', '')} to {exp.get('end_date', '')}"
            ]
            results.append(" | ".join(filter(None, parts)))
        return results

    basics = resume.get("basics", {})
    title = basics.get("current_title", "")
    location = basics.get("location", "")

    skills = safe_join(resume.get("skills", []))
    certs = safe_join(extract_certifications(resume.get("certifications", [])))
    education = safe_join(extract_education(resume.get("education", [])))
    experience = safe_join(extract_experience(resume.get("experience", [])))
    projects = safe_join(resume.get("projects", []))
    languages = safe_join(resume.get("languages", []))

    summary = f"""
Title: {title}
Location: {location}
Skills: {skills}
Certifications: {certs}
Education: {education}
Experience: {experience}
Projects: {projects}
Languages: {languages}
"""
    return summary.strip()


In [None]:
def jd_to_text(jd: Dict) -> str:
    def safe_join(items):
        return ", ".join(str(i) for i in items if i)

    title = jd.get("title", "")
    summary = jd.get("summary", jd.get("description", ""))
    required_skills = safe_join(jd.get("required_skills", []))
    optional_skills = safe_join(jd.get("optional_skills", []))
    soft_skills = safe_join(jd.get("soft_skills", []))
    certifications = safe_join(jd.get("certifications", []))
    degrees = safe_join(jd.get("preferred_degrees", []))
    domain = jd.get("inferred_domain", "unknown")
    location = jd.get("job_location", "")
    emp_type = jd.get("employment_type", "")
    remote = "Remote" if jd.get("remote_option") else "Onsite"

    jd_text = f"""
Title: {title}
Domain: {domain}
Location: {location} ({remote})
Employment Type: {emp_type}
Summary: {summary}
Required Skills: {required_skills}
Optional Skills: {optional_skills}
Soft Skills: {soft_skills}
Preferred Degrees: {degrees}
Certifications: {certifications}
"""
    return jd_text.strip()


### Embedding + Relevance Generator

In [None]:
from typing import List, Dict
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

def generate_semantic_relevance_map(
    resumes: List[Dict],
    jds: List[Dict],
    top_n: int = 10,
    model_name: str = "all-MiniLM-L6-v2"
) -> Dict[str, List[Dict]]:
    """
    Generates a relevance map using cosine similarity of sentence embeddings
    between each resume and all job descriptions.

    Returns:
        Dict[resume_id, List[{jd_id, score}]]
    """
    model = SentenceTransformer(model_name)

    print("🧠 Encoding job descriptions...")
    jd_ids, jd_texts = [], []
    for jd in jds:
        try:
            if not jd.get("jd_id"):
                continue
            text = jd_to_text(jd)
            if text.strip():
                jd_ids.append(jd["jd_id"])
                jd_texts.append(text)
        except Exception as e:
            print(f"⚠️ Skipping JD due to error: {e}")
            continue

    jd_embeddings = model.encode(jd_texts, show_progress_bar=True, batch_size=32)

    relevance_map = {}

    print("📄 Encoding resumes and computing similarities...")
    for resume in tqdm(resumes, desc="Generating semantic relevance map"):
        try:
            resume_id = resume.get("resume_id")
            if not resume_id:
                continue

            resume_text = resume_to_text(resume)
            if not resume_text.strip():
                continue

            resume_emb = model.encode([resume_text])[0]
            sim_scores = cosine_similarity([resume_emb], jd_embeddings)[0]
            top_indices = np.argsort(sim_scores)[::-1][:top_n]

            top_matches = [
                {"jd_id": jd_ids[i], "score": round(float(sim_scores[i]), 4)}
                for i in top_indices
            ]

            relevance_map[resume_id] = top_matches
        except Exception as e:
            print(f"⚠️ Error processing resume {resume.get('resume_id', 'unknown')}: {e}")
            continue

    return relevance_map


## Phase 3: Scoring Loop

### Checkpoint Handling (JSON)

In [None]:
from datetime import datetime, timezone

def load_resume_checkpoint(path: str) -> int:
    if not os.path.exists(path):
        return 0
    with open(path, "r", encoding="utf-8") as file:
        return json.load(file).get("last_index", 0)

def save_resume_checkpoint(path: str, index: int):
    data = {
        "last_index": index,
        "timestamp": datetime.now(timezone.utc).isoformat()
    }
    save_json_output(data, path)
   

### Get Relevant JDs

In [None]:
def get_relevant_jds(
    resume_id: str,
    relevance_map: Dict[str, List[Dict]],
    jd_lookup: Dict[str, Dict],
    threshold: float = 0.5
) -> List[Dict]:
    matches = relevance_map.get(resume_id, [])
    return [
        jd_lookup[m["jd_id"]]
        for m in matches
        if m["score"] >= threshold and m["jd_id"] in jd_lookup
    ]


### Scoring a Single Resume-JD Pair

In [None]:
def score_resume_jd_pair(
    resume: Dict,
    jd: Dict,
    resume_id: str,
    jd_id: str,
    rule_weight: float,
    llm_weight: float,
    relevance_score: float,
) -> Dict:
    rule_scores = rule_based_scoring(resume, jd)
    llm_scores = llm_based_scoring(resume, jd, resume_id, jd_id)
    merged = merge_section_scores(rule_scores, llm_scores, rule_weight, llm_weight)

    merged.update({
        "resume_id": resume_id,
        "job_id": jd_id,
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "inferred_domain": jd.get("inferred_domain", "unknown"),
        "relevance_score": relevance_score,
        "match_quality": (
            "strong" if merged["final_ats_score"] >= 0.75 else
            "medium" if merged["final_ats_score"] >= 0.5 else
            "weak"
        )
    })
    return merged


### Main Modular Function

In [None]:
from tqdm import tqdm
from typing import List, Dict
import os
import time

def score_and_save_in_batches(
    resumes: List[Dict],
    jd_lookup: Dict[str, Dict],
    relevance_map: Dict[str, List[Dict]],
    output_dir: str = Config.JSON_OUTPUT_SCORING_DIR,
    save_every: int = 5,
    limit: int = 20,
    relevance_threshold: float = 0.4,
    rule_weight: float = 0.5,
    llm_weight: float = 0.5,
    resume_from_checkpoint: bool = True
):
    os.makedirs(output_dir, exist_ok=True)
    checkpoint_file = os.path.join(output_dir, "checkpoint.json")

    # Determine starting index
    start = load_resume_checkpoint(checkpoint_file) if resume_from_checkpoint else 0
    end = min(start + limit, len(resumes))

    successes, failures = [], []
    timestamp = int(time.time())

    for idx in tqdm(range(start, end), desc="Scoring resumes"):
        resume = resumes[idx]
        resume_id = resume.get("resume_id", f"resume_{idx}")
 

        relevant_jds = get_relevant_jds(resume_id, relevance_map, jd_lookup, threshold=relevance_threshold)
        if not relevant_jds:
            print(f"⚠️ No relevant JDs found for {resume_id}")
            continue

        for jd in relevant_jds:
            jd_id = jd.get("jd_id", "")
            try:
                relevance_score = next((m["score"] for m in relevance_map[resume_id] if m["jd_id"] == jd_id), 0.0)
                result = score_resume_jd_pair(
                    resume, jd, resume_id, jd_id,
                    rule_weight, llm_weight,
                    relevance_score
                )
                successes.append(result)
            except Exception as e:
                failures.append({
                    "resume_id": resume_id,
                    "jd_id": jd_id,
                    "error": str(e)
                })

        if (idx - start + 1) % save_every == 0:
            partial_success_file = f"{output_dir}/scored_part_{start}_{idx}_{timestamp}.json"
            partial_fail_file = f"{output_dir}/failed_part_{start}_{idx}_{timestamp}.json"
            if successes:
                save_json_output(successes, partial_success_file)
            if failures:
                save_json_output(failures, partial_fail_file)
            save_resume_checkpoint(checkpoint_file, idx + 1)

    # Final save
    success_file = f"{output_dir}/scored_final_{start}_{end}_{timestamp}.json"
    fail_file = f"{output_dir}/failed_final_{start}_{end}_{timestamp}.json"
    if successes:
        save_json_output(successes, success_file)
    if failures:
        save_json_output(failures, fail_file)
    save_resume_checkpoint(checkpoint_file, end)


## Load Normalized Resumes and JDs and relevance score

### Load Resumes

In [None]:
import os

resumes_path = os.path.join(Config.JSON_OUTPUT_NORMALIZED_DIR, 'normalized_resumes.json')
resumes = load_json_file(resumes_path)

### Load JDs

In [None]:
import os

jds_path = os.path.join(Config.JSON_OUTPUT_NORMALIZED_DIR, 'normalized_jds.json')
jds = load_json_file(jds_path)


### create and save relevance map

In [None]:
import os

relevance_map = generate_semantic_relevance_map(resumes, jds, top_n=10)

relevance_map_file = os.path.join(Config.JSON_OUTPUT_SCORING_DIR, 'semantic_relevance_scores.json')
save_json_output(relevance_map, relevance_map_file)

In [None]:
print(f"Loaded {len(resumes)} resumes and {len(jds)} job descriptions.")

## Execute Scoring

In [None]:

jd_lookup = {jd["jd_id"]: jd for jd in jds}

score_and_save_in_batches(
    resumes=resumes,
    jd_lookup=jd_lookup,
    relevance_map=relevance_map,
    output_dir=Config.JSON_OUTPUT_SCORING_DIR,
    save_every=5,
    limit=1,
    relevance_threshold=0.45,
    rule_weight=0.5,
    llm_weight=0.5,
    resume_from_checkpoint=True  # set True to resume, False to start fresh
)
