# Global setup and package installation used in most phases

## Colab + GPU Detection Utilities

In [2]:
import subprocess

def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

def get_available_gpu_memory_gb():
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,nounits,noheader"],
            encoding="utf-8"
        )
        free_mem_mb = int(output.strip().split("\n")[0])
        return free_mem_mb / 1024
    except Exception:
        return 0.0


## install dependencies

In [3]:
if is_running_in_colab():
    # Install the required packages
    !pip install kagglehub pandas
    !pip install -q transformers accelerate bitsandbytes sentencepiece pydantic huggingface_hub xformers
    !pip install regex json5
    !pip install sentence-transformers scikit-learn
    !pip install rapidfuzz unidecode

else:
    %pip install kagglehub pandas
    %pip install -q transformers accelerate sentencepiece pydantic huggingface_hub xformers
    #%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
    %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
    #%pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
    %pip install -U bitsandbytes
    %pip install regex json5
    %pip install sentence-transformers scikit-learn
    %pip install rapidfuzz unidecode


Collecting kagglehub
  Downloading kagglehub-0.3.12-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.0/68.0 KB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas
  Downloading pandas-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting requests
  Downloading requests-2.32.4-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.8/64.8 KB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyyaml
  Downloading PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (751 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m751.2/751.2 KB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

## Login to huggingface

In [4]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    if is_running_in_colab():
        # If running in Colab, use the Colab secrets
        try:
            from google.colab import userdata
            HF_TOKEN = userdata.get('HF_TOKEN')
            if not HF_TOKEN:
                raise ValueError("⚠️ Hugging Face token not found in Colab secrets.")
            print("🔑 Hugging Face token found in Colab secrets.")
        except ImportError:
            print("⚠️ Unable to authenticate in Colab. Please set your Hugging Face token manually.")
    else:
        # Prompt for token if not set in environment
        print("🔑 Please enter your Hugging Face token:")
        # For Colab or local prompt input
        HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


  from .autonotebook import tqdm as notebook_tqdm


## Setup Kaggle Credentials

In [None]:
import shutil

def setup_kaggle_credentials():
    kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
    if not os.path.exists(kaggle_path):
        from google.colab import files
        print("📂 Upload kaggle.json file...")
        uploaded = files.upload()
        os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
        for filename in uploaded.keys():
            shutil.move(filename, kaggle_path)
        os.chmod(kaggle_path, 0o600)
        print(f"✅ Kaggle credentials setup at {kaggle_path}")
    else:
        print(f"✅ Kaggle credentials already exist at {kaggle_path}")

setup_kaggle_credentials()

##  Load Qwen-Instruct with Fallback to Quantized

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

def load_model_pipeline(model_name: str, hf_token: str):
    has_cuda = torch.cuda.is_available()
    free_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) if has_cuda else 0
    print(f"💻 CUDA: {has_cuda} | GPU Memory: {free_mem:.2f} GB")

    device_map = {"": 0} if has_cuda else "cpu"
    use_4bit = has_cuda and free_mem < 24

    # Set quantization config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True if use_4bit else False,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    ) if use_4bit else None

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token  # ✅ Fix warning about pad_token

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=quant_config,
        torch_dtype=torch.float16 if not quant_config else None,
        trust_remote_code=True,
        token=hf_token
    )

    print(f"✅ Model loaded on {next(model.parameters()).device}")
    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=1)


In [None]:
llm_pipeline = load_model_pipeline(
    model_name="Qwen/Qwen2-7B-Instruct",
    hf_token=HF_TOKEN
)

# Global utilities

### Utility to save json to a folder

In [5]:
import json
import os
# 📦 Save JSON Output with Safety
def save_json_output(data, output_path: str, indent: int = 4, overwrite: bool = True):
    output_dir = os.path.dirname(output_path)
    os.makedirs(output_dir, exist_ok=True)

    if os.path.exists(output_path):
        if overwrite:
            os.remove(output_path)
        else:
            raise FileExistsError(f"File {output_path} already exists and overwrite=False.")

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=indent, ensure_ascii=False)

    print(f"✅ Saved output to {output_path}")


### Utility to load file

In [6]:
from typing import Any
import json

# 📂 Load normalized JSON data
def load_json_file(file_path: str) -> Any:
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

### Utility to save jsonl file

In [7]:
def save_jsonl_line(record: dict, output_file: str):
    """
    Appends a single JSON object as a line to a .jsonl file.
    """
    with open(output_file, "a", encoding="utf-8") as f:
        json.dump(record, f, ensure_ascii=False)
        f.write("\n")


### Configurations  

In [8]:
# ==============================
# 🛠 CONFIGURATION
# ==============================

class Config:
    JSON_OUTPUT_DIR = "json_outputs_all_data"
    JSON_OUTPUT_NORMALIZED_DIR = "json_outputs_all_data/normalized"
    JSON_OUTPUT_NORMALIZED_JD = "json_outputs_all_data/normalized/jd"
    JSON_OUTPUT_NORMALIZED_RESUME = "json_outputs_all_data/normalized/resume"
    JSON_OUTPUT_SCORING_DIR = "json_outputs_all_data/scoring"
    JSON_OUTPUT_SCORING_SPLIT_DIR = "json_outputs_all_data/scoring/split"
    JSON_OUTPUT_SCORING_FT_DATA = "json_outputs_all_data/scoring/FT_data"
    JSON_OUTPUT_FINE_TUNE_SCORE = "json_outputs_all_data/fine-tune/scored"
    JSON_OUTPUT_FINE_TUNE_RECORD = "json_outputs_all_data/fine-tune/record"
    JSON_OUTPUT_FINE_TUNE_TEST_DATA = "json_outputs_all_data/fine-tune/test-data"

  


# Generate 30K core sample semantic data file

In [None]:
from typing import List, Dict, Tuple, Set
from collections import defaultdict, Counter
import random

# === 1. FILTERING ===
def filter_pairs_by_score(pairs: List[Dict], threshold: float) -> List[Dict]:
    return [p for p in pairs if p.get("resume_jd_similarity", 0.0) >= threshold]


# === 2. GROUPING ===
def group_pairs_by_quality_and_resume(pairs: List[Dict]) -> Tuple[
    Dict[str, List[Dict]],
    Dict[int, Dict[str, List[Dict]]],
    Dict[str, List[Dict]]
]:
    quality_buckets = defaultdict(list)
    resume_to_qualities = defaultdict(lambda: defaultdict(list))
    domain_coverage = defaultdict(list)

    for p in pairs:
        q = p.get("semantic_match_label", "unknown").lower()
        rid = p["resume_id"]
        res_dom = p.get("resume_domain", "unknown").lower()
        jd_dom = p.get("jd_domain", "unknown").lower()

        quality_buckets[q].append(p)
        resume_to_qualities[rid][q].append(p)
        domain_coverage[res_dom].append(p)
        domain_coverage[jd_dom].append(p)

    return quality_buckets, resume_to_qualities, domain_coverage


# === 3. RESUME-BALANCED SAMPLING ===
def resume_balanced_sampling(
    resume_to_qualities: Dict[int, Dict[str, List[Dict]]],
    target_qualities: List[str],
    seen_pairs: Set[Tuple[int, int]]
) -> Dict[str, List[Dict]]:
    selected = defaultdict(list)
    resume_count = 0

    for resume_id, qmap in resume_to_qualities.items():
        used = 0
        for q in target_qualities:
            if q in qmap:
                pair = random.choice(qmap[q])
                key = (pair["resume_id"], pair["jd_id"])
                if key not in seen_pairs:
                    selected[q].append(pair)
                    seen_pairs.add(key)
                    used += 1
        if used > 0:
            resume_count += 1

    return selected, resume_count


# === 4. DOMAIN QUOTA ENFORCEMENT ===
def enforce_domain_quota(
    quality_buckets: Dict[str, List[Dict]],
    seen_pairs: Set[Tuple[int, int]],
    target_counts: Dict[str, int],
    current_counts: Dict[str, int],
    min_per_domain: int
) -> Dict[str, List[Dict]]:
    selected = defaultdict(list)
    domain_pair_cache = defaultdict(list)

    for q in quality_buckets:
        for p in quality_buckets[q]:
            d1 = p.get("resume_domain", "unknown").lower()
            d2 = p.get("jd_domain", "unknown").lower()
            domain_pair_cache[(q, d1)].append(p)
            domain_pair_cache[(q, d2)].append(p)

    for (q, domain), group in domain_pair_cache.items():
        if current_counts[q] >= target_counts[q]:
            continue  # already filled this quality

        candidates = [p for p in group if (p["resume_id"], p["jd_id"]) not in seen_pairs]
        room_left = target_counts[q] - current_counts[q]
        take_count = min(min_per_domain, len(candidates), room_left)
        sampled = random.sample(candidates, take_count)

        for p in sampled:
            key = (p["resume_id"], p["jd_id"])
            if key not in seen_pairs:
                selected[q].append(p)
                seen_pairs.add(key)
                current_counts[q] += 1
                if current_counts[q] >= target_counts[q]:
                    break  # stop sampling more of this quality

    return selected



# === 5. FILL REMAINING WITH DIVERSITY ===
def fill_remaining_by_diverse_domains(
    quality_buckets: Dict[str, List[Dict]],
    selected_by_quality: Dict[str, List[Dict]],
    target_counts: Dict[str, int],
    seen_pairs: Set[Tuple[int, int]]
) -> Dict[str, List[Dict]]:
    for q in target_counts:
        remaining = target_counts[q] - len(selected_by_quality[q])
        if remaining <= 0:
            continue

        available = [
            p for p in quality_buckets[q]
            if (p["resume_id"], p["jd_id"]) not in seen_pairs
        ]

        # Bucket by domain
        domain_groups = defaultdict(list)
        for p in available:
            domain_groups[p.get("resume_domain", "unknown").lower()].append(p)
            domain_groups[p.get("jd_domain", "unknown").lower()].append(p)

        sampled = []
        domain_cycle = list(domain_groups.keys())
        random.shuffle(domain_cycle)

        while remaining > 0 and domain_cycle:
            domain = domain_cycle.pop(0)
            group = domain_groups[domain]
            group = [p for p in group if (p["resume_id"], p["jd_id"]) not in seen_pairs]
            if not group:
                continue
            chosen = random.choice(group)
            sampled.append(chosen)
            seen_pairs.add((chosen["resume_id"], chosen["jd_id"]))
            domain_groups[domain].remove(chosen)
            domain_cycle.append(domain)
            remaining -= 1

        selected_by_quality[q].extend(sampled)

    return selected_by_quality


In [None]:
# === 6. MAIN WRAPPER ===
def generate_balanced_sample(
    all_pairs: List[Dict],
    target_counts: Dict[str, int],
    score_threshold: float = 0.1,
    min_per_domain: int = 50
) -> List[Dict]:
    print(f"🔍 Total pairs in input: {len(all_pairs)}")
    filtered = filter_pairs_by_score(all_pairs, score_threshold)
    print(f"✅ After score ≥ {score_threshold}: {len(filtered)}")

    quality_buckets, resume_to_qualities, domain_coverage = group_pairs_by_quality_and_resume(filtered)
    print("📊 Match quality counts:")
    for q in target_counts:
        print(f"  {q.upper():<7}: {len(quality_buckets[q])}")
    print(f"👥 Unique resumes: {len(resume_to_qualities)}")

    seen_pairs = set()

    # Resume-balanced
    selected_by_quality, resume_count = resume_balanced_sampling(resume_to_qualities, list(target_counts.keys()), seen_pairs)
    print(f"👤 Resume-balanced resumes: {resume_count}")

    # Domain quotas
    current_counts = {q: len(selected_by_quality[q]) for q in target_counts}
    domain_quota_selected = enforce_domain_quota(
        quality_buckets,
        seen_pairs,
        target_counts,
        current_counts,
        min_per_domain
    )

    for q in target_counts:
        selected_by_quality[q].extend(domain_quota_selected.get(q, []))

    # Fill remaining
    selected_by_quality = fill_remaining_by_diverse_domains(quality_buckets, selected_by_quality, target_counts, seen_pairs)

    # Final merge
    final_sample = []
    print("\n📦 Final sampled count:")
    for q in target_counts:
        group = selected_by_quality[q]
        print(f"  {q.upper():<7}: {len(group)}")
        final_sample.extend(group)

    print(f"\n🎯 Total selected: {len(final_sample)}")

    # Domain coverage
    resume_domains = [p.get("resume_domain", "unknown").lower() for p in final_sample]
    jd_domains = [p.get("jd_domain", "unknown").lower() for p in final_sample]
    print("\n📊 Resume domain coverage (top 10):")
    for dom, count in Counter(resume_domains).most_common(10):
        print(f"  {dom:<30} {count}")

    print("\n📊 JD domain coverage (top 10):")
    for dom, count in Counter(jd_domains).most_common(10):
        print(f"  {dom:<30} {count}")

    return final_sample

In [None]:
# 🎯 Define how many samples per match type
target_counts = {
    "strong": 20000,
    "medium": 25000,
    "weak": 15000
}

# 📥 Load the filtered list only
input_path = os.path.join(Config.JSON_OUTPUT_SCORING_DIR, 'semantic_relevance_scores.json')
full_data  = load_json_file(input_path)
relevance_data = full_data.get("semantic_relevance_scores", [])

# 🧠 Generate the balanced subset
sampled = generate_balanced_sample(
    all_pairs=relevance_data,
    target_counts=target_counts,
    score_threshold=0.2
)

# Wrap in valid JSON object structure
relevance_wrapped = {
    "semantic_relevance_scores": sampled
}

# 💾 Save the sampled subset
relevance_map_file = os.path.join(Config.JSON_OUTPUT_SCORING_DIR, 'relevant_pairs.json')
save_json_output(relevance_wrapped, relevance_map_file)


# Generate Normal distributed 30K samples file from scores files

In [None]:
import os
import re
import json
import random
from collections import defaultdict, Counter

In [None]:
SCORING_FOLDER = Config.JSON_OUTPUT_SCORING_DIR  # path to scoring files
OUTPUT_FILE = os.path.join(Config.JSON_OUTPUT_SCORING_FT_DATA, "fine_tuning_filenames.json")  # JSON_OUTPUT_SCORING_FT_DATA
TARGET_COUNTS = {"strong": 10000, "medium": 15000, "weak": 5000}
QUALITY_BUCKETS = list(TARGET_COUNTS.keys())


In [None]:
def parse_filename(filename):
    # Handles timestamps with both date and time like: 20250616_160025
    match = re.match(r"^(\d+)_(\d+)_(strong|medium|weak)_\d{8}_\d{6}_([\d.]+)\.json$", filename)
    if match:
        resume_id, jd_id, quality, score = match.groups()
        return {
            "filename": filename,
            "resume_id": resume_id,
            "jd_id": jd_id,
            "quality": quality,
            "score": float(score)
        }
    return None


In [None]:
from collections import defaultdict

def load_scoring_files(folder):
    all_meta = []
    debug_limit = 5  # Only print debug for this many files
    debug_count = 0

    for fname in os.listdir(folder):
        if not fname.endswith(".json"):
            continue

        # 🧠 Extract core metadata from filename
        meta = parse_filename(fname)
        if not meta:
            print(f"⚠️ Skipping unrecognized filename format: {fname}")
            continue

        path = os.path.join(folder, fname)
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)

            # ✅ Add extra fields from inside the JSON
            meta["domain"] = data.get("domain", "UNKNOWN")

            # 🔄 Use filename 'quality' as fallback for missing fields
            match_quality = data.get("match_quality", meta["quality"])
            semantic_match_label = data.get("semantic_match_label", meta["quality"])

            meta["match_quality"] = str(match_quality).strip().lower()
            meta["semantic_match_label"] = str(semantic_match_label).strip().lower()

            # 🐛 Debug print for first few entries
            #if debug_count < debug_limit:
            #    print(f"🔍 File: {fname}")
            #    print(f"    ➤ match_quality: {meta['match_quality']}")
            #    print(f"    ➤ semantic_match_label: {meta['semantic_match_label']}")
            #    print(f"    ➤ domain: {meta['domain']}")
            #    debug_count += 1

            all_meta.append(meta)

        except Exception as e:
            print(f"❌ Error loading {fname}: {e}")

    return all_meta


In [None]:
def group_by_label(meta_list):
    return {
        "strong": [m for m in meta_list if m["match_quality"] == "strong"],
        "strong_semantic": [m for m in meta_list if m["semantic_match_label"] == "strong" and m["match_quality"] != "strong"],
        "medium": [m for m in meta_list if m["match_quality"] == "medium"],
        "weak": [m for m in meta_list if m["match_quality"] == "weak"]
    }

In [None]:
def select_by_domain_distribution(meta_list, limit):
    if not meta_list:
        print(f"⚠️ No data available to select from for limit {limit}")
        return []

    selected = []
    domain_buckets = defaultdict(list)
    for m in meta_list:
        domain_buckets[m["domain"]].append(m)

    domain_keys = list(domain_buckets.keys())
    if not domain_keys:
        print(f"⚠️ No domains found in the given metadata list.")
        return []

    per_domain = max(1, limit // len(domain_keys))
    for domain in domain_keys:
        random.shuffle(domain_buckets[domain])
        selected.extend(domain_buckets[domain][:per_domain])

    # Fill remaining if needed
    if len(selected) < limit:
        remaining = [m for m in meta_list if m["filename"] not in {x["filename"] for x in selected}]
        random.shuffle(remaining)
        selected.extend(remaining[:limit - len(selected)])
    return selected[:limit]


In [None]:
def log_distribution(selected_list, label):
    total = len(selected_list)
    by_domain = Counter([m["domain"] for m in selected_list])
    print(f"\n📊 Final count for {label}: {total}")
    for domain, count in by_domain.most_common():
        print(f"  {domain}: {count}")


In [None]:

def main():
    all_meta = load_scoring_files(SCORING_FOLDER)
    # Debug: Check quality counts across all files
    count_by_quality_field = Counter([m.get("match_quality", "MISSING") for m in all_meta])
    count_by_semantic_field = Counter([m.get("semantic_match_label", "MISSING") for m in all_meta])
    print("\n📋 match_quality field distribution:", count_by_quality_field)
    print("📋 semantic_match_label field distribution:", count_by_semantic_field)


    grouped = group_by_label(all_meta)
    final_selected = []

    # Step 1: Strong group
    strong_actual = grouped["strong"]
    strong_selected = []
    
    if len(strong_actual) >= TARGET_COUNTS["strong"]:
        strong_selected = select_by_domain_distribution(strong_actual, TARGET_COUNTS["strong"])
    else:
        shortfall = TARGET_COUNTS["strong"] - len(strong_actual)
        fallback = grouped["strong_semantic"]
        fallback_selected = select_by_domain_distribution(fallback, shortfall)

        strong_selected = strong_actual + fallback_selected
        print(f"\n⚙️ Strong fallback logic:")
        print(f"  ✅ Actual strong available: {len(strong_actual)}")
        print(f"  ➕ Needed fallback: {shortfall}")
        print(f"  🧩 Fallback added from semantic match: {len(fallback_selected)}")
        print(f"  ❌ Still missing: {shortfall - len(fallback_selected)} (if > 0, this means we are under target)")
    log_distribution(strong_selected, "strong")
    final_selected.extend(strong_selected)
    selected_filenames = {m["filename"] for m in final_selected}

    # Step 2: Medium group (exclude used)
    medium_pool = [m for m in grouped["medium"] if m["filename"] not in selected_filenames]
    medium_selected = select_by_domain_distribution(medium_pool, TARGET_COUNTS["medium"])
    log_distribution(medium_selected, "medium")
    final_selected.extend(medium_selected)
    selected_filenames.update(m["filename"] for m in medium_selected)

    # Step 3: Weak group (exclude used)
    weak_pool = [m for m in grouped["weak"] if m["filename"] not in selected_filenames]
    weak_selected = select_by_domain_distribution(weak_pool, TARGET_COUNTS["weak"])
    log_distribution(weak_selected, "weak")
    final_selected.extend(weak_selected)

    # Save output
    filenames = [m["filename"] for m in final_selected]
    save_json_output(filenames, OUTPUT_FILE, indent=2)
    print(f"\n✅ Saved final list to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


# Copy scored files to fine-tune/score folder

In [None]:
import os
import json
import shutil
from pathlib import Path

# Paths
SCORING_FOLDER = Path(Config.JSON_OUTPUT_SCORING_DIR)  # Source folder
FT_JSON_FILE =  os.path.join(Config.JSON_OUTPUT_SCORING_FT_DATA, "fine_tuning_filenames.json") #Path("fine_tuning_filenames.json")  # File with selected filenames
TARGET_FOLDER = Path(Config.JSON_OUTPUT_FINE_TUNE_SCORE)  # Destination

# Ensure output folder exists
TARGET_FOLDER.mkdir(parents=True, exist_ok=True)

# Load filenames
#with open(FT_JSON_FILE, "r", encoding="utf-8") as f:
#    filenames = json.load(f)
    
filenames = load_json_file(FT_JSON_FILE)

print(f"📄 Loaded {len(filenames)} filenames from {FT_JSON_FILE}")

# Copy files
copied = 0
missing = []
for fname in filenames:
    src_path = SCORING_FOLDER / fname
    dest_path = TARGET_FOLDER / fname

    if src_path.exists():
        shutil.copy2(src_path, dest_path)
        copied += 1
    else:
        missing.append(fname)

print(f"\n✅ Copied {copied} files to {TARGET_FOLDER}")
if missing:
    print(f"⚠️ {len(missing)} files were missing in {SCORING_FOLDER}:")
    for m in missing[:5]:
        print("  -", m)
    if len(missing) > 5:
        print(f"  ...and {len(missing) - 5} more.")


# Create SFT Record

In [None]:
import os
from pathlib import Path
from typing import Set, Dict
from glob import glob
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
# 🔁 Find resume/JD JSON by prefix
def find_file_with_prefix(directory: Path, prefix: str) -> Path:
    files = list(directory.glob(f"{prefix}_*.json"))
    if not files:
        raise FileNotFoundError(f"No matching file for prefix: {prefix}")
    return files[0]

In [None]:
import os

# 🧠 Determine optimal max_workers for I/O-bound tasks
def get_optimal_max_workers(io_bound: bool = True, cap: int = 32) -> int:
    cores = os.cpu_count() or 4  # fallback if undetectable
    if io_bound:
        return min(cap, 4 * cores)  # I/O-bound: more threads
    else:
        return min(cap, cores)      # CPU-bound: 1 thread per core


In [None]:
from typing import Any

def load_first_record_from_file(path: str) -> dict:
    """
    Load the first JSON object from a file that contains either:
    - a single dictionary (direct object)
    - a list with one dictionary [{...}]
    
    Raises ValueError if the list has more than one item.
    """
    data = load_json_file(path)

    if isinstance(data, list):
        if len(data) == 1:
            return data[0]
        raise ValueError(f"File {path} contains list with {len(data)} items — expected only one.")
    
    elif isinstance(data, dict):
        return data
    
    else:
        raise TypeError(f"File {path} must contain a dict or single-record list, got {type(data)}")


In [None]:
def process_single_file(
    fname: str,
    scoring_dir: Path,
    resume_dir: Path,
    jd_dir: Path,
    output_dir: Path,
    checkpoint_set: Set[str]
) -> str:
    try:
        scoring_path = scoring_dir / fname
        record = load_json_file(str(scoring_path))

        uid = f"{record['resume_id']}_{record['jd_id']}"
        if uid in checkpoint_set:
            return None

        resume_path = find_file_with_prefix(resume_dir, f"resumes_{record['resume_id']}")
        jd_path = find_file_with_prefix(jd_dir, f"jds_{record['jd_id']}")
        #resume_input = load_json_file(str(resume_path))["input_text"]
        #jd_input = load_json_file(str(jd_path))["input_text"]
        resume_data = load_first_record_from_file(str(resume_path))
        jd_data = load_first_record_from_file(str(jd_path))


        resume_input = resume_data["input_text"]
        jd_input = jd_data["input_text"]


        #input_text = f"{resume_input.strip()}\n{jd_input.strip()}"
        input_text = (
            f"<DOMAIN>{record.get('domain', '').strip()}</DOMAIN>\n"
            f"<RESUME_START>\n{resume_input.strip()}\n<RESUME_END>\n\n"
            f"<JD_START>\n{jd_input.strip()}\n<JD_END>"
        )

        section_scores = record["section_scores"]
        output_scores = {
            "final_ats_score": record["final_ats_score"],
            **{section: round(value["score"], 3) for section, value in section_scores.items()}
        }

        sft_data = {
            "input": input_text,
            "output": output_scores
        }

        match_quality = record.get("semantic_match_label", "unknown")
        output_path = output_dir / f"{record['resume_id']}_{record['jd_id']}_{match_quality}.json"
        save_json_output(sft_data, str(output_path))

        return uid

    except Exception as e:
        print(f"❌ Error processing {fname}: {e}")
        return None


In [None]:
def create_sft_record_parallel(
    scoring_file: Path,
    scoring_dir: Path,
    resume_dir: Path,
    jd_dir: Path,
    output_dir: Path,
    checkpoint_file: Path,
    max_workers: int = None
):
    os.makedirs(output_dir, exist_ok=True)
    filenames = load_json_file(str(scoring_file))  # list of filenames
    checkpoint_set = set()

    if os.path.exists(checkpoint_file):
        checkpoint_set = set(load_json_file(str(checkpoint_file)).get("processed", []))

    updated_checkpoint = list(checkpoint_set)
    max_workers = max_workers or get_optimal_max_workers(io_bound=True)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(
                process_single_file, fname, scoring_dir, resume_dir, jd_dir, output_dir, checkpoint_set
            )
            for fname in filenames
        ]
        for future in as_completed(futures):
            result = future.result()
            if result:
                updated_checkpoint.append(result)

    save_json_output({"processed": updated_checkpoint}, str(checkpoint_file))
    print(f"✅ All records processed with {max_workers} workers.")


In [None]:
create_sft_record_parallel(
    scoring_file=os.path.join(Config.JSON_OUTPUT_SCORING_FT_DATA, "fine_tuning_filenames.json"),
    scoring_dir=Path(Config.JSON_OUTPUT_FINE_TUNE_SCORE),
    resume_dir=Path(Config.JSON_OUTPUT_NORMALIZED_RESUME),
    jd_dir=Path(Config.JSON_OUTPUT_NORMALIZED_JD),
    output_dir=Path(Config.JSON_OUTPUT_FINE_TUNE_RECORD),
    checkpoint_file=os.path.join(Config.JSON_OUTPUT_FINE_TUNE_RECORD, "sft_checkpoint.json"),
)


# Create training_data.jsonl from individual records

In [11]:
from pathlib import Path
from tqdm import tqdm

def merge_sft_json_to_jsonl_notebook(input_dir: str, output_file: str):
    """
    Merges individual SFT JSON files into a newline-delimited JSONL file.

    Uses existing utility:
    - load_json_file(): safely loads a .json file
    - save_jsonl_line(): appends a record to JSONL
    """
    input_path = Path(input_dir)
    output_path = Path(output_file)

    if not input_path.exists():
        raise FileNotFoundError(f"❌ Input directory not found: {input_dir}")
    
    output_path.parent.mkdir(parents=True, exist_ok=True)  # 🛠️ Ensure directory exists


    # Clear previous output if exists
    if output_path.exists():
        os.remove(output_path)

    json_files = sorted(input_path.glob("*.json"))
    total_written = 0

    for file in tqdm(json_files, desc="🔄 Merging JSON files"):
        try:
            record = load_json_file(str(file))
            if "input" in record and "output" in record:
                save_jsonl_line(record, str(output_path))
                total_written += 1
            else:
                print(f"⚠️ Skipped {file.name}: missing 'input' or 'output' field")

        except Exception as e:
            print(f"❌ Error reading {file.name}: {e}")

    print(f"\n✅ Successfully merged {total_written} files into {output_file}")


In [12]:
merge_sft_json_to_jsonl_notebook(
    input_dir=Path(Config.JSON_OUTPUT_FINE_TUNE_RECORD), # "json_outputs_all_data/fine-tune/record",
    output_file=os.path.join(Config.JSON_OUTPUT_FINE_TUNE_TEST_DATA, "training_data.jsonl"), #"json_outputs_all_data/fine-tune/training_data.jsonl"
)


🔄 Merging JSON files: 100%|██████████| 29994/29994 [00:07<00:00, 3840.52it/s]


✅ Successfully merged 29994 files into json_outputs_all_data/fine-tune/test-data/training_data.jsonl





##  Split Merged .jsonl into 80/20 Train/Eval

In [15]:
import re
import json
import random
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm

def extract_match_quality(input_text: str) -> str:
    match = re.search(r"<MATCH_QUALITY>(.*?)</MATCH_QUALITY>", input_text)
    return match.group(1).strip().lower() if match else "unknown"

def stratified_split_jsonl(input_jsonl: str, train_path: str, eval_path: str, split_ratio: float = 0.8, seed: int = 42):
    """
    Perform stratified 80/20 split by <MATCH_QUALITY> tag in the 'input' field.
    """
    input_path = Path(input_jsonl)
    if not input_path.exists():
        raise FileNotFoundError(f"❌ File not found: {input_jsonl}")

    stratified_data = defaultdict(list)

    # Group records by match quality
    with open(input_path, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="📂 Reading and grouping records"):
            line = line.strip()
            if not line:
                continue
            try:
                record = json.loads(line)
                match_quality = extract_match_quality(record["input"])
                stratified_data[match_quality].append(line)
            except Exception as e:
                print(f"⚠️ Failed to process line: {e}")

    # Split each group and collect
    train_records, eval_records = [], []
    random.seed(seed)

    for label, records in stratified_data.items():
        random.shuffle(records)
        split_index = int(len(records) * split_ratio)
        train_records.extend(records[:split_index])
        eval_records.extend(records[split_index:])
        print(f"🔹 {label}: {len(records)} → train={split_index}, eval={len(records) - split_index}")

    # Save outputs
    Path(train_path).parent.mkdir(parents=True, exist_ok=True)
    Path(eval_path).parent.mkdir(parents=True, exist_ok=True)

    with open(train_path, "w", encoding="utf-8") as f_train:
        f_train.write("\n".join(train_records) + "\n")

    with open(eval_path, "w", encoding="utf-8") as f_eval:
        f_eval.write("\n".join(eval_records) + "\n")

    print(f"\n✅ Stratified split complete: {len(train_records)} train / {len(eval_records)} eval")


In [16]:
stratified_split_jsonl(
    input_jsonl=os.path.join(Config.JSON_OUTPUT_FINE_TUNE_TEST_DATA, "training_data.jsonl"), #"json_outputs_all_data/fine-tune/training_data.jsonl",
    train_path=os.path.join(Config.JSON_OUTPUT_FINE_TUNE_TEST_DATA, "train.jsonl"), #"json_outputs_all_data/fine-tune/train.jsonl",
    eval_path=os.path.join(Config.JSON_OUTPUT_FINE_TUNE_TEST_DATA, "eval.jsonl")  #"json_outputs_all_data/fine-tune/eval.jsonl"
)



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
📂 Reading and grouping records: 29994it [00:02, 13817.53it/s]


🔹 unknown: 29994 → train=23995, eval=5999

✅ Stratified split complete: 23995 train / 5999 eval


# create local debugging test data of 100 records

In [None]:
# 📦 Setup
import os
import random
from pathlib import Path
from collections import defaultdict
from typing import Any
import json

# === CONFIG ===
input_dir = Path(Config.JSON_OUTPUT_FINE_TUNE_RECORD)  #Path("json_outputs_all_data/fine-tune/record")
train_file =os.path.join(Config.JSON_OUTPUT_FINE_TUNE_TEST_DATA, "train_local.jsonl") 
eval_file = os.path.join(Config.JSON_OUTPUT_FINE_TUNE_TEST_DATA, "eval_local.jsonl")  
sample_count = 100


In [None]:
# Reset output files
for file in [train_file, eval_file]:
    if os.path.exists(file):
        os.remove(file)

# Collect labeled files
buckets = defaultdict(list)
for file in input_dir.glob("*.json"):
    parts = file.stem.split("_")
    if len(parts) >= 3:
        label = parts[2].lower()
        if label in {"strong", "medium", "weak"}:
            buckets[label].append(file)

# Show counts
for label, files in buckets.items():
    print(f"{label}: {len(files)} files")


In [None]:
# 🎯 Stratified Sampling: Split into Train and Eval
target_ratios = {"strong": 0.3, "medium": 0.5, "weak": 0.2}
train_count = int(sample_count * 0.8)
eval_count = sample_count - train_count

split_ratio = train_count / sample_count  # 0.8

train_written = 0
eval_written = 0

for label, label_ratio in target_ratios.items():
    n_total = int(sample_count * label_ratio)
    n_train = int(n_total * split_ratio)
    n_eval = n_total - n_train

    candidates = buckets[label]
    selected = random.sample(candidates, min(n_total, len(candidates)))
    train_samples = selected[:n_train]
    eval_samples = selected[n_train:n_train + n_eval]

    for path in train_samples:
        try:
            data = load_json_file(str(path))
            if "input" in data and "output" in data:
                save_jsonl_line(data, train_file)
                train_written += 1
        except Exception as e:
            print(f"⚠️ Train error in {path.name}: {e}")

    for path in eval_samples:
        try:
            data = load_json_file(str(path))
            if "input" in data and "output" in data:
                save_jsonl_line(data, eval_file)
                eval_written += 1
        except Exception as e:
            print(f"⚠️ Eval error in {path.name}: {e}")

print(f"✅ Wrote {train_written} examples to {train_file}")
print(f"✅ Wrote {eval_written} examples to {eval_file}")
