# Global setup and package installation used in most phases

## Colab + GPU Detection Utilities

In [2]:
import subprocess

def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

def get_available_gpu_memory_gb():
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,nounits,noheader"],
            encoding="utf-8"
        )
        free_mem_mb = int(output.strip().split("\n")[0])
        return free_mem_mb / 1024
    except Exception:
        return 0.0


## install dependencies

In [None]:
if is_running_in_colab():
    # Install the required packages
    !pip install kagglehub pandas
    !pip install -q transformers accelerate bitsandbytes sentencepiece pydantic huggingface_hub xformers
    !pip install regex json5
    !pip install sentence-transformers scikit-learn
    !pip install rapidfuzz unidecode

else:
    %pip install kagglehub pandas
    %pip install -q transformers accelerate sentencepiece pydantic huggingface_hub xformers
    #%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
    #%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
    %pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
    %pip install -U bitsandbytes
    %pip install regex json5
    %pip install sentence-transformers scikit-learn
    %pip install rapidfuzz unidecode


## Login to huggingface

In [3]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    if is_running_in_colab():
        # If running in Colab, use the Colab secrets
        try:
            from google.colab import userdata
            HF_TOKEN = userdata.get('HF_TOKEN')
            if not HF_TOKEN:
                raise ValueError("⚠️ Hugging Face token not found in Colab secrets.")
            print("🔑 Hugging Face token found in Colab secrets.")
        except ImportError:
            print("⚠️ Unable to authenticate in Colab. Please set your Hugging Face token manually.")
    else:
        # Prompt for token if not set in environment
        print("🔑 Please enter your Hugging Face token:")
        # For Colab or local prompt input
        HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


## Setup Kaggle Credentials

In [4]:
import shutil

def setup_kaggle_credentials():
    kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
    if not os.path.exists(kaggle_path):
        from google.colab import files
        print("📂 Upload kaggle.json file...")
        uploaded = files.upload()
        os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
        for filename in uploaded.keys():
            shutil.move(filename, kaggle_path)
        os.chmod(kaggle_path, 0o600)
        print(f"✅ Kaggle credentials setup at {kaggle_path}")
    else:
        print(f"✅ Kaggle credentials already exist at {kaggle_path}")

setup_kaggle_credentials()

✅ Kaggle credentials already exist at C:\Users\rubyj/.kaggle/kaggle.json


##  Load Qwen-Instruct with Fallback to Quantized

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

def load_model_pipeline(model_name: str, hf_token: str):
    has_cuda = torch.cuda.is_available()
    free_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) if has_cuda else 0
    print(f"💻 CUDA: {has_cuda} | GPU Memory: {free_mem:.2f} GB")

    device_map = {"": 0} if has_cuda else "cpu"
    use_4bit = has_cuda and free_mem < 24

    # Set quantization config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True if use_4bit else False,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    ) if use_4bit else None

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token  # ✅ Fix warning about pad_token

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=quant_config,
        torch_dtype=torch.float16 if not quant_config else None,
        trust_remote_code=True,
        token=hf_token
    )

    print(f"✅ Model loaded on {next(model.parameters()).device}")
    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=1)


In [None]:
llm_pipeline = load_model_pipeline(
    model_name="Qwen/Qwen2-7B-Instruct",
    hf_token=HF_TOKEN
)

# Global utilities

### Utility to save json to a folder

In [5]:
import json
import os
# 📦 Save JSON Output with Safety
def save_json_output(data, output_path: str, indent: int = 4, overwrite: bool = True):
    output_dir = os.path.dirname(output_path)
    os.makedirs(output_dir, exist_ok=True)

    if os.path.exists(output_path):
        if overwrite:
            os.remove(output_path)
        else:
            raise FileExistsError(f"File {output_path} already exists and overwrite=False.")

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=indent, ensure_ascii=False)

    print(f"✅ Saved output to {output_path}")


### Utility to load file

In [6]:
from typing import Any
import json

# 📂 Load normalized JSON data
def load_json_file(file_path: str) -> Any:
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

### Configurations  

In [None]:
# ==============================
# 🛠 CONFIGURATION
# ==============================

class Config:
    JSON_OUTPUT_DIR = "json_outputs_all_data"
    JSON_OUTPUT_NORMALIZED_DIR = "json_outputs_all_data/normalized"
    JSON_OUTPUT_NORMALIZED_JD = "json_outputs_all_data/normalized/jd"
    JSON_OUTPUT_NORMALIZED_RESUME = "json_outputs_all_data/normalized/resume"
    JSON_OUTPUT_SCORING_DIR = "json_outputs_all_data/scoring"
    JSON_OUTPUT_SCORING_SPLIT_DIR = "json_outputs_all_data/scoring/split"
    JSON_OUTPUT_SCORING_FT_DATA = "json_outputs_all_data/scoring/FT_data"
  


# Generate 30K core sample semantic data file

In [8]:
from typing import List, Dict, Tuple, Set
from collections import defaultdict, Counter
import random

# === 1. FILTERING ===
def filter_pairs_by_score(pairs: List[Dict], threshold: float) -> List[Dict]:
    return [p for p in pairs if p.get("resume_jd_similarity", 0.0) >= threshold]


# === 2. GROUPING ===
def group_pairs_by_quality_and_resume(pairs: List[Dict]) -> Tuple[
    Dict[str, List[Dict]],
    Dict[int, Dict[str, List[Dict]]],
    Dict[str, List[Dict]]
]:
    quality_buckets = defaultdict(list)
    resume_to_qualities = defaultdict(lambda: defaultdict(list))
    domain_coverage = defaultdict(list)

    for p in pairs:
        q = p.get("semantic_match_label", "unknown").lower()
        rid = p["resume_id"]
        res_dom = p.get("resume_domain", "unknown").lower()
        jd_dom = p.get("jd_domain", "unknown").lower()

        quality_buckets[q].append(p)
        resume_to_qualities[rid][q].append(p)
        domain_coverage[res_dom].append(p)
        domain_coverage[jd_dom].append(p)

    return quality_buckets, resume_to_qualities, domain_coverage


# === 3. RESUME-BALANCED SAMPLING ===
def resume_balanced_sampling(
    resume_to_qualities: Dict[int, Dict[str, List[Dict]]],
    target_qualities: List[str],
    seen_pairs: Set[Tuple[int, int]]
) -> Dict[str, List[Dict]]:
    selected = defaultdict(list)
    resume_count = 0

    for resume_id, qmap in resume_to_qualities.items():
        used = 0
        for q in target_qualities:
            if q in qmap:
                pair = random.choice(qmap[q])
                key = (pair["resume_id"], pair["jd_id"])
                if key not in seen_pairs:
                    selected[q].append(pair)
                    seen_pairs.add(key)
                    used += 1
        if used > 0:
            resume_count += 1

    return selected, resume_count


# === 4. DOMAIN QUOTA ENFORCEMENT ===
def enforce_domain_quota(
    quality_buckets: Dict[str, List[Dict]],
    seen_pairs: Set[Tuple[int, int]],
    target_counts: Dict[str, int],
    current_counts: Dict[str, int],
    min_per_domain: int
) -> Dict[str, List[Dict]]:
    selected = defaultdict(list)
    domain_pair_cache = defaultdict(list)

    for q in quality_buckets:
        for p in quality_buckets[q]:
            d1 = p.get("resume_domain", "unknown").lower()
            d2 = p.get("jd_domain", "unknown").lower()
            domain_pair_cache[(q, d1)].append(p)
            domain_pair_cache[(q, d2)].append(p)

    for (q, domain), group in domain_pair_cache.items():
        if current_counts[q] >= target_counts[q]:
            continue  # already filled this quality

        candidates = [p for p in group if (p["resume_id"], p["jd_id"]) not in seen_pairs]
        room_left = target_counts[q] - current_counts[q]
        take_count = min(min_per_domain, len(candidates), room_left)
        sampled = random.sample(candidates, take_count)

        for p in sampled:
            key = (p["resume_id"], p["jd_id"])
            if key not in seen_pairs:
                selected[q].append(p)
                seen_pairs.add(key)
                current_counts[q] += 1
                if current_counts[q] >= target_counts[q]:
                    break  # stop sampling more of this quality

    return selected



# === 5. FILL REMAINING WITH DIVERSITY ===
def fill_remaining_by_diverse_domains(
    quality_buckets: Dict[str, List[Dict]],
    selected_by_quality: Dict[str, List[Dict]],
    target_counts: Dict[str, int],
    seen_pairs: Set[Tuple[int, int]]
) -> Dict[str, List[Dict]]:
    for q in target_counts:
        remaining = target_counts[q] - len(selected_by_quality[q])
        if remaining <= 0:
            continue

        available = [
            p for p in quality_buckets[q]
            if (p["resume_id"], p["jd_id"]) not in seen_pairs
        ]

        # Bucket by domain
        domain_groups = defaultdict(list)
        for p in available:
            domain_groups[p.get("resume_domain", "unknown").lower()].append(p)
            domain_groups[p.get("jd_domain", "unknown").lower()].append(p)

        sampled = []
        domain_cycle = list(domain_groups.keys())
        random.shuffle(domain_cycle)

        while remaining > 0 and domain_cycle:
            domain = domain_cycle.pop(0)
            group = domain_groups[domain]
            group = [p for p in group if (p["resume_id"], p["jd_id"]) not in seen_pairs]
            if not group:
                continue
            chosen = random.choice(group)
            sampled.append(chosen)
            seen_pairs.add((chosen["resume_id"], chosen["jd_id"]))
            domain_groups[domain].remove(chosen)
            domain_cycle.append(domain)
            remaining -= 1

        selected_by_quality[q].extend(sampled)

    return selected_by_quality


In [10]:
# === 6. MAIN WRAPPER ===
def generate_balanced_sample(
    all_pairs: List[Dict],
    target_counts: Dict[str, int],
    score_threshold: float = 0.1,
    min_per_domain: int = 50
) -> List[Dict]:
    print(f"🔍 Total pairs in input: {len(all_pairs)}")
    filtered = filter_pairs_by_score(all_pairs, score_threshold)
    print(f"✅ After score ≥ {score_threshold}: {len(filtered)}")

    quality_buckets, resume_to_qualities, domain_coverage = group_pairs_by_quality_and_resume(filtered)
    print("📊 Match quality counts:")
    for q in target_counts:
        print(f"  {q.upper():<7}: {len(quality_buckets[q])}")
    print(f"👥 Unique resumes: {len(resume_to_qualities)}")

    seen_pairs = set()

    # Resume-balanced
    selected_by_quality, resume_count = resume_balanced_sampling(resume_to_qualities, list(target_counts.keys()), seen_pairs)
    print(f"👤 Resume-balanced resumes: {resume_count}")

    # Domain quotas
    current_counts = {q: len(selected_by_quality[q]) for q in target_counts}
    domain_quota_selected = enforce_domain_quota(
        quality_buckets,
        seen_pairs,
        target_counts,
        current_counts,
        min_per_domain
    )

    for q in target_counts:
        selected_by_quality[q].extend(domain_quota_selected.get(q, []))

    # Fill remaining
    selected_by_quality = fill_remaining_by_diverse_domains(quality_buckets, selected_by_quality, target_counts, seen_pairs)

    # Final merge
    final_sample = []
    print("\n📦 Final sampled count:")
    for q in target_counts:
        group = selected_by_quality[q]
        print(f"  {q.upper():<7}: {len(group)}")
        final_sample.extend(group)

    print(f"\n🎯 Total selected: {len(final_sample)}")

    # Domain coverage
    resume_domains = [p.get("resume_domain", "unknown").lower() for p in final_sample]
    jd_domains = [p.get("jd_domain", "unknown").lower() for p in final_sample]
    print("\n📊 Resume domain coverage (top 10):")
    for dom, count in Counter(resume_domains).most_common(10):
        print(f"  {dom:<30} {count}")

    print("\n📊 JD domain coverage (top 10):")
    for dom, count in Counter(jd_domains).most_common(10):
        print(f"  {dom:<30} {count}")

    return final_sample

In [11]:
# 🎯 Define how many samples per match type
target_counts = {
    "strong": 20000,
    "medium": 25000,
    "weak": 15000
}

# 📥 Load the filtered list only
input_path = os.path.join(Config.JSON_OUTPUT_SCORING_DIR, 'semantic_relevance_scores.json')
full_data  = load_json_file(input_path)
relevance_data = full_data.get("semantic_relevance_scores", [])

# 🧠 Generate the balanced subset
sampled = generate_balanced_sample(
    all_pairs=relevance_data,
    target_counts=target_counts,
    score_threshold=0.2
)

# Wrap in valid JSON object structure
relevance_wrapped = {
    "semantic_relevance_scores": sampled
}

# 💾 Save the sampled subset
relevance_map_file = os.path.join(Config.JSON_OUTPUT_SCORING_DIR, 'relevant_pairs.json')
save_json_output(relevance_wrapped, relevance_map_file)


🔍 Total pairs in input: 2463477
✅ After score ≥ 0.2: 2463477
📊 Match quality counts:
  STRONG : 15108
  MEDIUM : 998172
  WEAK   : 1450197
👥 Unique resumes: 2079
👤 Resume-balanced resumes: 2079

📦 Final sampled count:
  STRONG : 14993
  MEDIUM : 25000
  WEAK   : 15000

🎯 Total selected: 54993

📊 Resume domain coverage (top 10):
  chef                           5943
  accountant                     5854
  sales                          5482
  engineering                    5401
  finance                        4787
  construction                   3686
  business-development           3322
  banking                        2632
  information-technology         2476
  consultant                     1929

📊 JD domain coverage (top 10):
  finance                        7052
  food_service                   2497
  retail                         2097
  construction                   1631
  manufacturing                  1087
  education                      1077
  business                    

## Generate Normal distributed 30K samples file from scores files

In [None]:
import os
import re
import json
import random
from collections import defaultdict, Counter

SCORING_FOLDER = Config.JSON_OUTPUT_SCORING_DIR  # path to scoring files
OUTPUT_FILE = os.path.join(Config.JSON_OUTPUT_SCORING_FT_DATA, "fine_tuning_filenames.json")  # JSON_OUTPUT_SCORING_FT_DATA
TARGET_COUNTS = {"strong": 10000, "medium": 15000, "weak": 5000}
QUALITY_BUCKETS = list(TARGET_COUNTS.keys())

def parse_filename(fname):
    match = re.match(r"^(\d+)_(\d+)_(strong|medium|weak)_[^_]+_([\d.]+)\.json$", fname)
    if not match:
        return None
    resume_id, jd_id, quality, score = match.groups()
    return {
        "filename": fname,
        "resume_id": resume_id,
        "jd_id": jd_id,
        "quality": quality,
        "score": float(score)
    }

def load_scoring_files(folder):
    all_meta = []
    for fname in os.listdir(folder):
        if not fname.endswith(".json"):
            continue
        meta = parse_filename(fname)
        if not meta:
            continue
        path = os.path.join(folder, fname)
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
            meta.update({
                "domain": data.get("domain", "UNKNOWN"),
                "match_quality": data.get("match_quality", "").lower(),
                "semantic_match_label": data.get("semantic_match_label", "").lower()
            })
            all_meta.append(meta)
        except Exception as e:
            print(f"⚠️ Skipping {fname}: {e}")
    return all_meta

def group_by_label(meta_list):
    return {
        "strong": [m for m in meta_list if m["match_quality"] == "strong"],
        "strong_semantic": [m for m in meta_list if m["semantic_match_label"] == "strong" and m["match_quality"] != "strong"],
        "medium": [m for m in meta_list if m["match_quality"] == "medium"],
        "weak": [m for m in meta_list if m["match_quality"] == "weak"]
    }

def select_by_domain_distribution(meta_list, limit):
    selected = []
    domain_buckets = defaultdict(list)
    for m in meta_list:
        domain_buckets[m["domain"]].append(m)

    # Equal or proportionally balanced selection
    domain_keys = list(domain_buckets.keys())
    per_domain = max(1, limit // len(domain_keys))
    for domain in domain_keys:
        random.shuffle(domain_buckets[domain])
        selected.extend(domain_buckets[domain][:per_domain])

    if len(selected) < limit:
        remaining = [m for m in meta_list if m["filename"] not in {x["filename"] for x in selected}]
        random.shuffle(remaining)
        selected.extend(remaining[:limit - len(selected)])
    return selected[:limit]

def log_distribution(selected_list, label):
    total = len(selected_list)
    by_domain = Counter([m["domain"] for m in selected_list])
    print(f"\n📊 Final count for {label}: {total}")
    for domain, count in by_domain.most_common():
        print(f"  {domain}: {count}")

def main():
    all_meta = load_scoring_files(SCORING_FOLDER)
    grouped = group_by_label(all_meta)
    final_selected = []

    # Step 1: Strong group
    strong_actual = grouped["strong"]
    if len(strong_actual) >= TARGET_COUNTS["strong"]:
        strong_selected = select_by_domain_distribution(strong_actual, TARGET_COUNTS["strong"])
    else:
        shortfall = TARGET_COUNTS["strong"] - len(strong_actual)
        fallback = grouped["strong_semantic"]
        fallback_selected = select_by_domain_distribution(fallback, shortfall)
        strong_selected = strong_actual + fallback_selected
    log_distribution(strong_selected, "strong")
    final_selected.extend(strong_selected)
    selected_filenames = {m["filename"] for m in final_selected}

    # Step 2: Medium group (exclude used)
    medium_pool = [m for m in grouped["medium"] if m["filename"] not in selected_filenames]
    medium_selected = select_by_domain_distribution(medium_pool, TARGET_COUNTS["medium"])
    log_distribution(medium_selected, "medium")
    final_selected.extend(medium_selected)
    selected_filenames.update(m["filename"] for m in medium_selected)

    # Step 3: Weak group (exclude used)
    weak_pool = [m for m in grouped["weak"] if m["filename"] not in selected_filenames]
    weak_selected = select_by_domain_distribution(weak_pool, TARGET_COUNTS["weak"])
    log_distribution(weak_selected, "weak")
    final_selected.extend(weak_selected)

    # Save output
    filenames = [m["filename"] for m in final_selected]
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(filenames, f, indent=2)
    print(f"\n✅ Saved final list to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()
