# Global setup and package installation used in most phases

## Colab + GPU Detection Utilities

In [None]:
import subprocess

def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

def get_available_gpu_memory_gb():
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,nounits,noheader"],
            encoding="utf-8"
        )
        free_mem_mb = int(output.strip().split("\n")[0])
        return free_mem_mb / 1024
    except Exception:
        return 0.0


## install dependencies

In [None]:
if is_running_in_colab():
    # Install the required packages
    !pip install kagglehub pandas
    !pip install -q transformers accelerate bitsandbytes sentencepiece pydantic huggingface_hub xformers
else:
    %pip install kagglehub pandas
    %pip install -q transformers accelerate sentencepiece pydantic huggingface_hub xformers
    #%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
    #%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
    %pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
    %pip install -U bitsandbytes


## Login to huggingface

In [None]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    if is_running_in_colab():
        # If running in Colab, use the Colab secrets
        try:
            from google.colab import userdata
            HF_TOKEN = userdata.get('HF_TOKEN')
            if not HF_TOKEN:
                raise ValueError("⚠️ Hugging Face token not found in Colab secrets.")
            print("🔑 Hugging Face token found in Colab secrets.")
        except ImportError:
            print("⚠️ Unable to authenticate in Colab. Please set your Hugging Face token manually.")
    else:
        # Prompt for token if not set in environment
        print("🔑 Please enter your Hugging Face token:")
        # For Colab or local prompt input
        HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


## Setup Kaggle Credentials

In [None]:
import shutil

def setup_kaggle_credentials():
    kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
    if not os.path.exists(kaggle_path):
        from google.colab import files
        print("📂 Upload kaggle.json file...")
        uploaded = files.upload()
        os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
        for filename in uploaded.keys():
            shutil.move(filename, kaggle_path)
        os.chmod(kaggle_path, 0o600)
        print(f"✅ Kaggle credentials setup at {kaggle_path}")
    else:
        print(f"✅ Kaggle credentials already exist at {kaggle_path}")

setup_kaggle_credentials()

## Mount Google Drive (Colab)

In [None]:
if is_running_in_colab():
   from google.colab import drive
   drive.mount('/content/drive')

##  Load Nous-Hermes-mistral-Instruct with Fallback to Quantized

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

def load_model_pipeline(model_name: str, hf_token: str):
    has_cuda = torch.cuda.is_available()
    free_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) if has_cuda else 0
    print(f"💻 CUDA: {has_cuda} | GPU Memory: {free_mem:.2f} GB")

    device_map = {"": 0} if has_cuda else "cpu"
    use_4bit = has_cuda and free_mem < 24

    # Set quantization config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True if use_4bit else False,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    ) if use_4bit else None

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token  # ✅ Fix warning about pad_token

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=quant_config,
        torch_dtype=torch.float16 if not quant_config else None,
        trust_remote_code=True,
        token=hf_token
    )

    print(f"✅ Model loaded on {next(model.parameters()).device}")
    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=1)


In [None]:
llm_pipeline = load_model_pipeline(
    model_name="NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
    hf_token=HF_TOKEN
)


# Phase 1: First Steps Notebook — Data Ingestion + Minimal Parsing
1. Load Resume and JD datasets
2. Minimal Parsing into JSON Structure
3. Save structured JSON for Phase 2

## Util Classes and methods

### Configurations  

In [None]:
# ==============================
# 🛠 CONFIGURATION
# ==============================

class Config:
    DATASET_DOWNLOAD_DIR = "datasets"
    JSON_OUTPUT_DIR = "json_outputs_phase1_run2"
    AUTO_CLEANUP = True




### Downloader 

In [None]:

# ==============================
# DOWNLOADER
# ==============================
import zipfile
from pathlib import Path

class DatasetDownloader:
    @staticmethod
    def download_and_extract(dataset_path: str) -> tuple[str, str]:
        os.makedirs(Config.DATASET_DOWNLOAD_DIR, exist_ok=True)
        dataset_slug = dataset_path.split("/")[-1]
        extract_folder_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
        zip_filename = f"{dataset_slug}.zip"
        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)

        if os.path.exists(extract_folder_path) and any(Path(extract_folder_path).rglob("*.csv")):
            print(f"⚡ Dataset folder already exists at '{extract_folder_path}', skipping download and extraction.")
            return extract_folder_path, zip_filename

        print(f"⬇️ Downloading dataset: {dataset_path} ...")
        !kaggle datasets download -d {dataset_path} -p {Config.DATASET_DOWNLOAD_DIR}

        if not os.path.exists(zip_path):
            raise FileNotFoundError(f"❌ Zip file '{zip_filename}' not found after download!")

        os.makedirs(extract_folder_path, exist_ok=True)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_folder_path)

        print(f"✅ Downloaded and extracted to '{extract_folder_path}'.")
        return extract_folder_path, zip_filename



### Data Loader

In [None]:

# ==============================
# LOADER
# ==============================
import pandas as pd


class DatasetLoader:
    @staticmethod
    def load_csv(dataset_folder: str, target_csv_name: str) -> pd.DataFrame:
        print(f"🔍 Searching for '{target_csv_name}' inside {dataset_folder}...")
        if not os.path.exists(dataset_folder):
            raise FileNotFoundError(f"❌ Dataset folder '{dataset_folder}' does not exist!")

        for root, _, files in os.walk(dataset_folder):
            for file in files:
                if file.lower() == target_csv_name.lower():
                    csv_path = os.path.join(root, file)
                    df = pd.read_csv(csv_path)
                    print(f"✅ Loaded CSV with shape {df.shape}")
                    return df

        raise FileNotFoundError(f"❌ CSV file '{target_csv_name}' not found inside extracted dataset!")



### Data Processor

In [None]:

# ==============================
# PROCESSOR
# ==============================
from typing import List


class DatasetProcessor:
    @staticmethod
    def filter_fields(df: pd.DataFrame, allowed_fields: List[str]) -> pd.DataFrame:
        missing_fields = [field for field in allowed_fields if field not in df.columns]
        if missing_fields:
            raise ValueError(f"❌ Fields {missing_fields} not found in dataset!")

        filtered_df = df[allowed_fields]
        print(f"✅ Filtered columns: {list(filtered_df.columns)}")
        return filtered_df

    @staticmethod
    def save_to_json(df: pd.DataFrame, output_json_name: str):
        os.makedirs(Config.JSON_OUTPUT_DIR, exist_ok=True)
        output_path = os.path.join(Config.JSON_OUTPUT_DIR, output_json_name)

        if os.path.exists(output_path):
            os.remove(output_path)
            print(f"🗑️ Existing JSON '{output_path}' deleted.")

        df.to_json(output_path, orient='records', lines=True, force_ascii=False)
        print(f"✅ Data saved to JSON at '{output_path}'")



### Cleanup

In [None]:
# ==============================
# CLEANER
# ==============================
class Cleaner:
    @staticmethod
    def cleanup_dataset_artifacts(extracted_folder_path: str, zip_filename: str):
        if os.path.exists(extracted_folder_path):
            shutil.rmtree(extracted_folder_path)
            print(f"🧹 Folder '{extracted_folder_path}' has been deleted successfully.")

        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)
        if os.path.exists(zip_path):
            os.remove(zip_path)
            print(f"🗑️ Zip file '{zip_path}' has been deleted successfully.")

### Hybrid Data loader

In [None]:

# ==============================
# HYBRID LOADER
# ==============================
try:
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
except ImportError:
    kagglehub = None

class HybridDatasetLoader:
    @staticmethod
    def load_dataset(dataset_path: str, file_name: str) -> pd.DataFrame:
        if kagglehub:
            try:
                print(f"📥 Trying KaggleHub for {dataset_path}...")
                df = kagglehub.dataset_load(KaggleDatasetAdapter.PANDAS, dataset_path, file_name)
                print(f"✅ Loaded using KaggleHub: shape = {df.shape}")
                return df
            except Exception as e:
                print(f"⚠️ KaggleHub failed: {e}\nFalling back to ZIP-based loader.")

        extracted_folder, _ = DatasetDownloader.download_and_extract(dataset_path)
        return DatasetLoader.load_csv(extracted_folder, file_name)



### Infer JD Domains

In [None]:
domain_keywords_dict = {
    'advocate': ['advocate'],
    'agriculture': ['agriculture'],
    'apparel': ['apparel'],
    'arts': ['arts'],
    'automobile': ['automobile'],
    'aviation': ['aviation'],
    'banking': ['banking'],
    'bpo': ['bpo'],
    'business development': ['business', 'development', 'business development', 'business-development'],
    'chef': ['chef'],
    'construction': ['construction'],
    'consultant': ['consultant'],
    'data scientist': ['data', 'data analyst', 'data scientist', 'scientist'],
    'designing': ['designing', 'designer'],
    'digital media': ['digital', 'digital marketing executive', 'media', 'digital media', 'digital-media'],
    'engineering': ['engineering'],
    'finance': ['finance', 'financial analyst'],
    'healthcare': ['healthcare'],
    'hr': ['hr'],
    'information technology': ['information', 'technology', 'information technology', 'information-technology'],
    'public relations': ['public', 'relations', 'public relations', 'public-relations'],
    'marketing': ['marketing'],
    'sales': ['sales', 'sales executive'],
    'teacher': ['teacher'],
    'technician': ['technician'],
    'training': ['training'],
    'web designing': ['web', 'designing'],
    'fitness': ['fitness'],
    'accountant': ['accountant', 'accounting']
}


In [None]:
def infer_domain_from_title(title):
    title_lower = title.lower()
    for domain, keywords in domain_keywords_dict.items():
        if any(kw in title_lower for kw in keywords):
            return domain
    return "unknown"


In [None]:
# ==============================
# Efficient LLM Inference in Batches for JD Domains
# ==============================
from tqdm import tqdm

def infer_domains_in_batches(texts, batch_size=8):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="🧠 Inferring JD domains"):
        batch = texts[i:i+batch_size]
        prompts = [
            f"Given this job description:\n\n{desc}\n\nWhat is the most likely job function or domain?" for desc in batch
        ]
        try:
            responses = llm_pipeline(prompts)
            for r in responses:
                results.append(r[0]['generated_text'].strip().split("\n")[-1])
        except Exception:
            results.extend(["unknown"] * len(batch))
    return results


### Filter and Rank JDs

In [None]:
# ==============================
# JD Filtering and Ranking (with batched domain inference)
# ==============================
def filter_and_rank_jds(jd_df, resume_domains, max_total=1000, top_n_per_domain=10):
    # Ensure necessary columns exist
    for col in ['title', 'description']:
        if col not in jd_df.columns:
            raise ValueError(f"❌ Column '{col}' not found in JD dataset")
        jd_df[col] = jd_df[col].fillna('').astype(str)

    # Infer domains 
    print("🧠 Inferring JD domains from title using keyword matcing...")
    #jd_df['inferred_domain'] = infer_domains_in_batches(jd_df['description'].tolist())
    jd_df['inferred_domain'] = jd_df['title'].fillna("").apply(infer_domain_from_title)


    all_ranked = []

    for domain in resume_domains:
        matches = jd_df[
            jd_df['title'].str.contains(domain, na=False, case=False) |
            jd_df['inferred_domain'].str.contains(domain, na=False, case=False)
        ].copy()

        if matches.empty:
            print(f"⚠️ No JDs matched domain: '{domain}'")
            continue

        matches['richness_score'] = matches['description'].str.len()
        top = matches.sort_values(by='richness_score', ascending=False).head(top_n_per_domain)
        all_ranked.append(top)

    if not all_ranked:
        raise ValueError("❌ No job descriptions matched any resume domains.")

    final_jds_df = pd.concat(all_ranked, ignore_index=True)
    final_jds_df = final_jds_df.drop_duplicates().sort_values(by='richness_score', ascending=False).head(max_total)

    print(f"✅ Filtered and ranked {len(final_jds_df)} job descriptions across {len(resume_domains)} domains.")
    return final_jds_df


### Load Resume and JD datasets

In [None]:
# ==============================
# Resume Dataset Loader (with caching)
# ==============================
def load_resume_dataset(dataset_path: str = "snehaanbhawal/resume-dataset", target_csv_name: str = "Resume.csv") -> pd.DataFrame:
    if not hasattr(load_resume_dataset, "_cache"):
        print("📥 Loading resume dataset for the first time...")
        load_resume_dataset._cache = HybridDatasetLoader.load_dataset(dataset_path, target_csv_name)
    else:
        print("✅ Using cached resume dataset.")
    
    return load_resume_dataset._cache

# ==============================
# Job Description Dataset Loader (with caching)
# ==============================
def load_job_description_dataset(dataset_path: str = "arshkon/linkedin-job-postings", target_csv_name: str = "postings.csv") -> pd.DataFrame:
    if not hasattr(load_job_description_dataset, "_cache"):
        print("📥 Loading job description dataset for the first time...")
        load_job_description_dataset._cache = HybridDatasetLoader.load_dataset(dataset_path, target_csv_name)
    else:
        print("✅ Using cached job description dataset.")
    
    return load_job_description_dataset._cache



### JD Dataset Processing Function

In [None]:
# ==============================
# JD Processing Function
# ==============================
def process_dataset_jd(dataset_path: str, target_csv_name: str, allowed_fields: List[str], output_json_name: str):
    jd_df = load_job_description_dataset(dataset_path, target_csv_name)
    resume_df = load_resume_dataset()
    resume_domains = resume_df['Category'].dropna().str.lower().unique().tolist()
    ranked_jds_df = filter_and_rank_jds(jd_df, resume_domains)
    filtered_df = DatasetProcessor.filter_fields(ranked_jds_df, allowed_fields)
    DatasetProcessor.save_to_json(filtered_df, output_json_name)
    
    # cleanup dataset
    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)
    

### Resume Dataset Processing Function

In [None]:
# ==============================
# Resume Filtering (5 per category)
# ==============================
def filter_resumes_by_category(resume_df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
    if 'Category' not in resume_df.columns:
        raise ValueError("❌ Resume dataset does not contain 'Category' column.")

    filtered_resumes = (
        resume_df
        .dropna(subset=['Category'])
        .groupby('Category', group_keys=False)
        .apply(lambda group: group.head(top_n))
        .reset_index(drop=True)
    )

    print(f"✅ Filtered {len(filtered_resumes)} resumes (top {top_n} from each category).")
    return filtered_resumes


In [None]:

# ==============================
# MAIN FLOW
# ==============================

def process_dataset_resume(dataset_path: str, target_csv_name: str, allowed_fields: List[str], output_json_name: str):
    df = load_resume_dataset(dataset_path, target_csv_name)
    filtered_df = DatasetProcessor.filter_fields(df, allowed_fields)
    DatasetProcessor.save_to_json(filtered_df, output_json_name)

    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)


# ==============================
# Save Filtered Resumes
# ==============================
def process_and_save_filtered_resumes(dataset_path: str, target_csv_name: str, allowed_fields: List[str], output_json_name: str):
    resume_df = load_resume_dataset(dataset_path, target_csv_name)
    df = filter_resumes_by_category(resume_df)
    filtered_df = DatasetProcessor.filter_fields(df, allowed_fields)
    DatasetProcessor.save_to_json(filtered_df, output_json_name)
    
    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)


## Login and do the processing of Resume and JD dataset

In [None]:
process_and_save_filtered_resumes(
    dataset_path="snehaanbhawal/resume-dataset",
    target_csv_name="Resume.csv",
    allowed_fields=["Category", "Resume_str"],
    output_json_name="parsed_resumes.json"
)

process_dataset_jd(
    dataset_path="arshkon/linkedin-job-postings",
    target_csv_name="postings.csv",
    allowed_fields=["title", "company_name", "location",  "skills_desc", "job_id" , "formatted_experience_level", "formatted_work_type", "description"], #"description",
    output_json_name="parsed_jds.json"
)


In [None]:

# Process Resume Dataset
process_dataset_resume(
    dataset_path="snehaanbhawal/resume-dataset",
    target_csv_name="Resume.csv",
    allowed_fields=["Category", "Resume_str"],
    output_json_name="parsed_resumes.json"
)


# Process Job Postings Dataset
process_dataset_jd(
    dataset_path="arshkon/linkedin-job-postings",
    target_csv_name="postings.csv",
    allowed_fields=["title", "company_name", "location",  "skills_desc", "job_id" , "formatted_experience_level", "formatted_work_type", "description"], #"description",
    output_json_name="parsed_jds.json"
)