# Phase 1: First Steps Notebook — Data Ingestion + Minimal Parsing
1. Setup and Install Dependencies
2. Load Resume and JD datasets
3. Minimal Parsing into JSON Structure
4. Save structured JSON for Phase 2

## Setup and Install Dependencies

In [1]:
%pip install kaggle kagglehub pandas


Collecting kaggle
  Using cached kaggle-1.7.4.2-py3-none-any.whl.metadata (16 kB)
Collecting kagglehub
  Using cached kagglehub-0.3.11-py3-none-any.whl.metadata (32 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting bleach (from kaggle)
  Using cached bleach-6.2.0-py3-none-any.whl.metadata (30 kB)
Collecting certifi>=14.05.14 (from kaggle)
  Using cached certifi-2025.1.31-py3-none-any.whl.metadata (2.5 kB)
Collecting charset-normalizer (from kaggle)
  Using cached charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl.metadata (36 kB)
Collecting idna (from kaggle)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting protobuf (from kaggle)
  Using cached protobuf-6.30.2-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Collecting python-slugify (from kaggle)
  Using cached python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting requests (from kaggle)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6

## Util Classes and methods

### Configurations  

In [2]:
# ==============================
# 🛠 CONFIGURATION
# ==============================
import os
import shutil
import zipfile
import pandas as pd
from pathlib import Path
from typing import List

class Config:
    DATASET_DOWNLOAD_DIR = "datasets"
    JSON_OUTPUT_DIR = "json_outputs"
    AUTO_CLEANUP = True

    @staticmethod
    def setup_kaggle_credentials():
        kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
        if not os.path.exists(kaggle_path):
            from google.colab import files
            print("📂 Upload kaggle.json file...")
            uploaded = files.upload()
            os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
            for filename in uploaded.keys():
                shutil.move(filename, kaggle_path)
            os.chmod(kaggle_path, 0o600)
            print(f"✅ Kaggle credentials setup at {kaggle_path}")
        else:
            print(f"✅ Kaggle credentials already exist at {kaggle_path}")



### Downloader 

In [4]:

# ==============================
# DOWNLOADER
# ==============================
class DatasetDownloader:
    @staticmethod
    def download_and_extract(dataset_path: str) -> tuple[str, str]:
        os.makedirs(Config.DATASET_DOWNLOAD_DIR, exist_ok=True)
        dataset_slug = dataset_path.split("/")[-1]
        extract_folder_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
        zip_filename = f"{dataset_slug}.zip"
        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)

        if os.path.exists(extract_folder_path) and any(Path(extract_folder_path).rglob("*.csv")):
            print(f"⚡ Dataset folder already exists at '{extract_folder_path}', skipping download and extraction.")
            return extract_folder_path, zip_filename

        print(f"⬇️ Downloading dataset: {dataset_path} ...")
        !kaggle datasets download -d {dataset_path} -p {Config.DATASET_DOWNLOAD_DIR}

        if not os.path.exists(zip_path):
            raise FileNotFoundError(f"❌ Zip file '{zip_filename}' not found after download!")

        os.makedirs(extract_folder_path, exist_ok=True)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_folder_path)

        print(f"✅ Downloaded and extracted to '{extract_folder_path}'.")
        return extract_folder_path, zip_filename



### Data Loader

In [5]:

# ==============================
# LOADER
# ==============================
class DatasetLoader:
    @staticmethod
    def load_csv(dataset_folder: str, target_csv_name: str) -> pd.DataFrame:
        print(f"🔍 Searching for '{target_csv_name}' inside {dataset_folder}...")
        if not os.path.exists(dataset_folder):
            raise FileNotFoundError(f"❌ Dataset folder '{dataset_folder}' does not exist!")

        for root, _, files in os.walk(dataset_folder):
            for file in files:
                if file.lower() == target_csv_name.lower():
                    csv_path = os.path.join(root, file)
                    df = pd.read_csv(csv_path)
                    print(f"✅ Loaded CSV with shape {df.shape}")
                    return df

        raise FileNotFoundError(f"❌ CSV file '{target_csv_name}' not found inside extracted dataset!")



### Data Processor

In [6]:

# ==============================
# PROCESSOR
# ==============================
class DatasetProcessor:
    @staticmethod
    def filter_fields(df: pd.DataFrame, allowed_fields: List[str]) -> pd.DataFrame:
        missing_fields = [field for field in allowed_fields if field not in df.columns]
        if missing_fields:
            raise ValueError(f"❌ Fields {missing_fields} not found in dataset!")

        filtered_df = df[allowed_fields]
        print(f"✅ Filtered columns: {list(filtered_df.columns)}")
        return filtered_df

    @staticmethod
    def save_to_json(df: pd.DataFrame, output_json_name: str):
        os.makedirs(Config.JSON_OUTPUT_DIR, exist_ok=True)
        output_path = os.path.join(Config.JSON_OUTPUT_DIR, output_json_name)

        if os.path.exists(output_path):
            os.remove(output_path)
            print(f"🗑️ Existing JSON '{output_path}' deleted.")

        df.to_json(output_path, orient='records', lines=True, force_ascii=False)
        print(f"✅ Data saved to JSON at '{output_path}'")



### Cleanup

In [7]:


# ==============================
# CLEANER
# ==============================
class Cleaner:
    @staticmethod
    def cleanup_dataset_artifacts(extracted_folder_path: str, zip_filename: str):
        if os.path.exists(extracted_folder_path):
            shutil.rmtree(extracted_folder_path)
            print(f"🧹 Folder '{extracted_folder_path}' has been deleted successfully.")

        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)
        if os.path.exists(zip_path):
            os.remove(zip_path)
            print(f"🗑️ Zip file '{zip_path}' has been deleted successfully.")



### Hybrid Data loader

In [8]:


# ==============================
# HYBRID LOADER
# ==============================
try:
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
except ImportError:
    kagglehub = None

class HybridDatasetLoader:
    @staticmethod
    def load_dataset(dataset_path: str, file_name: str) -> pd.DataFrame:
        if kagglehub:
            try:
                print(f"📥 Trying KaggleHub for {dataset_path}...")
                df = kagglehub.dataset_load(KaggleDatasetAdapter.PANDAS, dataset_path, file_name)
                print(f"✅ Loaded using KaggleHub: shape = {df.shape}")
                return df
            except Exception as e:
                print(f"⚠️ KaggleHub failed: {e}\nFalling back to ZIP-based loader.")

        extracted_folder, _ = DatasetDownloader.download_and_extract(dataset_path)
        return DatasetLoader.load_csv(extracted_folder, file_name)



  from .autonotebook import tqdm as notebook_tqdm


### Main flow

In [9]:

# ==============================
# MAIN FLOW
# ==============================
def process_dataset(dataset_path: str, target_csv_name: str, allowed_fields: List[str], output_json_name: str):
    df = HybridDatasetLoader.load_dataset(dataset_path, target_csv_name)
    filtered_df = DatasetProcessor.filter_fields(df, allowed_fields)
    DatasetProcessor.save_to_json(filtered_df, output_json_name)

    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)

## Login and do the processing of Resume and JD dataset

In [10]:
Config.setup_kaggle_credentials()
# Process Resume Dataset
process_dataset(
    dataset_path="snehaanbhawal/resume-dataset",
    target_csv_name="Resume.csv",
    allowed_fields=["Category", "Resume_str"],
    output_json_name="parsed_resumes.json"
)

# Process Job Postings Dataset
process_dataset(
    dataset_path="arshkon/linkedin-job-postings",
    target_csv_name="postings.csv",
    allowed_fields=["title", "company_name", "location", "description", "skills_desc", "job_id" , "formatted_experience_level", "formatted_work_type"],
    output_json_name="parsed_jds.json"
)

✅ Kaggle credentials already exist at C:\Users\rubyj/.kaggle/kaggle.json
📥 Trying KaggleHub for snehaanbhawal/resume-dataset...
⚠️ KaggleHub failed: 404 Client Error.

Resource not found at URL: https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset/versions/1
The server reported the following issues: Data not found
Please make sure you specified the correct resource identifiers.
Falling back to ZIP-based loader.
⬇️ Downloading dataset: snehaanbhawal/resume-dataset ...
Dataset URL: https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset
License(s): CC0-1.0
✅ Downloaded and extracted to 'datasets\resume-dataset'.
🔍 Searching for 'Resume.csv' inside datasets\resume-dataset...
✅ Loaded CSV with shape (2484, 4)
✅ Filtered columns: ['Category', 'Resume_str']
🗑️ Existing JSON 'json_outputs\parsed_resumes.json' deleted.
✅ Data saved to JSON at 'json_outputs\parsed_resumes.json'
🧹 Folder 'datasets\resume-dataset' has been deleted successfully.
🗑️ Zip file 'datasets\resume-dataset.zi

# Phase 2 -	Parse resume/JD into JSON structured scheme

##  Install Dependencies  & Login to Hugging Face Hub

In [11]:
%pip install -q transformers accelerate bitsandbytes sentencepiece pydantic huggingface_hub

#%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
#%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
%pip install -U bitsandbytes


Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://download.pytorch.org/whl/nightly/cu128
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [12]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

if not HF_TOKEN:
    from google.colab import userdata
    HF_TOKEN = userdata.get('HF_TOKEN')
    if not HF_TOKEN:
        # Prompt for token if not set in environment
        print("🔑 Please enter your Hugging Face token:")
        # For Colab or local prompt input
        HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


## Import Libraries

In [13]:
import json
import os
from pathlib import Path
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch


##  Load Mistral-7B-Instruct with Fallback to Quantized

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
""""
def load_mistral_pipeline():
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_NAME,
            token=HF_TOKEN,
            trust_remote_code=True
        )
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            token=HF_TOKEN,
            device_map="auto",
            torch_dtype=torch.float16,
            trust_remote_code=True
        )
        print("✅ Loaded full precision model.")
    except Exception:
        print("⚠️ Full model failed. Loading quantized 8-bit version.")
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_NAME,
            token=HF_TOKEN,
            trust_remote_code=True
        )
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            token=HF_TOKEN,
            device_map="auto",
            load_in_8bit=True,
            quantization_config=BitsAndBytesConfig(load_in_8bit=True),
            trust_remote_code=True
        )
        print("✅ Loaded 8-bit quantized model.")

    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=2)

llm_pipeline = load_mistral_pipeline()
"""



def load_mistral_pipeline_dynamic(model_name=MODEL_NAME, hf_token=None):
    print("🔍 Checking system resources...")
    has_cuda = torch.cuda.is_available()

    print(f"🧠 CUDA available: {has_cuda}")

    print(f"🔧 Loading tokenizer: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        token=hf_token,
        trust_remote_code=True
    )

    print("🔁 Trying 8-bit quantized loading...")
    """"
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        token=hf_token,
        device_map=device_map,
        quantization_config=BitsAndBytesConfig(load_in_8bit=True),
        trust_remote_code=True
    )
    """
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,            # load model in 4-bit precision
        bnb_4bit_quant_type="nf4",    # pre-trained model should be quantized in 4-bit NF format
        bnb_4bit_use_double_quant=True, # Using double quantization as mentioned in QLoRA paper
        bnb_4bit_compute_dtype=torch.bfloat16, # During computation, pre-trained model should be loaded in BF16 format
        llm_int8_enable_fp32_cpu_offload=True
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        token=hf_token,
        quantization_config=bnb_config, # Use bitsandbytes config
        device_map="auto",  # Specifying device_map="auto" so that HF Accelerate will determine which GPU to put each layer of the model on
        trust_remote_code=True, # Set trust_remote_code=True to use falcon-7b model with custom code
    )
    print("✅ Quantized 8-bit model loaded.")

    print("🎯 Model is on device:", next(model.parameters()).device)
    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=2)

m_pipeline = load_mistral_pipeline_dynamic(hf_token=HF_TOKEN)

🔍 Checking system resources...
🧠 CUDA available: True
🔧 Loading tokenizer: mistralai/Mistral-7B-Instruct-v0.1
🔁 Trying 8-bit quantized loading...


Loading checkpoint shards: 100%|██████████| 2/2 [00:17<00:00,  8.58s/it]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cuda:0


✅ Quantized 8-bit model loaded.
🎯 Model is on device: cuda:0


In [15]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

print(torch.cuda.get_device_capability(0))

CUDA available: True
CUDA version: 12.8
GPU: NVIDIA GeForce RTX 5080
(12, 0)


In [22]:
import subprocess
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

def get_available_gpu_memory_gb():
    """Returns available GPU memory (in GB) using nvidia-smi. Returns 0 if GPU not available."""
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,nounits,noheader"],
            encoding="utf-8"
        )
        # First GPU (assume single GPU setup)
        free_mem_mb = int(output.strip().split("\n")[0])
        return free_mem_mb / 1024  # Convert MB to GB
    except Exception as e:
        print(f"⚠️ Could not fetch GPU memory: {str(e)}")
        return 0.0

def load_mistral_pipeline_dynamic(model_name="mistralai/Mistral-7B-Instruct-v0.1", hf_token=None):
    print("🔍 Detecting system resources...")
    has_cuda = torch.cuda.is_available()
    free_vram_gb = get_available_gpu_memory_gb() if has_cuda else 0

    print(f"🧠 CUDA available: {has_cuda}")
    print(f"📊 Free GPU memory: {free_vram_gb:.2f} GB")

    device_map = {"": 0} if has_cuda else "cpu"
    load_quantized = (not has_cuda) or (free_vram_gb < 14)  # ~14 GB is a safe threshold for Mistral-7B FP16

    print("🔧 Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        token=hf_token,
        trust_remote_code=True
    )

    try:
        if load_quantized:
            print("⚙️ Using 8-bit quantized model (low VRAM or CPU fallback)...")
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                token=hf_token,
                device_map=device_map,
                quantization_config=BitsAndBytesConfig(load_in_8bit=True),
                trust_remote_code=True
            )
            print("✅ Loaded 8-bit model.")
        else:
            print("⚙️ Using full precision FP16 model...")
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                token=hf_token,
                device_map=device_map,
                torch_dtype=torch.float16,
                trust_remote_code=True
            )
            print("✅ Loaded full precision model.")
    except Exception as e:
        print(f"❌ Error loading model: {str(e)}")
        raise RuntimeError("Model load failed.")

    print("🎯 Model on device:", next(model.parameters()).device)
    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=2)

llm_pipeline = load_mistral_pipeline_dynamic(hf_token=HF_TOKEN)

🔍 Detecting system resources...
🧠 CUDA available: True
📊 Free GPU memory: 7.54 GB
🔧 Loading tokenizer...
⚙️ Using 8-bit quantized model (low VRAM or CPU fallback)...


Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.31s/it]
Device set to use cuda:0


✅ Loaded 8-bit model.
🎯 Model on device: cuda:0


## JSON Schemas

In [16]:
resume_schema = {
    "basics": {
        "name": "string",
        "email": "string",
        "phone": "string",
        "location": "string",
        "summary": "string"
    },
    "education": [{"degree": "string", "field": "string", "institution": "string", "year": "string"}],
    "experience": [{"job_title": "string", "company": "string", "duration": "string", "description": "string"}],
    "skills": ["string"],
    "certifications": ["string"],
    "projects": ["string"]
}

jd_schema = {
    "title": "string",
    "summary": "string",
    "required_experience_years": "float",
    "preferred_degrees": ["string"],
    "required_skills": ["string"],
    "certifications": ["string"],
    "soft_skills": ["string"]
}


##  Prompt Templates

In [17]:
RESUME_PROMPT_TEMPLATE = """Extract the structured resume JSON from the raw resume below:
--------------------
{text}
--------------------
The output should match this schema (no extra fields):
{schema}
Return a valid JSON object only.
"""

JD_PROMPT_TEMPLATE = """Extract structured job description JSON from the raw JD below:
--------------------
{text}
--------------------
The output should match this schema:
{schema}
Return a valid JSON object only.
"""


## Inference Function

In [18]:
def extract_structured_json(text, prompt_template, schema_dict, max_new_tokens=512):
    schema_str = json.dumps(schema_dict, indent=2)
    prompt = prompt_template.format(text=text.strip()[:1500], schema=schema_str)
    response = llm_pipeline(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0]["generated_text"]
    json_start = response.find("{")
    try:
        return json.loads(response[json_start:])
    except json.JSONDecodeError:
        return {"raw_output": response}


##  Normalize & Save

In [23]:
def normalize_and_save(input_filename, output_filename, is_resume=True):
    input_path = Path(Config.JSON_OUTPUT_DIR) / input_filename
    output_path = Path(Config.JSON_OUTPUT_DIR) / output_filename
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with open(input_path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f.readlines() if line.strip()]

    if output_path.exists():
        print(f"🗑️ Deleting existing file: {output_path}")
        output_path.unlink()

    print(f"⏳ Normalizing: {input_filename} → {output_filename}")
    results = []
    for entry in tqdm(data):
        text = entry.get("Resume_str" if is_resume else "description", "")
        parsed = extract_structured_json(
            text=text,
            prompt_template=RESUME_PROMPT_TEMPLATE if is_resume else JD_PROMPT_TEMPLATE,
            schema_dict=resume_schema if is_resume else jd_schema
        )
        results.append(parsed)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)
    print(f"✅ Done: Saved {len(results)} entries to {output_path.name}")


## Phase 2 Main Method

In [None]:
def run_phase2_structured_normalization():
    normalize_and_save("parsed_resumes.json", "normalized_resumes.json", is_resume=True)
    normalize_and_save("parsed_jds.json", "normalized_jds.json", is_resume=False)

run_phase2_structured_normalization()


⏳ Normalizing: parsed_resumes.json → normalized_resumes.json


  0%|          | 0/2484 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/2484 [00:18<12:59:42, 18.84s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/2484 [00:35<12:08:26, 17.61s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 3/2484 [00:51<11:44:14, 17.03s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 4/2484 [01:30<17:38:04, 25.60s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 5/2484 [02:12<21:36:05, 31.37s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 6/2484 [02:29<18:11:59, 26.44s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 7/2484 [02:56<18:29:39, 26.88s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 8/2484 [03:13<16:15:06, 23.63s/it]Setting 