# Global utility

In [None]:
def is_running_in_colab():
    try:
        import google.colab  # Only available in Colab
        return True
    except ImportError:
        return False

# Phase 1: First Steps Notebook — Data Ingestion + Minimal Parsing
1. Setup and Install Dependencies
2. Load Resume and JD datasets
3. Minimal Parsing into JSON Structure
4. Save structured JSON for Phase 2

## Setup and Install Dependencies

In [None]:
%pip install kaggle kagglehub pandas


## Util Classes and methods

### Configurations  

In [None]:
# ==============================
# 🛠 CONFIGURATION
# ==============================
import os
import shutil
import zipfile
import pandas as pd
from pathlib import Path
from typing import List

class Config:
    DATASET_DOWNLOAD_DIR = "datasets"
    JSON_OUTPUT_DIR = "json_outputs"
    AUTO_CLEANUP = True

    @staticmethod
    def setup_kaggle_credentials():
        kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
        if not os.path.exists(kaggle_path):
            from google.colab import files
            print("📂 Upload kaggle.json file...")
            uploaded = files.upload()
            os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
            for filename in uploaded.keys():
                shutil.move(filename, kaggle_path)
            os.chmod(kaggle_path, 0o600)
            print(f"✅ Kaggle credentials setup at {kaggle_path}")
        else:
            print(f"✅ Kaggle credentials already exist at {kaggle_path}")



### Downloader 

In [None]:

# ==============================
# DOWNLOADER
# ==============================
class DatasetDownloader:
    @staticmethod
    def download_and_extract(dataset_path: str) -> tuple[str, str]:
        os.makedirs(Config.DATASET_DOWNLOAD_DIR, exist_ok=True)
        dataset_slug = dataset_path.split("/")[-1]
        extract_folder_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
        zip_filename = f"{dataset_slug}.zip"
        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)

        if os.path.exists(extract_folder_path) and any(Path(extract_folder_path).rglob("*.csv")):
            print(f"⚡ Dataset folder already exists at '{extract_folder_path}', skipping download and extraction.")
            return extract_folder_path, zip_filename

        print(f"⬇️ Downloading dataset: {dataset_path} ...")
        !kaggle datasets download -d {dataset_path} -p {Config.DATASET_DOWNLOAD_DIR}

        if not os.path.exists(zip_path):
            raise FileNotFoundError(f"❌ Zip file '{zip_filename}' not found after download!")

        os.makedirs(extract_folder_path, exist_ok=True)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_folder_path)

        print(f"✅ Downloaded and extracted to '{extract_folder_path}'.")
        return extract_folder_path, zip_filename



### Data Loader

In [None]:

# ==============================
# LOADER
# ==============================
class DatasetLoader:
    @staticmethod
    def load_csv(dataset_folder: str, target_csv_name: str) -> pd.DataFrame:
        print(f"🔍 Searching for '{target_csv_name}' inside {dataset_folder}...")
        if not os.path.exists(dataset_folder):
            raise FileNotFoundError(f"❌ Dataset folder '{dataset_folder}' does not exist!")

        for root, _, files in os.walk(dataset_folder):
            for file in files:
                if file.lower() == target_csv_name.lower():
                    csv_path = os.path.join(root, file)
                    df = pd.read_csv(csv_path)
                    print(f"✅ Loaded CSV with shape {df.shape}")
                    return df

        raise FileNotFoundError(f"❌ CSV file '{target_csv_name}' not found inside extracted dataset!")



### Data Processor

In [None]:

# ==============================
# PROCESSOR
# ==============================
class DatasetProcessor:
    @staticmethod
    def filter_fields(df: pd.DataFrame, allowed_fields: List[str]) -> pd.DataFrame:
        missing_fields = [field for field in allowed_fields if field not in df.columns]
        if missing_fields:
            raise ValueError(f"❌ Fields {missing_fields} not found in dataset!")

        filtered_df = df[allowed_fields]
        print(f"✅ Filtered columns: {list(filtered_df.columns)}")
        return filtered_df

    @staticmethod
    def save_to_json(df: pd.DataFrame, output_json_name: str):
        os.makedirs(Config.JSON_OUTPUT_DIR, exist_ok=True)
        output_path = os.path.join(Config.JSON_OUTPUT_DIR, output_json_name)

        if os.path.exists(output_path):
            os.remove(output_path)
            print(f"🗑️ Existing JSON '{output_path}' deleted.")

        df.to_json(output_path, orient='records', lines=True, force_ascii=False)
        print(f"✅ Data saved to JSON at '{output_path}'")



### Cleanup

In [None]:


# ==============================
# CLEANER
# ==============================
class Cleaner:
    @staticmethod
    def cleanup_dataset_artifacts(extracted_folder_path: str, zip_filename: str):
        if os.path.exists(extracted_folder_path):
            shutil.rmtree(extracted_folder_path)
            print(f"🧹 Folder '{extracted_folder_path}' has been deleted successfully.")

        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)
        if os.path.exists(zip_path):
            os.remove(zip_path)
            print(f"🗑️ Zip file '{zip_path}' has been deleted successfully.")



### Hybrid Data loader

In [None]:


# ==============================
# HYBRID LOADER
# ==============================
try:
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
except ImportError:
    kagglehub = None

class HybridDatasetLoader:
    @staticmethod
    def load_dataset(dataset_path: str, file_name: str) -> pd.DataFrame:
        if kagglehub:
            try:
                print(f"📥 Trying KaggleHub for {dataset_path}...")
                df = kagglehub.dataset_load(KaggleDatasetAdapter.PANDAS, dataset_path, file_name)
                print(f"✅ Loaded using KaggleHub: shape = {df.shape}")
                return df
            except Exception as e:
                print(f"⚠️ KaggleHub failed: {e}\nFalling back to ZIP-based loader.")

        extracted_folder, _ = DatasetDownloader.download_and_extract(dataset_path)
        return DatasetLoader.load_csv(extracted_folder, file_name)



### Main flow

In [None]:

# ==============================
# MAIN FLOW
# ==============================
def process_dataset(dataset_path: str, target_csv_name: str, allowed_fields: List[str], output_json_name: str):
    df = HybridDatasetLoader.load_dataset(dataset_path, target_csv_name)
    filtered_df = DatasetProcessor.filter_fields(df, allowed_fields)
    DatasetProcessor.save_to_json(filtered_df, output_json_name)

    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)

## Login and do the processing of Resume and JD dataset

In [None]:
Config.setup_kaggle_credentials()
# Process Resume Dataset
process_dataset(
    dataset_path="snehaanbhawal/resume-dataset",
    target_csv_name="Resume.csv",
    allowed_fields=["Category", "Resume_str"],
    output_json_name="parsed_resumes.json"
)

# Process Job Postings Dataset
process_dataset(
    dataset_path="arshkon/linkedin-job-postings",
    target_csv_name="postings.csv",
    allowed_fields=["title", "company_name", "location", "description", "skills_desc", "job_id" , "formatted_experience_level", "formatted_work_type"],
    output_json_name="parsed_jds.json"
)

# Phase 2 -	Parse resume/JD into JSON structured scheme

##  Install Dependencies  & Login to Hugging Face Hub

In [None]:
%pip install -q transformers accelerate sentencepiece pydantic huggingface_hub

#%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
#%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
%pip install -U bitsandbytes


In [None]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    if is_running_in_colab():
        # If running in Colab, use the Colab secrets
        try:
            from google.colab import userdata
            HF_TOKEN = userdata.get('HF_TOKEN')
            if not HF_TOKEN:
                raise ValueError("⚠️ Hugging Face token not found in Colab secrets.")
            print("🔑 Hugging Face token found in Colab secrets.")
        except ImportError:
            print("⚠️ Unable to authenticate in Colab. Please set your Hugging Face token manually.")
    else:
        # Prompt for token if not set in environment
        print("🔑 Please enter your Hugging Face token:")
        # For Colab or local prompt input
        HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


## Mount Google Drive (Colab)

In [None]:
if is_running_in_colab():
   from google.colab import drive
   drive.mount('/content/drive')

## Import Libraries

In [None]:
import json
import os
import time
from pathlib import Path
from tqdm import tqdm
from typing import List, Optional
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
from pydantic import BaseModel, ValidationError



##  Load Mistral-7B-Instruct with Fallback to Quantized

In [None]:
import subprocess
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

def get_available_gpu_memory_gb():
    """Returns available GPU memory (in GB) using nvidia-smi. Returns 0 if GPU not available."""
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,nounits,noheader"],
            encoding="utf-8"
        )
        # First GPU (assume single GPU setup)
        free_mem_mb = int(output.strip().split("\n")[0])
        return free_mem_mb / 1024  # Convert MB to GB
    except Exception as e:
        print(f"⚠️ Could not fetch GPU memory: {str(e)}")
        return 0.0

def load_mistral_pipeline_dynamic(model_name="mistralai/Mistral-7B-Instruct-v0.1", hf_token=None):
    print("🔍 Detecting system resources...")
    has_cuda = torch.cuda.is_available()
    free_vram_gb = get_available_gpu_memory_gb() if has_cuda else 0

    print(f"🧠 CUDA available: {has_cuda}")
    print(f"📊 Free GPU memory: {free_vram_gb:.2f} GB")

    device_map = {"": 0} if has_cuda else "cpu"
    load_quantized = (not has_cuda) or (free_vram_gb < 14)  # ~14 GB is a safe threshold for Mistral-7B FP16

    print("🔧 Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        token=hf_token,
        trust_remote_code=True
    )

    try:
        if load_quantized:
            print("⚙️ Using 8-bit quantized model (low VRAM or CPU fallback)...")
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                token=hf_token,
                device_map=device_map,
                quantization_config=BitsAndBytesConfig(load_in_8bit=True),
                trust_remote_code=True
            )
            print("✅ Loaded 8-bit model.")
        else:
            print("⚙️ Using full precision FP16 model...")
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                token=hf_token,
                device_map=device_map,
                torch_dtype=torch.float16,
                trust_remote_code=True
            )
            print("✅ Loaded full precision model.")
    except Exception as e:
        print(f"❌ Error loading model: {str(e)}")
        raise RuntimeError("Model load failed.")

    print("🎯 Model on device:", next(model.parameters()).device)
    return pipeline("text-generation", model=model, tokenizer=tokenizer, batch_size=2)

llm_pipeline = load_mistral_pipeline_dynamic(hf_token=HF_TOKEN)

## Define Pydantic Schemas

In [None]:
class Education(BaseModel):
    degree: str
    field: str
    institution: str
    year: str

class Experience(BaseModel):
    job_title: str
    company: str
    duration: str
    description: str

class ResumeSchema(BaseModel):
    basics: dict
    education: List[Education]
    experience: List[Experience]
    skills: List[str]
    certifications: List[str]
    projects: List[str]

class JobDescriptionSchema(BaseModel):
    title: str
    summary: str
    required_experience_years: float
    preferred_degrees: List[str]
    required_skills: List[str]
    certifications: List[str]
    soft_skills: List[str]


##  Prompt Templates

In [None]:
RESUME_PROMPT_TEMPLATE = """Extract the structured resume JSON from the raw resume below:
--------------------
{text}
--------------------
The output should match this schema (no extra fields):
{schema}
Return a valid JSON object only.
"""

JD_PROMPT_TEMPLATE = """Extract structured job description JSON from the raw JD below:
--------------------
{text}
--------------------
The output should match this schema:
{schema}
Return a valid JSON object only.
"""


## Inference Function -JSON Extraction  with Retry

In [None]:
def extract_structured_json(text, prompt_template, schema_dict, max_new_tokens=512, retries=2):
    schema_str = json.dumps(schema_dict, indent=2)
    prompt = prompt_template.format(text=text.strip()[:1500], schema=schema_str)
    attempt = 0

    while attempt <= retries:
        try:
            response = llm_pipeline(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0]["generated_text"]
            json_start = response.find("{")
            return json.loads(response[json_start:])
        except Exception as e:
            print(f"⚠️ Attempt {attempt + 1} failed: {str(e)}")
            attempt += 1
            time.sleep(1)

    return {"raw_output": "failed"}



##  Normalize in Batches with Validation

In [None]:
import uuid
from datetime import datetime

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"

def validate_entry(entry, is_resume):
    try:
        if is_resume:
            ResumeSchema.parse_obj(entry)
        else:
            JobDescriptionSchema.parse_obj(entry)
        return True
    except ValidationError as ve:
        return False

def normalize_and_save(
    input_filename,
    output_filename_prefix,
    is_resume=True,
    output_dir=Path("json_outputs"),
    google_drive_sync=True,
    drive_subdir="AI-Resume-Agent"
):
    # === Setup Metadata ===
    batch_id = uuid.uuid4().hex[:6]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    output_dir.mkdir(parents=True, exist_ok=True)

    valid_file = f"{output_filename_prefix}_{timestamp}_{batch_id}.json"
    invalid_file = f"invalid_{output_filename_prefix}_{timestamp}_{batch_id}.json"
    meta_file = f"run_metadata_{output_filename_prefix}_{timestamp}_{batch_id}.json"

    input_path = output_dir / input_filename
    valid_output_path = output_dir / valid_file
    invalid_output_path = output_dir / invalid_file
    metadata_path = output_dir / meta_file

    with open(input_path, "r", encoding="utf-8") as f:
        raw_data = [json.loads(line) for line in f.readlines() if line.strip()]

    results, invalids = [], []
    prompt_template = RESUME_PROMPT_TEMPLATE if is_resume else JD_PROMPT_TEMPLATE
    schema_ref = ResumeSchema.schema() if is_resume else JobDescriptionSchema.schema()

    print(f"⏳ Processing {len(raw_data)} records...")
    for record in tqdm(raw_data):
        text = record.get("Resume_str" if is_resume else "description", "")
        parsed = extract_structured_json(
            text=text,
            prompt_template=prompt_template,
            schema_dict=schema_ref
        )
        if validate_entry(parsed, is_resume):
            results.append(parsed)
        else:
            invalids.append({"input": text, "output": parsed})

    # === Save Results ===
    with open(valid_output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)

    with open(invalid_output_path, "w", encoding="utf-8") as f:
        json.dump(invalids, f, indent=2)

    # === Metadata ===
    meta = {
        "batch_id": batch_id,
        "timestamp": timestamp,
        "input_file": input_filename,
        "output_valid_file": valid_file,
        "output_invalid_file": invalid_file,
        "record_count": len(raw_data),
        "valid_count": len(results),
        "invalid_count": len(invalids),
        "model": MODEL_NAME,
        "quantized": not any(p.dtype == torch.float16 for p in llm_pipeline.model.parameters()),
        "device": str(next(llm_pipeline.model.parameters()).device)
    }

    with open(metadata_path, "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2)

    print(f"✅ Saved {len(results)} valid → {valid_output_path}")
    print(f"⚠️ Saved {len(invalids)} invalid → {invalid_output_path}")
    print(f"📄 Metadata → {metadata_path}")

    # === Google Drive Sync (Colab only) ===
    if google_drive_sync and is_running_in_colab():
        drive_base = Path("/content/drive/MyDrive") / drive_subdir
        drive_base.mkdir(parents=True, exist_ok=True)

        for file in [valid_output_path, invalid_output_path, metadata_path]:
            if file.exists():
                dest = drive_base / file.name
                file.replace(dest)
                print(f"📂 Synced to Google Drive: {dest}")


## Run Phase 2 End-to-End

In [None]:
def run_phase2_structured_normalization():
    normalize_and_save("parsed_resumes.json", "normalized_resumes.json", is_resume=True)
    normalize_and_save("parsed_jds.json", "normalized_jds.json", is_resume=False)

run_phase2_structured_normalization()
