# Global setup and package installation used in most phases

## Colab + GPU Detection Utilities

In [1]:
import subprocess

def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

def get_available_gpu_memory_gb():
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.free", "--format=csv,nounits,noheader"],
            encoding="utf-8"
        )
        free_mem_mb = int(output.strip().split("\n")[0])
        return free_mem_mb / 1024
    except Exception:
        return 0.0


## install dependencies

In [None]:
if is_running_in_colab():
    # Install the required packages
    !pip install kagglehub pandas
    !pip install -q transformers accelerate bitsandbytes sentencepiece pydantic huggingface_hub xformers
    !pip install langid
else:
    %pip install kagglehub pandas
    %pip install -q transformers accelerate sentencepiece pydantic huggingface_hub xformers
    #%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
    #%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
    %pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
    %pip install -U bitsandbytes
    %pip install langid


## Login to huggingface

In [2]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    if is_running_in_colab():
        # If running in Colab, use the Colab secrets
        try:
            from google.colab import userdata
            HF_TOKEN = userdata.get('HF_TOKEN')
            if not HF_TOKEN:
                raise ValueError("⚠️ Hugging Face token not found in Colab secrets.")
            print("🔑 Hugging Face token found in Colab secrets.")
        except ImportError:
            print("⚠️ Unable to authenticate in Colab. Please set your Hugging Face token manually.")
    else:
        # Prompt for token if not set in environment
        print("🔑 Please enter your Hugging Face token:")
        # For Colab or local prompt input
        HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


## Setup Kaggle Credentials

In [3]:
import shutil

def setup_kaggle_credentials():
    kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
    if not os.path.exists(kaggle_path):
        from google.colab import files
        print("📂 Upload kaggle.json file...")
        uploaded = files.upload()
        os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
        for filename in uploaded.keys():
            shutil.move(filename, kaggle_path)
        os.chmod(kaggle_path, 0o600)
        print(f"✅ Kaggle credentials setup at {kaggle_path}")
    else:
        print(f"✅ Kaggle credentials already exist at {kaggle_path}")

setup_kaggle_credentials()

✅ Kaggle credentials already exist at C:\Users\rubyj/.kaggle/kaggle.json


## Mount Google Drive (Colab)

In [None]:
if is_running_in_colab():
   from google.colab import drive
   drive.mount('/content/drive')

# Phase 1: First Steps Notebook — Data Ingestion + Minimal Parsing
1. Load Resume and JD datasets
2. Minimal Parsing into JSON Structure
3. Save structured JSON for Phase 2

## Util Classes and methods

### Configurations  

In [17]:
# ==============================
# 🛠 CONFIGURATION
# ==============================

class Config:
    DATASET_DOWNLOAD_DIR = "datasets"
    JSON_OUTPUT_DIR = "json_outputs_sample_data"
    AUTO_CLEANUP = False




### Downloader 

In [18]:

# ==============================
# DOWNLOADER
# ==============================
import zipfile
from pathlib import Path

class DatasetDownloader:
    @staticmethod
    def download_and_extract(dataset_path: str) -> tuple[str, str]:
        os.makedirs(Config.DATASET_DOWNLOAD_DIR, exist_ok=True)
        dataset_slug = dataset_path.split("/")[-1]
        extract_folder_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
        zip_filename = f"{dataset_slug}.zip"
        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)

        if os.path.exists(extract_folder_path) and any(Path(extract_folder_path).rglob("*.csv")):
            print(f"⚡ Dataset folder already exists at '{extract_folder_path}', skipping download and extraction.")
            return extract_folder_path, zip_filename

        print(f"⬇️ Downloading dataset: {dataset_path} ...")
        !kaggle datasets download -d {dataset_path} -p {Config.DATASET_DOWNLOAD_DIR}

        if not os.path.exists(zip_path):
            raise FileNotFoundError(f"❌ Zip file '{zip_filename}' not found after download!")

        os.makedirs(extract_folder_path, exist_ok=True)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_folder_path)

        print(f"✅ Downloaded and extracted to '{extract_folder_path}'.")
        return extract_folder_path, zip_filename



### Data Loader

In [19]:

# ==============================
# LOADER
# ==============================
import pandas as pd


class DatasetLoader:
    @staticmethod
    def load_csv(dataset_folder: str, target_csv_name: str) -> pd.DataFrame:
        print(f"🔍 Searching for '{target_csv_name}' inside {dataset_folder}...")
        if not os.path.exists(dataset_folder):
            raise FileNotFoundError(f"❌ Dataset folder '{dataset_folder}' does not exist!")

        for root, _, files in os.walk(dataset_folder):
            for file in files:
                if file.lower() == target_csv_name.lower():
                    csv_path = os.path.join(root, file)
                    df = pd.read_csv(csv_path)
                    print(f"✅ Loaded CSV with shape {df.shape}")
                    return df

        raise FileNotFoundError(f"❌ CSV file '{target_csv_name}' not found inside extracted dataset!")
    
    @staticmethod
    def load_all_linkedin_jd_files(dataset_folder: str) -> dict:
        print("📦 Loading all relevant CSVs for JD processing...")
        return {
            "postings": DatasetLoader.load_csv(dataset_folder, "postings.csv"),
            "skills": DatasetLoader.load_csv(dataset_folder, "skills.csv"),
            "industries": DatasetLoader.load_csv(dataset_folder, "industries.csv"),
            "job_skills": DatasetLoader.load_csv(dataset_folder, "job_skills.csv"),
            "job_industries": DatasetLoader.load_csv(dataset_folder, "job_industries.csv"),
        }



### Data Processor

In [20]:

# ==============================
# PROCESSOR
# ==============================
import os
import pandas as pd
from typing import List
from functools import lru_cache
import langid


class DatasetProcessor:
    @staticmethod
    def filter_fields(df: pd.DataFrame, allowed_fields: List[str]) -> pd.DataFrame:
        missing_fields = [field for field in allowed_fields if field not in df.columns]
        if missing_fields:
            raise ValueError(f"❌ Fields {missing_fields} not found in dataset!")

        filtered_df = df[allowed_fields]
        print(f"✅ Filtered columns: {list(filtered_df.columns)}")
        return filtered_df

    @staticmethod
    def save_to_json(df: pd.DataFrame, output_json_name: str):
        os.makedirs(Config.JSON_OUTPUT_DIR, exist_ok=True)
        output_path = os.path.join(Config.JSON_OUTPUT_DIR, output_json_name)

        if os.path.exists(output_path):
            os.remove(output_path)
            print(f"🗑️ Existing JSON '{output_path}' deleted.")

        df.to_json(output_path, orient='records', lines=True, force_ascii=False)
        print(f"✅ Data saved to JSON at '{output_path}'")
        
    @staticmethod
    def detect_languages_langid_cached(texts: List[str]) -> List[str]:
        @lru_cache(maxsize=10000)
        def detect_lang(text: str) -> str:
            return langid.classify(text)[0] if isinstance(text, str) else "unknown"
        return [detect_lang(text) for text in texts]    

    @staticmethod
    def print_row_stats(initial: int, deduped: int, enriched: int, quality: int, final: int):
        print("\n📊 Row Reduction Summary:")
        print(f"🔹 Initial rows in 'postings.csv':     {initial}")
        print(f"🔹 After deduplication:               {deduped}")
        print(f"🔹 After domain/industry enrichment: {enriched}")
        print(f"🔹 After quality filtering:          {quality}")
        print(f"🔹 After language filtering:         {final}")
        
    @staticmethod
    def _deduplicate_postings(postings: pd.DataFrame) -> pd.DataFrame:
        return postings.dropna(subset=["description"]).drop_duplicates(subset=["description"]).copy()

    @staticmethod
    def _enrich_industry(postings: pd.DataFrame, job_industries: pd.DataFrame, industries: pd.DataFrame) -> pd.DataFrame:
        job_industry_map = (
            job_industries
            .groupby("job_id")["industry_id"]
            .agg(lambda x: x.value_counts().index[0])
            .reset_index()
        )
        postings = postings.merge(job_industry_map, on="job_id", how="left")
        postings = postings.merge(industries, on="industry_id", how="left")
        postings.rename(columns={"industry_name": "industry"}, inplace=True)
        return postings

    @staticmethod
    def _enrich_skills_description(postings: pd.DataFrame, job_skills: pd.DataFrame, skills: pd.DataFrame) -> tuple[pd.DataFrame, int]:
        filled_skills_desc_count = 0
        try:
            if "skill_abr" in job_skills.columns and "skill_abr" in skills.columns:
                job_skills = job_skills.merge(skills, on="skill_abr", how="left")
                skill_map = (
                    job_skills
                    .dropna(subset=["skill_name"])
                    .groupby("job_id")["skill_name"]
                    .agg(lambda x: ", ".join(sorted(set(x))))
                    .reset_index()
                    .rename(columns={"skill_name": "merged_skills_text"})
                )
                postings = postings.merge(skill_map, on="job_id", how="left")

                def fill_skills_desc(row):
                    nonlocal filled_skills_desc_count
                    if pd.isna(row["skills_desc"]) or str(row["skills_desc"]).strip() == "":
                        if pd.notna(row["merged_skills_text"]):
                            filled_skills_desc_count += 1
                            return row["merged_skills_text"]
                    return row["skills_desc"]

                postings["skills_desc"] = postings.apply(fill_skills_desc, axis=1)
                postings.drop(columns=["merged_skills_text"], inplace=True)
        except Exception as e:
            print(f"⚠️ Skill enrichment skipped due to error: {e}")
        return postings, filled_skills_desc_count

    @staticmethod
    def _filter_postings(postings: pd.DataFrame) -> pd.DataFrame:
        postings = postings[postings["description"].str.len() > 300]
        if "formatted_work_type" in postings.columns:
            postings["formatted_work_type"] = postings["formatted_work_type"].str.lower()
            postings = postings[postings["formatted_work_type"].isin(["full-time", "permanent"])]

        postings = postings[postings["skills_desc"].notna() & (postings["skills_desc"].str.strip() != "")]
        postings = postings[postings["industry"].notna() & (postings["industry"].str.strip() != "")]
        return postings

    @staticmethod
    def _filter_language(postings: pd.DataFrame) -> pd.DataFrame:
        print("🌍 Detecting languages using langid...")
        postings["lang"] = DatasetProcessor.detect_languages_langid_cached(postings["description"].tolist())
        postings = postings[postings["lang"] == "en"]
        return postings.drop(columns=["lang"])

    @staticmethod
    def process_and_save_enriched_jds(jd_data: dict) -> pd.DataFrame:
        initial_rows = len(jd_data["postings"])

        postings = DatasetProcessor._deduplicate_postings(jd_data["postings"])
        rows_after_dedup = len(postings)

        postings = DatasetProcessor._enrich_industry(postings, jd_data["job_industries"], jd_data["industries"])
        postings, filled_skills_desc_count = DatasetProcessor._enrich_skills_description(postings, jd_data["job_skills"], jd_data["skills"])
        print(f"🧠 Enriched 'skills_desc' from skills.csv for {filled_skills_desc_count} records.")
        rows_before_quality = len(postings)

        postings = DatasetProcessor._filter_postings(postings)
        rows_after_quality = len(postings)

        postings = DatasetProcessor._filter_language(postings)
        final_rows = len(postings)

        DatasetProcessor.print_row_stats(
            initial_rows,
            rows_after_dedup,
            rows_before_quality,
            rows_after_quality,
            final_rows
        )

        return postings.reset_index(drop=True)



### Cleanup

In [21]:
# ==============================
# CLEANER
# ==============================
class Cleaner:
    @staticmethod
    def cleanup_dataset_artifacts(extracted_folder_path: str, zip_filename: str):
        if os.path.exists(extracted_folder_path):
            shutil.rmtree(extracted_folder_path)
            print(f"🧹 Folder '{extracted_folder_path}' has been deleted successfully.")

        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)
        if os.path.exists(zip_path):
            os.remove(zip_path)
            print(f"🗑️ Zip file '{zip_path}' has been deleted successfully.")

### Hybrid Data loader

In [22]:

# ==============================
# HYBRID LOADER
# ==============================
try:
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
except ImportError:
    kagglehub = None

class HybridDatasetLoader:
    @staticmethod
    def load_dataset(dataset_path: str, file_name: str) -> pd.DataFrame:
        if kagglehub:
            try:
                print(f"📥 Trying KaggleHub for {dataset_path}...")
                df = kagglehub.dataset_load(KaggleDatasetAdapter.PANDAS, dataset_path, file_name)
                print(f"✅ Loaded using KaggleHub: shape = {df.shape}")
                return df
            except Exception as e:
                print(f"⚠️ KaggleHub failed: {e}\nFalling back to ZIP-based loader.")

        extracted_folder, _ = DatasetDownloader.download_and_extract(dataset_path)
        return DatasetLoader.load_csv(extracted_folder, file_name)



### Filter high quality JDs

In [23]:
class JobDescriptionFilter:
    @staticmethod
    def filter_required_fields(df: pd.DataFrame) -> pd.DataFrame:
        return df[
            df["title"].notna() & df["title"].str.strip().ne("") &
            df["industry"].notna() & df["industry"].str.strip().ne("") &
            df["description"].notna() & df["description"].str.strip().ne("")
        ]

    @staticmethod
    def filter_structured_description(df: pd.DataFrame) -> pd.DataFrame:
        return df[df["description"].str.contains("responsibilit|requirement|qualif", case=False, na=False)]

    @staticmethod
    def filter_min_description_length(df: pd.DataFrame, min_words: int = 150) -> pd.DataFrame:
        df["word_count"] = df["description"].str.split().str.len()
        filtered = df[df["word_count"] >= min_words].copy()
        filtered.drop(columns=["word_count"], inplace=True)
        return filtered

    @staticmethod
    def filter_min_skills(df: pd.DataFrame, min_skills: int = 2) -> pd.DataFrame:
        df["skill_count"] = df["skills_desc"].str.split(",").str.len()
        filtered = df[df["skill_count"] >= min_skills].copy()
        filtered.drop(columns=["skill_count"], inplace=True)
        return filtered

    @staticmethod
    def filter_top_industries(df: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:
        top_industries = df["industry"].value_counts().nlargest(top_n).index
        return df[df["industry"].isin(top_industries)].copy()

    @staticmethod
    def deduplicate_by_title_and_description(df: pd.DataFrame) -> pd.DataFrame:
        df["title_desc"] = df["title"].str.lower() + "|" + df["description"].str.lower()
        deduped = df.drop_duplicates(subset="title_desc").copy()
        deduped.drop(columns=["title_desc"], inplace=True)
        return deduped

    @staticmethod
    def apply_all_filters(df: pd.DataFrame) -> pd.DataFrame:
        original = len(df)

        df = JobDescriptionFilter.filter_required_fields(df)
        f1 = len(df)

        df = JobDescriptionFilter.filter_structured_description(df)
        f2 = len(df)
        
        df = JobDescriptionFilter.deduplicate_by_title_and_description(df)
        f3 = len(df)

        df = JobDescriptionFilter.filter_min_skills(df, min_skills=2)
        f4 = len(df)
        
        df = JobDescriptionFilter.filter_min_description_length(df, min_words=300)
        f5 = len(df)

        df = JobDescriptionFilter.filter_top_industries(df, top_n=30)
        f6 = len(df)

 

        print("\n📊 High-Quality JD Filter Reduction Summary:")
        print(f"🔹 Initial records:           {original}")
        print(f"🔹 After required fields:     {f1}")
        print(f"🔹 After structure check:     {f2}")
        print(f"🔹 After deduplication:       {f3}")
        print(f"🔹 After min skills (2):      {f4}")
        print(f"🔹 After min length (300):    {f5}")
        print(f"🔹 After top industries (30): {f6}")
       

        return df.reset_index(drop=True)


In [24]:
def sample_jds_top_industries(df: pd.DataFrame, top_n: int = 20, per_industry: int = 2) -> pd.DataFrame:
    top_industries = df["industry"].value_counts().nlargest(top_n).index
    df_top = df[df["industry"].isin(top_industries)]
    sample_df = (
        df_top.groupby("industry", group_keys=False)
        .apply(lambda x: x.sample(n=min(len(x), per_industry), random_state=42))
        .reset_index(drop=True)
    )
    print(f"🧪 Sampled {len(sample_df)} JDs — {per_industry} per top-{top_n} industries.")
    return sample_df


### Load Resume and JD datasets

In [25]:
# ==============================
# Resume Dataset Loader (with caching)
# ==============================
def load_resume_dataset(dataset_path: str = "snehaanbhawal/resume-dataset", target_csv_name: str = "Resume.csv") -> pd.DataFrame:
    if not hasattr(load_resume_dataset, "_cache"):
        print("📥 Loading resume dataset for the first time...")
        load_resume_dataset._cache = HybridDatasetLoader.load_dataset(dataset_path, target_csv_name)
    else:
        print("✅ Using cached resume dataset.")
    
    return load_resume_dataset._cache

# ==============================
# Job Description Dataset Loader (with caching)
# ==============================
def load_job_description_dataset(dataset_path: str = "arshkon/linkedin-job-postings") -> dict:
    if not hasattr(load_job_description_dataset, "_cache"):
        print("📥 Loading all job description CSVs from Kaggle dataset...")

        def hybrid_load(csv_name: str) -> pd.DataFrame:
            return HybridDatasetLoader.load_dataset(dataset_path, csv_name)

        load_job_description_dataset._cache = {
            "postings": hybrid_load("postings.csv"),
            "skills": hybrid_load("skills.csv"),
            "industries": hybrid_load("industries.csv"),
            "job_skills": hybrid_load("job_skills.csv"),
            "job_industries": hybrid_load("job_industries.csv"),
        }
    else:
        print("✅ Using cached job description dataset files.")
    
    return load_job_description_dataset._cache


### JD Dataset Processing Function

In [26]:
def process_and_save_enriched_jd_dataset(
    dataset_path: str ,
    output_json_name: str,
    allowed_fields: List[str] 
):
    # 🔄 Load enriched multi-source JD data using HybridLoader
    jd_data = load_job_description_dataset(dataset_path)

    # Process (returns enriched DataFrame)
    enriched_df = DatasetProcessor.process_and_save_enriched_jds(jd_data)
    
    high_quality_df = JobDescriptionFilter.apply_all_filters(enriched_df)

    # Filter only allowed fields
    final_df = DatasetProcessor.filter_fields(high_quality_df, allowed_fields)

    # Save
    DatasetProcessor.save_to_json(final_df, output_json_name)
    
    # cleanup dataset
    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)


In [27]:
def process_and_save_enriched_jd_dataset_sample(
    dataset_path: str,
    output_json_name: str,
    allowed_fields: List[str],
    top_n: int = 20,
    per_industry: int = 2
):
    # 🔄 Load enriched multi-source JD data using HybridLoader
    jd_data = load_job_description_dataset(dataset_path)

    # Process (returns enriched DataFrame)
    enriched_df = DatasetProcessor.process_and_save_enriched_jds(jd_data)
    
    high_quality_df = JobDescriptionFilter.apply_all_filters(enriched_df)

    # Filter only allowed fields
    final_df = DatasetProcessor.filter_fields(high_quality_df, allowed_fields)

    # Sample
    sampled_df = sample_jds_top_industries(final_df, top_n=top_n, per_industry=per_industry)

    # Save
    DatasetProcessor.save_to_json(sampled_df, output_json_name)
    
    # cleanup dataset
    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)

### Resume Dataset Processing Function

In [28]:
# ==============================
# Resume Filtering (5 per category)
# ==============================
def filter_resumes_by_category(resume_df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
    if 'Category' not in resume_df.columns:
        raise ValueError("❌ Resume dataset does not contain 'Category' column.")

    filtered_resumes = (
        resume_df
        .dropna(subset=['Category'])
        .groupby('Category', group_keys=False)
        .apply(lambda group: group.head(top_n))
        .reset_index(drop=True)
    )

    print(f"✅ Filtered {len(filtered_resumes)} resumes (top {top_n} from each category).")
    return filtered_resumes


In [29]:

# ==============================
# MAIN FLOW
# ==============================

def process_dataset_resume(dataset_path: str, target_csv_name: str, allowed_fields: List[str], output_json_name: str):
    df = load_resume_dataset(dataset_path, target_csv_name)
    # Add resume_str length filter (3000–8000 characters)
    df = df[df["Resume_str"].str.len().between(3000, 8000)]
    filtered_df = DatasetProcessor.filter_fields(df, allowed_fields)
    DatasetProcessor.save_to_json(filtered_df, output_json_name)

    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)


# ==============================
# Save Filtered Resumes
# ==============================
def process_and_save_sample_resumes(dataset_path: str, target_csv_name: str, allowed_fields: List[str], output_json_name: str):
    resume_df = load_resume_dataset(dataset_path, target_csv_name)
    # Add resume_str length filter (3000–8000 characters)
    resume_df = resume_df[resume_df["Resume_str"].str.len().between(3000, 8000)]
    
    df = filter_resumes_by_category(resume_df, top_n=2)
    filtered_df = DatasetProcessor.filter_fields(df, allowed_fields)
    DatasetProcessor.save_to_json(filtered_df, output_json_name)
    
    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)


## Login and do the processing of Resume and JD dataset

In [30]:
process_and_save_sample_resumes(
    dataset_path="snehaanbhawal/resume-dataset",
    target_csv_name="Resume.csv",
    allowed_fields=["ID", "Category", "Resume_str"],
    output_json_name="parsed_resumes.json"
)

📥 Loading resume dataset for the first time...
📥 Trying KaggleHub for snehaanbhawal/resume-dataset...
⚠️ KaggleHub failed: 404 Client Error.

Resource not found at URL: https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset/versions/1
The server reported the following issues: Data not found
Please make sure you specified the correct resource identifiers.
Falling back to ZIP-based loader.
⚡ Dataset folder already exists at 'datasets\resume-dataset', skipping download and extraction.
🔍 Searching for 'Resume.csv' inside datasets\resume-dataset...
✅ Loaded CSV with shape (2484, 4)
✅ Filtered 48 resumes (top 2 from each category).
✅ Filtered columns: ['ID', 'Category', 'Resume_str']
🗑️ Existing JSON 'json_outputs_sample_data\parsed_resumes.json' deleted.
✅ Data saved to JSON at 'json_outputs_sample_data\parsed_resumes.json'


  .apply(lambda group: group.head(top_n))


In [None]:

# Process Resume Dataset
process_dataset_resume(
    dataset_path="snehaanbhawal/resume-dataset",
    target_csv_name="Resume.csv",
    allowed_fields=["ID", "Category", "Resume_str"],
    output_json_name="parsed_resumes.json"
)

In [None]:
process_and_save_enriched_jd_dataset(
    dataset_path="arshkon/linkedin-job-postings",
    output_json_name="parsed_jds.json",
    allowed_fields=["job_id", "title", "industry", "company_name", "location",  "skills_desc","formatted_experience_level", "formatted_work_type", "description"] 
)


In [None]:
process_and_save_enriched_jd_dataset_sample(
    dataset_path="arshkon/linkedin-job-postings",
    output_json_name="parsed_jds.json",
    allowed_fields=["job_id", "title", "industry", "company_name", "location",  "skills_desc","formatted_experience_level", "formatted_work_type", "description"],
    top_n=30,
    per_industry=2
)
