In [None]:
%pip install pandas tqdm pdfplumber python-docx

In [None]:
%pip install kaggle kagglehub pandas-datasets

In [3]:
import os
import json
import pandas as pd

from tqdm import tqdm


In [4]:
#import os
import zipfile
#import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi

def authenticate_kaggle():
    """Authenticate Kaggle API using kaggle.json file."""
    api = KaggleApi()
    api.authenticate()
    return api

def download_kaggle_dataset(api, dataset_slug: str, download_dir: str):
    """
    Download the Kaggle dataset if it's not already downloaded and extracted.
    
    Args:
        api (KaggleApi): Authenticated Kaggle API instance
        dataset_slug (str): The Kaggle dataset slug (e.g., 'snehaanbhawal/resume-dataset')
        download_dir (str): Directory where the dataset will be stored
    """
    os.makedirs(download_dir, exist_ok=True)
    
    dataset_name = dataset_slug.split("/")[-1]
    zip_path = os.path.join(download_dir, f"{dataset_name}.zip")
    extract_path = os.path.join(download_dir, dataset_name)
    
    if os.path.exists(extract_path):
        print(f"✅ Dataset already extracted at '{extract_path}', skipping download.")
        return extract_path

    print(f"⬇️ Downloading dataset '{dataset_slug}'...")
    api.dataset_download_files(dataset_slug, path=download_dir, quiet=False)

    print(f"📦 Extracting dataset '{zip_path}'...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"✅ Extraction completed at '{extract_path}'.")

    return extract_path

def find_csv_file(root_path: str, target_csv: str) -> str:
    """
    Recursively search for the target CSV inside the extracted dataset directory.
    
    Args:
        root_path (str): Directory to start search from
        target_csv (str): Name of the CSV file to find
    
    Returns:
        str: Full path to the CSV file
    """
    for dirpath, _, filenames in os.walk(root_path):
        if target_csv in filenames:
            full_path = os.path.join(dirpath, target_csv)
            print(f"✅ Found CSV at: {full_path}")
            return full_path
    raise FileNotFoundError(f"❌ CSV file '{target_csv}' not found inside '{root_path}'.")

def load_csv_to_dataframe(csv_path: str) -> pd.DataFrame:
    """
    Load a CSV file into a pandas DataFrame.
    
    Args:
        csv_path (str): Path to the CSV file
    
    Returns:
        pd.DataFrame: Loaded DataFrame
    """
    print(f"📖 Loading CSV file '{csv_path}'...")
    df = pd.read_csv(csv_path)
    print(f"✅ CSV loaded successfully. Shape: {df.shape}")
    return df

def download_and_load_kaggle_dataset(api, dataset_slug: str, 
                                     download_dir: str = "datasets", 
                                     csv_filename: str = None) -> pd.DataFrame:
    """
    High-level function to download, extract, find, and load a Kaggle CSV dataset.
    
    Args:
        api (KaggleApi): Authenticated Kaggle API instance
        dataset_slug (str): The Kaggle dataset slug
        download_dir (str): Local directory to store the datasets
        csv_filename (str): Specific CSV file name inside the dataset to load
    
    Returns:
        pd.DataFrame: Loaded pandas DataFrame
    """
    if csv_filename is None:
        raise ValueError("You must specify the CSV filename to load.")

    # Download and extract
    extract_path = download_kaggle_dataset(api, dataset_slug, download_dir)

    # Find the CSV
    csv_path = find_csv_file(extract_path, csv_filename)

    # Load the CSV
    df = load_csv_to_dataframe(csv_path)
    
    return df

def download_and_load_multiple_datasets(dataset_info: list, download_dir: str = "datasets") -> dict:
    """
    Load multiple Kaggle datasets at once.
    
    Args:
        dataset_info (list): List of dicts with keys 'slug' and 'csv'
        download_dir (str): Directory to store datasets
    
    Returns:
        dict: Mapping from dataset name to DataFrame
    """
    api = authenticate_kaggle()
    loaded_datasets = {}

    for item in dataset_info:
        print(f"\n🚀 Processing dataset: {item['slug']}")
        df = download_and_load_kaggle_dataset(api, item['slug'], download_dir, item['csv'])
        dataset_key = item.get('key', item['slug'].split('/')[-1])
        loaded_datasets[dataset_key] = df
    
    print("\n✅ All datasets loaded successfully.")
    return loaded_datasets


In [None]:
dataset_info = [
    {
        "slug": "snehaanbhawal/resume-dataset",
        "csv": "Resume.csv",
        "key": "resume_data"
    },
    {
        "slug": "arshkon/linkedin-job-postings",
        "csv": "postings.csv",
        "key": "job_postings"
    }
]

datasets = download_and_load_multiple_datasets(dataset_info)


In [None]:
# Resume Data
resume_df = datasets["resume_data"]
print(resume_df.shape)
resume_df.head()

# Job Postings Data
job_postings_df = datasets["job_postings"]
print(job_postings_df.shape)
job_postings_df.head()


In [7]:
# These should point to the actual extracted Kaggle dataset folders
OUTPUT_JSON_DIR_RESUME = 'structured_resumes'
OUTPUT_JSON_DIR_JOB_POSTINGS = 'structured_job_postings'


In [8]:
def structure_resume_from_csv(row):
    return {
        "basics": {
            "name": None,  # We don't have names yet
            "email": None,
            "phone": None,
            "location": None,
            "summary": None,
            "category": row.get('Category', '')  # Use dataset label
        },
        "education": [],
        "experience": [],
        "skills": [],
        "certifications": [],
        "projects": [],
        "raw_text": row.get('Resume_str', '')  # Full resume text
    }


In [None]:
def structure_jd_from_csv(row):
    return {
        "basics": {
            "title": row.get('title', ''),
            "company": row.get('company_name', ''),
            "location": row.get('location', '')
        },
        "description": row.get('description', ''),
        "skills_description": row.get('skills_desc', ''),
        "job_posted_date": row.get('posted_date', ''),
        "job_id": row.get('job_id', ''),
        "seniority_level": row.get('formatted_experience_level', ''),
        "employment_type": row.get('formatted_work_type', ''),
        "job_function": row.get('job_function', '')
    }



In [None]:
structured_resumes = []

for idx, row in tqdm(resume_df.iterrows(), total=len(resume_df)):
    resume_obj = structure_resume_from_csv(row)
    structured_resumes.append(resume_obj)

# Save parsed resumes
with open(os.path.join(OUTPUT_JSON_DIR_RESUME, "structured_resumes.json"), "w") as f:
    json.dump(structured_resumes, f, indent=2)


  0%|          | 0/2484 [00:00<?, ?it/s]

100%|██████████| 2484/2484 [00:00<00:00, 15680.68it/s]


In [None]:
structured_jds = []

for idx, row in tqdm(job_postings_df.iterrows(), total=len(job_postings_df)):
    jd_obj = structure_jd_from_csv(row)
    structured_jds.append(jd_obj)

# Save parsed JDs
with open(os.path.join(OUTPUT_JSON_DIR_JOB_POSTINGS, "structured_jds.json"), "w") as f:
    json.dump(structured_jds, f, indent=2)


100%|██████████| 123849/123849 [00:09<00:00, 12997.69it/s]
