# Phase 1: First Steps Notebook — Data Ingestion + Minimal Parsing
1. Setup and Install Dependencies
2. Load Resume and JD datasets
3. Minimal Parsing into JSON Structure
4. Save structured JSON for Phase 2

## Setup and Install Dependencies

In [10]:
%pip install kagglehub pandas


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Util Classes and methods

### Configurations  

In [11]:
# ==============================
# 🛠 CONFIGURATION
# ==============================
import os
import shutil
import zipfile
import pandas as pd
from pathlib import Path
from typing import List

class Config:
    DATASET_DOWNLOAD_DIR = "datasets"
    JSON_OUTPUT_DIR = "json_outputs"
    AUTO_CLEANUP = True

    @staticmethod
    def setup_kaggle_credentials():
        kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
        if not os.path.exists(kaggle_path):
            from google.colab import files
            print("📂 Upload kaggle.json file...")
            uploaded = files.upload()
            os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
            for filename in uploaded.keys():
                shutil.move(filename, kaggle_path)
            os.chmod(kaggle_path, 0o600)
            print(f"✅ Kaggle credentials setup at {kaggle_path}")
        else:
            print(f"✅ Kaggle credentials already exist at {kaggle_path}")



### Downloader 

In [12]:

# ==============================
# DOWNLOADER
# ==============================
class DatasetDownloader:
    @staticmethod
    def download_and_extract(dataset_path: str) -> tuple[str, str]:
        os.makedirs(Config.DATASET_DOWNLOAD_DIR, exist_ok=True)
        dataset_slug = dataset_path.split("/")[-1]
        extract_folder_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
        zip_filename = f"{dataset_slug}.zip"
        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)

        if os.path.exists(extract_folder_path) and any(Path(extract_folder_path).rglob("*.csv")):
            print(f"⚡ Dataset folder already exists at '{extract_folder_path}', skipping download and extraction.")
            return extract_folder_path, zip_filename

        print(f"⬇️ Downloading dataset: {dataset_path} ...")
        !kaggle datasets download -d {dataset_path} -p {Config.DATASET_DOWNLOAD_DIR}

        if not os.path.exists(zip_path):
            raise FileNotFoundError(f"❌ Zip file '{zip_filename}' not found after download!")

        os.makedirs(extract_folder_path, exist_ok=True)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_folder_path)

        print(f"✅ Downloaded and extracted to '{extract_folder_path}'.")
        return extract_folder_path, zip_filename



### Data Loader

In [13]:

# ==============================
# LOADER
# ==============================
class DatasetLoader:
    @staticmethod
    def load_csv(dataset_folder: str, target_csv_name: str) -> pd.DataFrame:
        print(f"🔍 Searching for '{target_csv_name}' inside {dataset_folder}...")
        if not os.path.exists(dataset_folder):
            raise FileNotFoundError(f"❌ Dataset folder '{dataset_folder}' does not exist!")

        for root, _, files in os.walk(dataset_folder):
            for file in files:
                if file.lower() == target_csv_name.lower():
                    csv_path = os.path.join(root, file)
                    df = pd.read_csv(csv_path)
                    print(f"✅ Loaded CSV with shape {df.shape}")
                    return df

        raise FileNotFoundError(f"❌ CSV file '{target_csv_name}' not found inside extracted dataset!")



### Data Processor

In [14]:

# ==============================
# PROCESSOR
# ==============================
class DatasetProcessor:
    @staticmethod
    def filter_fields(df: pd.DataFrame, allowed_fields: List[str]) -> pd.DataFrame:
        missing_fields = [field for field in allowed_fields if field not in df.columns]
        if missing_fields:
            raise ValueError(f"❌ Fields {missing_fields} not found in dataset!")

        filtered_df = df[allowed_fields]
        print(f"✅ Filtered columns: {list(filtered_df.columns)}")
        return filtered_df

    @staticmethod
    def save_to_json(df: pd.DataFrame, output_json_name: str):
        os.makedirs(Config.JSON_OUTPUT_DIR, exist_ok=True)
        output_path = os.path.join(Config.JSON_OUTPUT_DIR, output_json_name)

        if os.path.exists(output_path):
            os.remove(output_path)
            print(f"🗑️ Existing JSON '{output_path}' deleted.")

        df.to_json(output_path, orient='records', lines=True, force_ascii=False)
        print(f"✅ Data saved to JSON at '{output_path}'")



### Cleanup

In [15]:


# ==============================
# CLEANER
# ==============================
class Cleaner:
    @staticmethod
    def cleanup_dataset_artifacts(extracted_folder_path: str, zip_filename: str):
        if os.path.exists(extracted_folder_path):
            shutil.rmtree(extracted_folder_path)
            print(f"🧹 Folder '{extracted_folder_path}' has been deleted successfully.")

        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)
        if os.path.exists(zip_path):
            os.remove(zip_path)
            print(f"🗑️ Zip file '{zip_path}' has been deleted successfully.")



### Hybrid Data loader

In [16]:


# ==============================
# HYBRID LOADER
# ==============================
try:
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
except ImportError:
    kagglehub = None

class HybridDatasetLoader:
    @staticmethod
    def load_dataset(dataset_path: str, file_name: str) -> pd.DataFrame:
        if kagglehub:
            try:
                print(f"📥 Trying KaggleHub for {dataset_path}...")
                df = kagglehub.dataset_load(KaggleDatasetAdapter.PANDAS, dataset_path, file_name)
                print(f"✅ Loaded using KaggleHub: shape = {df.shape}")
                return df
            except Exception as e:
                print(f"⚠️ KaggleHub failed: {e}\nFalling back to ZIP-based loader.")

        extracted_folder, _ = DatasetDownloader.download_and_extract(dataset_path)
        return DatasetLoader.load_csv(extracted_folder, file_name)



### Main flow

In [17]:

# ==============================
# MAIN FLOW
# ==============================
def process_dataset(dataset_path: str, target_csv_name: str, allowed_fields: List[str], output_json_name: str):
    df = HybridDatasetLoader.load_dataset(dataset_path, target_csv_name)
    filtered_df = DatasetProcessor.filter_fields(df, allowed_fields)
    DatasetProcessor.save_to_json(filtered_df, output_json_name)

    dataset_slug = dataset_path.split("/")[-1]
    extracted_folder = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
    zip_filename = f"{dataset_slug}.zip"
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)

## Login and do the processing of Resume and JD dataset

In [18]:
Config.setup_kaggle_credentials()
# Process Resume Dataset
process_dataset(
    dataset_path="snehaanbhawal/resume-dataset",
    target_csv_name="Resume.csv",
    allowed_fields=["Category", "Resume_str"],
    output_json_name="parsed_resumes.json"
)

# Process Job Postings Dataset
process_dataset(
    dataset_path="arshkon/linkedin-job-postings",
    target_csv_name="postings.csv",
    allowed_fields=["title", "company_name", "location", "description", "skills_desc", "job_id" , "formatted_experience_level", "formatted_work_type"],
    output_json_name="parsed_jds.json"
)

✅ Kaggle credentials already exist at C:\Users\rubyj/.kaggle/kaggle.json
📥 Trying KaggleHub for snehaanbhawal/resume-dataset...
⚠️ KaggleHub failed: 404 Client Error.

Resource not found at URL: https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset/versions/1
The server reported the following issues: Data not found
Please make sure you specified the correct resource identifiers.
Falling back to ZIP-based loader.
⬇️ Downloading dataset: snehaanbhawal/resume-dataset ...
Dataset URL: https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset
License(s): CC0-1.0
✅ Downloaded and extracted to 'datasets\resume-dataset'.
🔍 Searching for 'Resume.csv' inside datasets\resume-dataset...
✅ Loaded CSV with shape (2484, 4)
✅ Filtered columns: ['Category', 'Resume_str']
✅ Data saved to JSON at 'json_outputs\parsed_resumes.json'
🧹 Folder 'datasets\resume-dataset' has been deleted successfully.
🗑️ Zip file 'datasets\resume-dataset.zip' has been deleted successfully.
📥 Trying KaggleHub for arsh