In [20]:
import os
import json
import zipfile
import shutil
import pandas as pd
from pathlib import Path
from typing import List

# -------------------------------
# CONFIGURATION
# -------------------------------

class Config:
    # Directories
    DATASET_DOWNLOAD_DIR = "datasets"
    #json outputs
    JSON_OUTPUT_DIR = "json_outputs"
    # Cleanup flag
    AUTO_CLEANUP = False
    # Kaggle API token location
    KAGGLE_JSON_PATH = os.path.expanduser('~/.kaggle/kaggle.json')
    
    @staticmethod
    def setup_kaggle_credentials():
        """Setup Kaggle credentials for Colab or local"""
        if not os.path.exists(Config.KAGGLE_JSON_PATH):
            from google.colab import files
            print("📂 Upload kaggle.json file...")
            uploaded = files.upload()
            os.makedirs(os.path.dirname(Config.KAGGLE_JSON_PATH), exist_ok=True)
            for filename in uploaded.keys():
                shutil.move(filename, Config.KAGGLE_JSON_PATH)
            os.chmod(Config.KAGGLE_JSON_PATH, 0o600)
            print(f"✅ Kaggle credentials setup at {Config.KAGGLE_JSON_PATH}")
        else:
            print(f"✅ Kaggle credentials already exist at {Config.KAGGLE_JSON_PATH}")


# -------------------------------
# DOWNLOADER
# -------------------------------

class DatasetDownloader:
    @staticmethod
    def download_and_extract(dataset_path: str) -> tuple[str, str]:
        os.makedirs(Config.DATASET_DOWNLOAD_DIR, exist_ok=True)
        
        dataset_slug = dataset_path.split("/")[-1]
        extract_folder_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, dataset_slug)
        zip_filename = f"{dataset_slug}.zip"
        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)

        # If already extracted, skip download and extraction
        if os.path.exists(extract_folder_path) and any(Path(extract_folder_path).rglob('*.csv')):
            print(f"⚡ Dataset folder already exists at '{extract_folder_path}', skipping download and extraction.")
            return extract_folder_path, zip_filename
        
        print(f"⬇️ Downloading dataset: {dataset_path} ...")
        !kaggle datasets download -d {dataset_path} -p {Config.DATASET_DOWNLOAD_DIR}
        
        if not os.path.exists(zip_path):
            raise FileNotFoundError(f"❌ Zip file '{zip_filename}' not found after download!")

        os.makedirs(extract_folder_path, exist_ok=True)
        
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_folder_path)
        
        print(f"✅ Downloaded and extracted to '{extract_folder_path}'.")
        return extract_folder_path, zip_filename

# -------------------------------
# LOADER
# -------------------------------

class DatasetLoader:
    @staticmethod
    def load_csv(dataset_folder: str, target_csv_name: str) -> pd.DataFrame:
        # Walk and find the CSV
        print(f"🔍 Searching for '{target_csv_name}' inside {dataset_folder}...")
        if not os.path.exists(dataset_folder):
            raise FileNotFoundError(f"❌ Dataset folder '{dataset_folder}' does not exist!")
        
        for root, _, files in os.walk(dataset_folder):
            for file in files:
                if file.lower() == target_csv_name.lower():
                    csv_path = os.path.join(root, file)
                    df = pd.read_csv(csv_path)
                    print(f"✅ Loaded CSV with shape {df.shape}")
                    return df
        raise FileNotFoundError(f"❌ CSV file '{target_csv_name}' not found inside extracted dataset!")



# -------------------------------
# PROCESSOR
# -------------------------------

class DatasetProcessor:
    @staticmethod
    def filter_fields(df: pd.DataFrame, allowed_fields: List[str]) -> pd.DataFrame:
        missing_fields = [field for field in allowed_fields if field not in df.columns]
        if missing_fields:
            raise ValueError(f"❌ Fields {missing_fields} not found in dataset!")
        
        filtered_df = df[allowed_fields]
        print(f"✅ Filtered columns: {list(filtered_df.columns)}")
        return filtered_df
    
    @staticmethod
    def save_to_json(df: pd.DataFrame, output_json_name: str):
        # Ensure JSON output directory exists
        os.makedirs(Config.JSON_OUTPUT_DIR, exist_ok=True)
        
        output_json_path = os.path.join(Config.JSON_OUTPUT_DIR, output_json_name)
        
        # Check if file already exists, remove it before writing
        if os.path.exists(output_json_path):
            print(f"⚠️ JSON file '{output_json_name}' already exists. Deleting and re-creating it.")
            os.remove(output_json_path)
        
        # Save DataFrame to JSON
        print(f"📝 Saving DataFrame to JSON: {output_json_name}...")
        df.to_json(output_json_path, orient='records', lines=True)
        print(f"✅ JSON file saved to '{output_json_path}'.")

# -------------------------------
# CLEANER
# -------------------------------

class Cleaner:
    @staticmethod
    def cleanup_dataset_artifacts(extracted_folder_path: str, zip_filename: str):
        # Delete extracted folder
        if os.path.exists(extracted_folder_path):
            shutil.rmtree(extracted_folder_path)
            print(f"🧹 Folder '{extracted_folder_path}' has been deleted successfully.")
        else:
            print(f"⚠️ Folder '{extracted_folder_path}' does not exist, skipping folder deletion.")
        
        # Delete zip file
        zip_path = os.path.join(Config.DATASET_DOWNLOAD_DIR, zip_filename)
        if os.path.exists(zip_path):
            os.remove(zip_path)
            print(f"🗑️ Zip file '{zip_path}' has been deleted successfully.")
        else:
            print(f"⚠️ Zip file '{zip_path}' does not exist, skipping zip deletion.")

# -------------------------------
# MAIN FLOW
# -------------------------------

def process_dataset(dataset_path: str, target_csv_name: str, allowed_fields: List[str], output_json_name: str):
    # Download and extract dataset
    extracted_folder, zip_filename = DatasetDownloader.download_and_extract(dataset_path)
    # Load the CSV file
    df = DatasetLoader.load_csv(extracted_folder, target_csv_name)
    # Filter the columns based on allowed fields
    filtered_df = DatasetProcessor.filter_fields(df, allowed_fields)
    # Save to JSON in the json_outputs folder
    DatasetProcessor.save_to_json(filtered_df, output_json_name)
    
    if Config.AUTO_CLEANUP:
        Cleaner.cleanup_dataset_artifacts(extracted_folder, zip_filename)



In [21]:
# First run Kaggle setup
Config.setup_kaggle_credentials()

# Process Resume Dataset
process_dataset(
    dataset_path="snehaanbhawal/resume-dataset",
    target_csv_name="Resume.csv",
    allowed_fields=["Category", "Resume_str"],
    output_json_name="resume_data.json"
)

# Process Job Postings Dataset
process_dataset(
    dataset_path="arshkon/linkedin-job-postings",
    target_csv_name="postings.csv",
    allowed_fields=["title", "company_name", "location", "description", "skills_desc", "job_id" , "formatted_experience_level", "formatted_work_type"],
    output_json_name="job_postings.json"
)


✅ Kaggle credentials already exist at C:\Users\rubyj/.kaggle/kaggle.json
⚡ Dataset folder already exists at 'datasets\resume-dataset', skipping download and extraction.
🔍 Searching for 'Resume.csv' inside datasets\resume-dataset...
✅ Loaded CSV with shape (2484, 4)
✅ Filtered columns: ['Category', 'Resume_str']
⚠️ JSON file 'resume_data.json' already exists. Deleting and re-creating it.
📝 Saving DataFrame to JSON: resume_data.json...
✅ JSON file saved to 'json_outputs\resume_data.json'.
⚡ Dataset folder already exists at 'datasets\linkedin-job-postings', skipping download and extraction.
🔍 Searching for 'postings.csv' inside datasets\linkedin-job-postings...
✅ Loaded CSV with shape (123849, 31)
✅ Filtered columns: ['title', 'company_name', 'location', 'description', 'skills_desc', 'job_id', 'formatted_experience_level', 'formatted_work_type']
⚠️ JSON file 'job_postings.json' already exists. Deleting and re-creating it.
📝 Saving DataFrame to JSON: job_postings.json...
✅ JSON file saved 