In [10]:
# ==============================
# 🛠 CONFIGURATION
# ==============================
AUTO_CLEANUP = True  # Set to True for automatic folder cleanup after JSON creation
DATASET_DOWNLOAD_DIR = "datasets"
JSON_OUTPUT_DIR = "json_outputs"

# ==============================
# 📦 Install and Import libraries
# ==============================
try:
    import google.colab
    IN_COLAB = True
    print("🧠 Detected environment: Google Colab")
except ImportError:
    IN_COLAB = False
    print("🧠 Detected environment: Local Machine")

import os
import pandas as pd
import zipfile
import json
import shutil

# ==============================
# 📥 Kaggle Authentication
# ==============================
def authenticate_kaggle():
    """
    Authenticate with Kaggle based on environment (Colab vs Local).
    """
    if IN_COLAB:
        from google.colab import files
        print("📥 Please upload your kaggle.json file...")
        uploaded = files.upload()

        os.makedirs('/root/.kaggle', exist_ok=True)
        with open('/root/.kaggle/kaggle.json', 'wb') as f:
            f.write(uploaded['kaggle.json'])
        
        os.chmod('/root/.kaggle/kaggle.json', 600)
        print("✅ Kaggle API credentials set up successfully (Colab).")
    else:
        kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
        if not os.path.exists(kaggle_path):
            raise FileNotFoundError(f"❌ kaggle.json not found at {kaggle_path}. Please place your Kaggle API key there.")
        print("✅ Kaggle API credentials found (Local Machine).")

# ==============================
# ⚙️ Helper Functions
# ==============================
def download_and_extract_dataset(dataset_path: str, download_dir: str = DATASET_DOWNLOAD_DIR) -> tuple[str, str]:
    """
    Download and extract a Kaggle dataset.
    Extract into a subfolder named after the ZIP file.
    Returns (extracted_folder_path, zip_filename).
    """
    os.makedirs(download_dir, exist_ok=True)
    print(f"⬇️ Downloading dataset: {dataset_path} ...")
    !kaggle datasets download -d {dataset_path} -p {download_dir}
    
    zip_files = [f for f in os.listdir(download_dir) if f.endswith('.zip')]
    if not zip_files:
        raise FileNotFoundError("❌ No zip file found after download!")
    
    zip_filename = zip_files[0]
    zip_path = os.path.join(download_dir, zip_filename)
    
    extract_folder_name = zip_filename.replace(".zip", "")
    extract_folder_path = os.path.join(download_dir, extract_folder_name)
    os.makedirs(extract_folder_path, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder_path)
    
    print(f"✅ Downloaded and extracted to '{extract_folder_path}'.")
    
    return extract_folder_path, zip_filename


def find_csv_file(root_dir: str, target_csv_name: str) -> str:
    """
    Walks through root_dir and finds the full path to the target CSV.
    """
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.lower() == target_csv_name.lower():
                found_path = os.path.join(root, file)
                print(f"✅ Found CSV: {found_path}")
                return found_path
    raise FileNotFoundError(f"❌ CSV file '{target_csv_name}' not found in '{root_dir}'!")

def load_csv(dataset_dir: str, csv_filename: str) -> pd.DataFrame:
    """
    Load CSV file into pandas DataFrame after walking the directory tree.
    """
    csv_path = find_csv_file(dataset_dir, csv_filename)
    df = pd.read_csv(csv_path)
    print(f"✅ Loaded CSV '{csv_filename}' with shape {df.shape}.")
    return df

def filter_fields(df: pd.DataFrame, allowed_fields: list) -> pd.DataFrame:
    """
    Keep only allowed fields from DataFrame.
    """
    missing = [field for field in allowed_fields if field not in df.columns]
    if missing:
        print(f"⚠️ Warning: Allowed fields missing from dataset: {missing}")
    
    df_filtered = df[[col for col in allowed_fields if col in df.columns]]
    print(f"🧹 Retained fields: {df_filtered.columns.tolist()}")
    return df_filtered

def save_to_json(df: pd.DataFrame, output_filename: str, output_dir: str = JSON_OUTPUT_DIR):
    """
    Save DataFrame to JSON records.
    """
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, output_filename)
    
    if os.path.exists(output_path):
        os.remove(output_path)
        print(f"🗑️ Old JSON file '{output_filename}' deleted.")

    df.to_json(output_path, orient="records", lines=True, force_ascii=False)
    print(f"✅ Saved JSON to '{output_path}'.")

def cleanup_dataset_artifacts(extracted_folder_path: str, zip_filename: str, download_dir: str = DATASET_DOWNLOAD_DIR):
    """
    Delete both the extracted dataset folder and the associated zip file.
    """
    # Delete extracted folder
    if os.path.exists(extracted_folder_path):
        shutil.rmtree(extracted_folder_path)
        print(f"🧹 Folder '{extracted_folder_path}' has been deleted successfully.")
    else:
        print(f"⚠️ Folder '{extracted_folder_path}' does not exist, skipping folder deletion.")
    
    # Delete zip file
    zip_path = os.path.join(download_dir, zip_filename)
    if os.path.exists(zip_path):
        os.remove(zip_path)
        print(f"🗑️ Zip file '{zip_path}' has been deleted successfully.")
    else:
        print(f"⚠️ Zip file '{zip_path}' does not exist, skipping zip deletion.")

# ==============================
# 🔥 Kaggle Authentication
# ==============================
authenticate_kaggle()

# ==============================
# 📄 Process Resume Dataset
# ==============================
resume_dataset_path = "snehaanbhawal/resume-dataset"
resume_csv_name = "Resume.csv"
resume_allowed_fields = ["Category", "Resume_str"]
resume_json_name = "resume_data.json"

resume_extracted_folder, resume_zip_filename = download_and_extract_dataset(resume_dataset_path)
resume_df = load_csv(resume_extracted_folder, resume_csv_name)
resume_filtered_df = filter_fields(resume_df, resume_allowed_fields)
save_to_json(resume_filtered_df, resume_json_name)

if AUTO_CLEANUP:
    cleanup_dataset_artifacts(resume_extracted_folder, resume_zip_filename)

# ==============================
# 📄 Process LinkedIn Job Postings Dataset
# ==============================
job_dataset_path = "arshkon/linkedin-job-postings"
job_csv_name = "postings.csv"
job_allowed_fields = ["title", "company_name", "location", "description", "skills_desc", "posted_date", "job_id" , "formatted_experience_level", "formatted_work_type", "job_function"]
job_json_name = "job_postings.json"

job_extracted_folder, job_zip_filename = download_and_extract_dataset(job_dataset_path)
job_df = load_csv(job_extracted_folder, job_csv_name)
job_filtered_df = filter_fields(job_df, job_allowed_fields)
save_to_json(job_filtered_df, job_json_name)

if AUTO_CLEANUP:
    cleanup_dataset_artifacts(job_extracted_folder, job_zip_filename)


🧠 Detected environment: Local Machine
✅ Kaggle API credentials found (Local Machine).
⬇️ Downloading dataset: snehaanbhawal/resume-dataset ...


Dataset URL: https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset
License(s): CC0-1.0
✅ Downloaded and extracted to 'datasets\resume-dataset'.
✅ Found CSV: datasets\resume-dataset\Resume\Resume.csv
✅ Loaded CSV 'Resume.csv' with shape (2484, 4).
🧹 Retained fields: ['Category', 'Resume_str']
🗑️ Old JSON file 'resume_data.json' deleted.
✅ Saved JSON to 'json_outputs\resume_data.json'.
🧹 Folder 'datasets\resume-dataset' has been deleted successfully.
🗑️ Zip file 'datasets\resume-dataset.zip' has been deleted successfully.
⬇️ Downloading dataset: arshkon/linkedin-job-postings ...
Dataset URL: https://www.kaggle.com/datasets/arshkon/linkedin-job-postings
License(s): CC-BY-SA-4.0
✅ Downloaded and extracted to 'datasets\linkedin-job-postings'.
✅ Found CSV: datasets\linkedin-job-postings\postings.csv
✅ Loaded CSV 'postings.csv' with shape (123849, 31).
🧹 Retained fields: ['title', 'company_name', 'location', 'description', 'skills_desc', 'job_id', 'formatted_experience_level', 'formatt