In [17]:
import os
from datasets import load_dataset
from huggingface_hub import login, create_repo

# Load environment variables from a .env file (root at ../.env, fallback to local)
def load_env_file(path: str) -> None:
    if not os.path.exists(path):
        return
    try:
        with open(path, "r", encoding="utf-8") as env_file:
            for raw_line in env_file:
                line = raw_line.strip()
                if not line or line.startswith("#") or "=" not in line:
                    continue
                key, value = line.split("=", 1)
                key = key.strip()
                value = value.strip().strip('"').strip("'")
                if key and key not in os.environ:
                    os.environ[key] = value
    except Exception:
        pass

# First try root .env at project root, then a local .env next to the notebook
load_env_file("../.env")
load_env_file(".env")

# Authenticate with Hugging Face using an env var token (HF_TOKEN or HUGGINGFACE_HUB_TOKEN)
hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
if not hf_token:
    raise RuntimeError(
        "Missing Hugging Face token. Put HF_TOKEN in the repo root .env or set it in your environment to push to the Hub."
    )
login(token=hf_token)

# Ensure the target dataset repo exists
repo_id = "raxITLabs/GrayZone"
create_repo(repo_id, repo_type="dataset", exist_ok=True, token=hf_token)

# Step 1 — Load HarmBench datasets (no push here)
print("Loading HarmBench datasets ...")
ds_standard = load_dataset("walledai/HarmBench", "standard")
ds_contextual = load_dataset("walledai/HarmBench", "contextual")

# Keep original schemas (no added columns)
standard_train = ds_standard["train"]
contextual_train = ds_contextual["train"]

# Keep splits for CSV export/import steps
print("Prepared splits (original schemas):")
print(" - standard:", len(standard_train), standard_train.column_names)
print(" - contextual:", len(contextual_train), contextual_train.column_names)

# Where CSVs will be written/read (next cells use this path)
csv_dir = "./grayzone_csv"
os.makedirs(csv_dir, exist_ok=True)
print("CSV directory:", os.path.abspath(csv_dir))


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Loading HarmBench datasets ...
Prepared splits (original schemas):
 - standard: 200 ['prompt', 'category']
 - contextual: 100 ['prompt', 'context', 'category']
CSV directory: /Users/adeshgairola/Documents/Test-temp-folder/openai-safety-bench/dataset/grayzone_csv


In [18]:
# Step 2 — Export splits to CSV for manual editing
# This writes two CSV files: standard.csv and contextual.csv under csv_dir

import pandas as pd

standard_csv_path = os.path.join(csv_dir, "standard.csv")
contextual_csv_path = os.path.join(csv_dir, "contextual.csv")

# Convert to pandas and export
pd.DataFrame(standard_train).to_csv(standard_csv_path, index=False)
pd.DataFrame(contextual_train).to_csv(contextual_csv_path, index=False)

print("Wrote:")
print(" -", os.path.abspath(standard_csv_path))
print(" -", os.path.abspath(contextual_csv_path))


Wrote:
 - /Users/adeshgairola/Documents/Test-temp-folder/openai-safety-bench/dataset/grayzone_csv/standard.csv
 - /Users/adeshgairola/Documents/Test-temp-folder/openai-safety-bench/dataset/grayzone_csv/contextual.csv


In [19]:
# Step 3 — Re-import edited CSVs preserving each split's original schema
# Edit the CSVs in `csv_dir` manually, then run this cell to re-import.

import pandas as pd
from datasets import Dataset, DatasetDict

# Read edited CSVs
standard_csv_path = os.path.join(csv_dir, "standard.csv")
contextual_csv_path = os.path.join(csv_dir, "contextual.csv")

standard_df = pd.read_csv(standard_csv_path)
contextual_df = pd.read_csv(contextual_csv_path)

# Do not add or reorder columns — keep as-is for each split
standard_ds = Dataset.from_pandas(standard_df, preserve_index=False)
contextual_ds = Dataset.from_pandas(contextual_df, preserve_index=False)

print("standard columns:", standard_ds.column_names)
print("contextual columns:", contextual_ds.column_names)

# Build two separate DatasetDicts (one per config) since schemas differ
combined_standard = DatasetDict({"train": standard_ds})
combined_contextual = DatasetDict({"train": contextual_ds})

print("Ready to push two configs: standard (train) and contextual (train)")

standard columns: ['prompt', 'category']
contextual columns: ['prompt', 'context', 'category']
Ready to push two configs: standard (train) and contextual (train)


In [21]:
# Step 4 — Push edited data to Hugging Face Hub as two configs
# We push `standard` and `contextual` separately to preserve original schemas

print("Pushing 'standard' config ...")
combined_standard.push_to_hub(
    repo_id,
    config_name="standard",
    token=hf_token,
    private=False,
    create_pr=True,
    commit_message="Upload edited GrayZone standard (train) via PR",
)

print("Pushing 'contextual' config ...")
combined_contextual.push_to_hub(
    repo_id,
    config_name="contextual",
    token=hf_token,
    private=False,
    create_pr=True,
    commit_message="Upload edited GrayZone contextual (train) via PR",
)

print("Upload complete (PRs created if you lack write access):", f"https://huggingface.co/datasets/{repo_id}")


Pushing 'standard' config ...




Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1845.27ba/s]


HfHubHTTPError: (Request ID: Root=1-689aa399-298ae00070c4f0515a59fd11;fb42d120-acf2-4c7d-9b2e-ba741784f42f)

403 Forbidden: Authorization error..
Cannot access content at: https://huggingface.co/api/datasets/raxITLabs/GrayZone/preupload/main?create_pr=1.
Make sure your token has the correct permissions.