In [18]:
import os
import requests

from dotenv import load_dotenv

load_dotenv("../.env", override=True)

True

In [23]:
# List of authors as (first_name, last_name)
authors = [
    ("Łukasz", "Augustyniak"),
    ("Jakub", "Binkowski"),
    ("Albert", "Sawczyn"),
    ("Michał", "Bernaczyk"),
    ("Krzysztof", "Kamiński"),
    ("Santosh", "Tirunagari"),
    ("David", "Windridge"),
    ("Mandeep K.", "Dhami"),
    ("Chérifa", "Boukacem-Zeghmouri"),
    ("Candice", "Fillaud"),
    ("Tomasz", "Kajdanowicz"),
]

# Format: Lastname F.
author_names = [f"{last} {first[0]}." for first, last in authors]
author_str = ", ".join(author_names)

paper_title = "JuDDGES: A Unified, Multilingual Dataset of Legal Judgments for Legal AI and Analytics"

citation = f'{author_str} "{paper_title}".'

print(citation)

Augustyniak Ł., Binkowski J., Sawczyn A., Bernaczyk M., Kamiński K., Tirunagari S., Windridge D., Dhami M., Boukacem-Zeghmouri C., Fillaud C., Kajdanowicz T. "JuDDGES: A Unified, Multilingual Dataset of Legal Judgments for Legal AI and Analytics".


In [28]:
public_hf_datasets = [
    # "JuDDGES/pl-court-raw",
    "JuDDGES/pl-nsa",
    "JuDDGES/pl-swiss-franc-loans",
    "JuDDGES/en-court-raw",
    "JuDDGES/en-appealcourt",
    # "JuDDGES/en-appealcourt-coded-instruct_v02",
]

In [29]:
from tqdm import tqdm
from pathlib import Path
import zipfile

headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACE_API_TOKEN']}"}

croissant_directory = Path("croissants_files")
croissant_directory.mkdir(exist_ok=True, parents=True)

for dataset in tqdm(public_hf_datasets):

    croissant_url = f"https://huggingface.co/api/datasets/{dataset}/croissant"
    response = requests.get(croissant_url, headers=headers)
    if response.status_code == 200:
        croissant_json = response.json()
        # Add the required keys to @context
        context = croissant_json.get("@context")
        if isinstance(context, dict):
            context["examples"] = {"@id": "cr:examples", "@type": "@json"}
            context["rai"] = "http://mlcommons.org/croissant/RAI/"
        croissant_json["@context"] = context

        if not "license" in croissant_json:
            croissant_json["license"] = "https://creativecommons.org/licenses/by/4.0/"

        if not "datePublished" in croissant_json:
            croissant_json["datePublished"] = "2025-04-01"

        if not "version" in croissant_json:
            croissant_json["version"] = "1.0.0"

        if not "citeAs" in croissant_json:
            croissant_json["citeAs"] = (
                'Augustyniak Ł., Binkowski J., Sawczyn A., Bernaczyk M., Kamiński K., Tirunagari S., Windridge D., Dhami M., Boukacem-Zeghmouri C., Fillaud C., Kajdanowicz T. "JuDDGES: A Unified, Multilingual Dataset of Legal Judgments for Legal AI and Analytics".'
            )

        filename = dataset.replace("/", "__") + "_croissant.json"
        with open(croissant_directory / filename, "w", encoding="utf-8") as f:
            import json

            json.dump(croissant_json, f, indent=2, ensure_ascii=False)
        print(f"Saved: {filename}")
    else:
        print(f"Failed to download croissant for {dataset}: {response.status_code}")

zip_path = croissant_directory.with_suffix(".zip")
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
    for file in croissant_directory.iterdir():
        zipf.write(file, arcname=file.name)
print(f"Created zip archive: {zip_path}")

 25%|██▌       | 1/4 [00:00<00:01,  2.81it/s]

Saved: JuDDGES__pl-nsa_croissant.json


 50%|█████     | 2/4 [00:00<00:00,  3.56it/s]

Saved: JuDDGES__pl-swiss-franc-loans_croissant.json


 75%|███████▌  | 3/4 [00:00<00:00,  3.85it/s]

Saved: JuDDGES__en-court-raw_croissant.json


100%|██████████| 4/4 [00:01<00:00,  3.85it/s]

Saved: JuDDGES__en-appealcourt_croissant.json
Created zip archive: croissants_files.zip





In [None]:
import json
from datasets import load_dataset

# Load your dataset
dataset = load_dataset("JuDDGES/pl-court-raw")

# Create a more complete Croissant metadata file
croissant_metadata = {
    "@context": {
        "@language": "pl",
        "examples": {"@id": "cr:examples", "@type": "@json"},
        "rai": "http://mlcommons.org/croissant/RAI/",
        "@vocab": "https://schema.org/",
        "arrayShape": "cr:arrayShape",
        "citeAs": "cr:citeAs",
        "column": "cr:column",
        "conformsTo": "dct:conformsTo",
        "cr": "http://mlcommons.org/croissant/",
        "data": {"@id": "cr:data", "@type": "@json"},
        "dataBiases": "cr:dataBiases",
        "dataCollection": "cr:dataCollection",
        "dataType": {"@id": "cr:dataType", "@type": "@vocab"},
        "dct": "http://purl.org/dc/terms/",
        "extract": "cr:extract",
        "field": "cr:field",
        "fileProperty": "cr:fileProperty",
        "fileObject": "cr:fileObject",
        "fileSet": "cr:fileSet",
        "format": "cr:format",
        "includes": "cr:includes",
        "isArray": "cr:isArray",
        "isLiveDataset": "cr:isLiveDataset",
        "jsonPath": "cr:jsonPath",
        "key": "cr:key",
        "md5": "cr:md5",
        "parentField": "cr:parentField",
        "path": "cr:path",
        "personalSensitiveInformation": "cr:personalSensitiveInformation",
        "recordSet": "cr:recordSet",
        "references": "cr:references",
        "regex": "cr:regex",
        "repeated": "cr:repeated",
        "replace": "cr:replace",
        "sc": "https://schema.org/",
        "separator": "cr:separator",
        "source": "cr:source",
        "subField": "cr:subField",
        "transform": "cr:transform",
    },
    "@type": "sc:Dataset",
    "@id": "https://huggingface.co/datasets/JuDDGES/pl-court-raw",
    "name": "JuDDGES_Polish_Court_Judgments_Raw_Dataset",
    "description": "A comprehensive collection of Polish court judgments in raw format, containing XML content and extracted metadata.",
    "url": "https://huggingface.co/datasets/JuDDGES/pl-court-raw",
    "version": "1.0.0",
    "license": "https://creativecommons.org/licenses/by/4.0/",
    "citeAs": {
        "@type": "ScholarlyArticle",
        "author": "JuDDGES Project Team",
        "name": "JuDDGES: Polish Court Judgments Dataset",
        "url": "https://huggingface.co/datasets/JuDDGES/pl-court-raw",
    },
    "creator": {
        "@type": "Organization",
        "name": "JuDDGES Project",
        "url": "https://github.com/legal-ai/JuDDGES",
    },
    "recordSet": [
        {
            "@id": "default",
            "@type": "RecordSet",
            "name": "default",
            "description": "Polish court judgments collection",
            "field": [],  # Will be populated with fields
        }
    ],
    "citation": "JuDDGES Project Team. JuDDGES: Polish Court Judgments Dataset. https://huggingface.co/datasets/JuDDGES/pl-court-raw",
    "datePublished": "2025-04-01",
}

# Define field types mapping
field_types = {
    "judgment_id": "Text",
    "docket_number": "Text",
    "judgment_date": "Date",
    "publication_date": "Date",
    "last_update": "Date",
    "court_id": "Text",
    "department_id": "Text",
    "judgment_type": "Text",
    "excerpt": "Text",
    "xml_content": "Text",
    "presiding_judge": "Text",
    "decision": "Text",
    "judges": "Text",
    "legal_bases": "Text",
    "publisher": "Text",
    "recorder": "Text",
    "reviser": "Text",
    "keywords": "Text",
    "num_pages": "Integer",
    "full_text": "Text",
    "volume_number": "Integer",
    "volume_type": "Text",
    "court_name": "Text",
    "department_name": "Text",
    "extracted_legal_bases": "Text",
    "references": "Text",
    "thesis": "Text",
    "country": "Text",
    "court_type": "Text",
    "source": "Text",
}

# Add fields to recordSet
fields = []
for feature_name, data_type in field_types.items():
    field = {
        "@id": f"field/{feature_name}",
        "@type": "Field",
        "name": feature_name,
        "description": f"The {feature_name.replace('_', ' ')} of the court judgment",
        "dataType": data_type,
    }
    fields.append(field)

# Add the fields to the recordSet
croissant_metadata["recordSet"][0]["field"] = fields

# Save the metadata to a file
with open("pl-court-raw-croissant.json", "w", encoding="utf-8") as f:
    json.dump(croissant_metadata, f, indent=2)

print("Croissant metadata generated and saved to pl-court-raw-croissant.json")

Croissant metadata generated and saved to pl-court-raw-croissant.json
