In [None]:
# # | eval: false
import warnings
from datetime import datetime

import datasets
import matplotlib.pyplot as plt
import pandas as pd
import polars as pl
import seaborn as sns
import transformers
from datasets import disable_caching, enable_caching, load_dataset
from IPython.display import Markdown, display

from juddges.schema import POLARS_SCHEMAS, SCHEMAS
from juddges.settings import ROOT_PATH

warnings.filterwarnings("ignore")
sns.set_theme("notebook")
transformers.logging.set_verbosity_error()
transformers.utils.logging.disable_progress_bar()
datasets.logging.set_verbosity_error()
datasets.utils.disable_progress_bars()

In [None]:
# | eval: false
raw_ds = pl.scan_parquet(source=ROOT_PATH / "data/datasets/pl/pl-court-raw/data/*.parquet", schema=POLARS_SCHEMAS["pl-court"], glob=True)

In [None]:
# | eval: false
schema_df = pd.DataFrame(
    {"Field": list(SCHEMAS["pl-court"].keys()), "Description": list(SCHEMAS["pl-court"].values())}
)
types_df = pd.DataFrame(
    [
        {"Field": col_name, "Type": str(col_type)} for col_name, col_type in raw_ds.schema.items()
    ]
)
ds_desc = schema_df.merge(types_df, on="Field", how="inner", validate="one_to_one")
ds_desc_md = ds_desc.to_markdown(index=False)
# print(ds_desc_md)

In [None]:
dataset_card = """
# Dataset Card for [JuDDGES/pl-court-raw](https://huggingface.co/datasets/JuDDGES/pl-court-raw)

## Table of Contents
- [Table of Contents](#table-of-contents)
- [Dataset Description](#dataset-description)
  - [Dataset Summary](#dataset-summary)
  - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
  - [Languages](#languages)
- [Dataset Structure](#dataset-structure)
  - [Data Instances](#data-instances)
  - [Data Fields](#data-fields)
  - [Data Splits](#data-splits)
- [Dataset Creation](#dataset-creation)
  - [Curation Rationale](#curation-rationale)
  - [Source Data](#source-data)
  - [Annotations](#annotations)
  - [Personal and Sensitive Information](#personal-and-sensitive-information)
- [Considerations for Using the Data](#considerations-for-using-the-data)
  - [Social Impact of Dataset](#social-impact-of-dataset)
  - [Discussion of Biases](#discussion-of-biases)
  - [Other Known Limitations](#other-known-limitations)
- [Additional Information](#additional-information)
  - [Dataset Curators](#dataset-curators)
  - [Licensing Information](#licensing-information)
  - [Citation Information](#citation-information)
  - [Contributions](#contributions)
- [Statistics](#statistics)

## Dataset Description

* **Homepage: TBA**
* **Repository: https://github.com/pwr-ai/JuDDGES**
* **Paper:  TBA**
* **Point of Contact: lukasz.augustyniak@pwr.edu.pl; jakub.binkowski@pwr.edu.pl; albert.sawczyn@pwr.edu.pl**

### Dataset Summary

The dataset consists of Polish Court judgments available at https://orzeczenia.ms.gov.pl/, containing full content of the judgments along with metadata sourced from official API and extracted from the judgment contents. This dataset contains raw data. For instruction dataset see [`JuDDGES/pl-court-instruct`](https://huggingface.co/datasets/JuDDGES/pl-court-instruct). For graph dataset see [`JuDDGES/pl-court-graph`](https://huggingface.co/datasets/JuDDGES/pl-court-graph).

### Supported Tasks and Leaderboards

The dataset can be used for various tasks. However, it contains raw data acquired from official API, and we rather recommend using instruction dataset [`JuDDGES/pl-court-instruct`](https://huggingface.co/datasets/JuDDGES/pl-court-instruct) for straightforward usage. 

### Languages

pl-PL Polish 

## Dataset Structure

### Data Fields

{fields_table}

### Data Splits

This dataset is not split into subsets. The dataset has only `train` split.

## Dataset Creation

For details on the dataset creation, see the paper [TBA]() and the code repository [here](https://github.com/pwr-ai/JuDDGES).

### Curation Rationale

Created to enable cross-jurisdictional legal analytics.

### Source Data

#### Initial Data Collection and Normalization

1. Download judgments metadata.
1. Download judgments text (XML content of judgments).
1. Download additional details available for each judgment.
1. Map id of courts and departments to court name.
1. Extract raw text from XML content and details of judgments not available through API.
1. For further processing prepare local dataset dump in parquet file, version it with dvc and push to remote storage.

#### Who are the source language producers?

Produced by human legal professionals (judges, court clerks). Demographics was not analysed. Sourced from public court databases.

### Annotations

#### Annotation process

No annotation was performed by us. All features were provided via API (anonymization and publication of the data were performed by court employees).

#### Who are the annotators?

As above.

### Personal and Sensitive Information

Pseudoanonymized to comply with GDPR (art. 4 sec. 5 GDPR).

## Considerations for Using the Data

### Social Impact of Dataset

[More Information Needed]

### Discussion of Biases

[More Information Needed]

### Other Known Limitations

[More Information Needed]

## Additional Information

### Dataset Curators

[More Information Needed]

### Licensing Information

We license the actual packaging of these data under Attribution 4.0 International (CC BY 4.0) https://creativecommons.org/licenses/by/4.0/

### Citation Information

TBA
"""

display(Markdown(dataset_card.format(fields_table=ds_desc_md)))

# Statistics

### Dataset size

In [None]:
# | eval: false
n_rows = raw_ds.select(pl.len()).collect().item()
display(Markdown(f"Dataset size: **{n_rows:_} examples**"))

### Missing values

In [None]:
# | eval: false
null_count = raw_ds.null_count().collect().to_pandas().T.rename(columns={0: "Null count"})
null_count.index.name = "Field name"
null_count["Null fraction"] = (
    null_count["Null count"] / raw_ds.select(pl.len()).collect().item()
).round(2)
display(Markdown(null_count.to_markdown()))

### Analysis of selected fields

In [None]:
# | eval: false
court_distribution = (
    raw_ds.drop_nulls(subset="court_name")
    .select("court_name")
    .group_by("court_name")
    .len()
    .sort("len", descending=True)
    .collect()
    .to_pandas()
)
ax = sns.histplot(data=court_distribution, x="len", log_scale=True, kde=True)
ax.set(
    title="Distribution of judgments per court",
    xlabel="#Judgments in single court",
    ylabel="Count",
)
plt.show()

In [None]:
# | eval: false
judgments_per_year = (
    raw_ds.select("judgment_date")
    .collect()["judgment_date"]
    .dt.year()
    .value_counts()
    .sort("judgment_date")
    .to_pandas()
)
judgments_per_year = judgments_per_year[judgments_per_year["judgment_date"] <= datetime.now().year]

_, ax = plt.subplots(1, 1, figsize=(10, 5))
ax = sns.pointplot(data=judgments_per_year, x="judgment_date", y="count", linestyles="--", ax=ax)
ax.set(
    xlabel="Year",
    ylabel="Number of Judgments",
    title="Yearly Number of Judgments",
    yscale="log",
)
plt.xticks(rotation=90)
plt.show()

In [None]:
# | eval: false
types = (
    raw_ds.fill_null(value="<null>")
    .select("judgment_type")
    .group_by("judgment_type")
    .len()
    .sort("len", descending=True)
    .collect()
    .to_pandas()
)

_, ax = plt.subplots(1, 1, figsize=(8, 8))
ax = sns.barplot(data=types, x="len", y="judgment_type", errorbar=None, ax=ax)
ax.set(xlabel="Count", ylabel="judgment_type", title="Judgment types cardinality", xscale="log")
plt.show()

In [None]:
# | eval: false
num_judges = (
    raw_ds.with_columns([pl.col("judges").list.len().alias("num_judges")])
    .select("num_judges")
    .sort("num_judges")
    .collect()
    .to_pandas()
)
ax = sns.histplot(data=num_judges, x="num_judges", bins=num_judges["num_judges"].nunique())
ax.set(
    xlabel="#Judges per judgment",
    ylabel="Count",
    yscale="log",
    title="#Judges per single judgment",
)
plt.show()

In [None]:
# | eval: false
num_lb = (
    raw_ds.with_columns([pl.col("legal_bases").list.len().alias("num_lb")])
    .select("num_lb")
    .sort("num_lb")
    .collect()
    .to_pandas()
)
ax = sns.histplot(data=num_lb, x="num_lb", bins=num_lb["num_lb"].nunique())
ax.set(
    xlabel="#Legal bases",
    ylabel="Count",
    yscale="log",
    title="#Legal bases per judgment",
)
plt.show()

In [None]:
# | eval: false
disable_caching()
raw_text_ds = load_dataset(
    "parquet",
    data_dir="../../data/datasets/pl/pl-court-raw/data/",
    columns=["judgment_id", "full_text"],
)
enable_caching()
raw_text_ds = raw_text_ds.filter(lambda x: x["full_text"] is not None)

In [None]:
# | eval: false
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")


# def tokenize(batch: dict[str, list]) -> list[int]:
#     tokenized = tokenizer(
#         batch["full_text"],
#         add_special_tokens=False,
#         return_attention_mask=False,
#         return_token_type_ids=False,
#         return_length=True,
#     )
#     return {"length": tokenized["length"]}


# raw_text_ds = raw_text_ds.map(
#     tokenize,
#     batched=True,
#     batch_size=16,
#     remove_columns=["full_text"],
#     num_proc=20,
# )

In [None]:
# | eval: false
# judgement_len = raw_text_ds["train"].to_pandas()

# ax = sns.histplot(data=judgement_len, x="length", bins=50)
# ax.set(
#     xlabel="#Tokens",
#     ylabel="Count",
#     title="#Tokens distribution in judgements (llama-3 tokenizer)",
#     yscale="log",
# )
# ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f"{int(x/1_000)}k"))
# plt.show()

In [None]:
# | eval: false
# per_type_tokens = (
#     raw_ds.fill_null(value="<null>")
#     .select(["judgment_id", "judgment_type"])
#     .collect()
#     .to_pandas()
#     .set_index("judgment_id")
#     .join(judgement_len.set_index("judgment_id"))
# )

# _, ax = plt.subplots(1, 1, figsize=(10, 10))
# ax = sns.boxenplot(data=per_type_tokens, y="judgment_type", x="length")
# ax.set(
#     xscale="log",
#     title="Judgement token count per type",
#     xlabel="#Tokens",
#     ylabel="Type",
# )
# plt.show()