In [1]:
from datasets import load_dataset
import seaborn as sns
from dotenv import load_dotenv
from juddges.settings import ROOT_PATH

In [None]:
load_dotenv(ROOT_PATH / ".env", override=True)
dataset = load_dataset("JuDDGES/nsa")

# Sample 20% of the dataset randomly
sample_size = int(len(dataset["train"]) * 0.2)
dataset["train"] = dataset["train"].shuffle(seed=17).select(range(sample_size))

In [3]:
from datetime import datetime

date_column = "The day of the judgment"


def date_to_datetime(entry: dict) -> dict:
    entry[date_column] = datetime.strptime(entry[date_column], "%Y-%m-%d")
    return entry


dataset = dataset.map(date_to_datetime, num_proc=15)

ds = dataset["train"]
ds_recent = ds.filter(lambda x: x[date_column] >= datetime(2020, 1, 1))
ds = ds.map(lambda x: {"year": x[date_column].year}, num_proc=15)
ds = ds.map(lambda x: {"month": x[date_column].month}, num_proc=15)

In [None]:
from matplotlib import pyplot as plt

sns.histplot(ds[date_column])
plt.title("Number of datapoints across time")
plt.show()

sns.histplot(ds_recent[date_column])
plt.title("Number of datapoints across time since 2020")
plt.show()

In [None]:
# Create a DataFrame and count entries per month-year combination
df = ds.to_pandas()
monthly_counts = df.groupby(["year", "month"]).size().reset_index(name="count")

# Create the line plot
g = sns.lineplot(
    data=monthly_counts[monthly_counts["year"] != 2024],  # Plot all years except 2024
    x="month",
    y="count",
    hue="year",
)

# Add 2024 line with custom style
sns.lineplot(
    data=monthly_counts[monthly_counts["year"] == 2024],
    x="month",
    y="count",
    color="red",
    linewidth=3,
    label="2024",
)

# Customize the plot
plt.xlabel("Month")
plt.ylabel("Number of Entries")
plt.title("Number of Entries per Month by Year")
plt.xticks(range(1, 13))

plt.show()

In [None]:
import pandas as pd

columns_to_examine = ["Reasons for judgment"]

for column in columns_to_examine:
    df[f"{column}_is_na"] = df[column].isna()
    monthly_counts = (
        df[df["year"] > 2019]
        .groupby(["year", "month"])[f"{column}_is_na"]
        .mean()
        .reset_index(name="count")
    )

    # Create the line plot
    ax = sns.lineplot(
        data=monthly_counts[monthly_counts["year"] != 2024],  # Plot all years except 2024
        x="month",
        y="count",
        hue="year",
    )

    # Add 2024 line with custom style
    sns.lineplot(
        data=monthly_counts[monthly_counts["year"] == 2024],
        x="month",
        y="count",
        color="red",
        linewidth=3,
        label="2024",
    )

    ax.set_title(f"Ratio of {column} that is NA")
    plt.show()

    df["YEAR_MONTH"] = df["year"].astype(str) + "_" + df["month"].astype(str)
    df["YEAR_MONTH"] = pd.to_datetime(df["YEAR_MONTH"], format="%Y_%m")

    sns.lineplot(
        data=df,
        x="year",
        y=f"{column}_is_na",
    )
    plt.title(f"Ratio of {column} that is NA")
    plt.show()

In [None]:
monthly_counts = df.groupby(["year", "month", "Finality"])["id"].count().reset_index(name="count")

monthly_counts = monthly_counts.pivot(index=["year", "month"], columns="Finality", values="count")

monthly_counts["Finality ratio"] = monthly_counts["orzeczenie nieprawomocne"] / (
    monthly_counts["orzeczenie nieprawomocne"] + monthly_counts["orzeczenie prawomocne"]
)

monthly_counts = monthly_counts.reset_index()

monthly_counts["YEAR_MONTH"] = (
    monthly_counts["year"].astype(str) + "_" + monthly_counts["month"].astype(str)
)


fig = plt.figure(figsize=(13, 4))
# Create the line plot
ax = sns.lineplot(
    data=monthly_counts,
    x="year",
    y="Finality ratio",
)

ax.set_title("Ratio of Finality")
ax.set_xlabel("Year-Month")
ax.set_ylabel("Ratio of Finality")
plt.xticks(rotation=90, ha="right")

plt.show()

1. There is increase of ratio of finality. And there is a spike in 2024. 
2. Ratio of lacking reasons is increasing from 2000. With a spike in last month before data was scraped.

What to do?
1. Reacquire datapoints with missing reasons.
2. Reacquire datapoints that are not final.
3. Do not reacquire datapoints that older than Threshold.


In [None]:
print("% of datapoints with missing reasons or not final: ")
print(
    f"{(df['Reasons for judgment_is_na'] | (df['Finality'] == 'orzeczenie nieprawomocne')).mean():.2%}"
)

In [None]:
recent_df = df[df["year"] >= 2024]
print("% of datapoints with missing reasons or not final in 2024: ")
print(
    f"{(recent_df['Reasons for judgment_is_na'] | (recent_df['Finality'] == 'orzeczenie nieprawomocne')).mean():.2%}"
)

## Experiment with newly acquired dataset for 01.01.2024 - 05.09.2024

### How many datapoints were in the old dataset?

In [None]:
new_df = pd.read_parquet(ROOT_PATH / "data/datasets/nsa/dataset/")
dataset = load_dataset("JuDDGES/nsa")

In [11]:
olds_ids = set(dataset["train"]["id"])
new_df["was_in_old_dataset"] = new_df["id"].isin(olds_ids)
new_df["date"] = pd.to_datetime(new_df["judgment_date"])
new_df = new_df[new_df["date"].dt.month < 9]

In [None]:
print(
    f"% of datapoints that were not in the old dataset: {(~new_df['was_in_old_dataset']).mean():.2%}"
)

In [None]:
sns.lineplot(data=new_df, x="date", y="was_in_old_dataset")

In [14]:
new_df["month"] = new_df["date"].dt.month
new_df["year"] = new_df["date"].dt.year

In [None]:
monthly_counts = (
    new_df.groupby(["year", "month", "was_in_old_dataset"])["id"].count().reset_index(name="count")
)

monthly_counts = monthly_counts.pivot(index=["month"], columns="was_in_old_dataset", values="count")

monthly_counts["ratio"] = monthly_counts[False] / (monthly_counts[False] + monthly_counts[True])

monthly_counts = monthly_counts.reset_index()


fig = plt.figure(figsize=(13, 4))
# Create the line plot
ax = sns.lineplot(
    data=monthly_counts,
    x="month",
    y="ratio",
)

ax.set_title("Ratio of datapoints that were not in the old dataset")
ax.set_xlabel("Month")
ax.set_ylabel("Ratio of datapoints that were not in the old dataset")
plt.xticks(rotation=90, ha="right")

plt.show()

### How many OLD datapoints were in the NEW dataset?

In [None]:
dataset = load_dataset("JuDDGES/nsa")

date_column = "The day of the judgment"
dataset = dataset.filter(lambda x: x[date_column].startswith("2024-"), num_proc=15)


def date_to_datetime(entry: dict) -> dict:
    entry[date_column] = datetime.strptime(entry[date_column], "%Y-%m-%d")
    return entry


dataset = dataset.map(date_to_datetime, num_proc=15)
ds = dataset["train"]
ds = ds.map(lambda x: {"year": x[date_column].year}, num_proc=15)
ds = ds.map(lambda x: {"month": x[date_column].month}, num_proc=15)
ds = ds.filter(lambda x: x["month"] < 9, num_proc=15)

In [17]:
df = ds.to_pandas()
df["is_in_new_dataset"] = df["id"].isin(set(new_df["id"]))

In [None]:
print(f"% of datapoints that are not in the new dataset: {(~df['is_in_new_dataset']).mean():.2%}")

In [None]:
df[date_column].sort_values()

In [None]:
new_df["date"].sort_values()

In [None]:
df.groupby(["is_in_new_dataset", "month"])["id"].count().reset_index(name="count")

### Do Reasons and finality change?

In [None]:
print("Year 2024")
print(f"% of datapoints with missing reasons: {df['Reasons for judgment'].isna().mean():.2%}")
print(
    f"% of datapoints that are not final: {(df['Finality'] == 'orzeczenie nieprawomocne').mean():.2%}"
)

In [None]:
ids = df[df["Reasons for judgment"].isna() & df["id"].isin(set(new_df["id"]))]["id"]

print(
    f"% of datapoints with changed reasons: {1 -new_df[new_df['id'].isin(ids)]['reasons'].isna().mean():.2%}"
)

In [None]:
new_df[new_df["id"].isin(ids)][~new_df[new_df["id"].isin(ids)]["reasons"].isna()].head(2)

In [None]:
display(df[df["id"] == "/doc/39457A5C4B"]["Reasons for judgment"])
display(new_df[new_df["id"] == "/doc/39457A5C4B"]["reasons"])

In [None]:
df["Finality"].value_counts()

In [None]:
ids = df[(df["Finality"] == "orzeczenie nieprawomocne") & df["id"].isin(set(new_df["id"]))]["id"]
df_ = new_df[new_df["id"].isin(ids)]
x = (df_["finality"] != "orzeczenie nieprawomocne").mean()

print(f"% of datapoints with changed finality: {x:.2%}")

In [None]:
df_[df_["finality"] != "orzeczenie nieprawomocne"].head(2)

In [None]:
display(df[df["id"] == "/doc/F9FAF349CB"]["Finality"])
display(new_df[new_df["id"] == "/doc/F9FAF349CB"]["finality"])