In [19]:
import json
from typing import Any

import pandas as pd
import seaborn as sns
from datasets import Dataset, DatasetDict, load_dataset
from langchain_core.utils.json import parse_json_markdown
from tqdm.auto import tqdm

sns.set_theme()

In [6]:
ds = load_dataset(
    "data/datasets/pl/swiss_franc_loans_source",
    data_files={"test": "test.json", "annotated": "annotated.json"},
)
ds

Generating test split: 0 examples [00:00, ? examples/s]

Generating annotated split: 0 examples [00:00, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['context', 'output'],
        num_rows: 1396
    })
    annotated: Dataset({
        features: ['context', 'output'],
        num_rows: 690
    })
})

# Analyse

In [25]:
def get_test_items_matching_annotated(dataset: DatasetDict) -> Dataset:
    def get_matching_test_item(annotated_item: dict[str, Any]) -> dict[str, Any]:
        found_items = []
        for test_item in dataset["test"]:
            if annotated_item["context"] == test_item["context"]:
                found_items.append(test_item)
        if not len(found_items) == 1:
            raise ValueError(f"Found {len(found_items)} items")
        return found_items[0]

    test_items_matching_annotated = []
    for annotated_item in tqdm(dataset["annotated"], "Matching test and annotated items"):
        found_item = get_matching_test_item(annotated_item)
        assert found_item["context"] == annotated_item["context"]
        test_items_matching_annotated.append(found_item)
        
    return Dataset.from_pandas(pd.DataFrame(test_items_matching_annotated))

In [26]:
get_test_items_matching_annotated(ds)

Matching test and annotated items:   0%|          | 0/690 [00:00<?, ?it/s]

Dataset({
    features: ['context', 'output'],
    num_rows: 690
})