In [2]:
from datasets import load_from_disk, concatenate_datasets, Dataset
from multiprocessing import cpu_count
import pandas as pd

In [3]:
esnli_phenomena = load_from_disk("../../lit-data/datasets/esnli_phenomena").to_pandas()
esnli_evaluations = load_from_disk("../../lit-data/datasets/esnli_evaluations_42/esnli_evaluations_chunk_0")
for i in range(1, 5):
    esnli_evaluations_chunk = load_from_disk(f"../../lit-data/datasets/esnli_evaluations_42/esnli_evaluations_chunk_{i}")
    esnli_evaluations = concatenate_datasets([esnli_evaluations, esnli_evaluations_chunk])
esnli_evaluations = esnli_evaluations.to_pandas()

In [4]:
esnli_evaluations.info()
esnli_phenomena.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9842 entries, 0 to 9841
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   premise                  9842 non-null   object
 1   hypothesis               9842 non-null   object
 2   label                    9842 non-null   int64 
 3   explanation_1            9842 non-null   object
 4   explanation_2            9842 non-null   object
 5   explanation_3            9842 non-null   object
 6   sentence1_highlighted_1  9842 non-null   object
 7   sentence2_highlighted_1  9842 non-null   object
 8   sentence1_highlighted_2  9842 non-null   object
 9   sentence2_highlighted_2  9842 non-null   object
 10  sentence1_highlighted_3  9842 non-null   object
 11  sentence2_highlighted_3  9842 non-null   object
 12  ferret_explanations      9842 non-null   object
 13  evaluations              9842 non-null   object
dtypes: int64(1), object(13)
memory usage: 1.

In [5]:
def merge(record):
    candidate_partners = esnli_evaluations.filter(lambda r: r["premise"] == record["premise"] and r["hypothesis"] == record["hypothesis"])
    if candidate_partners.num_rows > 1:
        print(f"Warning: Duplicate premise hypothesis pair for record: {record}")
    if candidate_partners.num_rows == 0:
        raise RuntimeError(f"Missing premise hypothesis pair for record: {record}")
    return {**record, **candidate_partners[0]}

esnli_evaluations_phenomena_pd = pd.merge(esnli_evaluations, esnli_phenomena, on=["premise", "hypothesis"], suffixes=('', '_DROP')).filter(regex='^(?!.*_DROP)')
esnli_evaluations_phenomena = Dataset.from_pandas(esnli_evaluations_phenomena_pd)
print(esnli_evaluations_phenomena)

Dataset({
    features: ['premise', 'hypothesis', 'label', 'explanation_1', 'explanation_2', 'explanation_3', 'sentence1_highlighted_1', 'sentence2_highlighted_1', 'sentence1_highlighted_2', 'sentence2_highlighted_2', 'sentence1_highlighted_3', 'sentence2_highlighted_3', 'ferret_explanations', 'evaluations', 'premise_highlighted_1', 'hypothesis_highlighted_1', 'premise_highlighted_2', 'hypothesis_highlighted_2', 'premise_highlighted_3', 'hypothesis_highlighted_3', 'premise_marked_1', 'hypothesis_marked_1', 'premise_marked_2', 'hypothesis_marked_2', 'premise_marked_3', 'hypothesis_marked_3', 'premise_highlighted_words_1', 'hypothesis_highlighted_words_1', 'premise_highlighted_words_2', 'hypothesis_highlighted_words_2', 'premise_highlighted_words_3', 'hypothesis_highlighted_words_3', 'premise_highlighted_words_all', 'hypothesis_highlighted_words_all', 'synonym', 'antonym', 'hypernym', 'hyponym', 'co_hyponym', 'quantifiers', 'numericals', '__index_level_0__'],
    num_rows: 9846
})


In [6]:
esnli_evaluations_phenomena.save_to_disk("../../lit-data/datasets/esnli_evaluations_42_phenomena")

Saving the dataset (0/1 shards):   0%|          | 0/9846 [00:00<?, ? examples/s]