In [12]:
from datasets import load_dataset
import re
from nltk.corpus import wordnet
import nltk

In [13]:
nltk.download('wordnet')
esnli = load_dataset("../datasets/esnli.py")
esnli.data

[nltk_data] Downloading package wordnet to /Users/I518253/nltk_data...
Found cached dataset esnli (/Users/I518253/.cache/huggingface/datasets/esnli/plain_text/0.0.2/64fd5bee4cf6dcae59e2b804162412bbe9646aab00da31dac1cecc0ad4f798fd)


  0%|          | 0/3 [00:00<?, ?it/s]

{'train': MemoryMappedTable
 premise: string
 hypothesis: string
 label: int64
 explanation_1: string
 explanation_2: string
 explanation_3: string
 premise_highlighted_1: string
 hypothesis_highlighted_1: string
 premise_highlighted_2: string
 hypothesis_highlighted_2: string
 premise_highlighted_3: string
 hypothesis_highlighted_3: string
 premise_marked_1: string
 hypothesis_marked_1: string
 premise_marked_2: string
 hypothesis_marked_2: string
 premise_marked_3: string
 hypothesis_marked_3: string
 ----
 premise: [["A person on a horse jumps over a broken down airplane.","A person on a horse jumps over a broken down airplane.","A person on a horse jumps over a broken down airplane.","Children smiling and waving at camera","Children smiling and waving at camera",...,"A child with a yellow cup and milk all over his face.","A child with a yellow cup and milk all over his face.","Two barefoot men are playing on a green lawn outside a building with other people in the background.","Two

In [14]:
def transform_highlighted(record: dict) -> dict:
    highlighted_premise_all = set()
    highlighted_hypothesis_all = set()
    for i in range(1, 4):
        highlighted_current_premise = get_words_at_indices(record["premise"], parse_indices(record[f"premise_highlighted_{i}"]))
        highlighted_premise_all.update(highlighted_current_premise)
        record[f"premise_highlighted_{i}"] = ",".join(highlighted_current_premise)
        highlighted_current_hypothesis = get_words_at_indices(record["hypothesis"], parse_indices(record[f"hypothesis_highlighted_{i}"]))
        highlighted_hypothesis_all.update(highlighted_current_hypothesis)
        record[f"hypothesis_highlighted_{i}"] = ",".join(highlighted_current_hypothesis)
    record["highlighted_premise_all"] = ",".join(highlighted_premise_all)
    record["highlighted_hypothesis_all"] = ",".join(highlighted_hypothesis_all)
    return record

def parse_indices(indices: str) -> list[int]:
    if indices in [r"{}", ""]:
        return []
    return [int(i) for i in indices.split(",")]

def get_words_at_indices(string: str, indices: list[int]) -> str:
    split_string = string.split(" ")
    return filter(lambda word: word != "", map(lambda i: re.sub(r"[.,!?]", "", split_string[i]), indices))

splits = ["train", "test", "validation"]
for split in splits:
    esnli[split] = esnli[split].map(transform_highlighted, num_proc=8)

esnli.data

Loading cached processed dataset at /Users/I518253/.cache/huggingface/datasets/esnli/plain_text/0.0.2/64fd5bee4cf6dcae59e2b804162412bbe9646aab00da31dac1cecc0ad4f798fd/cache-91045558473ae8b9_*_of_00008.arrow
Loading cached processed dataset at /Users/I518253/.cache/huggingface/datasets/esnli/plain_text/0.0.2/64fd5bee4cf6dcae59e2b804162412bbe9646aab00da31dac1cecc0ad4f798fd/cache-63693622c32f55df_*_of_00008.arrow
Loading cached processed dataset at /Users/I518253/.cache/huggingface/datasets/esnli/plain_text/0.0.2/64fd5bee4cf6dcae59e2b804162412bbe9646aab00da31dac1cecc0ad4f798fd/cache-1da715ce42a35d5b_*_of_00008.arrow


{'train': ConcatenationTable
 premise: string
 hypothesis: string
 label: int64
 explanation_1: string
 explanation_2: string
 explanation_3: string
 premise_highlighted_1: string
 hypothesis_highlighted_1: string
 premise_highlighted_2: string
 hypothesis_highlighted_2: string
 premise_highlighted_3: string
 hypothesis_highlighted_3: string
 premise_marked_1: string
 hypothesis_marked_1: string
 premise_marked_2: string
 hypothesis_marked_2: string
 premise_marked_3: string
 hypothesis_marked_3: string
 highlighted_premise_all: string
 highlighted_hypothesis_all: string
 ----
 premise: [["A person on a horse jumps over a broken down airplane.","A person on a horse jumps over a broken down airplane.","A person on a horse jumps over a broken down airplane.","Children smiling and waving at camera","Children smiling and waving at camera",...,"A child with a yellow cup and milk all over his face.","A child with a yellow cup and milk all over his face.","Two barefoot men are playing on a gr

In [15]:
def get_antonyms(word: str) -> set[str]:
    antonyms = []
    for synset in wordnet.synsets(word):
        for lemma in synset.lemmas():
            if lemma.antonyms():
                antonyms.append(lemma.antonyms()[0].name())
    return set(antonyms)

def important_words_contain_antonym(record):
    important_premise = set(record["highlighted_premise_all"].split(","))
    important_hypothesis = set(record["highlighted_hypothesis_all"].split(","))
    for word_premise in important_premise:
        antonyms = get_antonyms(word_premise)
        if len(antonyms.intersection(important_hypothesis)) > 0:
            return True
    return False

esnli_test_antonyms = esnli["test"].filter(important_words_contain_antonym, num_proc=8)

Filter (num_proc=8):   0%|          | 0/9824 [00:00<?, ? examples/s]

ConcatenationTable
premise: string
hypothesis: string
label: int64
explanation_1: string
explanation_2: string
explanation_3: string
premise_highlighted_1: string
hypothesis_highlighted_1: string
premise_highlighted_2: string
hypothesis_highlighted_2: string
premise_highlighted_3: string
hypothesis_highlighted_3: string
premise_marked_1: string
hypothesis_marked_1: string
premise_marked_2: string
hypothesis_marked_2: string
premise_marked_3: string
hypothesis_marked_3: string
highlighted_premise_all: string
highlighted_hypothesis_all: string
----
premise: [["This church choir sings to the masses as they sing joyous songs from the book at a church.","This church choir sings to the masses as they sing joyous songs from the book at a church.","This church choir sings to the masses as they sing joyous songs from the book at a church.","A woman with a green headscarf, blue shirt and a very big grin.","A woman with a green headscarf, blue shirt and a very big grin.",...,"Three children hold 

In [17]:
esnli_test_antonyms.to_csv("../datasets/esnli_antonyms.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

315616