In [1]:
from pathlib import Path

DATASET = Path("../data/err-0.8/test.json")

MODEL = "./model/roberta-error-detection"

In [2]:
from datasets import Dataset
import math

dataset = Dataset.from_json(str(DATASET))

def parse_dataset(example):
    return {
        "sentence": " ".join(example["sentence"]),
        "error": " ".join(example["error"]),
    }

SHARDS = math.ceil(len(dataset) / 1000)

dataset = dataset.map(parse_dataset, batched=False, num_proc=4)

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 10000 examples [00:00, 517215.08 examples/s]
Map (num_proc=4): 100%|██████████| 10000/10000 [00:00<00:00, 53610.44 examples/s]


In [3]:
from transformers import pipeline, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL, model_max_length=512)

MODEL = "./model/roberta-error-detection"

token_classifier = pipeline(
    "token-classification", model=MODEL, tokenizer=tokenizer, aggregation_strategy="simple", device=0
)

2024-05-09 02:12:11.696517: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-09 02:12:11.723721: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
from tqdm.auto import tqdm

output: list[str] = []

for i in tqdm(range(SHARDS), desc="Creating masked sentences."):
    _dataset = dataset.shard(num_shards=SHARDS, index=i)

    for j, labels in enumerate(token_classifier(_dataset["error"], batch_size=32)):
        errored_sentence = _dataset["error"][j]
        masked_sentences: list[str] = []

        for l in labels:  # noqa: E741
            if l["entity_group"] == "LABEL_1":
                if len(m:=errored_sentence[: l["start"]] + "[MASK]" + errored_sentence[l["end"] :]) > 128:
                    continue

                # NOTE: if [MASK] is within a word, we can't use it
                # so I will mask the whole word
                m = m.split(" ")

                for i, word in enumerate(m):
                    if "[MASK]" in word:
                        m[i] = "[MASK]"

                masked_sentences.append(" ".join(m))

        output.append({
            "sentence": _dataset["sentence"][j],
            "error": _dataset["error"][j],
            "masked": masked_sentences
        })

Dataset.from_list(output).to_json(DATASET.parent / "masked.json")

Creating masked sentences.: 100%|██████████| 10/10 [01:19<00:00,  7.96s/it]
Creating json from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 251.06ba/s]


4863154