In [229]:
from pathlib import Path

DATASET = Path("../data/err-0.5/test.json")

MODEL = "./model/roberta-error-detection"

In [230]:
import pandas as pd
import swifter

df = pd.read_json(DATASET, orient="records", lines=True)

def parse_dataset(example: pd.Series):
    return " ".join(example["sentence"]), " ".join(example["error"])

df["sentence"], df["error"] = zip(*df.swifter.progress_bar(desc='Joining sentences').apply(parse_dataset, axis=1))

Joining sentences: 100%|██████████| 10000/10000 [00:00<00:00, 205144.58it/s]


### Create pipeline for token classification

To use our fine-tuned model we need to provide directory where it's saved. By using `pipeline` from huggingface `transformers` library we can easily provide input to our model.

In [231]:
from transformers import pipeline, AutoTokenizer

MODEL = "./model/roberta-error-detection"

tokenizer = AutoTokenizer.from_pretrained(MODEL, model_max_length=124)

token_classifier = pipeline(
    "token-classification", model=MODEL, tokenizer=tokenizer, aggregation_strategy="simple", device=0
)

### Detect errornous words / text

Dataset is processed in chunks. For each sentence in the dataset, tokens will be classified either as `0` correct or `1` incorrect.

In [232]:
from tqdm.auto import tqdm
from typing import Any
from util import optimal_split
import json

output: list[Any] = []

data_frames = optimal_split(df)

for i, _df in tqdm(enumerate(data_frames), desc="Creating masked sentences.", total=len(data_frames)):
    for j, labels in enumerate(token_classifier(_df["error"].to_list(), batch_size=32)):
        masked_sentence: list[str] = _df.iloc[j]["error"].split(" ")
        
        # this code be like 👌

        # create masks for words
        # word -> eg. [0, 0, 0, 0] + [None]
        _index_mask = [[x]*len(w) + [None] for x, w in enumerate(masked_sentence)]
        
        # flatten indexes into one list
        index_mask = [item for sublist in _index_mask for item in sublist]

        for l in labels:  # noqa: E741
            # incorrect group of tokens
            if l["entity_group"] == "LABEL_1":
                # find words that are 
                index = set(index_mask[l["start"]:l["end"]])
                index.discard(None)
                for x in index:
                    masked_sentence[x] = "[MASK]"

        output.append({
            "sentence": _df.iloc[j]["sentence"],
            "error": _df.iloc[j]["error"],
            "masked": " ".join(masked_sentence)
        })

(DATASET.parent.parent / "masked.json").write_text(json.dumps(output, indent=2))

Creating masked sentences.: 100%|██████████| 10/10 [00:15<00:00,  1.50s/it]


4864273