In [8]:
from pathlib import Path
from tqdm import tqdm

tqdm.pandas()

# DATASET = Path("../data/err-0.5/test.jsonl")
DATASET = Path("../gpt-correction.jsonl")

MODEL = "./model/roberta-error-detection"

In [9]:
import pandas as pd

df = pd.read_json(DATASET, orient="records", lines=True)

# def parse_dataset(example: pd.Series):
#     return " ".join(example["sentence"]), " ".join(example["error"])

# df["sentence"], df["error"] = zip(*df.progress_apply(parse_dataset, axis=1))

100%|██████████| 10000/10000 [00:00<00:00, 228057.90it/s]


### Create pipeline for token classification

To use our fine-tuned model we need to provide directory where it's saved. By using `pipeline` from huggingface `transformers` library we can easily provide input to our model.

In [10]:
from transformers import pipeline, AutoTokenizer

MODEL = "./model/roberta-error-detection"

tokenizer = AutoTokenizer.from_pretrained(MODEL, model_max_length=124)

token_classifier = pipeline(
    "token-classification", model=MODEL, tokenizer=tokenizer, aggregation_strategy="simple", device=0
)

In [11]:
token_classifier("K Schumann v 6díle GesamtbGeschreibung er Kakteen 1898 v tomto rodu vytvořil pro tyto rostliny podroId Notocactus")[:2]

[{'entity_group': 'LABEL_1',
  'score': 0.8531505,
  'word': ' K',
  'start': 0,
  'end': 1},
 {'entity_group': 'LABEL_0',
  'score': 0.99065715,
  'word': ' Schumann v',
  'start': 2,
  'end': 12}]

### Detect errornous words / text

Dataset is processed in chunks. For each sentence in the dataset, tokens will be classified either as `0` correct or `1` incorrect.

In [12]:
from tqdm.auto import tqdm
from typing import Any
from util import optimal_split
import json

output: list[Any] = []

data_frames = optimal_split(df)

for i, _df in tqdm(enumerate(data_frames), desc="Creating masked sentences.", total=len(data_frames)):
    for j, labels in enumerate(token_classifier(_df["error"].to_list(), batch_size=32)):
        masked_sentence: list[str] = _df.iloc[j]["error"].split(" ")
        
        # this code be like 👌

        # create masks for words
        # word -> eg. [0, 0, 0, 0] + [None]
        _index_mask = [[x]*len(w) + [None] for x, w in enumerate(masked_sentence)]
        
        # flatten indexes into one list
        index_mask = [item for sublist in _index_mask for item in sublist]

        for l in labels:  # noqa: E741
            # incorrect group of tokens
            if l["entity_group"] == "LABEL_1":
                # find words that are 
                index = set(index_mask[l["start"]:l["end"]])
                index.discard(None)
                for x in index:
                    masked_sentence[x] = "[MASK]"

        output.append({
            "sentence": _df.iloc[j]["sentence"],
            "error": _df.iloc[j]["error"],
            "masked": " ".join(masked_sentence)
        })


with open("../data/masked.jsonl", "w") as f:
    json.dump(output, f)

  return bound(*args, **kwds)


Creating masked sentences.:   0%|          | 0/10 [00:00<?, ?it/s]

In [13]:
from pprint import pprint

pprint(output[0])

{'error': 'Hloubková sondáž byla ražena ze severu v\xa0délce 94\xa0km od '
          'Frutigenu z\xa0okresu Kandersteg v\xa0letech 1994 až 1996',
 'masked': 'Hloubková sondáž byla ražena ze severu v\xa0délce 94\xa0km od '
           'Frutigenu z\xa0okresu Kandersteg v\xa0letech 1994 až 1996',
 'sentence': 'Hloubková sondáž byla ražena ze severu v\xa0délce 94\xa0km od '
             'Frutigenu z\xa0okresu Kandersteg v\xa0letech 1994 až 1996'}
