In [41]:
from pathlib import Path

DATASET = Path("../data/err-0.8/masked.json")

MODEL = "ufal/robeczech-base"

In [42]:
from datasets import Dataset

dataset = Dataset.from_json(str(DATASET))

In [43]:
from transformers import pipeline

corrector = pipeline("fill-mask", model=MODEL, tokenizer=MODEL, device=0, top_k=1)

### Example usage

In [44]:
masked_sentences = dataset["masked"]

for predictions in corrector(masked_sentences[0]):
    print(predictions)

{'score': 0.9997588992118835, 'token': 2, 'token_str': '[SEP]', 'sequence': ''}


In [45]:
import math

SHARDS = math.ceil(len(dataset) / 1000)

In [46]:
from tqdm.auto import tqdm
from typing import Iterator
from transformers import Pipeline
from dataclasses import dataclass
import pandas as pd


@dataclass
class PredictionData:
    sentence: str
    error: str
    masked: list[str]
    predictions: list[dict[str, str]]


class ProcessPrediction:
    def __init__(self, pipeline: Pipeline):
        self.pipeline = pipeline

    def process_prediction(self, data: list[PredictionData]) -> Iterator[str]:
        raise NotImplementedError("Implement this method")

def join_masked(row: list[str]) -> str | None:
    is_all_floats = all(isinstance(item, float) for item in row)
    if is_all_floats:
        return None

    masks: list[list[str]] = [
        masked.split() for masked in row
    ]

    sentence: list[str] = []

    for elements in zip(*masks):
        if "[MASK]" in elements:
            sentence.append("[MASK]")
        else:
            sentence.append(elements[0])
    
    return " ".join(sentence)

def create_dataset(pred: ProcessPrediction) -> Dataset:
    final_dataset = pd.DataFrame()

    for i in tqdm(range(SHARDS), desc="Filling masks for shards"):
        _dataset = dataset.shard(num_shards=SHARDS, index=i)

        # explode dataset so we can use it in the pipeline
        df = _dataset.to_pandas()

        df = df.explode("masked")
        df["replace"] = None
        df.reset_index(drop=True, inplace=True)

        not_nan = df[df["masked"].notnull()].index

        df_to_process = df.loc[not_nan]

        # apply pipeline
        predictions = corrector(df_to_process["masked"].to_list(), batch_size=32)

        # process predictions
        prediction_data: list[PredictionData] = [
            PredictionData(
                sentence=row.sentence,
                error=row.error,
                masked=row.masked,
                predictions=pred
            ) for row, pred in zip(df_to_process.itertuples(), predictions)
        ]

        for j, prediction in enumerate(pred.process_prediction(prediction_data)):
            df_to_process.loc[j, "replace"] = prediction

        df.loc[not_nan] = df_to_process
        
        # implode and merge
        df = df.groupby(["sentence", "error"]).agg(
            {
                "masked": list,
                "replace": list,
            }
        ).reset_index()

        # process masked sentences
        df["masked"] = df["masked"].apply(join_masked)

        final_dataset = pd.concat([final_dataset, df])

    return Dataset.from_pandas(final_dataset)

### Experiment 1
We will replace `[MASK]` with the suggestion that has biggest score.

In [47]:
class Experiment1(ProcessPrediction):
    def process_prediction(self, data: list[PredictionData]) -> Iterator[str]:
        for row in data:
            yield row.predictions[0]["token_str"].strip()

corrector = pipeline("fill-mask", model=MODEL, tokenizer=MODEL, device=0, top_k=1)
proc = Experiment1(corrector)

result = create_dataset(proc)
result.to_json(DATASET.parent / "result-experiment-1.json", orient="records", lines=True)

Filling masks for shards: 100%|██████████| 10/10 [00:31<00:00,  3.15s/it]
Creating json from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 250.03ba/s]


4397950

### Experiment 2

In [48]:
from nltk import edit_distance

class Experiment2(ProcessPrediction):
    def process_prediction(self, data: list[PredictionData]) -> Iterator[str]:
        for row in data:
            error = row.error.split(" ")
            masked = row.masked
            predictions = row.predictions
            index = masked.index("[MASK]")
            
            try:
                invalid_word = error[index]

                if invalid_word in [".", ",", "!", "?"]:
                    yield invalid_word

                best = min([(edit_distance(prediction["token_str"].strip(), invalid_word), prediction["token_str"].strip()) for prediction in predictions], key=lambda x: x[0])            

                yield best[1]
            except IndexError:
                yield row.predictions[0]["token_str"].strip()



corrector = pipeline("fill-mask", model=MODEL, tokenizer=MODEL, device=0, top_k=50)
proc = Experiment2(corrector)
result = create_dataset(proc)
result.to_json(DATASET.parent / "result-experiment-2.json", orient="records", lines=True)

Filling masks for shards: 100%|██████████| 10/10 [00:51<00:00,  5.20s/it]
Creating json from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 260.05ba/s]


4397556