It is the time to thain something finally.

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
from difflib import SequenceMatcher
import re
import json
from tqdm.notebook import tqdm
import random

In [3]:
# !pip install datasets transformers[torch]

We want to use all the data we have for the first attmept.

In [4]:
FILES = [
    "/home/jovyan/work/dataset/ficbook_pairs.jsonl",
    # should be first as its index fields are str not integers
    # otherwise field type can be specified explicitely like
    # `features=Features({'prompt': Value('string'), 'target': Value('string')})`
    "/home/jovyan/work/dataset/pikabu_pairs.jsonl",
    "/home/jovyan/work/dataset/librusec_pairs.jsonl"
]

In [5]:
from datasets import load_dataset, Features, Value, Dataset

dataset = load_dataset('json', data_files=FILES)
# dataset = dataset['train']#.train_test_split(test_size=0.1)
dataset, dataset["train"][0]

(DatasetDict({
     train: Dataset({
         features: ['tn', 'itn', 'orig_index', 'text_index'],
         num_rows: 236074
     })
 }),
 {'tn': '\nОбед номер два прошел куда лучше.',
  'itn': 'Обед номер два прошел куда лучше.',
  'orig_index': 'https://ficbook.net/readfic/177065/644335',
  'text_index': 0})

# He obtayn

I decided to put a construction of train examples alongside the training code itself as
* it is fast actually and
* I do see the preprocessing as a part of the future model.

So, here is the code.
It finds parts of two lines which are different and construct that "before" and "after" thing.
It filters identical pairs as well since there is nothing to learn from.

In [6]:
from typing import Optional


class Replace(dict):
    def __init__(
        self,
        type: str, text_from: str, text_to: Optional[str]=None,
        *args, **kwargs
    ):
        super().__init__(*args, **kwargs)
        self["type"] = type
        self["text_from"] = text_from
        self["text_to"] = "" if not text_to else text_to

    @property
    def type(self):
        return self["type"]

    @property
    def text_from(self):
        return self["text_from"]

    @property
    def text_to(self):
        return self["text_to"]

    def extend(self, r):
        if self.type != r.type:
            raise Exception("Replace type mismatch")
        self["text_from"] += r["text_from"]
        self["text_to"] += r["text_to"]


class Replaces(list):
    def add(self, r: Replace):
        if self and r.type == self[-1].type:
            self[-1].extend(r)
        else:
            return super().append(r)

In [7]:
def tokenize(text):
    return re.findall(r"[а-яА-Я]+\s*|\d+\s*|[^а-яА-Я\d\s]+\s*", text)


tokenize("ты, да я, да мы c тобой - вместе 2.")

['ты',
 ', ',
 'да ',
 'я',
 ', ',
 'да ',
 'мы ',
 'c ',
 'тобой ',
 '- ',
 'вместе ',
 '2',
 '.']

In [8]:
re_digits = re.compile(r"\d")


def diff(seq1, seq2):
    sm = SequenceMatcher(
        lambda x: not re.search(r"\w", x.strip()),
        a=seq1,
        b=seq2,
        autojunk=False
    )
    result = Replaces()
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        # print(tag, " ".join(seq1[i1:i2]), " ".join(seq2[j1:j2]))
        text_from, text_to = "".join(seq1[i1:i2]), "".join(seq2[j1:j2])
        if tag == "equal":
            type = "E"
        elif tag == "replace" and "".join((_.strip() for _ in seq1[i1:i2])) == "".join((_.strip() for _ in seq2[j1:j2])):
            type = "E"
        else:
            if not re.search(re_digits, text_from) and not re.search(re_digits, text_to):
                type = "E"
                text_to = None
            else:
                type = "R"
        result.add(Replace(type, text_from, text_to))
    return result


diff("234678", "123536788")

[{'type': 'R', 'text_from': '', 'text_to': '1'},
 {'type': 'E', 'text_from': '23', 'text_to': '23'},
 {'type': 'R', 'text_from': '4', 'text_to': '53'},
 {'type': 'E', 'text_from': '678', 'text_to': '678'},
 {'type': 'R', 'text_from': '', 'text_to': '8'}]

In [9]:
elem = {"tn": "А пока я расскажу, что происходило с тобой с двадцать пятого мая.", "itn": "А пока я расскажу, что происходило с тобой с 25.05 .", "orig_index": "https://ficbook.net/readfic/17915/38547", "text_index": 14}
elem["replaces"] = diff(tokenize(elem["tn"]), tokenize(elem["itn"]))
elem

{'tn': 'А пока я расскажу, что происходило с тобой с двадцать пятого мая.',
 'itn': 'А пока я расскажу, что происходило с тобой с 25.05 .',
 'orig_index': 'https://ficbook.net/readfic/17915/38547',
 'text_index': 14,
 'replaces': [{'type': 'E',
   'text_from': 'А пока я расскажу, что происходило с тобой с ',
   'text_to': 'А пока я расскажу, что происходило с тобой с '},
  {'type': 'R', 'text_from': 'двадцать пятого мая.', 'text_to': '25.05 .'}]}

In [10]:
data_good = []
for elem in tqdm(dataset["train"]):
    elem["replaces"] = diff(tokenize(elem["tn"]), tokenize(elem["itn"]))
    if all(_.type == "E" for _ in elem["replaces"]):
        continue
    prompt, target = "<SC1>", ""
    etid = 0
    for r in elem["replaces"]:
        if r.type == "E":
            prompt += r.text_from
        else:
            ws_number = len(r.text_to) - len(r.text_to.rstrip())
            prompt += f"[{r.text_to.rstrip()}]<extra_id_{etid}>{' ' * ws_number}"
            target += f"<extra_id_{etid}> {r.text_from.strip()} "
            etid += 1
        elem["prompt"] = prompt
        elem["target"] = target
    data_good.append({"prompt": prompt, "target": target})
len(data_good), random.choice(data_good)

  0%|          | 0/236074 [00:00<?, ?it/s]

(176270,
 {'prompt': '<SC1>- Ладно, Гезилл, буду через [15]<extra_id_0> минут. -',
  'target': '<extra_id_0> пятнадцать '})

We made here train examples of that kind

    <SC1>Временами я думаю, какое применение найти тем [14697]<extra_id_0> рублям, что лежат уже больше [33]<extra_id_1> лет?

and we want to predict a text like this

    <extra_id_0> четырнадцати тысячам шестистам девяноста семи <extra_id_1> тридцати трёх </s>

There is a mess with spaces along punctuation also.

In [11]:
dataset = Dataset.from_list([_ for _ in data_good]).train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'target'],
        num_rows: 158643
    })
    test: Dataset({
        features: ['prompt', 'target'],
        num_rows: 17627
    })
})

# He trayn

In [12]:
# MODEL_PATH = "/home/jovyan/wdc1/models/FRED-T5-1.7B"
MODEL_PATH = "/home/jovyan/wdc1/models/FRED-T5-large"

In [13]:
from transformers import GPT2Tokenizer, T5ForConditionalGeneration


tokenizer = GPT2Tokenizer.from_pretrained(MODEL_PATH, eos_token='</s>')
model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
# !pip install datasets transformers[sentencepiece]

In [15]:
# from transformers import T5ForConditionalGeneration, T5Tokenizer

# path = "./ruT5-base"
# model = T5ForConditionalGeneration.from_pretrained(path)
# tokenizer = T5Tokenizer.from_pretrained(path,)

In [16]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["prompt"],
        text_target=examples["target"],
        max_length=128,  # NB should affect memory consumption
        truncation=True
    )
    return model_inputs


dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/158643 [00:00<?, ? examples/s]

Map:   0%|          | 0/17627 [00:00<?, ? examples/s]

In [17]:
dataset = dataset.remove_columns(["prompt", "target"])

Just in case I get rid of examples with possible truncation mistakes.

In [18]:
from collections import Counter
c = Counter([len(_["input_ids"]) for _ in dataset["train"]])
sum([v for k, v in c.items() if k < 128]), c

(145944,
 Counter({128: 12699,
          20: 3802,
          19: 3791,
          21: 3761,
          17: 3720,
          18: 3688,
          23: 3668,
          22: 3659,
          16: 3608,
          15: 3522,
          13: 3516,
          14: 3502,
          25: 3380,
          24: 3371,
          12: 3311,
          26: 3171,
          27: 3143,
          28: 3079,
          11: 3009,
          29: 2952,
          30: 2873,
          31: 2677,
          32: 2560,
          33: 2452,
          10: 2427,
          34: 2368,
          35: 2324,
          36: 2164,
          37: 1997,
          38: 1991,
          39: 1905,
          40: 1787,
          41: 1732,
          9: 1691,
          42: 1688,
          43: 1548,
          45: 1507,
          44: 1443,
          46: 1357,
          47: 1228,
          48: 1214,
          49: 1213,
          50: 1151,
          52: 1118,
          51: 1108,
          53: 1055,
          55: 960,
          54: 960,
          8: 949,
          57: 

In [19]:
for k, v in dataset.items():
    dataset[k] = [_ for _ in v if 10 < len(_["input_ids"]) < 126]
# dataset

In [22]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer,  DataCollatorForSeq2Seq


training_args = Seq2SeqTrainingArguments(
    output_dir="/home/jovyan/work/models/1_fred-t5",
    optim="adafactor",
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_first_step=True,
    learning_rate=1e-4,
    lr_scheduler_type="constant",
    gradient_checkpointing=0,
    gradient_accumulation_steps=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_total_limit=10,
    num_train_epochs=5,
    predict_with_generate=True,
    bf16=True,
    push_to_hub=False,
    remove_unused_columns=False
)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model),
    # compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


In [None]:
model.save_pretrained("/home/jovyan/work/models/1_fred-t5/final", safe_serialization=False)
tokenizer.save_pretrained("/home/jovyan/work/models/1_fred-t5/final")

# But most importantly he explayn

In [None]:
# lm_text = '<SC1>я купил [iphone 12X]<extra_id_0> за [142 990 руб]<extra_id_1> без [3-x]<extra_id_2> часов полдень и т.д.'
# lm_text = '<SC1>я купил айфон за [14 970]<extra_id_0> рублей'
# lm_text = "<SC1>Временами я думаю, какое применение найти тем [14 697]<extra_id_0> рублям, что лежат уже больше [33]<extra_id_1> лет?"
lm_text = "<SC1>Было у отца [3]<extra_id_0> сына, но не было даже [2-3]<extra_id_1> пиджаков с блёстками за [142 990 руб]<extra_id_2>."
# lm_text = "<SC1>В школе у меня одни [5]<extra_id_0>."
# lm_text = '<SC1>Было у отца [3]<extra_id_0> сына. Старшему было [35]<extra_id_1>, среднему - не меньше [33]<extra_id_2>, а младший на [4]<extra_id_4> младше всех. Бывает.'
input_ids = torch.tensor([tokenizer.encode(lm_text)]).to("cuda:0")
outputs = model.generate(input_ids,eos_token_id=tokenizer.eos_token_id,early_stopping=True)
print(tokenizer.decode(outputs[0][1:]))

In [None]:
!nvidia-smi