# Pretrain Translator

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [45]:
import sys, os, datetime, copy
import json
import torch
import random
import glob
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, PreTrainedTokenizerFast
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TranslationPipeline
from datasets import load_dataset, Dataset, DatasetDict
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, Split, ByteLevel
from tokenizers.processors import TemplateProcessing
import tokenizers.pre_tokenizers
import tokenizers.processors
import tokenizers.decoders

In [2]:
import cdli
import languages

In [3]:
os.environ["WANDB_NOTEBOOK_NAME"] = "PretrainTranslator.ipynb"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

source_langs = set(["akk", "sux"])

# target_langs = set(["en", "it", "es", "fr", "de"])
target_langs = set(["en"])

base_model_id = "t5-base"

max_vocab_size = 50_000
model_max_length = 512
batch_size = 8 if os.path.basename(base_model_id).startswith("t5-base") else 128

num_train_sequences = 524_288 * 128
num_warmup_sequences = 10_000 * 128

warmup_learning_rate = 0.01

use_paragraphs = True
use_lines = True


In [4]:
date_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
flags = ""
suffix = ""
if use_paragraphs:
    flags += "-p"
if use_lines:
    flags += "-l"
model_id = f"{os.path.basename(base_model_id)}-pre{flags}-{''.join(sorted(list(source_langs)))}-{''.join(sorted(list(target_langs)))}-{date_id}{suffix}"
model_id

't5-base-pre-p-l-akksux-en-20220726-154724'

In [5]:
has_cuda = torch.cuda.is_available()
device = torch.cuda.device(0) if has_cuda else "cpu"
has_cuda, device

(True, <torch.cuda.device at 0x7f02435ee3e0>)

In [6]:
!nvidia-smi

Tue Jul 26 15:47:24 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
| 30%   49C    P8    30W / 350W |    168MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## Load Training Data

In [7]:
avg_src_chars_per_token = 2.6712177445735397

In [8]:
def get_prefix(src_lang, tgt_lang):
    s = languages.all_languages[src_lang]
    t = languages.all_languages[tgt_lang]
    return f"translate {s} to {t}: "
    
get_prefix("suxts", "es")

'translate Sumerian to Spanish: '

In [9]:
publications = cdli.get_atf()

Downloading https://github.com/cdli-gh/data/raw/master/cdliatf_unblocked.atf
Parsing atf


In [10]:
len(publications), "publications"

(134712, 'publications')

In [11]:
def target_ok(target_text):
    if len(target_text) == 0:
        return False
    if len(set(target_text.replace(" ", ""))) < 2:
        return False
    return True
    

def test_target_ok(text):
    ok = target_ok(text)
    print(ok, repr(text))
    
test_target_ok("")
test_target_ok(" ")
test_target_ok("xx xxx x")
test_target_ok(".. . .. ")
test_target_ok("Hi")

False ''
False ' '
False 'xx xxx x'
False '.. . .. '
True 'Hi'


In [12]:
wmax_num_tokens = model_max_length - 192

def wrap_paragraph(paragraph, lines, src_lang, tgt_lang):
    ptag, pline_start_index, pline_end_index = paragraph
    wline_ranges = []
    wline_tok_len = 0.0
    
    def start_new_line(pline_index):
#         print("start", pline_index)
        wline_ranges.append((pline_index, pline_index + 1))
        
    def append_line(pline_index):
#         print("append", pline_index)
        r = wline_ranges[-1]
        if r[1] == pline_index:
            wline_ranges[-1] = (r[0], r[1] + 1)
        else:
            print(f"Missing line: got {pline_index}, expected {r[1]}: {wline_ranges}")

    for pline_index in range(pline_start_index, pline_end_index):
        pline_num_toks = len(lines[pline_index].text) / avg_src_chars_per_token + 1.0
        if len(wline_ranges) == 0 or (wline_tok_len + pline_num_toks > wmax_num_tokens):
            start_new_line(pline_index)
            wline_tok_len = 0.0
        else:
            append_line(pline_index)
        wline_tok_len += pline_num_toks
    return wline_ranges



In [13]:
dataset_index = json.load(open("../data/dataset_index.json", "rt"))
print(dataset_index.keys())

dict_keys(['akk', 'sux'])


In [14]:
print(len(dataset_index["akk"]["train"]), "akk train")
print(len(dataset_index["akk"]["test"]), "akk test")
print(len(dataset_index["sux"]["train"]), "sux train")
print(len(dataset_index["sux"]["test"]), "sux test")

870 akk train
108 akk test
3753 sux train
396 sux test


In [15]:
def get_pubs_sources():

    added_sources = set()

    def add_line_ranges(s, area, b, e):
        ls = " ".join([x.text for x in area.lines[b:e]])
        ls = (s, " ".join(ls.split(" ")))
        added_sources.add(ls)
        for t in target_langs:
            lt = " ".join([(x.languages[t] if t in x.languages else "") for x in area.lines[b:e]])
            lt = (t, " ".join(lt.split(" ")))
            added_sources.add(lt)

    for s in source_langs:
        for t in target_langs:
            print("Preparing", s, "to", t)            
            st_prefix = get_prefix(s, t)
            ts_prefix = get_prefix(t, s)
            for pub in tqdm([p for p in publications if p.language==s]):
                for area in pub.text_areas:
                    if not any(x for x in area.lines if t in x.languages):
                        continue
                    if use_paragraphs:
                        paragraphs = area.lines_to_paragraphs(s)
                        line_ranges = []                
                        for p in paragraphs:                    
                            wlines = wrap_paragraph(p, area.lines, s, t)
                            line_ranges.extend(wlines)
        #                 print("="*50, len(area.lines))
                        for b, e in line_ranges:
                            add_line_ranges(s, area, b, e)
                    if use_lines:
                        for i, _ in enumerate(area.lines):
                            add_line_ranges(s, area, i, i + 1)
#     random.shuffle(new_sourceandtargets)
#     return Dataset.from_dict({"source": [x[0] for x in new_sourceandtargets], "target": [x[1] for x in new_sourceandtargets]})
    return added_sources

all_sources = get_pubs_sources()
all_sources = list(all_sources)
len(all_sources)


Preparing sux to en


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 99456/99456 [00:00<00:00, 253366.20it/s]


Preparing akk to en


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21820/21820 [00:00<00:00, 198020.47it/s]


146812

In [16]:
all_sources[1]

('en', '1 (workman,) porter: Lu-girizal,')

In [17]:
all_sources[:5]

[('en', 'You will soak the delicate part (of the cloth) in beer,'),
 ('en', '1 (workman,) porter: Lu-girizal,'),
 ('en', 'account of labor of worktroops;'),
 ('en', 'and Enlil'),
 ('akk', 'szum-ma _dumu-mesz_')]

## Train the Tokenizer

In [18]:
tokenizer_txt_path = os.path.abspath("tokenizer_training_data.txt")
with open(tokenizer_txt_path, "wb") as f:
    for lang, line in tqdm(all_sources):
        f.write(bytes(line, "utf8"))
        f.write(b'\n')
tokenizer_txt_path

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 146812/146812 [00:00<00:00, 1611653.59it/s]


'/home/fak/Projects/CuneiformTranslators/tools/tokenizer_training_data.txt'

In [19]:
!tail tokenizer_training_data.txt

Ur-Nanše, king of Lagaš, son of GuniDU, “son” of Gursar, had the temple of Nanše built. The shrine Girsu he built. The Great Oval he built. The E-PA he built. The temple of Gatumdu he built. The Edam he built. The temple of Ninmar he built. The Abzu of the Levee he built.
Ur-Nanše, king of Lagaš, son of GuniDU, “son” of Gursar, had the temple of Nanše built. (A statue of) Nanše he created. The A-Sanga (canal) he dug, (and) for Nanše into the Sanga he made water enter. (A statue of) Eš-ir he created. Ur-nimin, as the spouse of Nanše he chose by kid-omen. A-edin he built, Ningar he built, E-PA he built, the wall of Lagaš he built. (A statue of) Lugal-iri he created. He had boats of Dilmun from the mountains produce loads of timber.
ki-tusz ne-ha tusz-u3-da
mu# kara2-har#{ki#} ba#-hul#
If (there is) a mole is on . . .
that of Sîn-muballiț
{gesz}tukul-ga2 mu-bi sig-sze3 mu-un-gal2
4 barig of grain per (gin2) I sold.
ur-bar-ra-gin7
At that time there should be seven for him, there 

In [20]:
special_tokens=["<pad>", "</s>", "<unk>", "[...]"]
additional_special_tokens = [f"<extra_id_{i}>" for i in range(100)]
all_special_tokens = special_tokens + additional_special_tokens
all_special_tokens[:10]

['<pad>',
 '</s>',
 '<unk>',
 '[...]',
 '<extra_id_0>',
 '<extra_id_1>',
 '<extra_id_2>',
 '<extra_id_3>',
 '<extra_id_4>',
 '<extra_id_5>']

In [21]:
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
trainer = BpeTrainer(vocab_size=max_vocab_size, special_tokens=all_special_tokens)

# print(tokenizer.pre_tokenizer)
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False) # Split("\n", "removed")
tokenizer.post_processor = TemplateProcessing(
    single="$0 </s>",
    pair="$A </s> $B:1 </s>:1",
    special_tokens=[(x, i) for i, x in enumerate(all_special_tokens)],
)
# tokenizer.post_processor = tokenizers.processors. tokenizers.processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = tokenizers.decoders.ByteLevel()
tokenizer.model_max_length=model_max_length
files = [tokenizer_txt_path]
tokenizer.train(files, trainer)








In [22]:
tokenizer.get_vocab_size()

38543

In [23]:
test_txt = all_sources[6][1]
test_tokens = tokenizer.encode(test_txt).ids
print(test_txt)
print(test_tokens)
print(tokenizer.decode(ids=test_tokens))

its work: 26 2/3 (volume-)sar;
[759, 801, 129, 4301, 390, 118, 122, 314, 3464, 2511, 686, 130, 1]
its work: 26 2/3 (volume-)sar;


In [24]:
# tokenizer.decode(tokenizer.encode("Hello, my name is Frank").ids)

In [25]:
ptokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer, model_max_len=model_max_length)
ptokenizer.model_max_length = model_max_length
ptokenizer.pad_token = "<pad>"
ptokenizer.pad_token_id = tokenizer.encode("<pad>").ids[0]
ptokenizer.eos_token = "</s>"
ptokenizer.eos_token_id = tokenizer.encode("</s>").ids[0]
ptokenizer.unk_token = "<unk>"
ptokenizer.unk_token_id = tokenizer.encode("<unk>").ids[0]

In [26]:
ptokenizer.decode(ptokenizer.encode("Hello, my name is Frank"))

'Hello, my name is Frank</s>'

In [27]:
ptokenizer.model_max_length

512

In [28]:
tokenizer = ptokenizer

## Build the Train Dataset

In [58]:
all_sources_dataset = Dataset.from_dict({"source": [x[1] for x in all_sources[:1000]]})

In [59]:
all_sources_dataset[11]

{'source': 'ka da-mi-ma-ma-sze3 ma2 gid2-da'}

In [60]:
dataset = all_sources_dataset.train_test_split(test_size=0.1)

In [61]:
original_tests = dataset["test"]
original_tests

Dataset({
    features: ['source'],
    num_rows: 100
})

## Tokenize the Data

In [62]:
tokenizer.model_max_length

512

In [63]:
print("pad", tokenizer.pad_token, tokenizer.pad_token_id)
print("eos", tokenizer.eos_token, tokenizer.eos_token_id)
print("unk", tokenizer.unk_token, tokenizer.unk_token_id)

pad <pad> 0
eos </s> 1
unk <unk> 2


In [66]:
def corrupt_sources(sources):
    
    nsources = len(sources)
    print("-"*10, nsources)
    targets = []
    for i in range(nsources):
        print(len(sources[i]))
    return sources

In [67]:
ccc = 0
sum_src_chars_per_token = 0.0
num_src_chars_per_token = 0
sum_tgt_chars_per_token = 0.0
num_tgt_chars_per_token = 0

def preprocess_function(examples):
    global ccc, sum_src_chars_per_token, sum_tgt_chars_per_token, num_src_chars_per_token, num_tgt_chars_per_token
#     print(examples)
    inputs = [example for example in examples["source"]]
    
    model_inputs = tokenizer(inputs, max_length=model_max_length, truncation=True)
    
    with tokenizer.as_target_tokenizer():
#         labels = tokenizer(targets, max_length=model_max_length, truncation=True)
        labels = corrupt_sources(model_inputs["input_ids"])

    model_inputs["labels"] = labels
    
    nexamples = len(inputs)
    for i in range(nexamples):
        nchar = len(inputs[i])
        ntoks = len(model_inputs["input_ids"][i])
        if ntoks > 0:
            sum_src_chars_per_token += nchar / ntoks
            num_src_chars_per_token += 1
    
    ccc += 1
    if ccc == 1:
        print(model_inputs["input_ids"][0])
        print(model_inputs["labels"][0])

    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset

  0%|          | 0/1 [00:00<?, ?ba/s]

---------- 900
190
9
4
17
19
6
11
6
15
2
10
7
25
10
393
6
20
15
11
79
5
12
13
11
14
12
21
13
9
4
9
5
19
18
4
9
17
28
15
13
42
11
41
21
5
14
33
18
15
12
10
13
16
8
3
9
6
8
5
6
13
18
33
17
13
8
29
10
31
2
37
14
14
17
6
8
8
45
10
5
8
35
6
104
7
19
7
5
458
12
2
22
16
9
13
20
7
10
401
18
39
10
5
21
7
14
15
26
7
12
12
5
50
6
5
3
208
8
5
6
12
5
12
12
16
11
13
12
15
24
305
3
6
11
7
6
33
11
7
22
23
8
10
30
5
5
27
21
6
165
34
18
19
5
8
7
6
6
7
15
11
17
5
3
3
19
363
49
13
6
84
26
17
22
15
15
7
76
3
10
6
11
21
88
11
2
30
9
10
27
10
12
7
23
4
5
42
8
6
25
12
28
9
6
8
7
19
11
13
9
13
13
18
4
25
33
7
6
20
4
18
35
8
14
14
4
9
11
13
61
10
3
16
8
8
12
9
31
10
7
5
26
20
11
28
5
8
9
9
10
18
8
21
26
9
11
185
23
9
3
15
241
17
18
4
4
16
25
18
7
3
9
10
111
9
22
18
125
19
17
6
35
17
19
4
10
8
5
11
40
14
10
22
8
7
23
18
6
7
33
13
8
15
7
50
4
6
18
7
9
16
9
14
68
3
9
9
19
5
14
10
13
22
14
13
9
10
10
4
8
6
4
15
6
35
5
11
13
11
12
9
8
4
16
7
4
20
18
17
5
5
29
10
17
21
16
8
10
14
16
32
13
17
9
4
11
6
8
11
9
2
20
7
6


  0%|          | 0/1 [00:00<?, ?ba/s]

---------- 100
9
8
4
8
13
4
9
12
13
30
7
31
5
30
13
5
10
10
35
5
5
12
30
16
23
7
26
17
15
12
5
14
14
5
14
8
13
6
8
6
6
11
16
21
10
6
7
13
21
12
23
170
12
9
10
6
413
5
11
11
23
9
13
24
6
5
128
29
13
12
10
11
8
17
22
4
13
13
12
4
4
16
29
15
65
13
6
7
133
6
41
29
4
17
4
27
13
6
51
15


DatasetDict({
    train: Dataset({
        features: ['source', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 900
    })
    test: Dataset({
        features: ['source', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [68]:
avg_src_chars_per_token = sum_src_chars_per_token / num_src_chars_per_token
print("avg_src_chars_per_token", "=", avg_src_chars_per_token)

avg_src_chars_per_token = 2.70826004394177


In [69]:
tokenized_dataset["train"] = tokenized_dataset["train"].remove_columns(["source"])
tokenized_dataset["test"] = tokenized_dataset["test"].remove_columns(["source"])
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 900
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [70]:
source_max_length = max([len(x["input_ids"]) for x in tokenized_dataset["train"]])
source_max_length

458

In [71]:
tokenized_dataset["train"][0]["labels"]

[165,
 703,
 165,
 298,
 169,
 194,
 573,
 116,
 277,
 116,
 361,
 116,
 282,
 316,
 354,
 121,
 165,
 272,
 301,
 297,
 301,
 507,
 106,
 3534,
 170,
 121,
 2672,
 316,
 703,
 165,
 1780,
 116,
 531,
 121,
 405,
 169,
 194,
 270,
 116,
 321,
 116,
 285,
 316,
 174,
 122,
 116,
 320,
 127,
 589,
 121,
 418,
 121,
 116,
 340,
 123,
 116,
 166,
 165,
 316,
 703,
 165,
 298,
 169,
 194,
 1440,
 116,
 335,
 116,
 309,
 116,
 387,
 121,
 116,
 308,
 115,
 316,
 174,
 122,
 116,
 320,
 127,
 165,
 316,
 703,
 165,
 272,
 116,
 309,
 116,
 403,
 116,
 452,
 116,
 646,
 316,
 174,
 122,
 116,
 320,
 127,
 165,
 316,
 994,
 122,
 165,
 298,
 169,
 194,
 1151,
 116,
 335,
 116,
 384,
 116,
 1068,
 316,
 346,
 116,
 174,
 165,
 316,
 628,
 537,
 116,
 170,
 122,
 293,
 123,
 328,
 111,
 186,
 112,
 637,
 111,
 322,
 438,
 542,
 165,
 316,
 335,
 567,
 116,
 524,
 116,
 295,
 116,
 315,
 116,
 270,
 460,
 116,
 170,
 434,
 982,
 121,
 429,
 116,
 274,
 369,
 116,
 552,
 121,
 298,
 169,
 194,
 643

In [72]:
sdfsdfsdf

NameError: name 'sdfsdfsdf' is not defined

## Load the Model

In [42]:
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id, 
                                                   max_length=model_max_length)

In [52]:
model_config = copy.deepcopy(base_model.config)
model_config.vocab_size = tokenizer.vocab_size
model_config.max_length, model_config.vocab_size

(512, 38543)

In [53]:
model = AutoModelForSeq2SeqLM.from_config(model_config)

## Train

In [54]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
# data_collator

In [57]:
num_train_sequences = len(tokenized_dataset["train"])
num_train_sequences

132130

In [55]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"../results/{model_id}",
    evaluation_strategy="steps",
    eval_steps=int(0.5 * num_train_sequences/batch_size),
    warmup_steps=warm,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    fp16=has_cuda,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)



Using amp half precision backend


In [56]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 132130
  Num Epochs = 30
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 495510
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mpraeclarum[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,2.1555,1.867853
2,1.7267,1.485119


Saving model checkpoint to ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-500
Configuration saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-500/config.json
Model weights saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-1000
Configuration saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-1000/config.json
Model weights saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-1000/tokenizer_config.json
Special tokens file

Special tokens file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-6500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-5000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-7000
Configuration saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-7000/config.json
Model weights saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-7000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-7000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-7000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-5500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-pre-p-l-akksux-en-20220726

Deleting older checkpoint [../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-11000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-13000
Configuration saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-13000/config.json
Model weights saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-13000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-13000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-13000/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-11500] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-13500
Configuration saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-135

Model weights saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-18500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-18500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-18500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-17000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-19000
Configuration saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-19000/config.json
Model weights saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-19000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-19000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-15472

Model weights saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-24500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-24500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-24500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-23000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-25000
Configuration saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-25000/config.json
Model weights saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-25000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-25000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-15472

Model weights saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-30500/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-30500/tokenizer_config.json
Special tokens file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-30500/special_tokens_map.json
Deleting older checkpoint [../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-29000] due to args.save_total_limit
Saving model checkpoint to ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-31000
Configuration saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-31000/config.json
Model weights saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-31000/pytorch_model.bin
tokenizer config file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-154724/checkpoint-31000/tokenizer_config.json
Special tokens file saved in ../results/t5-base-pre-p-l-akksux-en-20220726-15472

KeyboardInterrupt: 

## Sample

In [None]:
pipeline = TranslationPipeline(model=model.to("cpu"), tokenizer=tokenizer, max_length=model_max_length)

In [None]:
pipeline

In [None]:
pipeline("translate English to French: hello my name is Frank")

In [None]:
source_test = translations["test"][0]["source"]
target_test = translations["test"][0]["target"]
print(source_test)
print("-"*80)
print(target_test)

In [None]:
def translate(text):
    return pipeline(text)

translate(source_test)

In [None]:
tests = original_tests
def sample(num_samples=100):
    for i in range(min(num_samples, tests.num_rows)):
        t = tests[i]
    #     print(t)
        src = t["source"]
        tgt = t["target"]
        query = src
        pred = pipeline(query)[0]["translation_text"]
        print("-"*48)
        print("QUERY ", query)
        print("TARGET", tgt)
        print("PRED  ", pred)
    #     break
    
sample()

## Save to Huggingface

In [None]:
model_path = os.path.abspath(f"/home/fak/nn/Data/generated/cuneiform/{model_id}-pretrained")
trainer.save_model(model_path)
model_path

In [None]:
tokenizer.save_pretrained(model_path)