Inspired by https://github.com/lingjzhu/CharsiuG2P. 

* Baseline (just a small byt5): 0.387 reduction

In [1]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

In [2]:
model = T5ForConditionalGeneration.from_pretrained('charsiu/g2p_multilingual_byT5_tiny_8_layers_100')
tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')

In [3]:
# tokenized English words
words = ['Char', 'siu', 'is', 'a', 'Cantonese', 'style', 'of', 'barbecued', 'pork']
words = ['<eng-us>: '+i for i in words]

out = tokenizer(words,padding=True,add_special_tokens=False,return_tensors='pt')

preds = model.generate(**out,num_beams=1) # We do not find beam search helpful. Greedy decoding is enough. 
phones = tokenizer.batch_decode(preds.tolist(),skip_special_tokens=True)
print(phones)



['ˈtʃɑɹ', 'ˈsiu', 'ˈɪs', 'ˈɑ', 'ˈkæntoʊˌniz', 'ˈstaɪɫ', 'ˈɔf', 'ˈbɑɹbɪkjud', 'ˈpɔɹk']


In [4]:
%%time


words = 'Господа, я не ел шесть дней целых 6 дней'.split()
words = ['<rus>: '+i for i in words]

out = tokenizer(words, padding=True, add_special_tokens=False, return_tensors='pt')

preds = model.generate(**out,num_beams=1) # We do not find beam search helpful. Greedy decoding is enough. 
phones = tokenizer.batch_decode(preds.tolist(),skip_special_tokens=True)
print(phones)

['ɡəspɐdak', 'ja', 'nʲe', 'jeɫ', 'ʂɛsʲtʲ', 'dʲnʲej', 't͡sɛɫɨx', 'tsʰɪŋ˧˥', 'dʲnʲej']
Wall time: 237 ms


In [5]:
tokenizer.convert_ids_to_tokens(tokenizer('привет').input_ids)

['Ð', '¿', 'Ñ', '\x80', 'Ð', '¸', 'Ð', '²', 'Ð', 'µ', 'Ñ', '\x82', '</s>']

# Loading the training data

## The original sentences

In [6]:
import pandas as pd

In [7]:
df_orig = pd.read_csv('../data/spellchecker_dataset_split.tsv', sep='\t')
df_orig

Unnamed: 0,trash,clean,trash2,clean2,distance,normalized_distance,split,edit_max_cldiff,edit_max_lendiff
0,"Шунда ук әсәйемдең тоҡсайын, төйөнсөктәрен күҙ...","Шунда уҡ әсәйемдең тоҡсайын, төйөнсөктәрен күҙ...","Шунда ук әсәйемдең тоҡсайын, төйөнсөктәрен күҙ...","Шунда уҡ әсәйемдең тоҡсайын, төйөнсөктәрен күҙ...",1,0.015385,train,1,0
1,Унан беҙ өсөбөҙ ҙә ултырғыстарға ултырабыҙ.,Унан беҙ әсәбеҙ ҙә ултырғыстарға ултырабыҙ.,Унан беҙ өсөбөҙ ҙә ултырғыстарға ултырабыҙ.,Унан беҙ әсәбеҙ ҙә ултырғыстарға ултырабыҙ.,3,0.069767,test,1,0
2,"«Иҫән-Һау ғына тороғоҙ инде», - тип бышылдай у...","«Иҫән-һау ғына тороғоҙ инде», - тип бышылдай у...","«Иҫән-Һау ғына тороғоҙ инде», - тип бышылдай у...","«Иҫән-һау ғына тороғоҙ инде», - тип бышылдай у...",1,0.014085,dev,1,0
3,"Минең генә бер кешем дә юҡ, тип шунда уҡ танау...","Минең генә бер кешем дә юҡ, - тип шунда уҡ тан...","Минең генә бер кешем дә юҡ, тип шунда уҡ танау...","Минең генә бер кешем дә юҡ, - тип шунда уҡ тан...",2,0.029412,train,0,0
4,"Ай йөрөгән, ти, йыл йөрөгән, ти, батыр, ете та...","Ай йөрөгән, ти, йыл йөрөгән, ти, батыр, ете та...","Ай йөрөгән, ти, йыл йөрөгән, ти, батыр, ете та...","Ай йөрөгән, ти, йыл йөрөгән, ти, батыр, ете та...",1,0.012500,train,1,0
...,...,...,...,...,...,...,...,...,...
23886,"Эҫтәрендә бүре үк оломаһа ла, эттәр шыңшый баш...","Эстәрендә бүре үк оломаһа ла, эттәр шыңшый баш...","Эҫтәрендә бүре үк оломаһа ла, эттәр шыңшый баш...","Эстәрендә бүре үк оломаһа ла, эттәр шыңшый баш...",1,0.020000,dev,1,0
23887,Үткән йәйҙә яман томра көндө Кәҙерғол төбәгенд...,Үткән йәйҙә яман томра көндө Ҡәҙерғол төбәгенд...,Үткән йәйҙә яман томра көндө Кәҙерғол төбәгенд...,Үткән йәйҙә яман томра көндө Ҡәҙерғол төбәгенд...,1,0.009524,train,1,0
23888,"Кайтыр алдынан салбарҙы эҙләй башлаһа, таба ал...","Ҡайтыр алдынан салбарҙы эҙләй башлаһа, таба ал...","Кайтыр алдынан салбарҙы эҙләй башлаһа, таба ал...","Ҡайтыр алдынан салбарҙы эҙләй башлаһа, таба ал...",1,0.020000,train,1,0
23889,Кыш урталарында бер көн Әбдрәшит ат аҙбарынан ...,Ҡыш урталарында бер көн Әбдрәшит ат аҙбарынан ...,Кыш урталарында бер көн Әбдрәшит ат аҙбарынан ...,Ҡыш урталарында бер көн Әбдрәшит ат аҙбарынан ...,1,0.009174,train,1,0


In [8]:
df_orig_train = df_orig[(df_orig.split=='train')]
print(df_orig_train.shape)

df_orig_train = df_orig_train[df_orig_train.edit_max_cldiff <= 3]
print(df_orig_train.shape)
df_orig_train = df_orig_train[df_orig_train.edit_max_lendiff <= 1].copy()
print(df_orig_train.shape)

(14382, 9)
(14171, 9)
(14085, 9)


In [9]:
df_orig_dev = df_orig[(df_orig.split=='dev') & (df_orig.edit_max_cldiff <= 3) & (df_orig.edit_max_lendiff <= 1)]
print(df_orig_dev.shape)

(4611, 9)


In [10]:
old_lens = pd.Series([len(s) for s in tokenizer(df_orig_train.trash2.tolist())['input_ids']])
new_lens = pd.Series([len(s) for s in tokenizer(df_orig_train.clean2.tolist())['input_ids']])

## Artificial replacements (todo)

Clone https://github.com/nevmenandr/bashkir-corpus/ nearby

In [12]:
import os
from tqdm.auto import tqdm, trange

In [13]:
with open('../data/clean_bk_sents.txt', 'r') as f:
    clean_sents = [line.strip() for line in f]

In [14]:
print(len(clean_sents))

1605495


# Training

In [15]:
from tqdm.auto import tqdm, trange

In [16]:
df_train = df_orig_train.copy().reset_index()

In [17]:
df_train.columns

Index(['index', 'trash', 'clean', 'trash2', 'clean2', 'distance',
       'normalized_distance', 'split', 'edit_max_cldiff', 'edit_max_lendiff'],
      dtype='object')

In [18]:
model.cuda();

In [19]:
import numpy as np
import torch 
import gc

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [21]:
bs = 1
report_steps = 200 

In [22]:
import textdistance

In [23]:
def fix(text, num_beams=1, max_length='auto', min_length='auto', **kwargs):
    out = tokenizer(text, padding=True, return_tensors='pt').to(model.device)
    n = out.input_ids.shape[1]
    if max_length == 'auto':
        max_length = int(n * 1.02 + 4)
    if min_length == 'auto':
        min_length = max(1, int(n * 0.98 - 4))
    preds = model.generate(**out,num_beams=num_beams, max_length=max_length, min_length=min_length, **kwargs)
    result = tokenizer.decode(preds[0], skip_special_tokens=True)
    return result

In [25]:
dev_small = df_orig_dev.sample(100, random_state=1).copy()

# Now try adding synthetic noise to the clean training data

In [47]:
import noisers
from importlib import reload
reload(noisers)

<module 'noisers' from 'C:\\Users\\david\\YandexDisk\\code\\NLP\\bashkort-spellchecker\\experiments\\noisers.py'>

In [48]:
from noisers import Noiser, add_simple_noise

In [30]:
noiser = Noiser.load('noise_model_v1.json')

In [50]:
from collections import Counter
chars_cnt = Counter(c for sent in tqdm(clean_sents) for c in sent)
all_chars = list(chars_cnt.keys())
len(chars_cnt)

  0%|          | 0/1605495 [00:00<?, ?it/s]

354

In [31]:
text = random.choice(clean_sents)

In [32]:
print(text)
print(noiser.add_noise(text, edit_rate=0.05))

Әҙип һүҙе милләттәрҙең йөрәген аса!
Әҙип һүҙе милләттәрҙвң йөрәген аса!


In [33]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
cleanup()

In [52]:
share_real = 0.1
share_noiser = 0.5
p_keep = 0.5
grad_steps = 8
report_steps = 1000
bs = 1

In [35]:
losses = []

In [54]:
model.train()

for i in trange(1_000_000):
    r = random.random()
    if r < share_real:
        batch = df_train.sample(bs)
        xx, yy = batch.trash2.tolist(), batch.clean2.tolist()
    elif r < share_real + share_noiser:
        yy = random.sample(clean_sents, bs)
        xx = [noiser.add_noise(text, edit_rate=0.05) if random.random() > p_keep else text for text in yy]
    else:
        yy = random.sample(clean_sents, bs)
        xx = [add_simple_noise(text, all_chars, edit_rate=0.05) if random.random() > p_keep else text for text in yy]
    
    try:
        x = tokenizer(xx, padding=True, return_tensors='pt').to(model.device)
        y = tokenizer(yy, padding=True, return_tensors='pt').to(model.device)

        y.input_ids[y.input_ids == 0] = -100
        loss = model(
            input_ids=x.input_ids,
            attention_mask=x.attention_mask,
            labels=y.input_ids,
            decoder_attention_mask=y.attention_mask,
            return_dict=True
        ).loss
        loss.backward()
        if i % grad_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        losses.append(loss.item())
    except RuntimeError as e:
        loss = None
        optimizer.zero_grad()
        cleanup()
        print('error', i, e)
        
    if i % report_steps == 0:
        print('step', i, 'loss', np.mean(losses[-report_steps:]))

  0%|          | 0/1000000 [00:00<?, ?it/s]

step 0 loss 1.5799298595190048
step 1000 loss 1.5785965538024902
step 2000 loss 1.5743405665159225
error 2143 CUDA out of memory. Tried to allocate 212.00 MiB (GPU 0; 4.00 GiB total capacity; 2.26 GiB already allocated; 168.20 MiB free; 2.48 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
error 2639 CUDA out of memory. Tried to allocate 2.40 GiB (GPU 0; 4.00 GiB total capacity; 210.20 MiB already allocated; 2.26 GiB free; 396.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 3000 loss 1.566542668223381
step 4000 loss 1.5563196305036544
step 5000 loss 1.545657940864563
step 6000 loss 1.5398989090919495
step 7000 loss 1.539822576880455
step 8000 loss 1.5328044068217277
step 9000 loss 1

step 84000 loss 0.5776040833257139
step 85000 loss 0.5747117669843137
step 86000 loss 0.556412426430732
step 87000 loss 0.560354130776599
step 88000 loss 0.5596464613322168
step 89000 loss 0.5371074318196625
error 89133 CUDA out of memory. Tried to allocate 700.00 MiB (GPU 0; 4.00 GiB total capacity; 2.24 GiB already allocated; 314.20 MiB free; 2.34 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 90000 loss 0.5644116847887635
step 91000 loss 0.518633094843477
error 91710 CUDA out of memory. Tried to allocate 172.00 MiB (GPU 0; 4.00 GiB total capacity; 2.34 GiB already allocated; 132.20 MiB free; 2.52 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 92000 loss 0.5183962627705186
er

step 150000 loss 0.3205536270784214
step 151000 loss 0.34117992675397546
step 152000 loss 0.3316469591166824
step 153000 loss 0.32158420656900855
step 154000 loss 0.32227523491624743
step 155000 loss 0.3222550076730549
step 156000 loss 0.33178687118180095
error 156383 CUDA out of memory. Tried to allocate 460.00 MiB (GPU 0; 4.00 GiB total capacity; 1.76 GiB already allocated; 390.20 MiB free; 2.27 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 157000 loss 0.30821347533911464
step 158000 loss 0.31469858943996953
step 159000 loss 0.3110806119404733
step 160000 loss 0.3067249200092629
error 160447 CUDA out of memory. Tried to allocate 420.00 MiB (GPU 0; 4.00 GiB total capacity; 2.22 GiB already allocated; 392.20 MiB free; 2.26 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to 

step 229000 loss 0.22254937118571252
step 230000 loss 0.23649040507571772
step 231000 loss 0.21750179457152263
step 232000 loss 0.2253675498859957
step 233000 loss 0.23385746720153838
step 234000 loss 0.2282316687256098
error 234376 CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 4.00 GiB total capacity; 2.44 GiB already allocated; 60.20 MiB free; 2.59 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 235000 loss 0.21989145450945943
step 236000 loss 0.21367947154305875
step 237000 loss 0.22754633634164928
step 238000 loss 0.20981563916243612
step 239000 loss 0.21510113241011278
step 240000 loss 0.21446957120439036
step 241000 loss 0.2409097735118121
error 241212 CUDA out of memory. Tried to allocate 46.00 MiB (GPU 0; 4.00 GiB total capacity; 2.57 GiB already allocated; 8.20 MiB free; 2.64 GiB reserved in total by PyTorch) If r

step 307000 loss 0.1796159782060422
error 307636 CUDA out of memory. Tried to allocate 100.00 MiB (GPU 0; 4.00 GiB total capacity; 2.54 GiB already allocated; 40.20 MiB free; 2.61 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 308000 loss 0.18122519653500058
step 309000 loss 0.18926862568082287
step 310000 loss 0.18545510611962526
step 311000 loss 0.17857523324107752
error 311047 CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 4.00 GiB total capacity; 2.55 GiB already allocated; 24.20 MiB free; 2.62 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 312000 loss 0.18320936574600638
step 313000 loss 0.182819615974091
step 314000 loss 0.1869017479699105
step 315000 loss 0.1828

step 377000 loss 0.15958125275606289
step 378000 loss 0.15759516025753692
step 379000 loss 0.1612064256677404
step 380000 loss 0.14950356920063496
step 381000 loss 0.15988918294897303
step 382000 loss 0.16023260991787538
error 382939 CUDA out of memory. Tried to allocate 92.00 MiB (GPU 0; 4.00 GiB total capacity; 2.51 GiB already allocated; 76.20 MiB free; 2.57 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 383000 loss 0.16300548178981988
step 384000 loss 0.15121915566315874
error 384372 CUDA out of memory. Tried to allocate 144.00 MiB (GPU 0; 4.00 GiB total capacity; 2.57 GiB already allocated; 28.20 MiB free; 2.62 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 385000 loss 0.1

error 450394 CUDA out of memory. Tried to allocate 126.00 MiB (GPU 0; 4.00 GiB total capacity; 2.55 GiB already allocated; 20.20 MiB free; 2.63 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 451000 loss 0.13689366140938364
step 452000 loss 0.13486919765779748
error 452263 CUDA out of memory. Tried to allocate 320.00 MiB (GPU 0; 4.00 GiB total capacity; 2.16 GiB already allocated; 170.20 MiB free; 2.48 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 453000 loss 0.1330342805115506
step 454000 loss 0.1405926973107271
step 455000 loss 0.1304072535405867
step 456000 loss 0.14009099586727097
step 457000 loss 0.13604291272582486
step 458000 loss 0.1345101175091695
step 459000 loss 0.13

step 508000 loss 0.12955787990009413
step 509000 loss 0.13401356011605822
step 510000 loss 0.1309831263711676
step 511000 loss 0.12052694856165909
step 512000 loss 0.1225209368225187
error 512993 CUDA out of memory. Tried to allocate 236.00 MiB (GPU 0; 4.00 GiB total capacity; 2.17 GiB already allocated; 222.20 MiB free; 2.43 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 513000 loss 0.12551147224474699
step 514000 loss 0.12328751448937691
step 515000 loss 0.12693971003079788
error 515046 CUDA out of memory. Tried to allocate 98.00 MiB (GPU 0; 4.00 GiB total capacity; 2.47 GiB already allocated; 20.20 MiB free; 2.63 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 516000 loss 0.1

step 558000 loss 0.11927361842989921
error 558690 CUDA out of memory. Tried to allocate 80.00 MiB (GPU 0; 4.00 GiB total capacity; 2.48 GiB already allocated; 44.20 MiB free; 2.60 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 559000 loss 0.1150800149468705
step 560000 loss 0.12144295880710707
step 561000 loss 0.11096268541831524
step 562000 loss 0.1243954921120312
step 563000 loss 0.1273931681246031
step 564000 loss 0.11630318406154401
step 565000 loss 0.11971781189972534
step 566000 loss 0.12106155532598495
step 567000 loss 0.11285549202375114
error 567007 CUDA out of memory. Tried to allocate 310.00 MiB (GPU 0; 4.00 GiB total capacity; 2.32 GiB already allocated; 280.20 MiB free; 2.37 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentati

step 640000 loss 0.10387712128972634
step 641000 loss 0.11719345789495855
error 641942 CUDA out of memory. Tried to allocate 306.00 MiB (GPU 0; 4.00 GiB total capacity; 2.58 GiB already allocated; 6.20 MiB free; 2.64 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 642000 loss 0.11218127903994173
step 643000 loss 0.1107912263199687
error 643828 CUDA out of memory. Tried to allocate 86.00 MiB (GPU 0; 4.00 GiB total capacity; 2.58 GiB already allocated; 38.20 MiB free; 2.61 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 644000 loss 0.1065028185765259
step 645000 loss 0.10467859716597014
error 645398 CUDA out of memory. Tried to allocate 4.38 GiB (GPU 0; 4.00 GiB total capacity; 246

step 714000 loss 0.09866076239780523
step 715000 loss 0.10140288830338977
step 716000 loss 0.1028235488627106
error 716542 CUDA out of memory. Tried to allocate 140.00 MiB (GPU 0; 4.00 GiB total capacity; 2.50 GiB already allocated; 108.20 MiB free; 2.54 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 717000 loss 0.09861275741481222
error 717311 CUDA out of memory. Tried to allocate 78.00 MiB (GPU 0; 4.00 GiB total capacity; 1.92 GiB already allocated; 60.20 MiB free; 2.59 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 718000 loss 0.10642920645163395
step 719000 loss 0.09960286799632014
step 720000 loss 0.09978142785839736
error 720335 CUDA out of memory. Tried to allocate 108.0

step 797000 loss 0.09978106440638658
step 798000 loss 0.09857285513286479
error 798454 CUDA out of memory. Tried to allocate 232.00 MiB (GPU 0; 4.00 GiB total capacity; 2.45 GiB already allocated; 120.20 MiB free; 2.53 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 799000 loss 0.09432505177450366
step 800000 loss 0.09481737771281042
step 801000 loss 0.09354269078047946
step 802000 loss 0.09058417246723548
step 803000 loss 0.09838391212245914
step 804000 loss 0.09397814438841305
step 805000 loss 0.10082559155044146
step 806000 loss 0.09214344577700831
step 807000 loss 0.09286327820550651
step 808000 loss 0.0978928542973008
error 808036 CUDA out of memory. Tried to allocate 106.00 MiB (GPU 0; 4.00 GiB total capacity; 2.44 GiB already allocated; 98.20 MiB free; 2.55 GiB reserved in total by PyTorch) If reserved memory is >> allocated 

step 882000 loss 0.08650071184360422
error 882880 CUDA out of memory. Tried to allocate 44.00 MiB (GPU 0; 4.00 GiB total capacity; 2.54 GiB already allocated; 38.20 MiB free; 2.61 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 883000 loss 0.08501373364892788
step 884000 loss 0.09419221998809371
step 885000 loss 0.09134126584057231
error 885679 CUDA out of memory. Tried to allocate 44.00 MiB (GPU 0; 4.00 GiB total capacity; 2.55 GiB already allocated; 14.20 MiB free; 2.63 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 886000 loss 0.08886762151622679
step 887000 loss 0.0907416554717347
error 887342 CUDA out of memory. Tried to allocate 84.00 MiB (GPU 0; 4.00 GiB total capacity; 2

step 927000 loss 0.084660089220386
error 927377 CUDA out of memory. Tried to allocate 294.00 MiB (GPU 0; 4.00 GiB total capacity; 2.30 GiB already allocated; 284.20 MiB free; 2.37 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 928000 loss 0.08278056111210026
step 929000 loss 0.08946197853679769
step 930000 loss 0.08540668199257925
step 931000 loss 0.08112962641462218
step 932000 loss 0.09018070698645897
step 933000 loss 0.09070602148619947
step 934000 loss 0.08938274859683588
step 935000 loss 0.08740863130311481
step 936000 loss 0.08085799121065065
step 937000 loss 0.08702542889548931
error 937567 CUDA out of memory. Tried to allocate 102.00 MiB (GPU 0; 4.00 GiB total capacity; 2.44 GiB already allocated; 62.20 MiB free; 2.59 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb t

step 974000 loss 0.08481518008059356
error 974165 CUDA out of memory. Tried to allocate 136.00 MiB (GPU 0; 4.00 GiB total capacity; 2.43 GiB already allocated; 24.20 MiB free; 2.62 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
step 975000 loss 0.08928496128052939
step 976000 loss 0.08084241666365415
step 977000 loss 0.09004547361284494
step 978000 loss 0.08542263434198685
step 979000 loss 0.09093589480081574
step 980000 loss 0.09141773560782894
step 981000 loss 0.07908525465824641
step 982000 loss 0.08365372589917387
step 983000 loss 0.08568151694175322
step 984000 loss 0.08058231674996205
step 985000 loss 0.08335229357751087
error 985756 CUDA out of memory. Tried to allocate 44.00 MiB (GPU 0; 4.00 GiB total capacity; 2.56 GiB already allocated; 42.20 MiB free; 2.61 GiB reserved in total by PyTorch) If reserved memory is >> allocated m

In [55]:
model.eval();

In [56]:
dev_small['fixed2'] = [fix(text, num_beams=2) for text in tqdm(dev_small.trash2)]

  0%|          | 0/100 [00:00<?, ?it/s]

In [57]:
dev_small['change_amount'] = dev_small.apply(lambda row: textdistance.levenshtein.distance(row.trash2, row.fixed2), axis=1)
dev_small['new_diff'] = dev_small.apply(lambda row: textdistance.levenshtein.distance(row.clean2, row.fixed2), axis=1)

dev_small.mean()

distance               1.2900
normalized_distance    0.0175
edit_max_cldiff        0.4600
edit_max_lendiff       0.0300
change_amount          1.0200
new_diff               1.6300
dtype: float64

In [58]:
cnd = dev_small.new_diff * (dev_small.change_amount < 5) + dev_small.distance * (dev_small.change_amount >= 5)
print(cnd.sum())
print(1 - cnd.sum() / dev_small.distance.sum())

101
0.21705426356589153


In [59]:
path = '../models/t5-tiny-denoise-v2'

In [60]:
model.save_pretrained(path)
tokenizer.save_pretrained(path)

('../models/t5-tiny-denoise-v2\\tokenizer_config.json',
 '../models/t5-tiny-denoise-v2\\special_tokens_map.json',
 '../models/t5-tiny-denoise-v2\\added_tokens.json')