The idea is the same as in the notebooks 3 and 5, "seq labeling", but now we use a simple 1-gram vocabulary, and to compensate for this, we upsample the input characters and use CTC loss to compensate for it. 

# 1. Data

## 1. 1. Load the parallel text

In [1]:
import pandas as pd
from tqdm.auto import tqdm, trange
from sklearn.model_selection import train_test_split
import torch
from datasets import load_dataset
import gc
import numpy as np
import random

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

In [2]:
torch.logsumexp(torch.stack([torch.tensor(1), torch.tensor(2)]), 0)

tensor(2.3133)

In [3]:
df_orig = pd.read_csv('../data/spellchecker_dataset_split.tsv', sep='\t')
df_orig

Unnamed: 0,trash,clean,trash2,clean2,distance,normalized_distance,split,edit_max_cldiff,edit_max_lendiff
0,"Шунда ук әсәйемдең тоҡсайын, төйөнсөктәрен күҙ...","Шунда уҡ әсәйемдең тоҡсайын, төйөнсөктәрен күҙ...","Шунда ук әсәйемдең тоҡсайын, төйөнсөктәрен күҙ...","Шунда уҡ әсәйемдең тоҡсайын, төйөнсөктәрен күҙ...",1,0.015385,train,1,0
1,Унан беҙ өсөбөҙ ҙә ултырғыстарға ултырабыҙ.,Унан беҙ әсәбеҙ ҙә ултырғыстарға ултырабыҙ.,Унан беҙ өсөбөҙ ҙә ултырғыстарға ултырабыҙ.,Унан беҙ әсәбеҙ ҙә ултырғыстарға ултырабыҙ.,3,0.069767,test,1,0
2,"«Иҫән-Һау ғына тороғоҙ инде», - тип бышылдай у...","«Иҫән-һау ғына тороғоҙ инде», - тип бышылдай у...","«Иҫән-Һау ғына тороғоҙ инде», - тип бышылдай у...","«Иҫән-һау ғына тороғоҙ инде», - тип бышылдай у...",1,0.014085,dev,1,0
3,"Минең генә бер кешем дә юҡ, тип шунда уҡ танау...","Минең генә бер кешем дә юҡ, - тип шунда уҡ тан...","Минең генә бер кешем дә юҡ, тип шунда уҡ танау...","Минең генә бер кешем дә юҡ, - тип шунда уҡ тан...",2,0.029412,train,0,0
4,"Ай йөрөгән, ти, йыл йөрөгән, ти, батыр, ете та...","Ай йөрөгән, ти, йыл йөрөгән, ти, батыр, ете та...","Ай йөрөгән, ти, йыл йөрөгән, ти, батыр, ете та...","Ай йөрөгән, ти, йыл йөрөгән, ти, батыр, ете та...",1,0.012500,train,1,0
...,...,...,...,...,...,...,...,...,...
23886,"Эҫтәрендә бүре үк оломаһа ла, эттәр шыңшый баш...","Эстәрендә бүре үк оломаһа ла, эттәр шыңшый баш...","Эҫтәрендә бүре үк оломаһа ла, эттәр шыңшый баш...","Эстәрендә бүре үк оломаһа ла, эттәр шыңшый баш...",1,0.020000,dev,1,0
23887,Үткән йәйҙә яман томра көндө Кәҙерғол төбәгенд...,Үткән йәйҙә яман томра көндө Ҡәҙерғол төбәгенд...,Үткән йәйҙә яман томра көндө Кәҙерғол төбәгенд...,Үткән йәйҙә яман томра көндө Ҡәҙерғол төбәгенд...,1,0.009524,train,1,0
23888,"Кайтыр алдынан салбарҙы эҙләй башлаһа, таба ал...","Ҡайтыр алдынан салбарҙы эҙләй башлаһа, таба ал...","Кайтыр алдынан салбарҙы эҙләй башлаһа, таба ал...","Ҡайтыр алдынан салбарҙы эҙләй башлаһа, таба ал...",1,0.020000,train,1,0
23889,Кыш урталарында бер көн Әбдрәшит ат аҙбарынан ...,Ҡыш урталарында бер көн Әбдрәшит ат аҙбарынан ...,Кыш урталарында бер көн Әбдрәшит ат аҙбарынан ...,Ҡыш урталарында бер көн Әбдрәшит ат аҙбарынан ...,1,0.009174,train,1,0


In [4]:
df_orig_train = df_orig[(df_orig.split=='train')]
print(df_orig_train.shape)

df_orig_train = df_orig_train[df_orig_train.edit_max_cldiff <= 3]
print(df_orig_train.shape)
df_orig_train = df_orig_train[df_orig_train.edit_max_lendiff <= 1].copy()
print(df_orig_train.shape)

(14382, 9)
(14171, 9)
(14085, 9)


In [5]:
df_orig_dev = df_orig[(df_orig.split=='dev') & (df_orig.edit_max_cldiff <= 3) & (df_orig.edit_max_lendiff <= 1)]
print(df_orig_dev.shape)
dev_small = df_orig_dev.sample(100, random_state=1).copy()

(4611, 9)


## 1.2. Corrupt the clean sents

In [35]:
with open('../data/clean_bk_sents.txt', 'r') as f:
    cs2 = [line.strip() for line in f]
print(len(cs2))

1605495


In [9]:
all_chars = ''.join(sorted({
    c for texts in [cs2, df_orig_train.trash, df_orig_train.clean] 
    for text in texts for c in text
}))
print(all_chars)
print(len(all_chars))

all_chars = ''.join(sorted(set(all_chars + all_chars.upper() + all_chars.lower())))
print(all_chars)
print(len(all_chars))

 !"#%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~¢¦§ª«¬­®°²µ·»¿ÀÁÂÄÉÊÌÍÎÐÒÖ×ØÜÝÞàáâãäåçèéêëìíîïðñòóôõö÷øûüýÿāČčğıłŠšūŽžƏəɵʺ̶́ΒΠΧЁЃЄЅІЉЋЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяёђѓєѕіїљњћќўџѲѳҐҒғҖҗҘҙқҠҡҢңҪҫҮүҰҺһӊӘәӧӨөاتخرسعكنو​‎‐‑‒–—―‘’“”„•…‰›⁠№Ⅰ→∂−≥⏰─●☎⚡✒✓✨﻿🌸🎭📝
349
 !"#%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~¢¦§ª«¬­®°²µ·»¿ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÛÜÝÞàáâãäåçèéêëìíîïðñòóôõö÷øûüýþÿĀāČčĞğıŁłŠšŪūŸŽžƏƟəɵʺ̶́ΒΜΠΧβπχЁЂЃЄЅІЇЉЊЋЌЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяёђѓєѕіїљњћќўџѲѳҐґҒғҖҗҘҙҚқҠҡҢңҪҫҮүҰұҺһӉӊӘәӦӧӨөاتخرسعكنو​‎‐‑‒–—―‘’“”„•…‰›⁠№Ⅰⅰ→∂−≥⏰─●☎⚡✒✓✨﻿🌸🎭📝
382


In [28]:
from noisers import add_simple_noise
from noisers import Noiser

In [39]:
noiser = Noiser.load('noise_model_v1.json')

In [36]:
text = random.choice(cs2)
text

'Тәңре уға тәғәйен иткән юлды һәр кем үҙе генә үтә һәм маңлайына яҙылған эш-ғәмәлдәрҙе фәҡәт үҙ ҡулы менән атҡара.'

In [38]:
add_simple_noise(text, all_chars)

'Тәңре уға тәғәйен иткән юлды һәр кем үҙе генә үт¦ә һәм маңлайына яҙылған%эш-ғәмәлдәрҙе фәҡәт☎ ћҙ ҡулы менән атҡара.'

In [41]:
noiser.add_noise(text)

'Тәңре уға тәғәйен иткән юлды һәр кем үҙе генә үтә һәм маң*лайына яҙьшған эш-ғәмәлдәрҙе фәҡәт үҙ ҡулы менән атҡарә.'

# 2. Building the model

In [12]:
VOCAB = ['▁', '[pad]', '[unk]', '[cls]', '[sep]', '[mask]', '[bos]', '[eos]'] + list(all_chars)
print(len(VOCAB))

390


In [13]:
with open('char_vocab.txt', 'w') as f:
    for t in VOCAB:
        print(t, file=f)

In [15]:
import char_tokenizer
from importlib import reload
reload(char_tokenizer)
from char_tokenizer import CharTokenizer

In [16]:
tokenizer = CharTokenizer(vocab_file='char_vocab.txt', model_max_length=1024)

In [10]:
from transformers import BertConfig, BertForMaskedLM

In [17]:
model_cfg = BertConfig(
    vocab_size=len(tokenizer),
    hidden_size=256,
    num_hidden_layers=4,
    num_attention_heads=8,
    intermediate_size=512,
    max_position_embeddings=tokenizer.model_max_length,
    type_vocab_size=1,
    pad_token_id=tokenizer.pad_token_id,
    position_embedding_type='relative_key_query',
)

In [18]:
model = BertForMaskedLM(model_cfg)

In [19]:
MODEL_NAME = '../models/bert-char-ctc-bak-denoise'

In [20]:
model.save_pretrained(MODEL_NAME)
tokenizer.save_pretrained(MODEL_NAME)

('../models/bert-char-ctc-bak-denoise\\tokenizer_config.json',
 '../models/bert-char-ctc-bak-denoise\\special_tokens_map.json',
 '../models/bert-char-ctc-bak-denoise\\vocab.txt',
 '../models/bert-char-ctc-bak-denoise\\added_tokens.json')

# 3. Training loop

In [1]:
model.cuda();

NameError: name 'model' is not defined

In [2]:
import textdistance

def fix_text(text, verbose=False, spaces=2):
    with torch.inference_mode():
        batch = tokenizer(text, return_tensors='pt', spaces=spaces, padding=True, truncation=True).to(model.device)
        logits = torch.log_softmax(model(**batch).logits, axis=-1)
    return tokenizer.decode(logits[0].argmax(-1), skip_special_tokens=True)

In [79]:
def eval_model(spaces=2):
    dev_small['fixed'] = [fix_text(text, spaces=spaces) for text in dev_small.trash2]
    dev_small['change_amount'] = dev_small.apply(lambda row: textdistance.levenshtein.distance(row.trash2, row.fixed), axis=1)
    dev_small['new_diff'] = dev_small.apply(lambda row: textdistance.levenshtein.distance(row.clean2, row.fixed), axis=1)
    return 1 - dev_small.new_diff.sum() / dev_small.distance.sum()

In [75]:
eval_model()

0.015503875968992276

In [45]:
from torch.optim import AdamW
optimizer = AdamW(
    [p for p in model.parameters() if p.requires_grad], 
    lr=1e-4,
    weight_decay=1e-2,
)
cleanup()

In [46]:
from transformers import get_linear_schedule_with_warmup, get_constant_schedule_with_warmup
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=1000)

In [47]:
ewm_loss = 0
losses = []

In [63]:
batch_size = 3

share_real = 0.1
share_noiser = 0.4
p_keep = 0.2

report_steps = 1000  # раз в сколько шагов печатаем результат
cleanup_steps = 100  # раз в сколько батчей чистим память

gradient_steps = 1  # раз в сколько батчей обновляем параметры 
window = 1000

Wow, this is the speed I like! 5 iterations per second, with 64 samples per batch, on my laptop GPU!

In [76]:
loss, logits, batch, batch_labels = None, None, None, None
cleanup()
model.train()

tq = trange(len(losses), 300_000)
for i in tq:
    r = random.random()
    if r < share_real:
        batch = df_orig_train.sample(batch_size)
        xx, yy = batch.trash2.tolist(), batch.clean2.tolist()
    elif r < share_real + share_noiser:
        yy = random.sample(cs2, batch_size)
        xx = [noiser.add_noise(text, edit_rate=0.05) if random.random() > p_keep else text for text in yy]
    else:
        yy = random.sample(cs2, batch_size)
        xx = [add_simple_noise(text, all_chars, edit_rate=0.05) if random.random() > p_keep else text for text in yy]
    
    random_spaces = random.choices([0, 1, 2], weights=[0.1, 0.7, 0.2])[0]
    batch = tokenizer(xx, return_tensors='pt', spaces=random_spaces, padding=True, truncation=True).to(model.device)
    batch_labels = tokenizer(yy, return_tensors='pt', spaces=0, padding=True, truncation=True, add_special_tokens=False).to(model.device)

    try:
        logits = torch.log_softmax(model(**batch).logits, axis=-1)
        loss = torch.nn.functional.ctc_loss(
            logits.transpose(1, 0), 
            batch_labels.input_ids, 
            batch.attention_mask.sum(1), 
            batch_labels.attention_mask.sum(1), 
            reduction='mean',
            zero_infinity=True,
        )
        loss.backward()

        if i % gradient_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()
    except RuntimeError as e:
        print(
            'error', i, 
            'sizes:', batch['input_ids'].shape, max(len(_) for _ in xx), 
            '/', batch_labels['input_ids'].shape, max(len(_) for _ in yy), 
            e
        )
        # raise e
        loss, logits, batch, batch_labels = None, None, None, None
        optimizer.zero_grad(set_to_none=True)
        cleanup()
        continue

    w = 1 / max(1, min(len(losses), window))
    ewm_loss = ewm_loss * (1-w) + loss.item() * w
    losses.append(loss.item())
    tq.set_description(f'{ewm_loss:3.3f}')

    if len(losses) % report_steps == 0:
        model.eval();
        print('step', len(losses), 'loss', np.mean(losses[-report_steps:]), 'error decrease', eval_model())
        model.train();
        if i > 0:
            print('SAVING')
            model.save_pretrained(MODEL_NAME)
            tokenizer.save_pretrained(MODEL_NAME)
    if i % cleanup_steps == 0:
        cleanup()

  0%|          | 0/297826 [00:00<?, ?it/s]

step 3000 loss 0.3351442826408893 error decrease -0.007751937984496138
SAVING
step 4000 loss 0.36188816670142115 error decrease 0.023255813953488413
SAVING
step 5000 loss 0.30822691893391313 error decrease 0.03100775193798455
SAVING
step 6000 loss 0.3383087878935039 error decrease 0.007751937984496138
SAVING
step 7000 loss 0.3273058985415846 error decrease 0.015503875968992276
SAVING
step 8000 loss 0.3242420162037015 error decrease 0.007751937984496138
SAVING
step 9000 loss 0.31116471908800303 error decrease 0.0
SAVING
step 10000 loss 0.33094022853299976 error decrease 0.023255813953488413
SAVING
step 11000 loss 0.3342750302515924 error decrease -0.09302325581395343
SAVING
step 12000 loss 0.2979228977262974 error decrease 0.046511627906976716
SAVING
step 13000 loss 0.29272264916449786 error decrease 0.054263565891472854
SAVING
step 14000 loss 0.26522703042626383 error decrease 0.07751937984496127
SAVING
step 15000 loss 0.2960522537855431 error decrease 0.015503875968992276
SAVING
step 

step 110000 loss 0.146723149424186 error decrease 0.3565891472868217
SAVING
step 111000 loss 0.15154756888188423 error decrease 0.3798449612403101
SAVING
step 112000 loss 0.160171855897177 error decrease 0.39534883720930236
SAVING
step 113000 loss 0.14404648092249409 error decrease 0.4031007751937985
SAVING
step 114000 loss 0.1458556364630349 error decrease 0.37209302325581395
SAVING
step 115000 loss 0.14236843575211242 error decrease 0.3875968992248062
SAVING
step 116000 loss 0.14362568270601334 error decrease 0.33333333333333337
SAVING
step 117000 loss 0.14816099842824043 error decrease 0.39534883720930236
SAVING
step 118000 loss 0.13235429579624905 error decrease 0.3875968992248062
SAVING
step 119000 loss 0.151375465081539 error decrease 0.2945736434108527
SAVING
step 120000 loss 0.14319267445942388 error decrease 0.31007751937984496
SAVING
step 121000 loss 0.13956063600396737 error decrease 0.39534883720930236
SAVING
step 122000 loss 0.14835485884686933 error decrease 0.33333333333

KeyboardInterrupt: 

In [83]:
loss, logits, batch, batch_labels = None, None, None, None
optimizer.zero_grad(set_to_none=True)
cleanup()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [82]:
fix_text(text)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [77]:
model.eval();

In [69]:
dev_small['fixed'] = [fix_text(text) for text in tqdm(dev_small.trash2)]

  0%|          | 0/100 [00:00<?, ?it/s]

In [71]:
dev_small['change_amount'] = dev_small.apply(lambda row: textdistance.levenshtein.distance(row.trash2, row.fixed), axis=1)
dev_small['new_diff'] = dev_small.apply(lambda row: textdistance.levenshtein.distance(row.clean2, row.fixed), axis=1)

In [72]:
dev_small.mean()

distance               1.2900
normalized_distance    0.0175
edit_max_cldiff        0.4600
edit_max_lendiff       0.0300
change_amount          0.0900
new_diff               1.2700
dtype: float64

In [73]:
1 - dev_small.new_diff.sum() / dev_small.distance.sum()

0.015503875968992276

In [81]:
for s in range(3):
    print(eval_model(spaces=s))

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [5]:
(1, 2) + (3, 5, 6)

(1, 2, 3, 5, 6)

# Reproduce the evaluation

In [10]:
from transformers import AutoModelForMaskedLM
from char_tokenizer import CharTokenizer

In [11]:
MODEL_NAME = '../models/bert-char-ctc-bak-denoise'
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME).cuda()
tokenizer = CharTokenizer.from_pretrained(MODEL_NAME)

In [12]:
import torch

def fix_text(text, verbose=False, spaces=2):
    with torch.inference_mode():
        batch = tokenizer(text, return_tensors='pt', spaces=spaces, padding=True, truncation=True, return_token_type_ids=False).to(model.device)
        logits = torch.log_softmax(model(**batch).logits, axis=-1)
    return tokenizer.decode(logits[0].argmax(-1), skip_special_tokens=True)

In [15]:
import textdistance

In [16]:
def eval_model(spaces=1):
    dev_small['fixed'] = [fix_text(text, spaces=spaces) for text in dev_small.trash2]
    dev_small['change_amount'] = dev_small.apply(lambda row: textdistance.levenshtein.distance(row.trash2, row.fixed), axis=1)
    dev_small['new_diff'] = dev_small.apply(lambda row: textdistance.levenshtein.distance(row.clean2, row.fixed), axis=1)
    return 1 - dev_small.new_diff.sum() / dev_small.distance.sum()

In [18]:
for s in range(4):
    print(eval_model(spaces=s))

-0.9147286821705427
0.4108527131782945
0.4108527131782945
0.26356589147286824
