In [1]:
with open("klej_ar/train.tsv", "r") as f:
    raw_train = f.readlines()

In [2]:
with open("klej_ar/dev.tsv", "r") as f:
    raw_dev = f.readlines()

In [3]:
def prepare_data(raw_data):
    corpus = []
    labels = []
    for doc in raw_data:
        record = doc.strip().split("\t")
        if len(record) != 2:
            continue
        text, target = record
        label = int(float(target))
        corpus.append(text)
        labels.append(label)
    return corpus, labels

In [4]:
train_corpus, train_labels = prepare_data(raw_train[1:])

In [5]:
test_corpus, test_labels = prepare_data(raw_dev[1:])

# herBERT

**KLEJ: Comprehensive Benchmark for Polish Language Understanding**   
Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik

https://www.aclweb.org/anthology/2020.acl-main.111.pdf

https://huggingface.co/transformers/master/model_doc/herbert.html

https://huggingface.co/transformers/master/model_doc/roberta.html?highlight=robertamodel#transformers.RobertaModel

- RobertaModel
- RobertaForCausalLM
- RobertaForMaskedLM
- RobertaForSequenceClassification
- RobertaForMultipleChoice
- RobertaForTokenClassification
- RobertaForQuestionAnswering

In [6]:
simplification = {1: 0, 2: 0, 3: 1, 4: 2, 5: 2}
train_labels = [simplification[label] for label in train_labels]
test_labels = [simplification[label] for label in test_labels]

In [7]:
train_data = list(zip(train_corpus, train_labels))
test_data = list(zip(test_corpus, test_labels))

In [8]:
import random
import torch
from tqdm import tqdm
from sklearn.metrics import classification_report
from transformers import RobertaForSequenceClassification, HerbertTokenizer

torch.manual_seed(42)
random.seed(42)

In [9]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [10]:
tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

In [11]:
train_corpus[0]

'Jako do ceny dobra. Przyssawka mogłaby być lepsza. Po 2 miesiącach użytkowania musiałem nóżkę z przyssawką rozkręcić i przyssawkę podkleić bo guma zaczęła pękać od strony mocowania do uchwytu (uchwyt zaczął się po prostu trząść bo zrobił się luz).  Mechanizm mocowania telefonu póki co (3 miesiące użytkowania) działa bez zarzutu. '

In [12]:
tokens = tokenizer.tokenize(train_corpus[0])

In [13]:
tokens

['Jako</w>',
 'do</w>',
 'ceny</w>',
 'dobra</w>',
 '.</w>',
 'Przy',
 'ssa',
 'wka</w>',
 'mogłaby</w>',
 'być</w>',
 'lepsza</w>',
 '.</w>',
 'Po</w>',
 '2</w>',
 'miesiącach</w>',
 'użytkowania</w>',
 'musiałem</w>',
 'nó',
 'żkę</w>',
 'z</w>',
 'przy',
 'ssa',
 'wką</w>',
 'roz',
 'kręcić</w>',
 'i</w>',
 'przy',
 'ssa',
 'wkę</w>',
 'pod',
 'kle',
 'ić</w>',
 'bo</w>',
 'gu',
 'ma</w>',
 'zaczęła</w>',
 'pę',
 'kać</w>',
 'od</w>',
 'strony</w>',
 'moc',
 'owania</w>',
 'do</w>',
 'uchwy',
 'tu</w>',
 '(</w>',
 'uchwyt</w>',
 'zaczął</w>',
 'się</w>',
 'po</w>',
 'prostu</w>',
 'trzą',
 'ść</w>',
 'bo</w>',
 'zrobił</w>',
 'się</w>',
 'lu',
 'z</w>',
 ')</w>',
 '.</w>',
 'Mechanizm</w>',
 'moc',
 'owania</w>',
 'telefonu</w>',
 'póki</w>',
 'co</w>',
 '(</w>',
 '3</w>',
 'miesiące</w>',
 'użytkowania</w>',
 ')</w>',
 'działa</w>',
 'bez</w>',
 'zarzutu</w>',
 '.</w>']

In [14]:
len(tokens)

75

In [15]:
encoded_input = tokenizer.encode(train_corpus[0], return_tensors='pt')

In [16]:
encoded_input

tensor([[    0,  1215,    21,  1405,  1942,    15,   862,  6198,  4623,  7238,
            70, 13802,    15,    95,    89,  4773,  4227, 24467, 16589, 41452,
            19,   198,  6198, 19773,   190, 29726,    17,   198,  6198, 11273,
           213,  5063,  5510,    94,  1389,    48,  3713,  2775,  6091,    34,
           263,  4071,   311,    21, 13404,   115,    31, 46921,  2110,    20,
            41,   852, 39825,  2490,    94,  5751,    20,   681,    19,    29,
            15, 30490,  4071,   311,  8897, 12024,    45,    31,   114,  3528,
          4227,    29,  1469,   140, 18411,    15,     1]])

In [17]:
model = RobertaForSequenceClassification.from_pretrained("allegro/herbert-klej-cased-v1", 
                                                         num_labels=3, hidden_dropout_prob=0.5, 
                                                         attention_probs_dropout_prob=0.5)

Some weights of the model checkpoint at allegro/herbert-klej-cased-v1 were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at allegro/herbert-klej-cased-v1 and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream

In [18]:
train_corpus[0]

'Jako do ceny dobra. Przyssawka mogłaby być lepsza. Po 2 miesiącach użytkowania musiałem nóżkę z przyssawką rozkręcić i przyssawkę podkleić bo guma zaczęła pękać od strony mocowania do uchwytu (uchwyt zaczął się po prostu trząść bo zrobił się luz).  Mechanizm mocowania telefonu póki co (3 miesiące użytkowania) działa bez zarzutu. '

In [19]:
encoded_input

tensor([[    0,  1215,    21,  1405,  1942,    15,   862,  6198,  4623,  7238,
            70, 13802,    15,    95,    89,  4773,  4227, 24467, 16589, 41452,
            19,   198,  6198, 19773,   190, 29726,    17,   198,  6198, 11273,
           213,  5063,  5510,    94,  1389,    48,  3713,  2775,  6091,    34,
           263,  4071,   311,    21, 13404,   115,    31, 46921,  2110,    20,
            41,   852, 39825,  2490,    94,  5751,    20,   681,    19,    29,
            15, 30490,  4071,   311,  8897, 12024,    45,    31,   114,  3528,
          4227,    29,  1469,   140, 18411,    15,     1]])

In [20]:
outputs = model(encoded_input)

In [21]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0141, -0.1494, -0.0072]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [22]:
PAD_TOKEN_ID = tokenizer.pad_token_id

def documents_to_batch(docs, max_len):
    tokenized = tokenizer(docs)
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]
    for i, (inp, att) in enumerate(zip(input_ids, attention_mask)):
        inp_len = len(inp)
        inp = inp[:max_len] + [PAD_TOKEN_ID] * (max_len - inp_len)
        att = att[:max_len] + [PAD_TOKEN_ID] * (max_len - inp_len)
        input_ids[i], attention_mask[i] = inp, att
    X = torch.LongTensor(input_ids).to(DEVICE)
    ATT = torch.BoolTensor(attention_mask).to(DEVICE)
    return X, ATT

In [23]:
def train_on_batch(model, optimizer, X, ATT, Y):
    model.train()
    optimizer.zero_grad()
    output = model(input_ids=X, attention_mask=ATT, labels=Y)
    loss = output["loss"]
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()
    return loss.item()

In [24]:
def predict_on_batch(model, X, ATT, Y):
    model.eval()
    output = model(input_ids=X, attention_mask=ATT, labels=Y)
    decision = output["logits"].topk(1).indices.squeeze()
    loss = output["loss"].item()
    equal = decision == Y
    correct = sum(equal).item()
    return correct, decision, loss

In [25]:
model = model.to(DEVICE)
learning_rate = 0.000005
epochs = 20
batch_size = 10
max_len = 120
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [26]:
num_train_batches = len(train_data) // batch_size + int(bool(len(train_data) % batch_size))
num_test_batches = len(test_data) // batch_size + int(bool(len(test_data) % batch_size))

best_acc = 0

In [34]:
for epoch in range(epochs):
    random.shuffle(train_data)
    total_loss = 0
    for n in tqdm(range(num_train_batches)):
        datapoints = train_data[n * batch_size:(n + 1) * batch_size]
        documents, labels = list(zip(*datapoints))
        Y = torch.LongTensor(labels).to(DEVICE)
        X, ATT = documents_to_batch(documents, max_len)
        loss = train_on_batch(model, optimizer, X, ATT, Y)
        total_loss += loss
    print(total_loss)
    with torch.no_grad():
        total = 0
        correct = 0
        dev_loss = 0
        for n in tqdm(range(num_test_batches)):
            datapoints = test_data[n * batch_size:(n + 1) * batch_size]
            documents, labels = list(zip(*datapoints))
            Y = torch.LongTensor(labels).to(DEVICE)
            X, ATT = documents_to_batch(documents, max_len)
            result, _, loss = predict_on_batch(model, X, ATT, Y)
            dev_loss += loss
            total += batch_size
            correct += result
        acc = correct/total * 100
        print(f"acc: {acc}")
        print(f"loss: {dev_loss}")
        if acc > best_acc:
            best_acc = acc
            torch.save(model, "herbert_simple.model")

100%|██████████| 918/918 [04:11<00:00,  3.65it/s]
  2%|▏         | 2/98 [00:00<00:07, 12.80it/s]

909.4574948847294


100%|██████████| 98/98 [00:07<00:00, 12.45it/s]


acc: 52.346938775510196
loss: 96.20249527692795


100%|██████████| 918/918 [04:09<00:00,  3.68it/s]
  2%|▏         | 2/98 [00:00<00:07, 13.14it/s]

868.0260069668293


100%|██████████| 98/98 [00:07<00:00, 12.72it/s]


acc: 53.06122448979592
loss: 91.10123351216316


100%|██████████| 918/918 [04:08<00:00,  3.70it/s]
  2%|▏         | 2/98 [00:00<00:07, 13.12it/s]

733.7944690734148


100%|██████████| 98/98 [00:07<00:00, 12.69it/s]


acc: 61.63265306122449
loss: 82.15524110198021


100%|██████████| 918/918 [04:08<00:00,  3.69it/s]
  2%|▏         | 2/98 [00:00<00:07, 12.81it/s]

667.2652001082897


100%|██████████| 98/98 [00:07<00:00, 12.71it/s]


acc: 72.75510204081633
loss: 68.91659331321716


100%|██████████| 918/918 [04:08<00:00,  3.70it/s]
  2%|▏         | 2/98 [00:00<00:07, 12.85it/s]

631.4610816985369


100%|██████████| 98/98 [00:07<00:00, 12.70it/s]


acc: 74.48979591836735
loss: 63.68057103455067


100%|██████████| 918/918 [04:07<00:00,  3.70it/s]
  2%|▏         | 2/98 [00:00<00:07, 12.81it/s]

608.53753798455


100%|██████████| 98/98 [00:07<00:00, 12.71it/s]
  0%|          | 0/918 [00:00<?, ?it/s]

acc: 73.57142857142858
loss: 66.6466724127531


100%|██████████| 918/918 [04:07<00:00,  3.70it/s]
  2%|▏         | 2/98 [00:00<00:07, 13.10it/s]

590.8029479011893


100%|██████████| 98/98 [00:07<00:00, 12.75it/s]


acc: 74.6938775510204
loss: 60.831061244010925


100%|██████████| 918/918 [04:07<00:00,  3.71it/s]
  2%|▏         | 2/98 [00:00<00:07, 13.51it/s]

567.4897343032062


100%|██████████| 98/98 [00:07<00:00, 12.72it/s]


acc: 75.81632653061224
loss: 59.05846221745014


100%|██████████| 918/918 [04:08<00:00,  3.70it/s]
  2%|▏         | 2/98 [00:00<00:07, 12.61it/s]

553.9323493950069


100%|██████████| 98/98 [00:07<00:00, 12.69it/s]


acc: 76.22448979591837
loss: 60.4273477345705


100%|██████████| 918/918 [04:07<00:00,  3.71it/s]
  2%|▏         | 2/98 [00:00<00:07, 13.11it/s]

550.6100513897836


100%|██████████| 98/98 [00:07<00:00, 12.72it/s]
  0%|          | 0/918 [00:00<?, ?it/s]

acc: 75.91836734693878
loss: 60.061823442578316


100%|██████████| 918/918 [04:07<00:00,  3.71it/s]
  2%|▏         | 2/98 [00:00<00:07, 13.11it/s]

532.4648185186088


100%|██████████| 98/98 [00:07<00:00, 12.65it/s]
  0%|          | 0/918 [00:00<?, ?it/s]

acc: 76.0204081632653
loss: 57.377476282417774


100%|██████████| 918/918 [04:08<00:00,  3.70it/s]
  2%|▏         | 2/98 [00:00<00:07, 13.04it/s]

519.4986428506672


100%|██████████| 98/98 [00:07<00:00, 12.77it/s]


acc: 76.32653061224491
loss: 57.90367554873228


100%|██████████| 918/918 [04:07<00:00,  3.71it/s]
  2%|▏         | 2/98 [00:00<00:07, 12.90it/s]

511.5248202867806


100%|██████████| 98/98 [00:07<00:00, 12.73it/s]


acc: 76.83673469387755
loss: 59.145870700478554


100%|██████████| 918/918 [04:07<00:00,  3.71it/s]
  2%|▏         | 2/98 [00:00<00:07, 12.96it/s]

498.7831436507404


100%|██████████| 98/98 [00:07<00:00, 12.75it/s]
  0%|          | 0/918 [00:00<?, ?it/s]

acc: 76.42857142857142
loss: 59.260906919837


100%|██████████| 918/918 [04:07<00:00,  3.71it/s]
  2%|▏         | 2/98 [00:00<00:07, 12.66it/s]

495.19837080687284


100%|██████████| 98/98 [00:07<00:00, 12.77it/s]


acc: 77.14285714285715
loss: 58.71985176578164


100%|██████████| 918/918 [04:06<00:00,  3.72it/s]
  2%|▏         | 2/98 [00:00<00:07, 13.19it/s]

484.7216311097145


100%|██████████| 98/98 [00:07<00:00, 12.79it/s]
  0%|          | 0/918 [00:00<?, ?it/s]

acc: 76.42857142857142
loss: 61.95589941740036


100%|██████████| 918/918 [04:06<00:00,  3.72it/s]
  2%|▏         | 2/98 [00:00<00:07, 12.93it/s]

473.6354301087558


100%|██████████| 98/98 [00:07<00:00, 12.76it/s]
  0%|          | 0/918 [00:00<?, ?it/s]

acc: 76.63265306122449
loss: 61.554667234420776


100%|██████████| 918/918 [04:07<00:00,  3.71it/s]
  2%|▏         | 2/98 [00:00<00:07, 12.78it/s]

473.58630425296724


100%|██████████| 98/98 [00:07<00:00, 12.73it/s]


acc: 77.75510204081633
loss: 58.84965969622135


100%|██████████| 918/918 [04:07<00:00,  3.71it/s]
  2%|▏         | 2/98 [00:00<00:07, 13.05it/s]

458.8079054579139


100%|██████████| 98/98 [00:07<00:00, 12.71it/s]
  0%|          | 0/918 [00:00<?, ?it/s]

acc: 77.65306122448979
loss: 60.41280238702893


100%|██████████| 918/918 [04:07<00:00,  3.72it/s]
  2%|▏         | 2/98 [00:00<00:07, 13.22it/s]

457.5871388223022


100%|██████████| 98/98 [00:07<00:00, 12.81it/s]

acc: 77.75510204081633
loss: 61.04016913101077





In [32]:
model = torch.load("herbert_simple.model", map_location=DEVICE)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50560, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.5, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.5, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [33]:
preds = []
for n in tqdm(range(num_test_batches)):
    datapoints = test_data[n * batch_size:(n + 1) * batch_size]
    documents, labels = list(zip(*datapoints))
    Y = torch.LongTensor(labels).to(DEVICE)
    X, ATT = documents_to_batch(documents, max_len)
    _, pred, _ = predict_on_batch(model, X, ATT, Y)
    preds.append(pred)

100%|██████████| 98/98 [02:30<00:00,  1.54s/it]


In [34]:
preds[:2]

[tensor([2, 0, 0, 2, 0, 0, 2, 2, 2, 0]),
 tensor([2, 2, 0, 2, 1, 0, 0, 2, 2, 0])]

In [35]:
preds = [p for t in preds for p in t.tolist()]
documents, labels = list(zip(*test_data))

In [39]:
print(classification_report(labels, preds))

              precision    recall  f1-score   support

           0       0.77      0.83      0.80       327
           1       0.38      0.13      0.20       137
           2       0.82      0.92      0.87       513

    accuracy                           0.78       977
   macro avg       0.66      0.63      0.62       977
weighted avg       0.74      0.78      0.75       977

