In [1]:
with open("klej_ar/train.tsv", "r") as f:
    raw_train = f.readlines()

In [2]:
with open("klej_ar/dev.tsv", "r") as f:
    raw_dev = f.readlines()

In [3]:
def prepare_data(raw_data):
    corpus = []
    labels = []
    for doc in raw_data:
        record = doc.strip().split("\t")
        if len(record) != 2:
            continue
        text, target = record
        label = int(float(target))
        corpus.append(text)
        labels.append(label)
    return corpus, labels

In [4]:
train_corpus, train_labels = prepare_data(raw_train[1:])

In [5]:
test_corpus, test_labels = prepare_data(raw_dev[1:])

# herBERT

**KLEJ: Comprehensive Benchmark for Polish Language Understanding**   
Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik

https://www.aclweb.org/anthology/2020.acl-main.111.pdf

https://huggingface.co/transformers/master/model_doc/herbert.html

In [6]:
simplification = {1: 0, 2: 0, 3: 1, 4: 2, 5: 2}
train_labels = [simplification[label] for label in train_labels]
test_labels = [simplification[label] for label in test_labels]

In [7]:
train_data = list(zip(train_corpus, train_labels))
test_data = list(zip(test_corpus, test_labels))

In [8]:
import random
import torch
import torch.nn as nn
from tqdm import tqdm
from sklearn.metrics import classification_report
from transformers import RobertaModel, HerbertTokenizer

torch.manual_seed(42)
random.seed(42)

In [9]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [10]:
tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

In [11]:
max_input_length = tokenizer.max_model_input_sizes["allegro/herbert-base-cased"]
print(max_input_length)

514


In [12]:
PAD_TOKEN_ID = tokenizer.pad_token_id

def documents_to_batch(docs, max_len):
    tokenized = tokenizer(docs)
    input_ids = tokenized["input_ids"]
    for i, inp in enumerate(input_ids):
        inp_len = len(inp)
        inp = inp[:max_len] + [PAD_TOKEN_ID] * (max_len - inp_len)
        input_ids[i] = inp
    X = torch.LongTensor(input_ids).to(DEVICE)
    return X

In [13]:
documents_to_batch(train_corpus[:2], 50)

tensor([[    0,  1215,    21,  1405,  1942,    15,   862,  6198,  4623,  7238,
            70, 13802,    15,    95,    89,  4773,  4227, 24467, 16589, 41452,
            19,   198,  6198, 19773,   190, 29726,    17,   198,  6198, 11273,
           213,  5063,  5510,    94,  1389,    48,  3713,  2775,  6091,    34,
           263,  4071,   311,    21, 13404,   115,    31, 46921,  2110,    20],
        [    0,    81,  6940, 11273, 24499,  1867, 10001,   304,    30,    41,
         19062,  3446,  3476,    20,    14,    26,   344,  1576,   429,    20,
          6132, 18768,    17,    16,    74,   581,   756,  1331, 19648, 20377,
          4111,  2236,  1639,   982,    17,   987,    25,    17,    55, 24499,
          1867,  2176,   130,    75,   126,   120,  5228,    15,   203,    44]])

In [14]:
def train_on_batch(model, criterion, optimizer, X, Y):
    model.train()
    optimizer.zero_grad()
    output = model(X)
    loss = criterion(output, Y)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()
    return loss.item()

In [15]:
def predict_on_batch(model, X, Y):
    model.eval()
    output = model(X)
    decision = output.topk(1).indices.squeeze()
    equal = decision == Y
    correct = sum(equal).item()
    return correct, decision

In [16]:
herbert = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")

In [17]:
herbert.config.to_dict()['hidden_size']

768

In [18]:
class HerBERTGRUSentiment(nn.Module):
    def __init__(self, herbert, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        
        super().__init__()
        
        self.herbert = herbert
        embedding_dim = herbert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.dropout = nn.Dropout(dropout)
        self.dense = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, text):
        with torch.no_grad():
            embedded = self.herbert(text)[0]
        
        _, hidden = self.rnn(embedded)
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        
        output = self.softmax(self.dense(hidden))
        
        return output

In [19]:
HIDDEN_DIM = 256
OUTPUT_DIM = 3
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = HerBERTGRUSentiment(herbert,
                            HIDDEN_DIM,
                            OUTPUT_DIM,
                            N_LAYERS,
                            BIDIRECTIONAL,
                            DROPOUT)

In [20]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [21]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 127,632,387 trainable parameters


In [22]:
for name, param in model.named_parameters():                
    if name.startswith('herbert'):
        param.requires_grad = False

In [23]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,760,195 trainable parameters


In [24]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
dense.weight
dense.bias


In [25]:
criterion = torch.nn.NLLLoss()
learning_rate = 0.005
epochs = 20
batch_size = 20
max_len = 120
optimizer = torch.optim.Adam(model.parameters())
model = model.to(DEVICE)

In [26]:
num_train_batches = len(train_data) // batch_size + int(bool(len(train_data) % batch_size))
num_test_batches = len(test_data) // batch_size + int(bool(len(test_data) % batch_size))

best_acc = 0

In [35]:
for epoch in range(epochs):
    random.shuffle(train_data)
    total_loss = 0
    for n in tqdm(range(num_train_batches)):
        datapoints = train_data[n * batch_size:(n + 1) * batch_size]
        docs, labels = zip(*datapoints)
        Y = torch.tensor(labels).to(DEVICE)
        X = documents_to_batch(docs, max_len)
        loss = train_on_batch(model, criterion, optimizer, X, Y)
        total_loss += loss
    print(total_loss)
    
    with torch.no_grad():
        total = 0
        correct = 0
        for n in tqdm(range(num_test_batches)):
            datapoints = test_data[n * batch_size:(n + 1) * batch_size]
            docs, labels = zip(*datapoints)
            Y = torch.tensor(labels).to(DEVICE)
            X = documents_to_batch(docs, max_len)
            result, _ = predict_on_batch(model, X, Y)
            total += batch_size
            correct += result
        acc = correct/total * 100
        print(f"acc: {acc}")
        if acc > best_acc:
            best_acc = acc
            torch.save(model, "herbert_gru_simple.model")

100%|██████████| 459/459 [01:53<00:00,  4.05it/s]
  2%|▏         | 1/49 [00:00<00:09,  5.12it/s]

295.9212861061096


100%|██████████| 49/49 [00:09<00:00,  5.00it/s]


acc: 77.04081632653062


100%|██████████| 459/459 [01:55<00:00,  3.99it/s]
  2%|▏         | 1/49 [00:00<00:09,  5.07it/s]

231.74037800729275


100%|██████████| 49/49 [00:09<00:00,  4.91it/s]


acc: 77.75510204081633


100%|██████████| 459/459 [01:56<00:00,  3.96it/s]
  0%|          | 0/49 [00:00<?, ?it/s]

196.9043279364705


100%|██████████| 49/49 [00:09<00:00,  4.93it/s]
  0%|          | 0/459 [00:00<?, ?it/s]

acc: 74.48979591836735


100%|██████████| 459/459 [01:56<00:00,  3.95it/s]
  0%|          | 0/49 [00:00<?, ?it/s]

168.8777245208621


100%|██████████| 49/49 [00:09<00:00,  4.93it/s]


acc: 78.26530612244898


100%|██████████| 459/459 [01:55<00:00,  3.96it/s]
  2%|▏         | 1/49 [00:00<00:09,  5.06it/s]

139.64224070124328


100%|██████████| 49/49 [00:09<00:00,  4.91it/s]
  0%|          | 0/459 [00:00<?, ?it/s]

acc: 77.3469387755102


100%|██████████| 459/459 [01:55<00:00,  3.96it/s]
  0%|          | 0/49 [00:00<?, ?it/s]

107.80150984181091


100%|██████████| 49/49 [00:10<00:00,  4.89it/s]
  0%|          | 0/459 [00:00<?, ?it/s]

acc: 77.75510204081633


100%|██████████| 459/459 [01:56<00:00,  3.96it/s]
  0%|          | 0/49 [00:00<?, ?it/s]

83.7062169611454


100%|██████████| 49/49 [00:09<00:00,  4.93it/s]
  0%|          | 0/459 [00:00<?, ?it/s]

acc: 77.55102040816327


100%|██████████| 459/459 [01:56<00:00,  3.94it/s]
  0%|          | 0/49 [00:00<?, ?it/s]

70.2943873598706


100%|██████████| 49/49 [00:10<00:00,  4.89it/s]
  0%|          | 0/459 [00:00<?, ?it/s]

acc: 77.75510204081633


100%|██████████| 459/459 [01:56<00:00,  3.95it/s]
  0%|          | 0/49 [00:00<?, ?it/s]

55.81154702240019


100%|██████████| 49/49 [00:10<00:00,  4.89it/s]


acc: 78.36734693877551


100%|██████████| 459/459 [01:55<00:00,  3.96it/s]
  2%|▏         | 1/49 [00:00<00:09,  5.11it/s]

53.347433627495775


100%|██████████| 49/49 [00:09<00:00,  4.93it/s]
  0%|          | 0/459 [00:00<?, ?it/s]

acc: 76.32653061224491


100%|██████████| 459/459 [01:56<00:00,  3.95it/s]
  2%|▏         | 1/49 [00:00<00:09,  5.02it/s]

42.49467245025153


100%|██████████| 49/49 [00:09<00:00,  4.92it/s]
  0%|          | 0/459 [00:00<?, ?it/s]

acc: 78.36734693877551


100%|██████████| 459/459 [01:56<00:00,  3.96it/s]
  0%|          | 0/49 [00:00<?, ?it/s]

43.74333077614574


100%|██████████| 49/49 [00:09<00:00,  4.93it/s]
  0%|          | 0/459 [00:00<?, ?it/s]

acc: 77.24489795918367


100%|██████████| 459/459 [01:56<00:00,  3.95it/s]
  0%|          | 0/49 [00:00<?, ?it/s]

38.13253153504047


100%|██████████| 49/49 [00:09<00:00,  4.90it/s]
  0%|          | 0/459 [00:00<?, ?it/s]

acc: 76.73469387755102


100%|██████████| 459/459 [01:55<00:00,  3.96it/s]
  0%|          | 0/49 [00:00<?, ?it/s]

37.41596916825074


100%|██████████| 49/49 [00:09<00:00,  4.94it/s]
  0%|          | 0/459 [00:00<?, ?it/s]

acc: 76.42857142857142


100%|██████████| 459/459 [01:55<00:00,  3.96it/s]
  0%|          | 0/49 [00:00<?, ?it/s]

33.45110570444376


100%|██████████| 49/49 [00:09<00:00,  4.95it/s]
  0%|          | 0/459 [00:00<?, ?it/s]

acc: 77.24489795918367


100%|██████████| 459/459 [01:55<00:00,  3.97it/s]
  0%|          | 0/49 [00:00<?, ?it/s]

42.20944005657657


100%|██████████| 49/49 [00:09<00:00,  4.93it/s]
  0%|          | 0/459 [00:00<?, ?it/s]

acc: 77.75510204081633


100%|██████████| 459/459 [01:55<00:00,  3.97it/s]
  2%|▏         | 1/49 [00:00<00:09,  5.04it/s]

34.22136291235438


100%|██████████| 49/49 [00:09<00:00,  4.93it/s]
  0%|          | 0/459 [00:00<?, ?it/s]

acc: 78.26530612244898


100%|██████████| 459/459 [01:55<00:00,  3.97it/s]
  0%|          | 0/49 [00:00<?, ?it/s]

37.756234720558496


100%|██████████| 49/49 [00:09<00:00,  4.91it/s]


acc: 78.77551020408163


100%|██████████| 459/459 [01:56<00:00,  3.94it/s]
  2%|▏         | 1/49 [00:00<00:09,  4.95it/s]

35.69088855811788


100%|██████████| 49/49 [00:09<00:00,  4.90it/s]
  0%|          | 0/459 [00:00<?, ?it/s]

acc: 77.75510204081633


100%|██████████| 459/459 [01:55<00:00,  3.97it/s]
  0%|          | 0/49 [00:00<?, ?it/s]

38.33239246890298


100%|██████████| 49/49 [00:09<00:00,  4.94it/s]

acc: 77.9591836734694





In [27]:
model = torch.load("herbert_gru_simple.model", map_location=DEVICE)
model.eval()

HerBERTGRUSentiment(
  (herbert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50560, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

In [29]:
preds = []
for n in tqdm(range(num_test_batches)):
    datapoints = test_data[n * batch_size:(n + 1) * batch_size]
    docs, labels = list(zip(*datapoints))
    Y = torch.tensor(labels).to(DEVICE)
    X = documents_to_batch(docs, max_len)
    _, pred = predict_on_batch(model, X, Y)
    preds.append(pred)

100%|██████████| 49/49 [01:51<00:00,  2.28s/it]


In [30]:
preds[:2]

[tensor([2, 0, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 0, 2, 1, 0, 0, 2, 2, 0]),
 tensor([2, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 0, 1, 1, 2, 2, 2])]

In [31]:
preds = [p for t in preds for p in t.tolist()]
documents, labels = list(zip(*test_data))

In [32]:
print(classification_report(labels, preds))

              precision    recall  f1-score   support

           0       0.73      0.90      0.81       327
           1       0.39      0.20      0.26       137
           2       0.89      0.88      0.89       513

    accuracy                           0.79       977
   macro avg       0.67      0.66      0.65       977
weighted avg       0.77      0.79      0.77       977

