In [70]:
import torch
import torch.nn as nn
import numpy as np
import wandb

from typing import List
from IPython.display import clear_output

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn.utils.rnn import pad_sequence

from nerus import load_nerus
import nerus

import os
import json

from transformers import BertModel, BertConfig, BertTokenizer

In [7]:
docs = load_nerus('/content/drive/MyDrive/nerus_lenta.conllu.gz')
doc = next(docs)

In [8]:
sent = doc.sents[0]
sent.morph.tokens[0]

MorphToken(
    text='Вице-премьер',
    pos='NOUN',
    feats={'Animacy': 'Anim',
     'Case': 'Nom',
     'Gender': 'Masc',
     'Number': 'Sing'}
)

In [9]:
sent.morph

MorphMarkup(
    tokens=[MorphToken(
         text='Вице-премьер',
         pos='NOUN',
         feats={'Animacy': 'Anim',
          'Case': 'Nom',
          'Gender': 'Masc',
          'Number': 'Sing'}
     ),
     MorphToken(
         text='по',
         pos='ADP',
         feats={}
     ),
     MorphToken(
         text='социальным',
         pos='ADJ',
         feats={'Case': 'Dat', 'Degree': 'Pos', 'Number': 'Plur'}
     ),
     MorphToken(
         text='вопросам',
         pos='NOUN',
         feats={'Animacy': 'Inan',
          'Case': 'Dat',
          'Gender': 'Masc',
          'Number': 'Plur'}
     ),
     MorphToken(
         text='Татьяна',
         pos='PROPN',
         feats={'Animacy': 'Anim',
          'Case': 'Nom',
          'Gender': 'Fem',
          'Number': 'Sing'}
     ),
     MorphToken(
         text='Голикова',
         pos='PROPN',
         feats={'Animacy': 'Anim',
          'Case': 'Nom',
          'Gender': 'Fem',
          'Number': 'Sing'}
     ),
   

In [10]:
len(doc.sents)

6

In [11]:
sent.text

'Вице-премьер по социальным вопросам Татьяна Голикова рассказала, в каких регионах России зафиксирована наиболее высокая смертность от рака, сообщает РИА Новости.'

In [12]:
sent.tokens[0].text

'Вице-премьер'

In [16]:
class Vocab:
    def __init__(self, capacity=10000, save_path='./vocab'):
        self.capacity = capacity
        self.save_path = save_path

        self.max_len = 0
        self.max_chars = 0

        self.i2c = ['<SEP>', '<BEGIN>', '<END>', '<UNK>']
        self.c2i = {'<SEP>': 0, '<BEGIN>': 1, '<END>': 2, '<UNK>': 3}

        self.i2w = ['<SEP>', '<UNK>']
        self.w2i = {'<SEP>': 0, '<UNK>': 1}

        self.i2t = ['<SEP>', '<UNK>']
        self.t2i = {'<SEP>': 0, '<UNK>': 1}

    def load(self, max_len=200, max_chars=1000):
        with open(os.path.join(self.save_path, 'words.txt'), 'r') as v:
            self.w2i = json.load(v)
            self.i2w = [_ for _ in self.w2i]
            for k in self.w2i:
                self.i2w[self.w2i[k]] = k
        with open(os.path.join(self.save_path, 'chars.txt'), 'r') as v:
            self.c2i = json.load(v)
            self.i2c = [_ for _ in self.c2i]
            for k in self.c2i:
                self.i2c[self.c2i[k]] = k
        with open(os.path.join(self.save_path, 'labels.txt'), 'r') as v:
            self.t2i = json.load(v)
            self.i2t = [_ for _ in self.t2i]
            for k in self.t2i:
                self.i2t[self.t2i[k]] = k

        self.max_len = max_len
        self.max_chars = max_chars

    def save(self):
        with open(os.path.join(self.save_path, 'words.txt'), 'w') as v:
            json.dump(self.w2i, v)
        with open(os.path.join(self.save_path, 'chars.txt'), 'w') as v:
            json.dump(self.c2i, v)
        with open(os.path.join(self.save_path, 'labels.txt'), 'w') as v:
            json.dump(self.t2i, v)

    def read(self, docs):
        for _ in range(self.capacity):
            clear_output()
            print('Reading: ')
            print(_, '/', self.capacity)
            doc = next(docs)
            for sent in doc.sents:
                self.max_len = max(self.max_len, len(sent.tokens))
                chars_len = 0
                for token in sent.morph.tokens:
                    if token.pos not in self.t2i:
                        self.i2t.append(token.pos)
                        self.t2i[token.pos] = len(self.i2t) - 1
                    if token.text not in self.w2i:
                        self.i2w.append(token.text)
                        self.w2i[token.text] = len(self.i2w) - 1
                    for char in token.text:
                        chars_len += 1
                        if char not in self.c2i:
                            self.i2c.append(char)
                            self.c2i[char] = len(self.i2c) - 1
                self.max_chars = max(self.max_chars, chars_len + 10)

class POSDataset(Dataset):
    def __init__(self, vocab, capacity=2000):
        super().__init__()
        self.tokens = []
        self.chars = []
        self.targets = []
        self.capacity = capacity

        self.vocab=vocab

    def __getitem__(self, i):
        return torch.LongTensor(self.chars[i]), torch.LongTensor(self.tokens[i]), torch.LongTensor(self.targets[i])

    def __len__(self):
        return len(self.chars)

    def collate_fn(self, batch):
        tokens, words, pos_tags = list(zip(*batch))
        tokens = pad_sequence(tokens, batch_first=True)
        words = pad_sequence(words, batch_first=True)
        pos_tags = pad_sequence(pos_tags, batch_first=True)
        return tokens, words, pos_tags

    def read(self, docs):
        while len(self.chars) < self.capacity:
            clear_output()
            print('Reading: ')
            print(len(self.chars), '/', self.capacity)
            doc = next(docs)
            for sent in doc.sents:
                self.tokens.append([])
                self.chars.append([])
                self.targets.append([])
                for token in sent.morph.tokens:
                    if token.pos not in self.vocab.t2i:
                        self.targets[-1].append(self.vocab.t2i['<UNK>'])
                    else:
                        self.targets[-1].append(self.vocab.t2i[token.pos])
                    if token.text not in self.vocab.w2i:
                        self.tokens[-1].append(self.vocab.w2i['<UNK>'])
                    else:
                        self.tokens[-1].append(self.vocab.w2i[token.text])
                    self.chars[-1].append(self.vocab.c2i['<BEGIN>'])
                    for char in token.text:
                        if char not in self.vocab.c2i:
                            self.chars[-1].append(self.vocab.c2i['<UNK>'])
                        else:
                            self.chars[-1].append(self.vocab.c2i[char])
                self.chars[-1].append(self.vocab.c2i['<BEGIN>'])
            

In [21]:
BATCH_SIZE=128

docs = load_nerus('/content/drive/MyDrive/nerus_lenta.conllu.gz')
vocab = Vocab(capacity=150000)
vocab.read(docs)
vocab.save()

docs = load_nerus('/content/drive/MyDrive/nerus_lenta.conllu.gz')
train_pos_dataset = POSDataset(vocab, capacity=100000)
train_pos_dataset.read(docs)

val_pos_dataset = POSDataset(vocab, capacity=20000)
val_pos_dataset.read(docs)

test_pos_dataset = POSDataset(vocab, capacity=20000)
test_pos_dataset.read(docs)

train_loader = DataLoader(train_pos_dataset, batch_size=128, collate_fn=train_pos_dataset.collate_fn)
val_loader = DataLoader(val_pos_dataset, batch_size=128, collate_fn=val_pos_dataset.collate_fn)
test_loader = DataLoader(test_pos_dataset, batch_size=128, collate_fn=test_pos_dataset.collate_fn)

Reading: 
19992 / 20000


In [22]:
for batch in train_loader:
    print(batch[0].shape, batch[1].shape, batch[2].shape)
    break

torch.Size([128, 290]) torch.Size([128, 39]) torch.Size([128, 39])


# Flair Embeddings

In [23]:
class FlairEmbeddings(nn.Module):
    def __init__(self, n_tokens=len(vocab.c2i), max_words=vocab.max_len, embedding_dim=128, hidden_dim=256, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(n_tokens, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, dropout=dropout, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, n_tokens)

        self.max_words = max_words
        self.n_tokens = n_tokens

        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        
    def forward(self, x, hidden=None):
        mask = (x != 0).to(torch.long)

        lengths = mask.sum(dim=1).to('cpu')
        total_length = x.shape[1]
        x = self.embedding(x)

        x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        output, hidden = self.lstm(x, hidden)
        output, _ = pad_packed_sequence(output, batch_first=True)

        out = self.dropout(output)
        out = self.fc(out)
        return output, (out, hidden)

    def predict(self, x, hidden=None):
        out, _ = self.forward(x.to(dtype=torch.long), hidden)
        emb = torch.zeros((x.shape[0], self.max_words, self.hidden_dim*2)).to(device=x.device)
        one_ids = (x == 1).nonzero()
        j = 0
        emb[one_ids[0, 0], 0, self.hidden_dim:] = out[one_ids[0, 0], one_ids[0, 1], self.hidden_dim:]
        for i in range(1, one_ids.shape[0]):
            if one_ids[i, 0] != one_ids[i - 1, 0]:
                emb[one_ids[i, 0] - 1, j, self.hidden_dim:] = 0
                j = 0
                emb[one_ids[i, 0], j, self.hidden_dim:] = out[one_ids[i, 0], one_ids[i, 1], self.hidden_dim:]
            else:
                emb[one_ids[i, 0], j, :self.hidden_dim] = out[one_ids[i, 0], one_ids[i, 1], :self.hidden_dim]
                j += 1
                emb[one_ids[i, 0], j, self.hidden_dim:] = out[one_ids[i, 0], one_ids[i, 1], self.hidden_dim:]

        return emb

In [27]:
class FlairTrainer:
    def __init__(self, model: nn.Module, train_loader, val_loader, lr=2e-5, betas=(0.9, 0.999),
                 project="flair_embeddings", save_every=None, save_path='./'):
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=betas)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.model = model
        self.save_path = save_path
        self.save_every = save_every
        self.name = project
        wandb.init(project=project)

    def train_epoch(self, cuda=True, clip=5):
        if cuda:
            self.model.cuda()
        else:
            self.model.cpu()
        self.model.train()
        total_loss = 0
        for batch_idx, (tokens, _, __) in enumerate(self.train_loader):
            self.optimizer.zero_grad()
            if cuda:
                tokens = tokens.cuda()
            x = tokens[:, :-1]
            y = tokens[:, 1:]
            output = self.model(x.to(dtype=torch.long))[1][0]
            loss = self.criterion(output.view(-1, output.shape[-1]).to(dtype=torch.float),
                                  y.reshape(-1).to(dtype=torch.long))

            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), clip)
            self.optimizer.step()

            total_loss += loss.item()

            print('\rTrain loss: %4f, Batch: %d of %d' % (
                total_loss / (batch_idx + 1), batch_idx + 1, len(self.train_loader)), end='')
        print()
        loss = total_loss / len(self.train_loader)
        return loss

    def test_epoch(self, cuda=True):
        if cuda:
            self.model.cuda()
        else:
            self.model.cpu()
        with torch.no_grad():
            self.model.eval()
            total_loss = 0
            for batch_idx, (tokens, _, __) in enumerate(self.val_loader):
                if cuda:
                    tokens = tokens.cuda()
                x = tokens[:, :-1]
                y = tokens[:, 1:]
                output = self.model(x.to(dtype=torch.long))[1][0]
                loss = self.criterion(output.view(-1, output.shape[-1]).to(dtype=torch.float),
                                      y.reshape(-1).to(dtype=torch.long))

                total_loss += loss.item()

                print('\rVal loss: %4f, Batch: %d of %d' % (
                    total_loss / (batch_idx + 1), batch_idx + 1, len(self.val_loader)), end='')
            print()
            loss = total_loss / len(self.val_loader)
            return loss

    @staticmethod
    def log(epoch, train_loss, test_loss):
        wandb.log({
            'train': {
                'loss': train_loss,
            },
            'val': {
                'loss': test_loss,
            },
            'epoch': epoch
        })

    def checkpoint(self, epoch):
        torch.save(self.model.state_dict(), os.path.join(self.save_path, self.name + str(epoch) + '.ckpt'))

    def fit(self, max_epochs: int = 20, cuda=True, clip=5, log=False):
        for epoch in range(max_epochs):
            if epoch and self.save_every and epoch % self.save_every == 0:
                self.checkpoint(epoch)
            print('\rEpoch: %d' % epoch)
            train_loss = self.train_epoch(cuda=cuda, clip=clip)
            test_loss = self.test_epoch(cuda=cuda)
            if log:
                self.log(epoch, train_loss, test_loss)

In [30]:
flair_model = FlairEmbeddings()
flair_trainer = FlairTrainer(flair_model, train_loader, val_loader, lr=3e-4, save_every=5)
flair_trainer.fit(log=True)

  "num_layers={}".format(dropout, num_layers))


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

Epoch: 0
Train loss: 4.896546, Batch: 782 of 782
Val loss: 4.508872, Batch: 157 of 157
Epoch: 1
Train loss: 4.302160, Batch: 782 of 782
Val loss: 4.198440, Batch: 157 of 157
Epoch: 2
Train loss: 3.999873, Batch: 782 of 782
Val loss: 3.895781, Batch: 157 of 157
Epoch: 3
Train loss: 3.700885, Batch: 782 of 782
Val loss: 3.593791, Batch: 157 of 157
Epoch: 4
Train loss: 3.402564, Batch: 782 of 782
Val loss: 3.292335, Batch: 157 of 157
Epoch: 5
Train loss: 3.105017, Batch: 782 of 782
Val loss: 2.991888, Batch: 157 of 157
Epoch: 6
Train loss: 2.808870, Batch: 782 of 782
Val loss: 2.693204, Batch: 157 of 157
Epoch: 7
Train loss: 2.515046, Batch: 782 of 782
Val loss: 2.397543, Batch: 157 of 157
Epoch: 8
Train loss: 2.224989, Batch: 782 of 782
Val loss: 2.106562, Batch: 157 of 157
Epoch: 9
Train loss: 1.940727, Batch: 782 of 782
Val loss: 1.822737, Batch: 157 of 157
Epoch: 10
Train loss: 1.665172, Batch: 782 of 782
Val loss: 1.549513, Batch: 157 of 157
Epoch: 11
Train loss: 1.402245, Batch: 782

In [32]:
flair_model.cpu()

FlairEmbeddings(
  (embedding): Embedding(1528, 128)
  (lstm): LSTM(128, 256, batch_first=True, dropout=0.3, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=1528, bias=True)
)

# POS tagging model

In [55]:
class PosTagger(nn.Module):
    def __init__(self, output_dim: int, flair, hidden_dim=300,
                 feedforward_dim=100, dropout_rate=0.1,
                 freeze_emb=True, num_embeddings=None, classic_emb_dim=128):
        super(PosTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = flair.hidden_dim * 2
        self.embedder = flair.cpu()
        self.classic_embeddings = None
        if num_embeddings:
            self.classic_embeddings = nn.Embedding(num_embeddings, classic_emb_dim)
            self.embedding_dim += classic_emb_dim
        if freeze_emb:
            self.embedder = self.embedder.eval()
            for param in self.embedder.parameters():
                param.requires_grad = False
        self.encoder = nn.LSTM(self.embedding_dim, hidden_dim, num_layers=2, bidirectional=True,
                               batch_first=True)
        self.feedforward = nn.Linear(2 * hidden_dim, feedforward_dim)
        self.out = nn.Linear(feedforward_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x: torch.LongTensor, words: torch.LongTensor):
        mask = (words != 0).to(torch.long)
        lengths = mask.sum(dim=1).to('cpu')

        if self.classic_embeddings:
            flair = self.embedder.predict(x)
            classic = self.classic_embeddings(words)
            classic_target = torch.zeros(classic.shape[0], flair.shape[1], classic.shape[-1]).to(classic.device)
            classic_target[:, :classic.shape[1], :] = classic
            x = torch.cat([flair, classic_target], dim=-1)
        else:
            x = self.embedder.predict(x)
        x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        x, _ = self.encoder(x)
        x, _ = pad_packed_sequence(x, batch_first=True)

        x = self.feedforward(x)
        x = torch.relu(x)
        x = self.dropout(x)
        return self.out(x)

In [58]:
class POSTrainer:
    def __init__(self, model: nn.Module, train_loader, val_loader, lr=3e-3, betas=(0.9, 0.999),
                 project="flair_pos_tagger", save_every=None, save_path='./', name=None):
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=betas)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.model = model
        self.save_path = save_path
        self.save_every = save_every
        self.name = project
        wandb.init(project=project, name=name)

    def train_epoch(self, cuda=True, clip=5):
        if cuda:
            self.model.cuda()
        else:
            self.model.cpu()
        self.model.train()
        total_loss = 0
        total = 0
        correct = 0
        for batch_idx, (tokens, words, pos_tags) in enumerate(self.train_loader):
            self.optimizer.zero_grad()
            if cuda:
                tokens = tokens.cuda()
                pos_tags = pos_tags.cuda()
                words = words.cuda()

            output = self.model(tokens.to(dtype=torch.long), words)
            loss = self.criterion(output.view(-1, output.shape[-1]),
                                  pos_tags.view(-1).to(dtype=torch.long))

            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), clip)
            self.optimizer.step()

            total_loss += loss.item()

            mask = (words != 0).to(torch.long)
            pred = torch.argmax(output, dim=-1)
            correct += ((pred == pos_tags)*mask).sum().item()
            total += mask.sum().item()
            print('\rTrain loss: %4f, Train accuracy: %4f, Batch: %d of %d' % (
                total_loss / (batch_idx + 1), correct / total, batch_idx + 1, len(self.train_loader)
            ), end='')
        print()
        loss, accuracy = total_loss / len(self.train_loader), correct / total
        return loss, accuracy

    def test_epoch(self, cuda=True):
        if cuda:
            self.model.cuda()
        else:
            self.model.cpu()
        with torch.no_grad():
            self.model.eval()
            total_loss = 0
            total = 0
            correct = 0
            for batch_idx, (tokens, words, pos_tags) in enumerate(self.val_loader):
                if cuda:
                    tokens = tokens.cuda()
                    pos_tags = pos_tags.cuda()
                    words = words.cuda()

                output = self.model(tokens.to(dtype=torch.long), words)
                loss = self.criterion(output.view(-1, output.shape[-1]),
                                      pos_tags.view(-1).to(dtype=torch.long))
                total_loss += loss.item()

                mask = (words != 0).to(torch.long)
                pred = torch.argmax(output, dim=-1)
                correct += ((pred == pos_tags) * mask).sum().item()
                total += mask.sum().item()

                print('\rVal loss: %4f, Val accuracy: %4f, Batch: %d of %d' % (
                    total_loss / (batch_idx + 1), correct / total, batch_idx + 1, len(self.val_loader)
                ), end='')
            print()
            loss, accuracy = total_loss / len(self.val_loader), correct / total
            return loss, accuracy

    @staticmethod
    def log(epoch, train_loss, train_accuracy, test_loss, test_accuracy):
        wandb.log({
            'train': {
                'loss': train_loss,
                'acc': train_accuracy
            },
            'val': {
                'loss': test_loss,
                'acc': test_accuracy
            },
            'epoch': epoch
        })

    def checkpoint(self, epoch):
        torch.save(self.model.state_dict(), os.path.join(self.save_path, self.name + str(epoch) + '.ckpt'))

    def fit(self, max_epochs: int = 20, cuda=True, clip=5, log=False):
        for epoch in range(max_epochs):
            if epoch and self.save_every and epoch % self.save_every == 0:
                self.checkpoint(epoch)
            print('\rEpoch: %d' % epoch)
            train_loss, train_accuracy = self.train_epoch(cuda=cuda, clip=clip)
            test_loss, test_accuracy = self.test_epoch(cuda=cuda)
            if log:
                self.log(epoch, train_loss, train_accuracy, test_loss, test_accuracy)
        if self.save_every:
            self.checkpoint(max_epochs)

# Experiments

In [65]:
pos_model = PosTagger(len(vocab.t2i), flair_model)
pos_trainer = POSTrainer(pos_model, train_loader, val_loader, save_every=1, name='flair_only, lr=3e-3, 10 epochs, hidden_dim=300')
pos_trainer.fit(max_epochs=10, log=True)
pos_model = PosTagger(len(vocab.t2i), flair_model, hidden_dim=512)
pos_trainer = POSTrainer(pos_model, train_loader, val_loader, save_every=1, lr=2e-5, name='flair_only, lr=2e-5, 10 epochs, hidden_dim=512')
pos_trainer.fit(max_epochs=10, log=True)
pos_model = PosTagger(len(vocab.t2i), flair_model, hidden_dim=128)
pos_trainer = POSTrainer(pos_model, train_loader, val_loader, save_every=1, name='flair_only, lr=3e-3, 10 epochs, hidden_dim=128')
pos_trainer.fit(max_epochs=10, log=True)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁█

0,1
epoch,1


Epoch: 0
Train loss: 0.277874, Train accuracy: 0.844580, Batch: 782 of 782
Val loss: 0.076120, Val accuracy: 0.924874, Batch: 157 of 157
Epoch: 1
Train loss: 0.064065, Train accuracy: 0.940464, Batch: 782 of 782
Val loss: 0.050280, Val accuracy: 0.950281, Batch: 157 of 157
Epoch: 2
Train loss: 0.046206, Train accuracy: 0.956739, Batch: 782 of 782
Val loss: 0.038078, Val accuracy: 0.962438, Batch: 157 of 157
Epoch: 3
Train loss: 0.036685, Train accuracy: 0.965414, Batch: 782 of 782
Val loss: 0.033927, Val accuracy: 0.966341, Batch: 157 of 157
Epoch: 4
Train loss: 0.030763, Train accuracy: 0.970841, Batch: 782 of 782
Val loss: 0.031339, Val accuracy: 0.969413, Batch: 157 of 157
Epoch: 5
Train loss: 0.026975, Train accuracy: 0.974111, Batch: 782 of 782
Val loss: 0.029424, Val accuracy: 0.971394, Batch: 157 of 157
Epoch: 6
Train loss: 0.023406, Train accuracy: 0.977262, Batch: 782 of 782
Val loss: 0.027996, Val accuracy: 0.972687, Batch: 157 of 157
Epoch: 7
Train loss: 0.020978, Train accu

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█

0,1
epoch,9


Epoch: 0
Train loss: 2.435118, Train accuracy: 0.475064, Batch: 782 of 782
Val loss: 2.195481, Val accuracy: 0.667277, Batch: 157 of 157
Epoch: 1
Train loss: 2.064369, Train accuracy: 0.724348, Batch: 782 of 782
Val loss: 1.982083, Val accuracy: 0.781865, Batch: 157 of 157
Epoch: 2
Train loss: 1.880241, Train accuracy: 0.800369, Batch: 782 of 782
Val loss: 1.810056, Val accuracy: 0.831031, Batch: 157 of 157
Epoch: 3
Train loss: 1.705973, Train accuracy: 0.838164, Batch: 782 of 782
Val loss: 1.631747, Val accuracy: 0.856731, Batch: 157 of 157
Epoch: 4
Train loss: 1.522045, Train accuracy: 0.859826, Batch: 782 of 782
Val loss: 1.439692, Val accuracy: 0.873506, Batch: 157 of 157
Epoch: 5
Train loss: 1.326913, Train accuracy: 0.875048, Batch: 782 of 782
Val loss: 1.237817, Val accuracy: 0.885902, Batch: 157 of 157
Epoch: 6
Train loss: 1.126294, Train accuracy: 0.886273, Batch: 782 of 782
Val loss: 1.034073, Val accuracy: 0.894908, Batch: 157 of 157
Epoch: 7
Train loss: 0.928654, Train accu

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█

0,1
epoch,9


Epoch: 0
Train loss: 0.267049, Train accuracy: 0.844632, Batch: 782 of 782
Val loss: 0.072028, Val accuracy: 0.929799, Batch: 157 of 157
Epoch: 1
Train loss: 0.067407, Train accuracy: 0.937131, Batch: 782 of 782
Val loss: 0.051451, Val accuracy: 0.949466, Batch: 157 of 157
Epoch: 2
Train loss: 0.049784, Train accuracy: 0.953405, Batch: 782 of 782
Val loss: 0.042557, Val accuracy: 0.958275, Batch: 157 of 157
Epoch: 3
Train loss: 0.041222, Train accuracy: 0.961124, Batch: 782 of 782
Val loss: 0.037761, Val accuracy: 0.962976, Batch: 157 of 157
Epoch: 4
Train loss: 0.035386, Train accuracy: 0.966538, Batch: 782 of 782
Val loss: 0.034903, Val accuracy: 0.965862, Batch: 157 of 157
Epoch: 5
Train loss: 0.030707, Train accuracy: 0.970720, Batch: 782 of 782
Val loss: 0.031339, Val accuracy: 0.969333, Batch: 157 of 157
Epoch: 6
Train loss: 0.027906, Train accuracy: 0.973209, Batch: 782 of 782
Val loss: 0.029790, Val accuracy: 0.970651, Batch: 157 of 157
Epoch: 7
Train loss: 0.025667, Train accu

In [66]:
pos_model = PosTagger(len(vocab.t2i), flair_model, num_embeddings=len(vocab.w2i))
pos_trainer = POSTrainer(pos_model, train_loader, val_loader, save_every=1, name='flair_and_classic_embeddings lr=3e-3, 10 epochs, hidden_dim=300')
pos_trainer.fit(max_epochs=10, log=True)
pos_model = PosTagger(len(vocab.t2i), flair_model, num_embeddings=len(vocab.w2i), hidden_dim=512)
pos_trainer = POSTrainer(pos_model, train_loader, val_loader, save_every=1, lr=2e-5, name='flair_and_classic_embeddings lr=2e-5, 10 epochs, hidden_dim=512')
pos_trainer.fit(max_epochs=10, log=True)
pos_model = PosTagger(len(vocab.t2i), flair_model, num_embeddings=len(vocab.w2i), hidden_dim=128)
pos_trainer = POSTrainer(pos_model, train_loader, val_loader, save_every=1, name='flair_and_classic_embeddings lr=3e-3, 10 epochs, hidden_dim=300')
pos_trainer.fit(max_epochs=10, log=True)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█

0,1
epoch,9


Epoch: 0
Train loss: 0.209309, Train accuracy: 0.903603, Batch: 782 of 782
Val loss: 0.034328, Val accuracy: 0.968017, Batch: 157 of 157
Epoch: 1
Train loss: 0.023957, Train accuracy: 0.978655, Batch: 782 of 782
Val loss: 0.025686, Val accuracy: 0.976640, Batch: 157 of 157
Epoch: 2
Train loss: 0.013836, Train accuracy: 0.987581, Batch: 782 of 782
Val loss: 0.025926, Val accuracy: 0.977829, Batch: 157 of 157
Epoch: 3
Train loss: 0.009688, Train accuracy: 0.991108, Batch: 782 of 782
Val loss: 0.027298, Val accuracy: 0.977560, Batch: 157 of 157
Epoch: 4
Train loss: 0.007510, Train accuracy: 0.992960, Batch: 782 of 782
Val loss: 0.028459, Val accuracy: 0.978499, Batch: 157 of 157
Epoch: 5
Train loss: 0.006216, Train accuracy: 0.994118, Batch: 782 of 782
Val loss: 0.030365, Val accuracy: 0.978759, Batch: 157 of 157
Epoch: 6
Train loss: 0.005475, Train accuracy: 0.994724, Batch: 782 of 782
Val loss: 0.031749, Val accuracy: 0.978532, Batch: 157 of 157
Epoch: 7
Train loss: 0.004647, Train accu

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█

0,1
epoch,9


Epoch: 0
Train loss: 2.410107, Train accuracy: 0.536319, Batch: 782 of 782
Val loss: 2.159173, Val accuracy: 0.735777, Batch: 157 of 157
Epoch: 1
Train loss: 2.038514, Train accuracy: 0.780370, Batch: 782 of 782
Val loss: 1.966802, Val accuracy: 0.832773, Batch: 157 of 157
Epoch: 2
Train loss: 1.865614, Train accuracy: 0.851137, Batch: 782 of 782
Val loss: 1.806317, Val accuracy: 0.877960, Batch: 157 of 157
Epoch: 3
Train loss: 1.702158, Train accuracy: 0.886614, Batch: 782 of 782
Val loss: 1.638913, Val accuracy: 0.902024, Batch: 157 of 157
Epoch: 4
Train loss: 1.529262, Train accuracy: 0.907433, Batch: 782 of 782
Val loss: 1.458826, Val accuracy: 0.917763, Batch: 157 of 157
Epoch: 5
Train loss: 1.345619, Train accuracy: 0.921433, Batch: 782 of 782
Val loss: 1.268764, Val accuracy: 0.928228, Batch: 157 of 157
Epoch: 6
Train loss: 1.155103, Train accuracy: 0.931508, Batch: 782 of 782
Val loss: 1.074334, Val accuracy: 0.935730, Batch: 157 of 157
Epoch: 7
Train loss: 0.964152, Train accu

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█

0,1
epoch,9


Epoch: 0
Train loss: 0.201540, Train accuracy: 0.904869, Batch: 782 of 782
Val loss: 0.033950, Val accuracy: 0.968433, Batch: 157 of 157
Epoch: 1
Train loss: 0.024841, Train accuracy: 0.978014, Batch: 782 of 782
Val loss: 0.026312, Val accuracy: 0.975729, Batch: 157 of 157
Epoch: 2
Train loss: 0.014495, Train accuracy: 0.987091, Batch: 782 of 782
Val loss: 0.025666, Val accuracy: 0.977840, Batch: 157 of 157
Epoch: 3
Train loss: 0.010260, Train accuracy: 0.990669, Batch: 782 of 782
Val loss: 0.031603, Val accuracy: 0.975072, Batch: 157 of 157
Epoch: 4
Train loss: 0.008095, Train accuracy: 0.992448, Batch: 782 of 782
Val loss: 0.028982, Val accuracy: 0.978815, Batch: 157 of 157
Epoch: 5
Train loss: 0.006385, Train accuracy: 0.993950, Batch: 782 of 782
Val loss: 0.029737, Val accuracy: 0.979327, Batch: 157 of 157
Epoch: 6
Train loss: 0.005542, Train accuracy: 0.994636, Batch: 782 of 782
Val loss: 0.030726, Val accuracy: 0.978737, Batch: 157 of 157
Epoch: 7
Train loss: 0.004683, Train accu

In [67]:
pos_model = PosTagger(len(vocab.t2i), flair_model, freeze_emb=False)
pos_trainer = POSTrainer(pos_model, train_loader, val_loader, name='flair_no_freeze')
pos_trainer.fit(max_epochs=10, log=True)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█

0,1
epoch,9


Epoch: 0
Train loss: 0.273362, Train accuracy: 0.838490, Batch: 782 of 782
Val loss: 0.071590, Val accuracy: 0.930375, Batch: 157 of 157
Epoch: 1
Train loss: 0.064916, Train accuracy: 0.939655, Batch: 782 of 782
Val loss: 0.049390, Val accuracy: 0.951760, Batch: 157 of 157
Epoch: 2
Train loss: 0.046180, Train accuracy: 0.956824, Batch: 782 of 782
Val loss: 0.038343, Val accuracy: 0.962377, Batch: 157 of 157
Epoch: 3
Train loss: 0.036761, Train accuracy: 0.965310, Batch: 782 of 782
Val loss: 0.035645, Val accuracy: 0.964931, Batch: 157 of 157
Epoch: 4
Train loss: 0.030778, Train accuracy: 0.970733, Batch: 782 of 782
Val loss: 0.031877, Val accuracy: 0.968823, Batch: 157 of 157
Epoch: 5
Train loss: 0.026705, Train accuracy: 0.974288, Batch: 782 of 782
Val loss: 0.031471, Val accuracy: 0.969474, Batch: 157 of 157
Epoch: 6
Train loss: 0.023524, Train accuracy: 0.977200, Batch: 782 of 782
Val loss: 0.028808, Val accuracy: 0.971926, Batch: 157 of 157
Epoch: 7
Train loss: 0.021117, Train accu