In [4]:
import torch
import torch.nn as nn
import numpy as np
import wandb

from typing import List
from IPython.display import clear_output

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn.utils.rnn import pad_sequence

from nerus import load_nerus
import nerus

import os
import json

from transformers import BertModel, BertConfig, BertTokenizer

In [6]:
docs = load_nerus('/content/drive/MyDrive/nerus_lenta.conllu.gz')
doc = next(docs)

In [7]:
sent = doc.sents[0]
sent.morph.tokens[0]

MorphToken(
    text='Вице-премьер',
    pos='NOUN',
    feats={'Animacy': 'Anim',
     'Case': 'Nom',
     'Gender': 'Masc',
     'Number': 'Sing'}
)

In [None]:
sent.morph

In [9]:
len(doc.sents)

6

In [10]:
sent.text

'Вице-премьер по социальным вопросам Татьяна Голикова рассказала, в каких регионах России зафиксирована наиболее высокая смертность от рака, сообщает РИА Новости.'

In [11]:
sent.tokens[0].text

'Вице-премьер'

In [12]:
class Vocab:
    def __init__(self, capacity=10000, save_path='./vocab'):
        self.capacity = capacity
        self.save_path = save_path

        self.max_len = 0
        self.max_chars = 0

        self.i2c = ['<SEP>', '<BEGIN>', '<END>', '<UNK>']
        self.c2i = {'<SEP>': 0, '<BEGIN>': 1, '<END>': 2, '<UNK>': 3}

        self.i2w = ['<SEP>', '<UNK>']
        self.w2i = {'<SEP>': 0, '<UNK>': 1}

        self.i2t = ['<SEP>', '<UNK>']
        self.t2i = {'<SEP>': 0, '<UNK>': 1}

    def load(self, max_len=200, max_chars=1000):
        with open(os.path.join(self.save_path, 'words.txt'), 'r') as v:
            self.w2i = json.load(v)
            self.i2w = [_ for _ in self.w2i]
            for k in self.w2i:
                self.i2w[self.w2i[k]] = k
        with open(os.path.join(self.save_path, 'chars.txt'), 'r') as v:
            self.c2i = json.load(v)
            self.i2c = [_ for _ in self.c2i]
            for k in self.c2i:
                self.i2c[self.c2i[k]] = k
        with open(os.path.join(self.save_path, 'labels.txt'), 'r') as v:
            self.t2i = json.load(v)
            self.i2t = [_ for _ in self.t2i]
            for k in self.t2i:
                self.i2t[self.t2i[k]] = k

        self.max_len = max_len
        self.max_chars = max_chars

    def save(self):
        with open(os.path.join(self.save_path, 'words.txt'), 'w') as v:
            json.dump(self.w2i, v)
        with open(os.path.join(self.save_path, 'chars.txt'), 'w') as v:
            json.dump(self.c2i, v)
        with open(os.path.join(self.save_path, 'labels.txt'), 'w') as v:
            json.dump(self.t2i, v)

    def read(self, docs):
        for _ in range(self.capacity):
            clear_output()
            print('Reading: ')
            print(_, '/', self.capacity)
            doc = next(docs)
            for sent in doc.sents:
                self.max_len = max(self.max_len, len(sent.tokens))
                chars_len = 0
                for token in sent.morph.tokens:
                    if token.pos not in self.t2i:
                        self.i2t.append(token.pos)
                        self.t2i[token.pos] = len(self.i2t) - 1
                    if token.text not in self.w2i:
                        self.i2w.append(token.text)
                        self.w2i[token.text] = len(self.i2w) - 1
                    for char in token.text:
                        chars_len += 1
                        if char not in self.c2i:
                            self.i2c.append(char)
                            self.c2i[char] = len(self.i2c) - 1
                self.max_chars = max(self.max_chars, chars_len + 10)

class POSDataset(Dataset):
    def __init__(self, vocab, capacity=2000):
        super().__init__()
        self.tokens = []
        self.chars = []
        self.targets = []
        self.capacity = capacity

        self.vocab=vocab

    def __getitem__(self, i):
        return torch.LongTensor(self.chars[i]), torch.LongTensor(self.tokens[i]), torch.LongTensor(self.targets[i])

    def __len__(self):
        return len(self.chars)

    def collate_fn(self, batch):
        tokens, words, pos_tags = list(zip(*batch))
        tokens = pad_sequence(tokens, batch_first=True)
        words = pad_sequence(words, batch_first=True)
        pos_tags = pad_sequence(pos_tags, batch_first=True)
        return tokens, words, pos_tags

    def read(self, docs):
        while len(self.chars) < self.capacity:
            clear_output()
            print('Reading: ')
            print(len(self.chars), '/', self.capacity)
            doc = next(docs)
            for sent in doc.sents:
                self.tokens.append([])
                self.chars.append([])
                self.targets.append([])
                for token in sent.morph.tokens:
                    if token.pos not in self.vocab.t2i:
                        self.targets[-1].append(self.vocab.t2i['<UNK>'])
                    else:
                        self.targets[-1].append(self.vocab.t2i[token.pos])
                    if token.text not in self.vocab.w2i:
                        self.tokens[-1].append(self.vocab.w2i['<UNK>'])
                    else:
                        self.tokens[-1].append(self.vocab.w2i[token.text])
                    self.chars[-1].append(self.vocab.c2i['<BEGIN>'])
                    for char in token.text:
                        if char not in self.vocab.c2i:
                            self.chars[-1].append(self.vocab.c2i['<UNK>'])
                        else:
                            self.chars[-1].append(self.vocab.c2i[char])
                self.chars[-1].append(self.vocab.c2i['<BEGIN>'])
            

In [14]:
BATCH_SIZE=128

docs = load_nerus('/content/drive/MyDrive/nerus_lenta.conllu.gz')
vocab = Vocab(capacity=150000)
vocab.read(docs)
vocab.save()

docs = load_nerus('/content/drive/MyDrive/nerus_lenta.conllu.gz')
train_pos_dataset = POSDataset(vocab, capacity=100000)
train_pos_dataset.read(docs)

val_pos_dataset = POSDataset(vocab, capacity=20000)
val_pos_dataset.read(docs)

test_pos_dataset = POSDataset(vocab, capacity=20000)
test_pos_dataset.read(docs)

train_loader = DataLoader(train_pos_dataset, batch_size=128, collate_fn=train_pos_dataset.collate_fn)
val_loader = DataLoader(val_pos_dataset, batch_size=128, collate_fn=val_pos_dataset.collate_fn)
test_loader = DataLoader(test_pos_dataset, batch_size=128, collate_fn=test_pos_dataset.collate_fn)

Reading: 
19992 / 20000


In [15]:
for batch in train_loader:
    print(batch[0].shape, batch[1].shape, batch[2].shape)
    break

torch.Size([128, 290]) torch.Size([128, 39]) torch.Size([128, 39])


# Flair Embeddings

In [29]:
class FlairEmbeddings(nn.Module):
    def __init__(self, n_tokens=len(vocab.c2i), max_words=vocab.max_len, embedding_dim=128, hidden_dim=256, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(n_tokens, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, dropout=dropout, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, n_tokens)

        self.max_words = max_words
        self.n_tokens = n_tokens

        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        
    def forward(self, x, hidden=None):
        mask = (x != 0).to(torch.long)

        lengths = mask.sum(dim=1).to('cpu')
        total_length = x.shape[1]
        x = self.embedding(x)

        x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        output, hidden = self.lstm(x, hidden)
        output, _ = pad_packed_sequence(output, batch_first=True)
        output_f = output[:, :, :self.hidden_dim]
        output_b = output[:, :, self.hidden_dim:]

        out_f = self.fc(self.dropout(output_f))
        out_b = self.fc(self.dropout(output_b))
        return output, (out_f[:, :-1], out_b[:, 1:], hidden)

    def predict(self, x, hidden=None):
        out, _ = self.forward(x.to(dtype=torch.long), hidden)
        emb = torch.zeros((x.shape[0], self.max_words, self.hidden_dim*2)).to(device=x.device)
        one_ids = (x == 1).nonzero()
        j = 0
        emb[one_ids[0, 0], 0, self.hidden_dim:] = out[one_ids[0, 0], one_ids[0, 1], self.hidden_dim:]
        for i in range(1, one_ids.shape[0]):
            if one_ids[i, 0] != one_ids[i - 1, 0]:
                emb[one_ids[i, 0] - 1, j, self.hidden_dim:] = 0
                j = 0
                emb[one_ids[i, 0], j, self.hidden_dim:] = out[one_ids[i, 0], one_ids[i, 1], self.hidden_dim:]
            else:
                emb[one_ids[i, 0], j, :self.hidden_dim] = out[one_ids[i, 0], one_ids[i, 1], :self.hidden_dim]
                j += 1
                emb[one_ids[i, 0], j, self.hidden_dim:] = out[one_ids[i, 0], one_ids[i, 1], self.hidden_dim:]

        return emb

In [30]:
class FlairTrainer:
    def __init__(self, model: nn.Module, train_loader, val_loader, lr=2e-5, betas=(0.9, 0.999),
                 project="flair_embeddings", save_every=None, save_path='./'):
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=betas)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.model = model
        self.save_path = save_path
        self.save_every = save_every
        self.name = project
        wandb.init(project=project)

    def train_epoch(self, cuda=True, clip=5):
        if cuda:
            self.model.cuda()
        else:
            self.model.cpu()
        self.model.train()
        total_loss = 0
        for batch_idx, (tokens, _, __) in enumerate(self.train_loader):
            self.optimizer.zero_grad()
            if cuda:
                tokens = tokens.cuda()
            x = tokens
            y_f = tokens[:, 1:]
            y_b = tokens[:, :-1]
            output_f, output_b, _ = self.model(x.to(dtype=torch.long))[1]
            loss_forward = self.criterion(output_f.reshape(-1, output_f.shape[-1]).to(dtype=torch.float),
                                  y_f.reshape(-1).to(dtype=torch.long))
            loss_backward = self.criterion(output_b.reshape(-1, output_b.shape[-1]).to(dtype=torch.float),
                                  y_b.reshape(-1).to(dtype=torch.long))
            loss = loss_forward + loss_backward

            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), clip)
            self.optimizer.step()

            total_loss += loss.item()

            print('\rTrain loss: %4f, Batch: %d of %d' % (
                total_loss / (batch_idx + 1), batch_idx + 1, len(self.train_loader)), end='')
        print()
        loss = total_loss / len(self.train_loader)
        return loss

    def test_epoch(self, cuda=True):
        if cuda:
            self.model.cuda()
        else:
            self.model.cpu()
        with torch.no_grad():
            self.model.eval()
            total_loss = 0
            for batch_idx, (tokens, _, __) in enumerate(self.val_loader):
                if cuda:
                    tokens = tokens.cuda()
                x = tokens
                y_f = tokens[:, 1:]
                y_b = tokens[:, :-1]
                output_f, output_b, _ = self.model(x.to(dtype=torch.long))[1]
                loss_forward = self.criterion(output_f.reshape(-1, output_f.shape[-1]).to(dtype=torch.float),
                                    y_f.reshape(-1).to(dtype=torch.long))
                loss_backward = self.criterion(output_b.reshape(-1, output_b.shape[-1]).to(dtype=torch.float),
                                    y_b.reshape(-1).to(dtype=torch.long))
                loss = loss_forward + loss_backward

                total_loss += loss.item()

                print('\rVal loss: %4f, Batch: %d of %d' % (
                    total_loss / (batch_idx + 1), batch_idx + 1, len(self.val_loader)), end='')
            print()
            loss = total_loss / len(self.val_loader)
            return loss

    @staticmethod
    def log(epoch, train_loss, test_loss):
        wandb.log({
            'train': {
                'loss': train_loss,
            },
            'val': {
                'loss': test_loss,
            },
            'epoch': epoch
        })

    def checkpoint(self, epoch):
        torch.save(self.model.state_dict(), os.path.join(self.save_path, self.name + str(epoch) + '.ckpt'))

    def fit(self, max_epochs: int = 20, cuda=True, clip=5, log=False):
        for epoch in range(max_epochs):
            if epoch and self.save_every and epoch % self.save_every == 0:
                self.checkpoint(epoch)
            print('\rEpoch: %d' % epoch)
            train_loss = self.train_epoch(cuda=cuda, clip=clip)
            test_loss = self.test_epoch(cuda=cuda)
            if log:
                self.log(epoch, train_loss, test_loss)

In [31]:
flair_model = FlairEmbeddings()
flair_trainer = FlairTrainer(flair_model, train_loader, val_loader, lr=3e-4, save_every=5)
flair_trainer.fit(log=True)

  "num_layers={}".format(dropout, num_layers))


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

Epoch: 0
Train loss: 11.212957, Batch: 782 of 782
Val loss: 10.617119, Batch: 157 of 157
Epoch: 1
Train loss: 10.200093, Batch: 782 of 782
Val loss: 9.846869, Batch: 157 of 157
Epoch: 2
Train loss: 9.475763, Batch: 782 of 782
Val loss: 9.128949, Batch: 157 of 157
Epoch: 3
Train loss: 8.785096, Batch: 782 of 782
Val loss: 8.438249, Batch: 157 of 157
Epoch: 4
Train loss: 8.116903, Batch: 782 of 782
Val loss: 7.770190, Batch: 157 of 157
Epoch: 5
Train loss: 7.467449, Batch: 782 of 782
Val loss: 7.120563, Batch: 157 of 157
Epoch: 6
Train loss: 6.833857, Batch: 782 of 782
Val loss: 6.487044, Batch: 157 of 157
Epoch: 7
Train loss: 6.215229, Batch: 782 of 782
Val loss: 5.868756, Batch: 157 of 157
Epoch: 8
Train loss: 5.611103, Batch: 782 of 782
Val loss: 5.265769, Batch: 157 of 157
Epoch: 9
Train loss: 5.023635, Batch: 782 of 782
Val loss: 4.682292, Batch: 157 of 157
Epoch: 10
Train loss: 4.457732, Batch: 782 of 782
Val loss: 4.123684, Batch: 157 of 157
Epoch: 11
Train loss: 3.920497, Batch: 

In [32]:
flair_model.cpu()

FlairEmbeddings(
  (embedding): Embedding(1528, 128)
  (lstm): LSTM(128, 256, batch_first=True, dropout=0.3, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1528, bias=True)
)

# POS tagging model

In [33]:
class PosTagger(nn.Module):
    def __init__(self, output_dim: int, flair, hidden_dim=300,
                 feedforward_dim=100, dropout_rate=0.1,
                 freeze_emb=True, num_embeddings=None, classic_emb_dim=128):
        super(PosTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = flair.hidden_dim * 2
        self.embedder = flair.cpu()
        self.classic_embeddings = None
        if num_embeddings:
            self.classic_embeddings = nn.Embedding(num_embeddings, classic_emb_dim)
            self.embedding_dim += classic_emb_dim
        if freeze_emb:
            self.embedder = self.embedder.eval()
            for param in self.embedder.parameters():
                param.requires_grad = False
        self.encoder = nn.LSTM(self.embedding_dim, hidden_dim, num_layers=2, bidirectional=True,
                               batch_first=True)
        self.feedforward = nn.Linear(2 * hidden_dim, feedforward_dim)
        self.out = nn.Linear(feedforward_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x: torch.LongTensor, words: torch.LongTensor):
        mask = (words != 0).to(torch.long)
        lengths = mask.sum(dim=1).to('cpu')

        if self.classic_embeddings:
            flair = self.embedder.predict(x)
            classic = self.classic_embeddings(words)
            classic_target = torch.zeros(classic.shape[0], flair.shape[1], classic.shape[-1]).to(classic.device)
            classic_target[:, :classic.shape[1], :] = classic
            x = torch.cat([flair, classic_target], dim=-1)
        else:
            x = self.embedder.predict(x)
        x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        x, _ = self.encoder(x)
        x, _ = pad_packed_sequence(x, batch_first=True)

        x = self.feedforward(x)
        x = torch.relu(x)
        x = self.dropout(x)
        return self.out(x)

In [34]:
class POSTrainer:
    def __init__(self, model: nn.Module, train_loader, val_loader, lr=3e-3, betas=(0.9, 0.999),
                 project="bilstm_flair_pos_tagger", save_every=None, save_path='./', name=None):
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=betas)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.model = model
        self.save_path = save_path
        self.save_every = save_every
        self.name = project
        wandb.init(project=project, name=name)

    def train_epoch(self, cuda=True, clip=5):
        if cuda:
            self.model.cuda()
        else:
            self.model.cpu()
        self.model.train()
        total_loss = 0
        total = 0
        correct = 0
        for batch_idx, (tokens, words, pos_tags) in enumerate(self.train_loader):
            self.optimizer.zero_grad()
            if cuda:
                tokens = tokens.cuda()
                pos_tags = pos_tags.cuda()
                words = words.cuda()

            output = self.model(tokens.to(dtype=torch.long), words)
            loss = self.criterion(output.view(-1, output.shape[-1]),
                                  pos_tags.view(-1).to(dtype=torch.long))

            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), clip)
            self.optimizer.step()

            total_loss += loss.item()

            mask = (words != 0).to(torch.long)
            pred = torch.argmax(output, dim=-1)
            correct += ((pred == pos_tags)*mask).sum().item()
            total += mask.sum().item()
            print('\rTrain loss: %4f, Train accuracy: %4f, Batch: %d of %d' % (
                total_loss / (batch_idx + 1), correct / total, batch_idx + 1, len(self.train_loader)
            ), end='')
        print()
        loss, accuracy = total_loss / len(self.train_loader), correct / total
        return loss, accuracy

    def test_epoch(self, cuda=True):
        if cuda:
            self.model.cuda()
        else:
            self.model.cpu()
        with torch.no_grad():
            self.model.eval()
            total_loss = 0
            total = 0
            correct = 0
            for batch_idx, (tokens, words, pos_tags) in enumerate(self.val_loader):
                if cuda:
                    tokens = tokens.cuda()
                    pos_tags = pos_tags.cuda()
                    words = words.cuda()

                output = self.model(tokens.to(dtype=torch.long), words)
                loss = self.criterion(output.view(-1, output.shape[-1]),
                                      pos_tags.view(-1).to(dtype=torch.long))
                total_loss += loss.item()

                mask = (words != 0).to(torch.long)
                pred = torch.argmax(output, dim=-1)
                correct += ((pred == pos_tags) * mask).sum().item()
                total += mask.sum().item()

                print('\rVal loss: %4f, Val accuracy: %4f, Batch: %d of %d' % (
                    total_loss / (batch_idx + 1), correct / total, batch_idx + 1, len(self.val_loader)
                ), end='')
            print()
            loss, accuracy = total_loss / len(self.val_loader), correct / total
            return loss, accuracy

    @staticmethod
    def log(epoch, train_loss, train_accuracy, test_loss, test_accuracy):
        wandb.log({
            'train': {
                'loss': train_loss,
                'acc': train_accuracy
            },
            'val': {
                'loss': test_loss,
                'acc': test_accuracy
            },
            'epoch': epoch
        })

    def checkpoint(self, epoch):
        torch.save(self.model.state_dict(), os.path.join(self.save_path, self.name + str(epoch) + '.ckpt'))

    def fit(self, max_epochs: int = 20, cuda=True, clip=5, log=False):
        for epoch in range(max_epochs):
            if epoch and self.save_every and epoch % self.save_every == 0:
                self.checkpoint(epoch)
            print('\rEpoch: %d' % epoch)
            train_loss, train_accuracy = self.train_epoch(cuda=cuda, clip=clip)
            test_loss, test_accuracy = self.test_epoch(cuda=cuda)
            if log:
                self.log(epoch, train_loss, train_accuracy, test_loss, test_accuracy)
        if self.save_every:
            self.checkpoint(max_epochs)

# Experiments

In [35]:
pos_model = PosTagger(len(vocab.t2i), flair_model)
pos_trainer = POSTrainer(pos_model, train_loader, val_loader, save_every=1, name='flair_only, lr=3e-3, 10 epochs, hidden_dim=300')
pos_trainer.fit(max_epochs=10, log=True)
pos_model = PosTagger(len(vocab.t2i), flair_model, hidden_dim=512)
pos_trainer = POSTrainer(pos_model, train_loader, val_loader, save_every=1, lr=2e-5, name='flair_only, lr=2e-5, 10 epochs, hidden_dim=512')
pos_trainer.fit(max_epochs=10, log=True)
pos_model = PosTagger(len(vocab.t2i), flair_model, hidden_dim=128)
pos_trainer = POSTrainer(pos_model, train_loader, val_loader, save_every=1, name='flair_only, lr=3e-3, 10 epochs, hidden_dim=128')
pos_trainer.fit(max_epochs=10, log=True)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██

0,1
epoch,19


Epoch: 0
Train loss: 0.202669, Train accuracy: 0.921532, Batch: 782 of 782
Val loss: 0.028329, Val accuracy: 0.973441, Batch: 157 of 157
Epoch: 1
Train loss: 0.025699, Train accuracy: 0.976504, Batch: 782 of 782
Val loss: 0.023917, Val accuracy: 0.976610, Batch: 157 of 157
Epoch: 2
Train loss: 0.019278, Train accuracy: 0.981648, Batch: 782 of 782
Val loss: 0.020838, Val accuracy: 0.979491, Batch: 157 of 157
Epoch: 3
Train loss: 0.016043, Train accuracy: 0.984685, Batch: 782 of 782
Val loss: 0.018044, Val accuracy: 0.982757, Batch: 157 of 157
Epoch: 4
Train loss: 0.013665, Train accuracy: 0.986581, Batch: 782 of 782
Val loss: 0.018172, Val accuracy: 0.982973, Batch: 157 of 157
Epoch: 5
Train loss: 0.012047, Train accuracy: 0.987999, Batch: 782 of 782
Val loss: 0.018287, Val accuracy: 0.983142, Batch: 157 of 157
Epoch: 6
Train loss: 0.010641, Train accuracy: 0.989309, Batch: 782 of 782
Val loss: 0.018209, Val accuracy: 0.983524, Batch: 157 of 157
Epoch: 7
Train loss: 0.009424, Train accu

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█

0,1
epoch,9


Epoch: 0
Train loss: 2.510424, Train accuracy: 0.506234, Batch: 782 of 782
Val loss: 2.193423, Val accuracy: 0.778300, Batch: 157 of 157
Epoch: 1
Train loss: 2.047103, Train accuracy: 0.836996, Batch: 782 of 782
Val loss: 1.973111, Val accuracy: 0.886318, Batch: 157 of 157
Epoch: 2
Train loss: 1.877221, Train accuracy: 0.899774, Batch: 782 of 782
Val loss: 1.828544, Val accuracy: 0.919702, Batch: 157 of 157
Epoch: 3
Train loss: 1.730589, Train accuracy: 0.923799, Batch: 782 of 782
Val loss: 1.677913, Val accuracy: 0.934810, Batch: 157 of 157
Epoch: 4
Train loss: 1.574405, Train accuracy: 0.936388, Batch: 782 of 782
Val loss: 1.513434, Val accuracy: 0.942982, Batch: 157 of 157
Epoch: 5
Train loss: 1.405664, Train accuracy: 0.943802, Batch: 782 of 782
Val loss: 1.336398, Val accuracy: 0.948774, Batch: 157 of 157
Epoch: 6
Train loss: 1.227057, Train accuracy: 0.948979, Batch: 782 of 782
Val loss: 1.151626, Val accuracy: 0.952992, Batch: 157 of 157
Epoch: 7
Train loss: 1.044432, Train accu

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█

0,1
epoch,9


Epoch: 0
Train loss: 0.189833, Train accuracy: 0.923025, Batch: 782 of 782
Val loss: 0.028505, Val accuracy: 0.973355, Batch: 157 of 157
Epoch: 1
Train loss: 0.026780, Train accuracy: 0.975648, Batch: 782 of 782
Val loss: 0.022170, Val accuracy: 0.978563, Batch: 157 of 157
Epoch: 2
Train loss: 0.021063, Train accuracy: 0.980384, Batch: 782 of 782
Val loss: 0.019297, Val accuracy: 0.981402, Batch: 157 of 157
Epoch: 3
Train loss: 0.017695, Train accuracy: 0.983201, Batch: 782 of 782
Val loss: 0.017867, Val accuracy: 0.982673, Batch: 157 of 157
Epoch: 4
Train loss: 0.015600, Train accuracy: 0.984936, Batch: 782 of 782
Val loss: 0.017132, Val accuracy: 0.983208, Batch: 157 of 157
Epoch: 5
Train loss: 0.013804, Train accuracy: 0.986536, Batch: 782 of 782
Val loss: 0.018543, Val accuracy: 0.982518, Batch: 157 of 157
Epoch: 6
Train loss: 0.012712, Train accuracy: 0.987578, Batch: 782 of 782
Val loss: 0.018067, Val accuracy: 0.983169, Batch: 157 of 157
Epoch: 7
Train loss: 0.011356, Train accu

In [36]:
pos_model = PosTagger(len(vocab.t2i), flair_model, num_embeddings=len(vocab.w2i))
pos_trainer = POSTrainer(pos_model, train_loader, val_loader, save_every=1, name='flair_and_classic_embeddings lr=3e-3, 10 epochs, hidden_dim=300')
pos_trainer.fit(max_epochs=10, log=True)
pos_model = PosTagger(len(vocab.t2i), flair_model, num_embeddings=len(vocab.w2i), hidden_dim=512)
pos_trainer = POSTrainer(pos_model, train_loader, val_loader, save_every=1, lr=2e-5, name='flair_and_classic_embeddings lr=2e-5, 10 epochs, hidden_dim=512')
pos_trainer.fit(max_epochs=10, log=True)
pos_model = PosTagger(len(vocab.t2i), flair_model, num_embeddings=len(vocab.w2i), hidden_dim=128)
pos_trainer = POSTrainer(pos_model, train_loader, val_loader, save_every=1, name='flair_and_classic_embeddings lr=3e-3, 10 epochs, hidden_dim=128')
pos_trainer.fit(max_epochs=10, log=True)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█

0,1
epoch,9


Epoch: 0
Train loss: 0.165846, Train accuracy: 0.940848, Batch: 782 of 782
Val loss: 0.019760, Val accuracy: 0.981618, Batch: 157 of 157
Epoch: 1
Train loss: 0.014775, Train accuracy: 0.986807, Batch: 782 of 782
Val loss: 0.017083, Val accuracy: 0.984485, Batch: 157 of 157
Epoch: 2
Train loss: 0.009030, Train accuracy: 0.991779, Batch: 782 of 782
Val loss: 0.017575, Val accuracy: 0.984751, Batch: 157 of 157
Epoch: 3
Train loss: 0.006387, Train accuracy: 0.993994, Batch: 782 of 782
Val loss: 0.018444, Val accuracy: 0.985216, Batch: 157 of 157
Epoch: 4
Train loss: 0.004829, Train accuracy: 0.995385, Batch: 782 of 782
Val loss: 0.019148, Val accuracy: 0.986053, Batch: 157 of 157
Epoch: 5
Train loss: 0.004035, Train accuracy: 0.996077, Batch: 782 of 782
Val loss: 0.021359, Val accuracy: 0.985729, Batch: 157 of 157
Epoch: 6
Train loss: 0.003687, Train accuracy: 0.996452, Batch: 782 of 782
Val loss: 0.021634, Val accuracy: 0.985901, Batch: 157 of 157
Epoch: 7
Train loss: 0.003063, Train accu

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█

0,1
epoch,9


Epoch: 0
Train loss: 2.406533, Train accuracy: 0.567416, Batch: 782 of 782
Val loss: 2.098112, Val accuracy: 0.834773, Batch: 157 of 157
Epoch: 1
Train loss: 1.966703, Train accuracy: 0.880836, Batch: 782 of 782
Val loss: 1.907484, Val accuracy: 0.917212, Batch: 157 of 157
Epoch: 2
Train loss: 1.811791, Train accuracy: 0.926138, Batch: 782 of 782
Val loss: 1.765902, Val accuracy: 0.939824, Batch: 157 of 157
Epoch: 3
Train loss: 1.664245, Train accuracy: 0.942661, Batch: 782 of 782
Val loss: 1.609147, Val accuracy: 0.950120, Batch: 157 of 157
Epoch: 4
Train loss: 1.499573, Train accuracy: 0.951694, Batch: 782 of 782
Val loss: 1.433519, Val accuracy: 0.956319, Batch: 157 of 157
Epoch: 5
Train loss: 1.318977, Train accuracy: 0.957391, Batch: 782 of 782
Val loss: 1.244082, Val accuracy: 0.960472, Batch: 157 of 157
Epoch: 6
Train loss: 1.128768, Train accuracy: 0.961619, Batch: 782 of 782
Val loss: 1.048278, Val accuracy: 0.963746, Batch: 157 of 157
Epoch: 7
Train loss: 0.936985, Train accu

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█

0,1
epoch,9


Epoch: 0
Train loss: 0.179208, Train accuracy: 0.939190, Batch: 782 of 782
Val loss: 0.020427, Val accuracy: 0.981460, Batch: 157 of 157
Epoch: 1
Train loss: 0.015233, Train accuracy: 0.986514, Batch: 782 of 782
Val loss: 0.017156, Val accuracy: 0.984759, Batch: 157 of 157
Epoch: 2
Train loss: 0.009212, Train accuracy: 0.991784, Batch: 782 of 782
Val loss: 0.019911, Val accuracy: 0.982862, Batch: 157 of 157
Epoch: 3
Train loss: 0.006443, Train accuracy: 0.994037, Batch: 782 of 782
Val loss: 0.017979, Val accuracy: 0.985308, Batch: 157 of 157
Epoch: 4
Train loss: 0.004737, Train accuracy: 0.995594, Batch: 782 of 782
Val loss: 0.021263, Val accuracy: 0.985191, Batch: 157 of 157
Epoch: 5
Train loss: 0.003875, Train accuracy: 0.996280, Batch: 782 of 782
Val loss: 0.021463, Val accuracy: 0.985599, Batch: 157 of 157
Epoch: 6
Train loss: 0.003270, Train accuracy: 0.996778, Batch: 782 of 782
Val loss: 0.023474, Val accuracy: 0.985394, Batch: 157 of 157
Epoch: 7
Train loss: 0.003051, Train accu

In [37]:
pos_model = PosTagger(len(vocab.t2i), flair_model, freeze_emb=False)
pos_trainer = POSTrainer(pos_model, train_loader, val_loader, name='flair_no_freeze')
pos_trainer.fit(max_epochs=10, log=True)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▂▃▃▄▅▆▆▇█

0,1
epoch,9


Epoch: 0
Train loss: 0.202150, Train accuracy: 0.916930, Batch: 782 of 782
Val loss: 0.029822, Val accuracy: 0.971663, Batch: 157 of 157
Epoch: 1
Train loss: 0.026337, Train accuracy: 0.976128, Batch: 782 of 782
Val loss: 0.022662, Val accuracy: 0.977820, Batch: 157 of 157
Epoch: 2
Train loss: 0.020287, Train accuracy: 0.981146, Batch: 782 of 782
Val loss: 0.018717, Val accuracy: 0.981845, Batch: 157 of 157
Epoch: 3
Train loss: 0.016786, Train accuracy: 0.984024, Batch: 782 of 782
Val loss: 0.017782, Val accuracy: 0.982806, Batch: 157 of 157
Epoch: 4
Train loss: 0.014250, Train accuracy: 0.986164, Batch: 782 of 782
Val loss: 0.017682, Val accuracy: 0.982887, Batch: 157 of 157
Epoch: 5
Train loss: 0.012745, Train accuracy: 0.987371, Batch: 782 of 782
Val loss: 0.018477, Val accuracy: 0.983045, Batch: 157 of 157
Epoch: 6
Train loss: 0.011078, Train accuracy: 0.988796, Batch: 782 of 782
Val loss: 0.018467, Val accuracy: 0.983039, Batch: 157 of 157
Epoch: 7
Train loss: 0.009838, Train accu