# Table of Contents <a class="anchor" id="top"></a>

* [1. Imports](#imports)
* [2. Utils](#utils)
    * [2.1. GeLu](#gelu)
    * [2.2. Layer Norm](#layer_norm)
    * [2.3. Feed Forward](#feedforward)
* [3. Attention](#attention)
    * [3.1. Single Attention](#single_attention)
    * [3.2. Multi-head Attention](#multihead_attention)
* [4. Encoder Layer](#encoder)
    * [4.1. Sublayer Connection](#sublayer)
    * [4.2. Position-wise Feedforward](#position_feedforward)
    * [4.3. Transformer Block](#transformer_block)
* [5. Embeddings](#embeddings)
    * [5.1. Token Embedding](#token_embedding)
    * [5.2. Position Embedding](#position_embedding)
    * [5.3. Segment Embedding](#segment_embedding)
    * [5.3. BERT Embedding](#bert_embedding)
* [6. Model](#model)
    * [6.1. BERT](#bert)
    * [6.2. Masked Language Model](#masked_lm)
    * [6.3. Next Sentence Prediction](#nsp)
    * [6.4. BERT Language Model](#bert_lm)

* [7. Dataset](#dataset)
    * [7.1. Vocabulary Building](#vocab)
    * [7.2. Dataset Loading](#dataset_load)
* [8. Trainer](#trainer)
    * [8.1. Optimizer Scheduler](#optim_schedule)
    * [8.2. Pre-train](#pretrain)
* [9. Fine-tuning](#finetuning)

# 1. Imports <a class="anchor" id="imports"></a> 
-> *[Top](#top)*

In [1]:
import torch
import torch.nn as nn

# 2. Utils <a class="anchor" id="utils"></a>

## 2.1. GeLu <a class="anchor" id="gelu"></a>

In [1]:
import torch.nn as nn
import torch
import math


class GELU(nn.Module):
    """
    Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU
    """

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

## 2.2. Layer Norm <a class="anchor" id="layer_norm"></a>

In [2]:
import torch.nn as nn
import torch

class LayerNorm(nn.Module):
    """ 
    Construct a layer norm 
    """
    
    def __init__(self, features, eps=1e-6):
        super().__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
        
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

## 2.3. Feed Forward <a class="anchor" id="feedforward"></a>

# 3. Attention <a class="anchor" id="attention"></a>
-> *[Top](#top)*

## 3.1. Single attention <a class="anchor" id="single_attention"></a>

In [3]:
import torch.nn as nn
import torch.nn.functional as F
import torch

import math

class Attention(nn.Module):
    """ 
    Computes 'Scaled Dot Product Attention' 
    """
    
    def forward(self, query, key, value, mask=None, dropout=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(query.size(-1))
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        p_attn = F.softmax(scores, dim=-1)
        
        if dropout is not None:
            p_attn = dropout(p_attn)
            
        return torch.matmul(p_attn, value), p_attn

## 3.2. Multi-head attention <a class="anchor" id="multihead_attention"></a>

In [4]:
import torch.nn as nn
# from .single import Attention

class MultiHeadedAttention(nn.Module):
    """
    Multi-headed Attention - Takes in `model size` and `number of heads`
    """
    
    def __init__(self, h, d_model, dropout=0.1):
        super().__init__()
        assert d_model % h == 0
        
        # Assuming always d_k = d_v"
        self.d_k = d_model // h
        self.h = h
        
        self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
        self.output_layer = nn.Linear(d_model, d_model)
        self.attention = Attention()
        
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        # 1- Do all projections in batch from d_model => h x d_k
        query, key, value = [
            l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
            for l, x in zip(self.linear_layers, (query, key, value))
        ]
        # 2- Apply attention on all of the projected vectors in batch (`h` attention head)
        x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)
        # 3- "Concat" all attention heads using a view and apply a final linear
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
        
        return self.output_layer(x)

# 4. Encoder Layer <a class="anchor" id="encoder"></a>
-> *[Top](#top)*

## 4.1. Sublayer Connection <a class="anchor" id="sublayer"></a>

In [5]:
import torch.nn as nn
# from .layer_nor import LayerNorm

class SublayerConnection(nn.Module):
    """
    Applies a residual network followed by a layer norm
    """
    
    def __init__(self, size, dropout):
        super().__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

## 4.2. Position-wise Feedforward <a class="anchor" id="position_feedforward"></a>

In [6]:
import torch.nn as nn
# from .gelu import GELU

class PositionwiseFeedForward(nn.Module):
    """
    Implements a feed-forward network layer
    """
    
    def __init__(self, d_model, d_ff, dropout):
        super().__init__()
        
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        
        self.activation = GELU()
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x):
        return self.w_2(
            self.dropout(
                self.activation(self.w_1(x))
            )
        )

## 4.3. Transformer Block <a class="anchor" id="transformer_block"></a>

In [7]:
import torch.nn as nn
# from .utils import SublayerConnection, PositionwiseFeedForward
# from .attention import MultiHeadedAttention

class Transformer(nn.Module):
    """
    Bidirectional Encoder = Transformer(self-attention)
    
    Transformer = Multiheaded_Attention + FeedForward with Sublayer conneciton
    """
    
    def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):
        """
        :param hidden: size of hidden layer of transformer (or `d_model`)
        :param attn_heads: number of attention heads in multi-headed attention layer
        :param feed_forward_hidden: size of feed forward hidden layer (usually 4*hidden)
        :param dropout: dropout rate
        """
        super().__init__()
        
        self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)
        self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout)
        self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
        x = self.output_sublayer(x, self.feed_forward)
        return self.dropout(x)

# 5. Embeddings <a class="anchor" id="embeddings"></a>
-> *[Top](#top)*

## 5.1. Token Embedding <a class="anchor" id="token_embedding"></a>

In [8]:
import torch.nn as nn

class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, embed_size=512):
        super().__init__(vocab_size, embed_size, padding_idx=0)

## 5.2. Position Embedding <a class="anchor" id="position_embedding"></a>

In [9]:
import torch.nn as nn
import torch

import math


class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        
        # compute the positional encoding
        pe = torch.zeros(max_len, d_model).float()
        pe.require_grad = False
        
        position = torch.arange(0, max_len).float().unsqueeze(1)
        division = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
        
        pe[:, 0::2] = torch.sin(position * division)
        pe[:, 1::2] = torch.cos(position * division)
        
        pe = pe.unsqueeze(0)
        # Register buffer to add persistent state without counting it as a parameter
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        return self.pe[:, :x.size(1)]

## 5.3. Segment Embedding <a class="anchor" id="segment_embedding"></a>

In [10]:
import torch.nn as nn

class SegmentEmbedding(nn.Embedding):
    def __init__(self, embed_size=512):
        super().__init__(3, embed_size, padding_idx=0) # vocab size = 3

## 5.4. BERT Embedding <a class="anchor" id="bert_embedding"></a>

In [11]:
import torch.nn as nn
# from .token import TokenEmbedding
# from .position import PositionalEmbedding
# from .segment import SegmentEmbedding

class BERTEmbedding(nn.Module):
    """
    BERT embedding is consisted of three different embeddings:
        - Token Embedding
        - Positional Embedding
        - Segment Embedding
    The final embedding is the summation of all these three embeddings
    """
    
    def __init__(self, vocab_size, embed_size, dropout):
        """
        :param vocab_size	: size of vocabulary
        :param embed_size	: size of token embedding
        :param dropout		: dropout rate
        """
        super().__init__()
        self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)
        self.position = PositionalEmbedding(d_model=self.token.embedding_dim)
        self.segment = SegmentEmbedding(embed_size=self.token.embedding_dim)
        
        self.dropout = nn.Dropout(p=dropout)
        self.embed_size = embed_size
        
    def forward(self, sequence, segment_label):
        x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
        return self.dropout(x)

# 6. Model <a class="anchor" id="model"></a>
-> *[Top](#top)*

## 6.1. BERT <a class="anchor" id="bert"></a>

In [12]:
import torch.nn as nn

# from .transformer import Transformer
# from .embedding import BERTEmbedding

class BERT(nn.Module):
    """
    BERT model: Bidirectional Encoder Representations from Transformers
    """
    
    def __init__(self, vocab_size, hidden=768, n_layers=12, attn_heads=12, dropout=0.1):
        """
        :param vocab_size: size of vocabulary
        :param hidden: size of BERT hidden layers
        :param n_layers: number of Transformer blocks (layers)
        :param attn_heads: number of attention heads in multi-headed attention
        :param dropout: dropout rate
        """
        super().__init__()
        
        self.vocab_size = vocab_size
        self.hidden = hidden
        self.n_layers = n_layers
        self.attn_heads = attn_heads
        self.feed_forward_hidden = hidden * 4
        
        self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden, dropout=dropout)
        self.transformers = nn.ModuleList([
            Transformer(
                hidden=hidden, 
                attn_heads=attn_heads, 
                feed_forward_hidden=self.feed_forward_hidden, 
                dropout=dropout
            ) for _ in range(n_layers)
        ])
        
    def forward(self, x, segment_labels):
        # Creating attention mask for padding tokens
        mask = (x>0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
        
        x = self.embedding(x, segment_labels)
        
        for transformer in self.transformers:
            x = transformer.forward(x, mask=mask)
        
        return x

## 6.2. Masked Language Model <a class="anchor" id="masked_lm"></a>

In [13]:
import torch.nn as nn

class MaskedLanguageModel(nn.Module):
    """
    n-class classification module (n-class = vocab_size)
    """
    def __init__(self, hidden, vocab_size):
        """
        :param hidden : size of hidden layers in Transformer block
        :param vocab_size : size of vocabulary (= n_class)
        """
        super().__init__()
        
        self.linear = nn.Linear(hidden, vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, x):
        return self.softmax(self.linear(x))

## 6.3. Next Sentence Prediction <a class="anchor" id="nsp"></a>

In [14]:
import torch.nn as nn

class NextSentencePrediction(nn.Module):
    """
    2-class classification model: `is_next`, `is_not_next`
    """
    def __init__(self, hidden):
        """
        :param hidden: BERT hidden layer size
        """
        super().__init__()
        
        self.linear = nn.Linear(hidden, 2)
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, x):
        return self.softmax(self.linear(x[:, 0]))


## 6.4. BERT Language Model <a class="anchor" id="bert_lm"></a>

In [15]:
import torch.nn as nn

class BERTLM(nn.Module):
    """
    BERT Language Model (with NSP + MLM)
    """
    
    def __init__(self, bert: BERT, vocab_size):
        """
        :param bert : BERT model to be trained
        :param vocab_size : vocabulary size
        """
        super().__init__()
        self.bert = bert
        self.next_sentence = NextSentencePrediction(self.bert.hidden)
        self.masked_lm = MaskedLanguageModel(hidden=self.bert.hidden, vocab_size=vocab_size)
        
    def forward(self, x, segment_labels):
        x = self.bert(x, segment_labels)
        return self.next_sentence(x), self.masked_lm(x)

# 7. Dataset <a class="anchor" id="dataset"></a>
-> *[Top](#top)*

## 7.1. Vocabulary Building <a class="anchor" id="vocab"></a>

In [16]:
import pickle
from tqdm.auto import tqdm
from collections import Counter
from tokenizers import BertWordPieceTokenizer
# from multiprocessing import Pool
from pathos.multiprocessing import ProcessPool as Pool


class TorchVocab(object):
    """Defines a vocabulary object that will be used to numericalize a field.
    Attributes:
        freqs: A collections.Counter object holding the frequencies of tokens
            in the data used to build the Vocab.
        stoi: A collections.defaultdict instance mapping token strings to
            numerical identifiers.
        itos: A list of token strings indexed by their numerical identifiers.
    """

    def __init__(self, counter, max_size=None, min_freq=1, specials=['<pad>', '<oov>'],
                 vectors=None, unk_init=None, vectors_cache=None):
        """Create a Vocab object from a collections.Counter.
        Arguments:
            counter: collections.Counter object holding the frequencies of
                each value found in the data.
            max_size: The maximum size of the vocabulary, or None for no
                maximum. Default: None.
            min_freq: The minimum frequency needed to include a token in the
                vocabulary. Values less than 1 will be set to 1. Default: 1.
            specials: The list of special tokens (e.g., padding or eos) that
                will be prepended to the vocabulary in addition to an <unk>
                token. Default: ['<pad>']
            vectors: One of either the available pretrained vectors
                or custom pretrained vectors (see Vocab.load_vectors);
                or a list of aforementioned vectors
            unk_init (callback): by default, initialize out-of-vocabulary word vectors
                to zero vectors; can be any function that takes in a Tensor and
                returns a Tensor of the same size. Default: torch.Tensor.zero_
            vectors_cache: directory for cached vectors. Default: '.vector_cache'
        """
        self.freqs = counter
        counter = counter.copy()
        min_freq = max(min_freq, 1)

        self.itos = list(specials)
        # frequencies of special tokens are not counted when building vocabulary
        # in frequency order
        for tok in specials:
            del counter[tok]

        max_size = None if max_size is None else max_size + len(self.itos)

        # sort by frequency, then alphabetically
        words_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0])
        words_and_frequencies.sort(key=lambda tup: tup[1], reverse=True)

        for word, freq in words_and_frequencies:
            if freq < min_freq or len(self.itos) == max_size:
                break
            self.itos.append(word)

        # stoi is simply a reverse dict for itos
        self.stoi = {tok: i for i, tok in enumerate(self.itos)}

        self.vectors = None
        if vectors is not None:
            self.load_vectors(vectors, unk_init=unk_init, cache=vectors_cache)
        else:
            assert unk_init is None and vectors_cache is None

    def __eq__(self, other):
        if self.freqs != other.freqs:
            return False
        if self.stoi != other.stoi:
            return False
        if self.itos != other.itos:
            return False
        if self.vectors != other.vectors:
            return False
        return True

    def __len__(self):
        return len(self.itos)

    def vocab_rerank(self):
        self.stoi = {word: i for i, word in enumerate(self.itos)}

    def extend(self, v, sort=False):
        words = sorted(v.itos) if sort else v.itos
        for w in words:
            if w not in self.stoi:
                self.itos.append(w)
                self.stoi[w] = len(self.itos) - 1


class Vocab(TorchVocab):
    def __init__(
        self, counter, max_size=None, min_freq=1, 
        pad_token="[PAD]",
        unk_token="[UNK]",
        sep_token="[SEP]",
        cls_token="[CLS]",
        mask_token="[MASK]",
    ):
        self.pad_index = 0
        self.unk_index = 1
        self.sep_index = 2
        self.cls_index = 3
        self.mask_index = 4
        
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.sep_token = sep_token
        self.cls_token = cls_token
        self.mask_token = mask_token
        
        super().__init__(counter, specials=[pad_token, unk_token, sep_token, cls_token, mask_token],
                         max_size=max_size, min_freq=min_freq)

    def to_seq(self, sentece, seq_len, with_sep=False, with_cls=False) -> list:
        pass

    def from_seq(self, seq, join=False, with_pad=False):
        pass

    @staticmethod
    def load_vocab(vocab_path: str) -> 'Vocab':
        with open(vocab_path, "rb") as f:
            return pickle.load(f)

    def save_vocab(self, vocab_path):
        with open(vocab_path, "wb") as f:
            pickle.dump(self, f)


# Building Vocab with text files
class WordVocab(Vocab):
    def __init__(
        self, texts, tok_train_path, max_size=None, min_freq=1,
        pad_token="[PAD]",
        unk_token="[UNK]",
        sep_token="[SEP]",
        cls_token="[CLS]",
        mask_token="[MASK]",
    ):
        """
        :param texts : List of sentences for building the vocabulary
        :param tok_train_path : Path to training texts for training the tokenizer
        :param pad_token : Special token to be used for padding
        :param unk_token : Special token to be used for unknown words
        :param sep_token : Special token to be used as seperator
        :param cls_token : Special token to be used as CLS
        :param mask_token : Special token to be used for attention masking
        """
        
        # Creating Tokenizer for building the vocabulary
        self.tokenizer = BertWordPieceTokenizer()
        print("Training the tokenizer to build the vocabulary...")
        self.tokenizer.train(tok_train_path)
        print("Training tokenizer finished successfully!\n")
        
        # Building the vocabulary
        print("Building Vocab...")
        counter = Counter()
        for line in tqdm(texts, total=len(texts)):
            if isinstance(line, list):
                words = line
            else:
                words = self.tokenizer.encode(line).tokens

            for word in words:
                counter[word] += 1
                
        super().__init__(
            counter, max_size=max_size, min_freq=min_freq, 
            pad_token=pad_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            mask_token=mask_token
        )

    def to_seq(self, sentence, seq_len=None, with_sep=False, with_cls=False, with_len=False):
        if isinstance(sentence, str):
            sentence = self.tokenizer.encode(sentence).tokens

        seq = [self.stoi.get(word, self.unk_index) for word in sentence]

        if with_sep:
            seq += [self.sep_index]  # this would be index 1
        if with_cls:
            seq = [self.cls_index] + seq

        origin_seq_len = len(seq)

        if seq_len is None:
            pass
        elif len(seq) <= seq_len:
            seq += [self.pad_index for _ in range(seq_len - len(seq))]
        else:
            seq = seq[:seq_len]

        return (seq, origin_seq_len) if with_len else seq

    def from_seq(self, seq, join=False, with_pad=False):
        words = [self.itos[idx]
                 if idx < len(self.itos)
                 else "<%d>" % idx
                 for idx in seq
                 if with_pad or idx != self.pad_index]

        return (" ".join(words)).replace(" ##", "") if join else words

    @staticmethod
    def load_vocab(vocab_path: str) -> 'WordVocab':
        with open(vocab_path, "rb") as f:
            return pickle.load(f)
    
#     def parallel_processing(self, texts, ncores=None, message=None):
#         # from pathos.multiprocessing import ProcessPool as Pool
#         p = Pool(ncores)
#         return tqdm(
#             p.imap(self.count_each_line, texts),
#             total=len(texts), 
#             desc=message
#         )
    
#     def count_each_line(self, line):
#         if isinstance(line, list):
#             words = line
#         else:
#             words = self.tokenizer.encode(line).tokens
            
#         return words

## 7.2. Dataset <a class="anchor" id="dataset_load"></a>

In [17]:
from torch.utils.data import Dataset
from tqdm.auto import tqdm
import torch
import random


class BERTDataset(Dataset):
    def __init__(self, corpus_path, vocab, seq_len, encoding="utf-8", corpus_lines=None, on_memory=True):
        self.vocab = vocab
        self.seq_len = seq_len

        self.on_memory = on_memory
        self.corpus_lines = corpus_lines
        self.corpus_path = corpus_path
        self.encoding = encoding

        with open(corpus_path, "r", encoding=encoding) as f:
            if self.corpus_lines is None and not on_memory:
                for _ in tqdm(f, desc="Loading Dataset", total=corpus_lines):
                    self.corpus_lines += 1

            if on_memory:
                self.lines = [line[:-1].split("\t")
                              for line in tqdm(f, desc="Loading Dataset", total=corpus_lines)]
                self.corpus_lines = len(self.lines)

        if not on_memory:
            self.file = open(corpus_path, "r", encoding=encoding)
            self.random_file = open(corpus_path, "r", encoding=encoding)

            for _ in range(random.randint(self.corpus_lines if self.corpus_lines < 1000 else 1000)):
                self.random_file.__next__()

    def __len__(self):
        return self.corpus_lines

    def __getitem__(self, item):
        t1, t2, is_next_label = self.random_sent(item)
        t1_random, t1_label = self.random_word(t1)
        t2_random, t2_label = self.random_word(t2)

        # [CLS] tag = SOS tag, [SEP] tag = EOS tag
        t1 = [self.vocab.cls_index] + t1_random + [self.vocab.sep_index]
        t2 = t2_random + [self.vocab.sep_index]

        t1_label = [self.vocab.pad_index] + t1_label + [self.vocab.pad_index]
        t2_label = t2_label + [self.vocab.pad_index]

        segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
        bert_input = (t1 + t2)[:self.seq_len]
        bert_label = (t1_label + t2_label)[:self.seq_len]

        padding = [self.vocab.pad_index for _ in range(self.seq_len - len(bert_input))]
        bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

        output = {"bert_input": bert_input,
                  "bert_label": bert_label,
                  "segment_label": segment_label,
                  "is_next": is_next_label}

        return {key: torch.tensor(value) for key, value in output.items()}

    def random_word(self, sentence):
        tokens = sentence.split()
        output_label = []

        for i, token in enumerate(tokens):
            prob = random.random()
            if prob < 0.15:
                prob /= 0.15

                # 80% randomly change token to mask token
                if prob < 0.8:
                    tokens[i] = self.vocab.mask_index

                # 10% randomly change token to random token
                elif prob < 0.9:
                    tokens[i] = random.randrange(len(self.vocab))

                # 10% randomly change token to current token
                else:
                    tokens[i] = self.vocab.stoi.get(token, self.vocab.unk_index)

                output_label.append(self.vocab.stoi.get(token, self.vocab.unk_index))

            else:
                tokens[i] = self.vocab.stoi.get(token, self.vocab.unk_index)
                output_label.append(0)

        return tokens, output_label

    def random_sent(self, index):
        t1, t2 = self.get_corpus_line(index)

        # output_text, label(isNotNext:0, isNext:1)
        if random.random() > 0.5:
            return t1, t2, 1
        else:
            return t1, self.get_random_line(), 0

    def get_corpus_line(self, item):
        if self.on_memory:
            if self.lines[item][1]==0 or item==0:
                return self.lines[item][0], self.lines[item+1][0]
            else:
                return self.lines[item-1][0], self.lines[item][0]
        else:
            line_1 = self.file.__next__()
            line_2 = self.file.__next__()
            if line_1 is None:
                self.file.close()
                self.file = open(self.corpus_path, "r", encoding=self.encoding)
                line_1 = self.file.__next__()
                line_2 = self.file.__next__()
            elif line_2 is None:
                self.file.close()
                self.file = open(self.corpus_path, "r", encoding=self.encoding)
                line_1 = self.file.__next__()
                line_2 = self.file.__next__()
            line_1 = line_1[:-1].split("\t")
            line_2 = line_2[:-1].split("\t")
            if line_2[1]==1:
                return line_1[0], line_2[0]
            
            line_3 = self.file.__next__()
            if line_3 is None:
                self.file.close()
                self.file = open(self.corpus_path, "r", encoding=self.encoding)
                line_1 = self.file.__next__()
                line_2 = self.file.__next__()
                line_1 = line_1[:-1].split("\t")
                line_2 = line_2[:-1].split("\t")
                return line_1[0], line_2[0]
            return line_2[0], line_3[:-1].split("\t")[0]

    def get_random_line(self):
        if self.on_memory:
            rand_idx = random.randrange(len(self.lines))
            return self.lines[rand_idx][0]

        line = self.file.__next__()
        if line is None:
            self.file.close()
            self.file = open(self.corpus_path, "r", encoding=self.encoding)
            for _ in range(random.randint(self.corpus_lines if self.corpus_lines < 1000 else 1000)):
                self.random_file.__next__()
            line = self.random_file.__next__()
        return line[:-1].split("\t")[0]

In [18]:
from torch.utils.data import Dataset
from tqdm.auto import tqdm
import torch
import random


class BERTDatasetOnMemory(Dataset):
    
    def __init__(self, corpus_path, vocab, seq_len, encoding="utf-8", corpus_lines=None, on_memory=True, device='cuda'):
        self.vocab = vocab
        self.seq_len = seq_len

        self.on_memory = on_memory
        self.corpus_lines = corpus_lines
        self.corpus_path = corpus_path
        self.encoding = encoding

        with open(corpus_path, "r", encoding=encoding) as f:
            self.lines = [
                line[:-1].split("\t")
                for line in tqdm(f, desc="Loading Dataset", total=corpus_lines)
            ]
            self.corpus_len = len(self.lines)
        self.sentences, self.isnext = self._preprocess_lines(self.lines)
        self.device = 'cpu'
        if torch.cuda.is_available() and device=='cuda':
            self.device = 'cuda'
            self.sentences = [t.cuda() for t in tqdm(self.sentences, desc="Moving sentences to GPU...")]
            self.isnext = self.isnext.cuda()
    
    
    def __len__(self):
        return self.corpus_len
    
    
    def to(self, device):
        self.sentences = [x.to(device) for x in tqdm(self.sentences, desc=f"Moving sentences to {device.upper()}...")]
        self.isnext = self.isnext.to(device)
        self.device = device
    
    
    def _preprocess_lines(self, lines):
        train_sents = []
        train_isnext = []
        for line in tqdm(lines):
            tokens = line[0].split()
            tokens_id = [self.vocab.stoi.get(tok, self.vocab.unk_index) for tok in tokens]
            train_sents.append(torch.tensor(tokens_id).long())
            train_isnext.append(int(line[1]))
        train_isnext = torch.tensor(train_isnext).long()
        
        return train_sents, train_isnext
    
    
    def __getitem__(self, item):
        t1, t2, is_next_label = self.random_sent(item)
        t1_random, t1_label = self.random_word(t1)
        t2_random, t2_label = self.random_word(t2)

        # [CLS] tag = SOS tag, [SEP] tag = EOS tag
        with torch.no_grad():
            t1 = torch.cat([torch.tensor([self.vocab.cls_index]).to(self.device), t1_random, torch.tensor([self.vocab.sep_index]).to(self.device)])
            t2 =  torch.cat([t2_random, torch.tensor([self.vocab.sep_index]).to(self.device)])

            t1_label = torch.cat([torch.tensor([self.vocab.pad_index]).to(self.device), t1_label, torch.tensor([self.vocab.pad_index]).to(self.device)])
            t2_label = torch.cat([t2_label, torch.tensor([self.vocab.pad_index]).to(self.device)])

            segment_label = torch.tensor(([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]).to(self.device)
            bert_input = torch.cat([t1, t2])[:self.seq_len]
            bert_label = torch.cat([t1_label, t2_label])[:self.seq_len]

            padding = torch.tensor([self.vocab.pad_index for _ in range(self.seq_len - len(bert_input))]).to(self.device)
            bert_input = torch.cat([bert_input, padding]).detach()
            bert_label = torch.cat([bert_label, padding]).detach()
            segment_label = torch.cat([segment_label, padding]).detach()

        return {
            "bert_input": bert_input.long(),
            "bert_label": bert_label,
            "segment_label": segment_label,
            "is_next": is_next_label
        }
#         return {key: torch.tensor(value) for key, value in output.items()}

    
    def random_word(self, tokens):
        tokens = tokens.detach().clone()
        output_label = []
        with torch.no_grad():
            for i, token in enumerate(tokens):
                prob = random.random()
                if prob < 0.15:
                    prob /= 0.15

                    # 80% randomly change token to mask token
                    if prob < 0.8:
                        tokens[i] = self.vocab.mask_index

                    # 10% randomly change token to random token
                    elif prob < 0.9:
                        tokens[i] = int(random.randrange(len(self.vocab)))

                    # 10% randomly change token to current token
                    else:
                        pass

                    output_label.append(token)

                else:
                    # tokens[i] = token
                    output_label.append(0)

        return tokens, torch.tensor(output_label).to(self.device)

    
    def random_sent(self, index):
        t1, t2 = self.get_corpus_line(index)

        # output_text, label(isNotNext:0, isNext:1)
        if random.random() > 0.5:
            return t1, t2, 1
        else:
            return t1, self.get_random_line(), 0

        
    def get_corpus_line(self, item):
        if self.isnext[item]==0 or item==0:
            return self.sentences[item], self.sentences[item+1]
        else:
            return self.sentences[item-1], self.sentences[item]


    def get_random_line(self):
        rand_idx = random.randrange(len(self.lines))
        return self.sentences[rand_idx]

# 8. Tokenization
-> *[Top](#top)*

## 8.1. Wordpiece Tokenizer

In [19]:
def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens


class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, unk_token, max_input_chars_per_word=100):
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.

        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer`.

        Returns:
          A list of wordpiece tokens.
        """

        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens

## 8.2. BERT Tokenizer

In [20]:
class BertTokenizer():
    def __init__(self, vocab):
        self.vocab = vocab
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=vocab, unk_token=self.vocab.unk_token)
    
    
    def _tokenize(self, text):
        return self.wordpiece_tokenizer.tokenize(text)
    
    
    def convert_tokens_to_ids(self, tokens):
        return [self.vocab.stoi.get(token, self.vocab.unk_index) for token in tokens]
    
    def convert_ids_to_tokens(self, token_ids):
        tokens = [self.vocab.itos[id] for id in token_ids]
        return " ".join(tokens)
    
    def _batch_encoding(self, ):
        pass # TO-DO
    
    def _pad(self, encoding):
        pass

# 8. Trainer <a class="anchor" id="trainer"></a>
-> *[Top](#top)*

## 8.1. Optimizer Scheduler <a class="anchor" id="optim_schedule"></a>

In [21]:
 
'''A wrapper class for optimizer '''
import numpy as np


class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        "Zero out the gradients by the inner optimizer"
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

## 8.2. Pre-train <a class="anchor" id="pretrain"></a>

In [22]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader
import json
from typing import Union
# from ..model import BERTLM, BERT
# from .optim_schedule import ScheduledOptim

from tqdm.auto import tqdm


class BERTTrainer:
    """
    BERTTrainer make the pretrained BERT model with two LM training method.
        1. Masked Language Model : 3.3.1 Task #1: Masked LM
        2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction
    please check the details on README.md with simple example.
    """

    def __init__(self, bert: BERT, vocab_size: int,
                 train_dataloader: DataLoader, test_dataloader: DataLoader = None,
                 lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
                 with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, 
                 checkpoint_path: Union[str, None] = None, bert_model_path: Union[str, None] = None):
        """
        :param bert: BERT model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")

        # Either initializing BERT or loading from the last checkpoint
        # This BERT model will be saved every epoch
        self.bert = bert
        if bert_model_path is not None:
            print("Loading from checkpoint...")
            self.load_bert(path=bert_model_path, type="entire")
            
        # Initialize the BERT Language Model or load it from checkpoint_path, with BERT model
        self.model = BERTLM(bert, vocab_size)
        if checkpoint_path is not None:
            self.load_from_checkpoint(path=checkpoint_path)
        # Sending the model to the appropriate device
        self.model.to(self.device)

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=0)

        self.log_freq = log_freq

        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        """
        loop over the data_loader for training or testing
        if on train status, backward operation is activated
        and also auto save the model every peoch
        :param epoch: current epoch index
        :param data_loader: torch.utils.data.DataLoader for iteration
        :param train: boolean value of is train or test
        :return: None
        """
        if train:
            str_code = "train"
            self.model.train()
        else:
            str_code = "test"
            self.model.eval()

        # Setting the tqdm progress bar
        data_iter = tqdm(enumerate(data_loader),
                         desc="EP_%s:%d" % (str_code, epoch),
                         total=len(data_loader),
                         bar_format="{l_bar}{r_bar}")

        avg_loss = 0.0
        total_correct = 0
        total_element = 0
        
        for i, data in data_iter:
            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            # 1. forward the next_sentence_prediction and masked_lm model
            next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])

            # 2-1. NLL(negative log likelihood) loss of is_next classification result
            next_loss = self.criterion(next_sent_output, data["is_next"])

            # 2-2. NLLLoss of predicting masked token word
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])

            # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
            loss = next_loss + mask_loss

            # 3. backward and optimization only in train
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            # next sentence prediction accuracy
            correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            total_element += data["is_next"].nelement()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100,
                "loss": loss.item()
            }
            if i % self.log_freq == 0:
                logging.info(json.dumps(post_fix))
            
            if i % (2*self.log_freq) == 0:
                data_iter.write(str(post_fix))

        print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=",
              total_correct * 100.0 / total_element)
        final_metrics = {
            "epoch": epoch,
            "train": train,
            "avg_loss": avg_loss / len(data_iter),
            "total_acc": total_correct * 100.0 / total_element
        }
        with open("./models/logging/bert_02_final.txt", "a", encoding="utf-8") as file:
            file.write(json.dumps(final_metrics) + "\n")

    def save(self, epoch, file_path="output/bert_trained.model"):
        """
        Saving the current BERT model on file_path
        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        output_path = file_path + ".{}.ep{:02d}.pt" #% epoch
        # Saving Entire BERT model
        torch.save(self.bert.cpu(), output_path.format("bert.model", epoch))
        # Saving only BERT state_dict
        torch.save(self.bert.cpu().state_dict(), output_path.format("bert.statedict", epoch))
        # Saving BERT LM state_dict
        torch.save(self.model.cpu().state_dict(), output_path.format("bertlm.statedict", epoch))
        
        self.bert.to(self.device)
        self.model.to(self.device)
        
        print("EP:%d Model Saved on:" % epoch, output_path)
        return output_path
    
    def load_bert(self, path, type="entire"):
        if type=="entire":
            # Loading entire model
            self.bert = torch.load(path)
        elif type=="state_dict":
            self.bert.load_state_dict(torch.load(path))
        else:
            raise TypeError('Parameter `type` can either be "entire" or "state_dict"')
            
    def load_from_checkpoint(self, path):
        self.model.load_state_dict(torch.load(path))

# 9. Pre-training <a class="anchor" id="finetuning"></a>

## 9.1. Building and Training Tokenizer and Vocabulary

In [23]:
import pandas as pd

train_df = pd.read_csv(
    "./wikitext-2/wikitext2_bert_train.csv", 
    sep="\t", 
    header=None, 
    names=['text', 'is_next']
)
print("{:,}".format(train_df.shape[0]))
train_df.head(2)

16,186


Unnamed: 0,text,is_next
0,Senjō no Valkyria 3 : [UNK] Chronicles ( Japan...,0
1,"The game began development in 2010 , carrying ...",1


In [24]:
# # Write for Tokenizer Training
# tokenizer_train_path = "./wikitext-2/wikitext2_bert_for_tokenizer.txt"
# with open(tokenizer_train_path, "w", encoding="utf-8")  as file:
#     for line in train_df.text.tolist():
#         file.write(line +"\n")

In [25]:
# vocabulary = WordVocab(texts=train_df.text, tok_train_path=tokenizer_train_path)

In [26]:
vocabulary.from_seq(vocabulary.to_seq(train_df['text'][0], seq_len=155), join=True)

NameError: name 'vocabulary' is not defined

In [None]:
# vocabulary.save_vocab("./wikitext-2/vocabulary.pkl")

In [None]:
def tokenize_for_bert(sents):
    print("Starting Batch Tokenization...")
    tokenized_sents = vocabulary.tokenizer.encode_batch(sents)
    return [
        " ".join(x.tokens) 
        for x in tqdm(tokenized_sents, total=len(tokenized_sents), desc="Merging tokens...")
    ]

### Save Tokenized Training Set BERT

In [None]:
import time

start_time = time.perf_counter()
train_df['text'] = pd.Series(tokenize_for_bert(train_df.text))
print("Time elapsed {:.2f}".format(time.perf_counter()-start_time))

In [27]:
# train_df.to_csv("./wikitext-2/wikitext2_bert_train_tokenized.csv", sep="\t", index=False, header=False, encoding='utf-8')

#### Save Tokenized Test set for BERT

In [40]:
trainer.optim_schedule.init_lr

0.03608439182435161

In [35]:
trainer.optim.param_groups[0]['lr']

0.00016136301107651203

In [47]:
import pandas as pd

valid_df = pd.read_csv(
    "./wikitext-2/wikitext2_bert_valid.csv", 
    sep="\t", 
    header=None, 
    names=['text', 'is_next']
)
print("{:,}".format(valid_df.shape[0]))
valid_df.head(2)

1,726


Unnamed: 0,text,is_next
0,"Homarus gammarus , known as the European lobst...",0
1,"Homarus gammarus is a large [UNK] , with a bod...",1


In [48]:
import time

start_time = time.perf_counter()
valid_df['text'] = pd.Series(tokenize_for_bert(valid_df.text))
print("Time elapsed {:.2f}".format(time.perf_counter()-start_time))

Starting Batch Tokenization...


HBox(children=(FloatProgress(value=0.0, description='Merging tokens...', max=1726.0, style=ProgressStyle(descr…


Time elapsed 0.07


In [49]:
# valid_df.to_csv("./wikitext-2/wikitext2_bert_valid_tokenized.csv", 
#                 sep="\t", index=False, header=False, encoding='utf-8')

## 9.2. Pre-training BERT Language Model

In [24]:
import logging

logging.basicConfig(
    filename='./models/logging/bert_02_logs.jsonl', 
    filemode='a', 
    encoding='utf-8', 
    level=logging.INFO,
    format='{"name": "%(name)s", "levelname": "%(levelname)s", "message":%(message)s}'
)

In [26]:
# import json

# with open("./models/logging/bert_01_logs.jsonl", "r") as file:
#     json_list = [json.loads(line) for line in file]

In [27]:
# for line in json_list:
#     logging.info(json.dumps(line))

In [25]:
vocab_path = "./wikitext-2/vocabulary.pkl"
train_ds_path = "./wikitext-2/wikitext2_bert_train_tokenized.csv"
valid_ds_path = "./wikitext-2/wikitext2_bert_valid_tokenized.csv"
seq_len = 384
corpus_lines = int(train_df.shape[0])
on_memory = True
batch_size = 16
num_workers = 22
attn_heads = 12
layers = 8
hidden = 768
lr = 5e-5
adam_beta1, adam_beta2 = 0.9, 0.999
adam_weight_decay = 0.01
with_cuda = True
cuda_devices = 1
log_freq = 200
epochs = 100
output_path = "./models/bert_02_small"
test_dataset = None

vocab = WordVocab.load_vocab(vocab_path)
train_dataset = BERTDataset(train_ds_path, vocab, seq_len=seq_len,
                            corpus_lines=corpus_lines, on_memory=on_memory)
# print("Loading Test Dataset", args.test_dataset)
test_dataset = BERTDataset(valid_ds_path, vocab, seq_len=seq_len, on_memory=on_memory) \
    if valid_ds_path is not None else None


HBox(children=(FloatProgress(value=0.0, description='Loading Dataset', max=16186.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Loading Dataset', layout=Layout(width='…




In [26]:
print("Creating Dataloader")
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=22)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=22) \
    if test_dataset is not None else None

print("Building BERT model")
bert = BERT(len(vocab), hidden=hidden, n_layers=layers, attn_heads=attn_heads)

print("Creating BERT Trainer")
trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader,
                      lr=lr, betas=(adam_beta1, adam_beta2), weight_decay=adam_weight_decay,
                      with_cuda=with_cuda, cuda_devices=cuda_devices, log_freq=log_freq, 
#                       checkpoint_path="./models/bert_01.bertlm.statedict.ep02.pt"
                     )

Creating Dataloader
Building BERT model
Creating BERT Trainer
Total Parameters: 94793678


In [27]:
epoch_start = 0
print("Training Start")
for epoch in range(epoch_start, epochs + epoch_start):
    trainer.train(epoch)
    trainer.save(epoch, output_path)

    if test_data_loader is not None:
        trainer.test(epoch)

Training Start


HBox(children=(FloatProgress(value=0.0, description='EP_train:0', max=1012.0, style=ProgressStyle(description_…

{'epoch': 0, 'iter': 0, 'avg_loss': 12.726310729980469, 'avg_acc': 68.75, 'loss': 12.726310729980469}
{'epoch': 0, 'iter': 400, 'avg_loss': 9.461090229395916, 'avg_acc': 49.3142144638404, 'loss': 7.554988384246826}
{'epoch': 0, 'iter': 800, 'avg_loss': 8.468101362759404, 'avg_acc': 49.53183520599251, 'loss': 7.329710483551025}

EP0_train, avg_loss= 8.195631435737308 total_acc= 49.57370567156802
EP:0 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:0', max=108.0, style=ProgressStyle(description_wi…

{'epoch': 0, 'iter': 0, 'avg_loss': 7.475540637969971, 'avg_acc': 37.5, 'loss': 7.475540637969971}

EP0_test, avg_loss= 7.322490140243813 total_acc= 51.33256083429896


HBox(children=(FloatProgress(value=0.0, description='EP_train:1', max=1012.0, style=ProgressStyle(description_…

{'epoch': 1, 'iter': 0, 'avg_loss': 6.9870991706848145, 'avg_acc': 43.75, 'loss': 6.9870991706848145}
{'epoch': 1, 'iter': 400, 'avg_loss': 7.066003132342103, 'avg_acc': 50.88840399002493, 'loss': 7.271169662475586}
{'epoch': 1, 'iter': 800, 'avg_loss': 7.037893726286965, 'avg_acc': 50.749063670411985, 'loss': 6.891916275024414}

EP1_train, avg_loss= 7.021967653229303 total_acc= 50.772272334115904
EP:1 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:1', max=108.0, style=ProgressStyle(description_wi…

{'epoch': 1, 'iter': 0, 'avg_loss': 7.128962516784668, 'avg_acc': 62.5, 'loss': 7.128962516784668}

EP1_test, avg_loss= 7.249704639116923 total_acc= 49.826187717265356


HBox(children=(FloatProgress(value=0.0, description='EP_train:2', max=1012.0, style=ProgressStyle(description_…

{'epoch': 2, 'iter': 0, 'avg_loss': 6.996368408203125, 'avg_acc': 62.5, 'loss': 6.996368408203125}
{'epoch': 2, 'iter': 400, 'avg_loss': 6.941265638926975, 'avg_acc': 49.00249376558604, 'loss': 6.579565525054932}
{'epoch': 2, 'iter': 800, 'avg_loss': 6.918057811394166, 'avg_acc': 49.61766541822721, 'loss': 6.605149745941162}

EP2_train, avg_loss= 6.912844444452067 total_acc= 49.59841838625973
EP:2 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:2', max=108.0, style=ProgressStyle(description_wi…

{'epoch': 2, 'iter': 0, 'avg_loss': 7.267695903778076, 'avg_acc': 68.75, 'loss': 7.267695903778076}

EP2_test, avg_loss= 7.090606265597874 total_acc= 50.115874855156434


HBox(children=(FloatProgress(value=0.0, description='EP_train:3', max=1012.0, style=ProgressStyle(description_…

{'epoch': 3, 'iter': 0, 'avg_loss': 7.030896186828613, 'avg_acc': 50.0, 'loss': 7.030896186828613}
{'epoch': 3, 'iter': 400, 'avg_loss': 6.8442567078549965, 'avg_acc': 49.20511221945137, 'loss': 6.7324604988098145}
{'epoch': 3, 'iter': 800, 'avg_loss': 6.829695631353447, 'avg_acc': 49.38358302122347, 'loss': 6.476406574249268}

EP3_train, avg_loss= 6.826710196351817 total_acc= 49.20301495119239
EP:3 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:3', max=108.0, style=ProgressStyle(description_wi…

{'epoch': 3, 'iter': 0, 'avg_loss': 6.966486930847168, 'avg_acc': 37.5, 'loss': 6.966486930847168}

EP3_test, avg_loss= 6.957973855513114 total_acc= 52.6071842410197


HBox(children=(FloatProgress(value=0.0, description='EP_train:4', max=1012.0, style=ProgressStyle(description_…

{'epoch': 4, 'iter': 0, 'avg_loss': 6.971465110778809, 'avg_acc': 43.75, 'loss': 6.971465110778809}
{'epoch': 4, 'iter': 400, 'avg_loss': 6.8125913589078, 'avg_acc': 49.76620947630923, 'loss': 6.471914768218994}
{'epoch': 4, 'iter': 800, 'avg_loss': 6.814100888189156, 'avg_acc': 49.89076154806492, 'loss': 6.547294616699219}

EP4_train, avg_loss= 6.818777200732778 total_acc= 49.54899295687631
EP:4 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:4', max=108.0, style=ProgressStyle(description_wi…

{'epoch': 4, 'iter': 0, 'avg_loss': 7.017176151275635, 'avg_acc': 50.0, 'loss': 7.017176151275635}

EP4_test, avg_loss= 6.862703685407285 total_acc= 47.740440324449594


HBox(children=(FloatProgress(value=0.0, description='EP_train:5', max=1012.0, style=ProgressStyle(description_…

{'epoch': 5, 'iter': 0, 'avg_loss': 6.804322719573975, 'avg_acc': 37.5, 'loss': 6.804322719573975}
{'epoch': 5, 'iter': 400, 'avg_loss': 6.818057552537419, 'avg_acc': 50.592269326683294, 'loss': 6.810786724090576}
{'epoch': 5, 'iter': 800, 'avg_loss': 6.816928582542696, 'avg_acc': 50.37453183520599, 'loss': 6.6378021240234375}

EP5_train, avg_loss= 6.817512456607441 total_acc= 50.46336340046954
EP:5 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:5', max=108.0, style=ProgressStyle(description_wi…

{'epoch': 5, 'iter': 0, 'avg_loss': 6.873398780822754, 'avg_acc': 31.25, 'loss': 6.873398780822754}

EP5_test, avg_loss= 6.78973592210699 total_acc= 49.94206257242178


HBox(children=(FloatProgress(value=0.0, description='EP_train:6', max=1012.0, style=ProgressStyle(description_…

{'epoch': 6, 'iter': 0, 'avg_loss': 7.037576675415039, 'avg_acc': 31.25, 'loss': 7.037576675415039}
{'epoch': 6, 'iter': 400, 'avg_loss': 6.8287684875830745, 'avg_acc': 50.46758104738155, 'loss': 6.65717077255249}
{'epoch': 6, 'iter': 800, 'avg_loss': 6.830229825294866, 'avg_acc': 49.664481897627965, 'loss': 6.83324670791626}

EP6_train, avg_loss= 6.834485354630844 total_acc= 49.81465463981218
EP:6 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:6', max=108.0, style=ProgressStyle(description_wi…

{'epoch': 6, 'iter': 0, 'avg_loss': 6.587947368621826, 'avg_acc': 62.5, 'loss': 6.587947368621826}

EP6_test, avg_loss= 6.664273875731009 total_acc= 50.0


HBox(children=(FloatProgress(value=0.0, description='EP_train:7', max=1012.0, style=ProgressStyle(description_…

{'epoch': 7, 'iter': 0, 'avg_loss': 6.9102654457092285, 'avg_acc': 56.25, 'loss': 6.9102654457092285}
{'epoch': 7, 'iter': 400, 'avg_loss': 6.863005636933439, 'avg_acc': 50.670199501246884, 'loss': 6.619730472564697}
{'epoch': 7, 'iter': 800, 'avg_loss': 6.86993715617243, 'avg_acc': 50.530586766541816, 'loss': 6.759713172912598}

EP7_train, avg_loss= 6.875325883801276 total_acc= 50.370690720375634
EP:7 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:7', max=108.0, style=ProgressStyle(description_wi…

{'epoch': 7, 'iter': 0, 'avg_loss': 6.536777973175049, 'avg_acc': 56.25, 'loss': 6.536777973175049}

EP7_test, avg_loss= 6.643185615539551 total_acc= 48.43568945538818


HBox(children=(FloatProgress(value=0.0, description='EP_train:8', max=1012.0, style=ProgressStyle(description_…

{'epoch': 8, 'iter': 0, 'avg_loss': 6.864974498748779, 'avg_acc': 50.0, 'loss': 6.864974498748779}
{'epoch': 8, 'iter': 400, 'avg_loss': 6.886917651740095, 'avg_acc': 49.37655860349127, 'loss': 6.765318393707275}
{'epoch': 8, 'iter': 800, 'avg_loss': 6.89675923739182, 'avg_acc': 49.56304619225968, 'loss': 6.883667469024658}

EP8_train, avg_loss= 6.899116648515694 total_acc= 49.7961201037934
EP:8 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:8', max=108.0, style=ProgressStyle(description_wi…

{'epoch': 8, 'iter': 0, 'avg_loss': 6.619323253631592, 'avg_acc': 50.0, 'loss': 6.619323253631592}

EP8_test, avg_loss= 6.720549481886405 total_acc= 49.76825028968714


HBox(children=(FloatProgress(value=0.0, description='EP_train:9', max=1012.0, style=ProgressStyle(description_…

{'epoch': 9, 'iter': 0, 'avg_loss': 7.0588274002075195, 'avg_acc': 62.5, 'loss': 7.0588274002075195}
{'epoch': 9, 'iter': 400, 'avg_loss': 6.9114387564528315, 'avg_acc': 49.37655860349127, 'loss': 6.919217586517334}
{'epoch': 9, 'iter': 800, 'avg_loss': 6.92008132613107, 'avg_acc': 49.92977528089887, 'loss': 6.873764514923096}

EP9_train, avg_loss= 6.925940754856517 total_acc= 50.006178178672926
EP:9 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:9', max=108.0, style=ProgressStyle(description_wi…

{'epoch': 9, 'iter': 0, 'avg_loss': 6.743729591369629, 'avg_acc': 56.25, 'loss': 6.743729591369629}

EP9_test, avg_loss= 6.754164567700139 total_acc= 49.826187717265356


HBox(children=(FloatProgress(value=0.0, description='EP_train:10', max=1012.0, style=ProgressStyle(description…

{'epoch': 10, 'iter': 0, 'avg_loss': 6.961677551269531, 'avg_acc': 81.25, 'loss': 6.961677551269531}
{'epoch': 10, 'iter': 400, 'avg_loss': 6.951360955797229, 'avg_acc': 50.29613466334164, 'loss': 6.933630466461182}
{'epoch': 10, 'iter': 800, 'avg_loss': 6.961704770872804, 'avg_acc': 49.953183520599254, 'loss': 6.908087730407715}

EP10_train, avg_loss= 6.961410544606537 total_acc= 50.14827628815026
EP:10 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:10', max=108.0, style=ProgressStyle(description_w…

{'epoch': 10, 'iter': 0, 'avg_loss': 6.80170202255249, 'avg_acc': 56.25, 'loss': 6.80170202255249}

EP10_test, avg_loss= 6.786875345088817 total_acc= 49.2468134414832


HBox(children=(FloatProgress(value=0.0, description='EP_train:11', max=1012.0, style=ProgressStyle(description…

{'epoch': 11, 'iter': 0, 'avg_loss': 6.930217266082764, 'avg_acc': 31.25, 'loss': 6.930217266082764}
{'epoch': 11, 'iter': 400, 'avg_loss': 6.965336998204638, 'avg_acc': 50.202618453865334, 'loss': 6.820492267608643}
{'epoch': 11, 'iter': 800, 'avg_loss': 6.966699540689494, 'avg_acc': 50.156054931335824, 'loss': 6.648755073547363}

EP11_train, avg_loss= 6.966905842185492 total_acc= 50.38922525639442
EP:11 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:11', max=108.0, style=ProgressStyle(description_w…

{'epoch': 11, 'iter': 0, 'avg_loss': 6.835878372192383, 'avg_acc': 43.75, 'loss': 6.835878372192383}

EP11_test, avg_loss= 6.903483615981208 total_acc= 51.1008111239861


HBox(children=(FloatProgress(value=0.0, description='EP_train:12', max=1012.0, style=ProgressStyle(description…

{'epoch': 12, 'iter': 0, 'avg_loss': 6.972602844238281, 'avg_acc': 31.25, 'loss': 6.972602844238281}
{'epoch': 12, 'iter': 400, 'avg_loss': 6.973631367719085, 'avg_acc': 51.15336658354115, 'loss': 6.887393951416016}
{'epoch': 12, 'iter': 800, 'avg_loss': 6.976515156797107, 'avg_acc': 50.452559300873915, 'loss': 6.895035266876221}

EP12_train, avg_loss= 6.98050882034151 total_acc= 50.25330532559001
EP:12 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:12', max=108.0, style=ProgressStyle(description_w…

{'epoch': 12, 'iter': 0, 'avg_loss': 6.803015232086182, 'avg_acc': 43.75, 'loss': 6.803015232086182}

EP12_test, avg_loss= 6.909099088774787 total_acc= 50.7531865585168


HBox(children=(FloatProgress(value=0.0, description='EP_train:13', max=1012.0, style=ProgressStyle(description…

{'epoch': 13, 'iter': 0, 'avg_loss': 7.015468597412109, 'avg_acc': 75.0, 'loss': 7.015468597412109}
{'epoch': 13, 'iter': 400, 'avg_loss': 6.983031288346745, 'avg_acc': 50.07793017456359, 'loss': 6.838181972503662}
{'epoch': 13, 'iter': 800, 'avg_loss': 6.987356431177642, 'avg_acc': 49.92977528089887, 'loss': 6.840852737426758}

EP13_train, avg_loss= 6.98765522170915 total_acc= 49.82701099715804
EP:13 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:13', max=108.0, style=ProgressStyle(description_w…

{'epoch': 13, 'iter': 0, 'avg_loss': 6.906279563903809, 'avg_acc': 56.25, 'loss': 6.906279563903809}

EP13_test, avg_loss= 6.878957593882525 total_acc= 49.304750869061415


HBox(children=(FloatProgress(value=0.0, description='EP_train:14', max=1012.0, style=ProgressStyle(description…

{'epoch': 14, 'iter': 0, 'avg_loss': 6.993631362915039, 'avg_acc': 50.0, 'loss': 6.993631362915039}
{'epoch': 14, 'iter': 400, 'avg_loss': 6.974552749101063, 'avg_acc': 50.23379052369077, 'loss': 7.028587818145752}
{'epoch': 14, 'iter': 800, 'avg_loss': 6.980695825093397, 'avg_acc': 49.9063670411985, 'loss': 6.806819438934326}

EP14_train, avg_loss= 6.981312216506174 total_acc= 50.06178178672927
EP:14 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:14', max=108.0, style=ProgressStyle(description_w…

{'epoch': 14, 'iter': 0, 'avg_loss': 7.003608226776123, 'avg_acc': 62.5, 'loss': 7.003608226776123}

EP14_test, avg_loss= 6.964668896463182 total_acc= 51.04287369640788


HBox(children=(FloatProgress(value=0.0, description='EP_train:15', max=1012.0, style=ProgressStyle(description…

{'epoch': 15, 'iter': 0, 'avg_loss': 7.029394626617432, 'avg_acc': 56.25, 'loss': 7.029394626617432}
{'epoch': 15, 'iter': 400, 'avg_loss': 6.9855707327921195, 'avg_acc': 48.70635910224439, 'loss': 6.793706893920898}
{'epoch': 15, 'iter': 800, 'avg_loss': 6.984862353768985, 'avg_acc': 49.13389513108614, 'loss': 6.799369812011719}

EP15_train, avg_loss= 6.982995202890026 total_acc= 49.450142098109474
EP:15 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:15', max=108.0, style=ProgressStyle(description_w…

{'epoch': 15, 'iter': 0, 'avg_loss': 6.798563003540039, 'avg_acc': 56.25, 'loss': 6.798563003540039}

EP15_test, avg_loss= 6.934547110840127 total_acc= 49.42062572421784


HBox(children=(FloatProgress(value=0.0, description='EP_train:16', max=1012.0, style=ProgressStyle(description…

{'epoch': 16, 'iter': 0, 'avg_loss': 7.081331729888916, 'avg_acc': 50.0, 'loss': 7.081331729888916}
{'epoch': 16, 'iter': 400, 'avg_loss': 6.973139608292805, 'avg_acc': 49.220698254364095, 'loss': 6.797079086303711}
{'epoch': 16, 'iter': 800, 'avg_loss': 6.976266743092054, 'avg_acc': 49.664481897627965, 'loss': 6.764641284942627}

EP16_train, avg_loss= 6.976950179446828 total_acc= 49.944396391943656
EP:16 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:16', max=108.0, style=ProgressStyle(description_w…

{'epoch': 16, 'iter': 0, 'avg_loss': 7.116652011871338, 'avg_acc': 50.0, 'loss': 7.116652011871338}

EP16_test, avg_loss= 6.964293625619677 total_acc= 49.594438006952494


HBox(children=(FloatProgress(value=0.0, description='EP_train:17', max=1012.0, style=ProgressStyle(description…

{'epoch': 17, 'iter': 0, 'avg_loss': 7.008421897888184, 'avg_acc': 50.0, 'loss': 7.008421897888184}
{'epoch': 17, 'iter': 400, 'avg_loss': 6.9819355700675985, 'avg_acc': 50.60785536159601, 'loss': 7.084016799926758}
{'epoch': 17, 'iter': 800, 'avg_loss': 6.9783491350143, 'avg_acc': 49.953183520599254, 'loss': 6.905210971832275}

EP17_train, avg_loss= 6.981885932650962 total_acc= 49.80847646113926
EP:17 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:17', max=108.0, style=ProgressStyle(description_w…

{'epoch': 17, 'iter': 0, 'avg_loss': 6.767923831939697, 'avg_acc': 56.25, 'loss': 6.767923831939697}

EP17_test, avg_loss= 6.914689050780402 total_acc= 48.8991888760139


HBox(children=(FloatProgress(value=0.0, description='EP_train:18', max=1012.0, style=ProgressStyle(description…

{'epoch': 18, 'iter': 0, 'avg_loss': 7.027040481567383, 'avg_acc': 25.0, 'loss': 7.027040481567383}
{'epoch': 18, 'iter': 400, 'avg_loss': 6.9795541537372845, 'avg_acc': 50.483167082294266, 'loss': 6.848891735076904}
{'epoch': 18, 'iter': 800, 'avg_loss': 6.978407734193457, 'avg_acc': 49.9063670411985, 'loss': 6.923051834106445}

EP18_train, avg_loss= 6.979468770649122 total_acc= 50.1359199308044
EP:18 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:18', max=108.0, style=ProgressStyle(description_w…

{'epoch': 18, 'iter': 0, 'avg_loss': 7.02012300491333, 'avg_acc': 62.5, 'loss': 7.02012300491333}

EP18_test, avg_loss= 7.052835649914211 total_acc= 50.695249130938585


HBox(children=(FloatProgress(value=0.0, description='EP_train:19', max=1012.0, style=ProgressStyle(description…

{'epoch': 19, 'iter': 0, 'avg_loss': 7.082269191741943, 'avg_acc': 31.25, 'loss': 7.082269191741943}
{'epoch': 19, 'iter': 400, 'avg_loss': 6.977348849660441, 'avg_acc': 49.76620947630923, 'loss': 7.131877899169922}
{'epoch': 19, 'iter': 800, 'avg_loss': 6.974695294388522, 'avg_acc': 49.2665418227216, 'loss': 6.760385036468506}

EP19_train, avg_loss= 6.977591747822969 total_acc= 49.51192388483875
EP:19 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:19', max=108.0, style=ProgressStyle(description_w…

{'epoch': 19, 'iter': 0, 'avg_loss': 6.8403425216674805, 'avg_acc': 50.0, 'loss': 6.8403425216674805}

EP19_test, avg_loss= 6.994916081428528 total_acc= 48.95712630359212


HBox(children=(FloatProgress(value=0.0, description='EP_train:20', max=1012.0, style=ProgressStyle(description…

{'epoch': 20, 'iter': 0, 'avg_loss': 6.964524269104004, 'avg_acc': 56.25, 'loss': 6.964524269104004}
{'epoch': 20, 'iter': 400, 'avg_loss': 6.98338708021397, 'avg_acc': 50.280548628428924, 'loss': 6.980965614318848}
{'epoch': 20, 'iter': 800, 'avg_loss': 6.9771843301818315, 'avg_acc': 49.52403245942571, 'loss': 6.751867771148682}

EP20_train, avg_loss= 6.97889167894959 total_acc= 49.83318917583097
EP:20 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:20', max=108.0, style=ProgressStyle(description_w…

{'epoch': 20, 'iter': 0, 'avg_loss': 6.798193454742432, 'avg_acc': 37.5, 'loss': 6.798193454742432}

EP20_test, avg_loss= 6.961878794210929 total_acc= 50.28968713789108


HBox(children=(FloatProgress(value=0.0, description='EP_train:21', max=1012.0, style=ProgressStyle(description…

{'epoch': 21, 'iter': 0, 'avg_loss': 6.955350399017334, 'avg_acc': 50.0, 'loss': 6.955350399017334}
{'epoch': 21, 'iter': 400, 'avg_loss': 6.968962387551096, 'avg_acc': 49.61034912718205, 'loss': 6.989436149597168}
{'epoch': 21, 'iter': 800, 'avg_loss': 6.971221241016364, 'avg_acc': 49.82833957553059, 'loss': 6.778234004974365}

EP21_train, avg_loss= 6.9760942850188306 total_acc= 50.055603608056344
EP:21 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:21', max=108.0, style=ProgressStyle(description_w…

{'epoch': 21, 'iter': 0, 'avg_loss': 6.893987655639648, 'avg_acc': 43.75, 'loss': 6.893987655639648}

EP21_test, avg_loss= 6.964837635004962 total_acc= 48.493626882966396


HBox(children=(FloatProgress(value=0.0, description='EP_train:22', max=1012.0, style=ProgressStyle(description…

{'epoch': 22, 'iter': 0, 'avg_loss': 7.006428241729736, 'avg_acc': 56.25, 'loss': 7.006428241729736}
{'epoch': 22, 'iter': 400, 'avg_loss': 6.984012960495794, 'avg_acc': 49.594763092269325, 'loss': 6.831071376800537}
{'epoch': 22, 'iter': 800, 'avg_loss': 6.976864215288865, 'avg_acc': 49.726903870162296, 'loss': 6.816226005554199}

EP22_train, avg_loss= 6.980115137552556 total_acc= 49.85172371184974
EP:22 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:22', max=108.0, style=ProgressStyle(description_w…

{'epoch': 22, 'iter': 0, 'avg_loss': 6.98036527633667, 'avg_acc': 43.75, 'loss': 6.98036527633667}

EP22_test, avg_loss= 7.003145690317507 total_acc= 51.44843568945539


HBox(children=(FloatProgress(value=0.0, description='EP_train:23', max=1012.0, style=ProgressStyle(description…

{'epoch': 23, 'iter': 0, 'avg_loss': 7.032945156097412, 'avg_acc': 31.25, 'loss': 7.032945156097412}
{'epoch': 23, 'iter': 400, 'avg_loss': 6.979116059300906, 'avg_acc': 50.18703241895261, 'loss': 7.265974998474121}
{'epoch': 23, 'iter': 800, 'avg_loss': 6.979059526536348, 'avg_acc': 50.24188514357054, 'loss': 6.864888668060303}

EP23_train, avg_loss= 6.9775732213800605 total_acc= 49.83318917583097
EP:23 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:23', max=108.0, style=ProgressStyle(description_w…

{'epoch': 23, 'iter': 0, 'avg_loss': 7.230944633483887, 'avg_acc': 18.75, 'loss': 7.230944633483887}

EP23_test, avg_loss= 7.017015435077526 total_acc= 50.347624565469296


HBox(children=(FloatProgress(value=0.0, description='EP_train:24', max=1012.0, style=ProgressStyle(description…

{'epoch': 24, 'iter': 0, 'avg_loss': 7.031558990478516, 'avg_acc': 31.25, 'loss': 7.031558990478516}
{'epoch': 24, 'iter': 400, 'avg_loss': 6.988874026367492, 'avg_acc': 49.6571072319202, 'loss': 6.990179061889648}
{'epoch': 24, 'iter': 800, 'avg_loss': 6.979450431209378, 'avg_acc': 49.414794007490634, 'loss': 7.203375816345215}

EP24_train, avg_loss= 6.978325998359047 total_acc= 49.604596564932656
EP:24 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:24', max=108.0, style=ProgressStyle(description_w…

{'epoch': 24, 'iter': 0, 'avg_loss': 6.994500637054443, 'avg_acc': 50.0, 'loss': 6.994500637054443}

EP24_test, avg_loss= 6.996597947897734 total_acc= 50.05793742757822


HBox(children=(FloatProgress(value=0.0, description='EP_train:25', max=1012.0, style=ProgressStyle(description…

{'epoch': 25, 'iter': 0, 'avg_loss': 6.720725059509277, 'avg_acc': 62.5, 'loss': 6.720725059509277}
{'epoch': 25, 'iter': 400, 'avg_loss': 6.984578938852819, 'avg_acc': 49.906483790523694, 'loss': 6.9905314445495605}
{'epoch': 25, 'iter': 800, 'avg_loss': 6.977122255627731, 'avg_acc': 50.156054931335824, 'loss': 6.74666690826416}

EP25_train, avg_loss= 6.977926732994351 total_acc= 50.29037439762758
EP:25 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:25', max=108.0, style=ProgressStyle(description_w…

{'epoch': 25, 'iter': 0, 'avg_loss': 7.13480281829834, 'avg_acc': 31.25, 'loss': 7.13480281829834}

EP25_test, avg_loss= 7.048307043534738 total_acc= 50.57937427578216


HBox(children=(FloatProgress(value=0.0, description='EP_train:26', max=1012.0, style=ProgressStyle(description…

{'epoch': 26, 'iter': 0, 'avg_loss': 6.994550704956055, 'avg_acc': 62.5, 'loss': 6.994550704956055}
{'epoch': 26, 'iter': 400, 'avg_loss': 6.9815287815959675, 'avg_acc': 49.781795511221944, 'loss': 6.900057792663574}
{'epoch': 26, 'iter': 800, 'avg_loss': 6.971933288669467, 'avg_acc': 49.43039950062422, 'loss': 6.8859734535217285}

EP26_train, avg_loss= 6.971016735427464 total_acc= 49.70962560237242
EP:26 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:26', max=108.0, style=ProgressStyle(description_w…

{'epoch': 26, 'iter': 0, 'avg_loss': 7.154868125915527, 'avg_acc': 37.5, 'loss': 7.154868125915527}

EP26_test, avg_loss= 7.010361053325512 total_acc= 50.405561993047506


HBox(children=(FloatProgress(value=0.0, description='EP_train:27', max=1012.0, style=ProgressStyle(description…

{'epoch': 27, 'iter': 0, 'avg_loss': 7.01690149307251, 'avg_acc': 62.5, 'loss': 7.01690149307251}
{'epoch': 27, 'iter': 400, 'avg_loss': 6.97132545694746, 'avg_acc': 50.950748129675816, 'loss': 6.8730854988098145}
{'epoch': 27, 'iter': 800, 'avg_loss': 6.969629462143306, 'avg_acc': 50.50717852684144, 'loss': 6.899282455444336}

EP27_train, avg_loss= 6.969050401755473 total_acc= 50.72902508340541
EP:27 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:27', max=108.0, style=ProgressStyle(description_w…

{'epoch': 27, 'iter': 0, 'avg_loss': 7.085572719573975, 'avg_acc': 31.25, 'loss': 7.085572719573975}

EP27_test, avg_loss= 6.9962721621548685 total_acc= 49.53650057937428


HBox(children=(FloatProgress(value=0.0, description='EP_train:28', max=1012.0, style=ProgressStyle(description…

{'epoch': 28, 'iter': 0, 'avg_loss': 7.128693580627441, 'avg_acc': 62.5, 'loss': 7.128693580627441}
{'epoch': 28, 'iter': 400, 'avg_loss': 6.970427924558112, 'avg_acc': 49.641521197007485, 'loss': 6.683968544006348}
{'epoch': 28, 'iter': 800, 'avg_loss': 6.974428990658154, 'avg_acc': 49.71129837702871, 'loss': 6.890273094177246}

EP28_train, avg_loss= 6.97911060845899 total_acc= 49.981465463981216
EP:28 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:28', max=108.0, style=ProgressStyle(description_w…

{'epoch': 28, 'iter': 0, 'avg_loss': 7.051245212554932, 'avg_acc': 56.25, 'loss': 7.051245212554932}

EP28_test, avg_loss= 6.977347921442102 total_acc= 49.71031286210892


HBox(children=(FloatProgress(value=0.0, description='EP_train:29', max=1012.0, style=ProgressStyle(description…

{'epoch': 29, 'iter': 0, 'avg_loss': 7.0379180908203125, 'avg_acc': 50.0, 'loss': 7.0379180908203125}
{'epoch': 29, 'iter': 400, 'avg_loss': 6.965748430190241, 'avg_acc': 51.106608478803, 'loss': 7.111639499664307}
{'epoch': 29, 'iter': 800, 'avg_loss': 6.968642589007127, 'avg_acc': 50.48377028714107, 'loss': 6.7342424392700195}

EP29_train, avg_loss= 6.969449883393148 total_acc= 50.759915976770046
EP:29 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:29', max=108.0, style=ProgressStyle(description_w…

{'epoch': 29, 'iter': 0, 'avg_loss': 7.0892863273620605, 'avg_acc': 68.75, 'loss': 7.0892863273620605}

EP29_test, avg_loss= 6.943283014827305 total_acc= 49.94206257242178


HBox(children=(FloatProgress(value=0.0, description='EP_train:30', max=1012.0, style=ProgressStyle(description…

{'epoch': 30, 'iter': 0, 'avg_loss': 7.023162364959717, 'avg_acc': 50.0, 'loss': 7.023162364959717}
{'epoch': 30, 'iter': 400, 'avg_loss': 6.966954671236643, 'avg_acc': 49.73503740648379, 'loss': 6.906540870666504}
{'epoch': 30, 'iter': 800, 'avg_loss': 6.9663079198677735, 'avg_acc': 49.39918851435706, 'loss': 6.690585136413574}

EP30_train, avg_loss= 6.96869202800419 total_acc= 49.59841838625973
EP:30 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:30', max=108.0, style=ProgressStyle(description_w…

{'epoch': 30, 'iter': 0, 'avg_loss': 7.359606742858887, 'avg_acc': 75.0, 'loss': 7.359606742858887}

EP30_test, avg_loss= 7.007091950487207 total_acc= 48.261877172653534


HBox(children=(FloatProgress(value=0.0, description='EP_train:31', max=1012.0, style=ProgressStyle(description…

{'epoch': 31, 'iter': 0, 'avg_loss': 7.176765441894531, 'avg_acc': 37.5, 'loss': 7.176765441894531}
{'epoch': 31, 'iter': 400, 'avg_loss': 6.960443939056777, 'avg_acc': 49.08042394014963, 'loss': 6.815739154815674}
{'epoch': 31, 'iter': 800, 'avg_loss': 6.968026714229703, 'avg_acc': 49.812734082397, 'loss': 6.7573676109313965}

EP31_train, avg_loss= 6.969021072029596 total_acc= 49.95057457061658
EP:31 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:31', max=108.0, style=ProgressStyle(description_w…

{'epoch': 31, 'iter': 0, 'avg_loss': 7.07455587387085, 'avg_acc': 62.5, 'loss': 7.07455587387085}

EP31_test, avg_loss= 6.98941441377004 total_acc= 50.81112398609502


HBox(children=(FloatProgress(value=0.0, description='EP_train:32', max=1012.0, style=ProgressStyle(description…

{'epoch': 32, 'iter': 0, 'avg_loss': 7.0042266845703125, 'avg_acc': 50.0, 'loss': 7.0042266845703125}
{'epoch': 32, 'iter': 400, 'avg_loss': 6.967139534224893, 'avg_acc': 50.14027431421446, 'loss': 6.715808391571045}
{'epoch': 32, 'iter': 800, 'avg_loss': 6.966694219877359, 'avg_acc': 50.10923845193508, 'loss': 6.852078914642334}

EP32_train, avg_loss= 6.967543139759259 total_acc= 49.81465463981218
EP:32 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:32', max=108.0, style=ProgressStyle(description_w…

{'epoch': 32, 'iter': 0, 'avg_loss': 6.860957622528076, 'avg_acc': 43.75, 'loss': 6.860957622528076}

EP32_test, avg_loss= 7.0030082199308605 total_acc= 51.15874855156431


HBox(children=(FloatProgress(value=0.0, description='EP_train:33', max=1012.0, style=ProgressStyle(description…

{'epoch': 33, 'iter': 0, 'avg_loss': 6.934237957000732, 'avg_acc': 43.75, 'loss': 6.934237957000732}
{'epoch': 33, 'iter': 400, 'avg_loss': 6.956319829175002, 'avg_acc': 49.95324189526185, 'loss': 6.824589252471924}
{'epoch': 33, 'iter': 800, 'avg_loss': 6.961374984103047, 'avg_acc': 50.148252184769035, 'loss': 6.775905132293701}

EP33_train, avg_loss= 6.964490994634365 total_acc= 49.97528728530829
EP:33 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:33', max=108.0, style=ProgressStyle(description_w…

{'epoch': 33, 'iter': 0, 'avg_loss': 7.210438251495361, 'avg_acc': 62.5, 'loss': 7.210438251495361}

EP33_test, avg_loss= 7.072836129753678 total_acc= 50.05793742757822


HBox(children=(FloatProgress(value=0.0, description='EP_train:34', max=1012.0, style=ProgressStyle(description…

{'epoch': 34, 'iter': 0, 'avg_loss': 6.955696105957031, 'avg_acc': 56.25, 'loss': 6.955696105957031}
{'epoch': 34, 'iter': 400, 'avg_loss': 6.963213594774356, 'avg_acc': 51.12219451371571, 'loss': 6.977696895599365}
{'epoch': 34, 'iter': 800, 'avg_loss': 6.96332806624128, 'avg_acc': 50.273096129837704, 'loss': 6.8380303382873535}

EP34_train, avg_loss= 6.960564059231121 total_acc= 50.661065118003215
EP:34 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:34', max=108.0, style=ProgressStyle(description_w…

{'epoch': 34, 'iter': 0, 'avg_loss': 6.819902420043945, 'avg_acc': 43.75, 'loss': 6.819902420043945}

EP34_test, avg_loss= 7.0418769518534345 total_acc= 50.05793742757822


HBox(children=(FloatProgress(value=0.0, description='EP_train:35', max=1012.0, style=ProgressStyle(description…

{'epoch': 35, 'iter': 0, 'avg_loss': 7.086144924163818, 'avg_acc': 37.5, 'loss': 7.086144924163818}
{'epoch': 35, 'iter': 400, 'avg_loss': 6.9655280874257075, 'avg_acc': 50.46758104738155, 'loss': 6.869312286376953}
{'epoch': 35, 'iter': 800, 'avg_loss': 6.969393696826645, 'avg_acc': 50.13264669163545, 'loss': 6.994606971740723}

EP35_train, avg_loss= 6.966789033573136 total_acc= 49.85172371184974
EP:35 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:35', max=108.0, style=ProgressStyle(description_w…

{'epoch': 35, 'iter': 0, 'avg_loss': 7.348604679107666, 'avg_acc': 50.0, 'loss': 7.348604679107666}

EP35_test, avg_loss= 7.0306576313795865 total_acc= 49.76825028968714


HBox(children=(FloatProgress(value=0.0, description='EP_train:36', max=1012.0, style=ProgressStyle(description…

{'epoch': 36, 'iter': 0, 'avg_loss': 7.0608229637146, 'avg_acc': 87.5, 'loss': 7.0608229637146}
{'epoch': 36, 'iter': 400, 'avg_loss': 6.9730816505793625, 'avg_acc': 50.857231920199496, 'loss': 7.193516731262207}
{'epoch': 36, 'iter': 800, 'avg_loss': 6.967951806148191, 'avg_acc': 50.77247191011236, 'loss': 6.7061638832092285}

EP36_train, avg_loss= 6.9690405997363 total_acc= 50.53132336587174
EP:36 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:36', max=108.0, style=ProgressStyle(description_w…

{'epoch': 36, 'iter': 0, 'avg_loss': 6.9594035148620605, 'avg_acc': 68.75, 'loss': 6.9594035148620605}

EP36_test, avg_loss= 7.04901608272835 total_acc= 50.28968713789108


HBox(children=(FloatProgress(value=0.0, description='EP_train:37', max=1012.0, style=ProgressStyle(description…

{'epoch': 37, 'iter': 0, 'avg_loss': 7.027158260345459, 'avg_acc': 50.0, 'loss': 7.027158260345459}
{'epoch': 37, 'iter': 400, 'avg_loss': 6.962616718320775, 'avg_acc': 49.76620947630923, 'loss': 6.65673828125}
{'epoch': 37, 'iter': 800, 'avg_loss': 6.966544611474846, 'avg_acc': 49.68008739076154, 'loss': 6.704526424407959}

EP37_train, avg_loss= 6.96762216562339 total_acc= 49.96293092796244
EP:37 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:37', max=108.0, style=ProgressStyle(description_w…

{'epoch': 37, 'iter': 0, 'avg_loss': 7.143213748931885, 'avg_acc': 31.25, 'loss': 7.143213748931885}

EP37_test, avg_loss= 7.048581551622461 total_acc= 50.05793742757822


HBox(children=(FloatProgress(value=0.0, description='EP_train:38', max=1012.0, style=ProgressStyle(description…

{'epoch': 38, 'iter': 0, 'avg_loss': 7.000029563903809, 'avg_acc': 31.25, 'loss': 7.000029563903809}
{'epoch': 38, 'iter': 400, 'avg_loss': 6.9586111756037, 'avg_acc': 50.670199501246884, 'loss': 6.8773884773254395}
{'epoch': 38, 'iter': 800, 'avg_loss': 6.962103363875295, 'avg_acc': 50.47596754057429, 'loss': 6.838382720947266}

EP38_train, avg_loss= 6.966530566158974 total_acc= 50.07413814407513
EP:38 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:38', max=108.0, style=ProgressStyle(description_w…

{'epoch': 38, 'iter': 0, 'avg_loss': 6.855334281921387, 'avg_acc': 50.0, 'loss': 6.855334281921387}

EP38_test, avg_loss= 6.998677337611163 total_acc= 47.21900347624565


HBox(children=(FloatProgress(value=0.0, description='EP_train:39', max=1012.0, style=ProgressStyle(description…

{'epoch': 39, 'iter': 0, 'avg_loss': 7.311832427978516, 'avg_acc': 43.75, 'loss': 7.311832427978516}
{'epoch': 39, 'iter': 400, 'avg_loss': 6.962133044911145, 'avg_acc': 51.106608478803, 'loss': 7.025884628295898}
{'epoch': 39, 'iter': 800, 'avg_loss': 6.964373590347918, 'avg_acc': 50.83489388264669, 'loss': 6.790438652038574}

EP39_train, avg_loss= 6.9651682475809995 total_acc= 50.66724329667614
EP:39 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:39', max=108.0, style=ProgressStyle(description_w…

{'epoch': 39, 'iter': 0, 'avg_loss': 6.99275541305542, 'avg_acc': 50.0, 'loss': 6.99275541305542}

EP39_test, avg_loss= 7.031220656854135 total_acc= 48.95712630359212


HBox(children=(FloatProgress(value=0.0, description='EP_train:40', max=1012.0, style=ProgressStyle(description…

{'epoch': 40, 'iter': 0, 'avg_loss': 7.037034034729004, 'avg_acc': 56.25, 'loss': 7.037034034729004}
{'epoch': 40, 'iter': 400, 'avg_loss': 6.954881467129524, 'avg_acc': 49.8285536159601, 'loss': 7.023557662963867}
{'epoch': 40, 'iter': 800, 'avg_loss': 6.9596838522493165, 'avg_acc': 49.51622971285893, 'loss': 6.935403347015381}

EP40_train, avg_loss= 6.959512190856481 total_acc= 49.7961201037934
EP:40 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:40', max=108.0, style=ProgressStyle(description_w…

{'epoch': 40, 'iter': 0, 'avg_loss': 6.748645782470703, 'avg_acc': 68.75, 'loss': 6.748645782470703}

EP40_test, avg_loss= 6.993540463624178 total_acc= 49.015063731170336


HBox(children=(FloatProgress(value=0.0, description='EP_train:41', max=1012.0, style=ProgressStyle(description…

{'epoch': 41, 'iter': 0, 'avg_loss': 7.10067081451416, 'avg_acc': 56.25, 'loss': 7.10067081451416}
{'epoch': 41, 'iter': 400, 'avg_loss': 6.951927433584694, 'avg_acc': 51.07543640897756, 'loss': 7.134896755218506}
{'epoch': 41, 'iter': 800, 'avg_loss': 6.952903955319103, 'avg_acc': 50.530586766541816, 'loss': 7.024403095245361}

EP41_train, avg_loss= 6.957018188337092 total_acc= 50.50043247250711
EP:41 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:41', max=108.0, style=ProgressStyle(description_w…

{'epoch': 41, 'iter': 0, 'avg_loss': 7.616061687469482, 'avg_acc': 56.25, 'loss': 7.616061687469482}

EP41_test, avg_loss= 7.048428407421818 total_acc= 49.76825028968714


HBox(children=(FloatProgress(value=0.0, description='EP_train:42', max=1012.0, style=ProgressStyle(description…

{'epoch': 42, 'iter': 0, 'avg_loss': 7.004776477813721, 'avg_acc': 43.75, 'loss': 7.004776477813721}
{'epoch': 42, 'iter': 400, 'avg_loss': 6.948425587870534, 'avg_acc': 49.329800498753116, 'loss': 6.966439247131348}
{'epoch': 42, 'iter': 800, 'avg_loss': 6.953744641255201, 'avg_acc': 50.28089887640449, 'loss': 6.920934677124023}

EP42_train, avg_loss= 6.958611568443389 total_acc= 50.15445446682318
EP:42 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:42', max=108.0, style=ProgressStyle(description_w…

{'epoch': 42, 'iter': 0, 'avg_loss': 7.201807975769043, 'avg_acc': 56.25, 'loss': 7.201807975769043}

EP42_test, avg_loss= 7.012172628332068 total_acc= 49.18887601390498


HBox(children=(FloatProgress(value=0.0, description='EP_train:43', max=1012.0, style=ProgressStyle(description…

{'epoch': 43, 'iter': 0, 'avg_loss': 7.173555850982666, 'avg_acc': 62.5, 'loss': 7.173555850982666}
{'epoch': 43, 'iter': 400, 'avg_loss': 6.9538459742158425, 'avg_acc': 50.34289276807981, 'loss': 6.826233386993408}
{'epoch': 43, 'iter': 800, 'avg_loss': 6.962406175711033, 'avg_acc': 49.74250936329588, 'loss': 6.495599269866943}

EP43_train, avg_loss= 6.963141374437234 total_acc= 50.07413814407513
EP:43 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:43', max=108.0, style=ProgressStyle(description_w…

{'epoch': 43, 'iter': 0, 'avg_loss': 7.395040512084961, 'avg_acc': 62.5, 'loss': 7.395040512084961}

EP43_test, avg_loss= 7.042791693298905 total_acc= 52.95480880648899


HBox(children=(FloatProgress(value=0.0, description='EP_train:44', max=1012.0, style=ProgressStyle(description…

{'epoch': 44, 'iter': 0, 'avg_loss': 6.964870452880859, 'avg_acc': 75.0, 'loss': 6.964870452880859}
{'epoch': 44, 'iter': 400, 'avg_loss': 6.962821414643095, 'avg_acc': 49.4856608478803, 'loss': 7.011837959289551}
{'epoch': 44, 'iter': 800, 'avg_loss': 6.956543962904874, 'avg_acc': 49.118289637952564, 'loss': 6.721404075622559}

EP44_train, avg_loss= 6.957541454922069 total_acc= 49.06091684171506
EP:44 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:44', max=108.0, style=ProgressStyle(description_w…

{'epoch': 44, 'iter': 0, 'avg_loss': 6.837031841278076, 'avg_acc': 43.75, 'loss': 6.837031841278076}

EP44_test, avg_loss= 7.013807822156836 total_acc= 50.28968713789108


HBox(children=(FloatProgress(value=0.0, description='EP_train:45', max=1012.0, style=ProgressStyle(description…

{'epoch': 45, 'iter': 0, 'avg_loss': 7.013490200042725, 'avg_acc': 56.25, 'loss': 7.013490200042725}
{'epoch': 45, 'iter': 400, 'avg_loss': 6.960825923672341, 'avg_acc': 50.65461346633416, 'loss': 6.878565788269043}
{'epoch': 45, 'iter': 800, 'avg_loss': 6.959553123264575, 'avg_acc': 50.90511860174781, 'loss': 6.881985664367676}

EP45_train, avg_loss= 6.9601312593037905 total_acc= 50.71666872605956
EP:45 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:45', max=108.0, style=ProgressStyle(description_w…

{'epoch': 45, 'iter': 0, 'avg_loss': 6.792926788330078, 'avg_acc': 37.5, 'loss': 6.792926788330078}

EP45_test, avg_loss= 7.073917344764427 total_acc= 49.53650057937428


HBox(children=(FloatProgress(value=0.0, description='EP_train:46', max=1012.0, style=ProgressStyle(description…

{'epoch': 46, 'iter': 0, 'avg_loss': 7.117523193359375, 'avg_acc': 43.75, 'loss': 7.117523193359375}
{'epoch': 46, 'iter': 400, 'avg_loss': 6.959465308676931, 'avg_acc': 49.220698254364095, 'loss': 6.811488628387451}
{'epoch': 46, 'iter': 800, 'avg_loss': 6.960369824470206, 'avg_acc': 49.87515605493134, 'loss': 6.785155773162842}

EP46_train, avg_loss= 6.96185373977239 total_acc= 50.29037439762758
EP:46 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:46', max=108.0, style=ProgressStyle(description_w…

{'epoch': 46, 'iter': 0, 'avg_loss': 7.497143745422363, 'avg_acc': 43.75, 'loss': 7.497143745422363}

EP46_test, avg_loss= 7.02750661638048 total_acc= 50.405561993047506


HBox(children=(FloatProgress(value=0.0, description='EP_train:47', max=1012.0, style=ProgressStyle(description…

{'epoch': 47, 'iter': 0, 'avg_loss': 7.241518497467041, 'avg_acc': 56.25, 'loss': 7.241518497467041}
{'epoch': 47, 'iter': 400, 'avg_loss': 6.9618441386710375, 'avg_acc': 49.298628428927685, 'loss': 6.950791358947754}
{'epoch': 47, 'iter': 800, 'avg_loss': 6.95797583345468, 'avg_acc': 49.86735330836454, 'loss': 6.904091835021973}

EP47_train, avg_loss= 6.961702972532732 total_acc= 49.95675274928951
EP:47 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:47', max=108.0, style=ProgressStyle(description_w…

{'epoch': 47, 'iter': 0, 'avg_loss': 7.258745193481445, 'avg_acc': 43.75, 'loss': 7.258745193481445}

EP47_test, avg_loss= 7.033828130474797 total_acc= 51.04287369640788


HBox(children=(FloatProgress(value=0.0, description='EP_train:48', max=1012.0, style=ProgressStyle(description…

{'epoch': 48, 'iter': 0, 'avg_loss': 7.073668479919434, 'avg_acc': 43.75, 'loss': 7.073668479919434}
{'epoch': 48, 'iter': 400, 'avg_loss': 6.947320022487879, 'avg_acc': 50.45199501246883, 'loss': 6.977117538452148}
{'epoch': 48, 'iter': 800, 'avg_loss': 6.954043479447954, 'avg_acc': 50.218476903870155, 'loss': 6.887017250061035}

EP48_train, avg_loss= 6.956282810260185 total_acc= 49.993821821327074
EP:48 Model Saved on: ./models/bert_02_small.{}.ep{:02d}.pt


HBox(children=(FloatProgress(value=0.0, description='EP_test:48', max=108.0, style=ProgressStyle(description_w…

{'epoch': 48, 'iter': 0, 'avg_loss': 6.997890472412109, 'avg_acc': 43.75, 'loss': 6.997890472412109}

EP48_test, avg_loss= 7.0191180485266225 total_acc= 49.594438006952494


HBox(children=(FloatProgress(value=0.0, description='EP_train:49', max=1012.0, style=ProgressStyle(description…

{'epoch': 49, 'iter': 0, 'avg_loss': 7.003330230712891, 'avg_acc': 50.0, 'loss': 7.003330230712891}
{'epoch': 49, 'iter': 400, 'avg_loss': 6.947207901543216, 'avg_acc': 50.29613466334164, 'loss': 6.854007720947266}



KeyboardInterrupt: 

In [71]:
# trainer.save(2, output_path)

EP:2 Model Saved on: ./models/bert_01.{0}.ep{1:2d}.pt


'./models/bert_01.{0}.ep{1:2d}.pt'

# 10. BERT for Named-Entity Recognition

In [38]:
print("Building BERT model")
bert = BERT(len(vocab), hidden=hidden, n_layers=layers, attn_heads=attn_heads)

Building BERT model


In [39]:
bert.load_state_dict(torch.load("./models/bert_01.bert.statedict.ep02.pt"))

<All keys matched successfully>

In [None]:
trainer.model.eval()

In [115]:
inputs = train_dataset[3]
inputs = {key: value.to('cuda') for key, value in inputs.items()}

In [116]:
outputs = trainer.model(inputs['bert_input'].unsqueeze(0), inputs['segment_label'].unsqueeze(0))

In [117]:
for sent in " ".join([vocab.itos[t] for t in inputs['bert_input'].cpu().tolist() if t!=0]).split("[SEP]"):
    print(sent)
    print("\n\n")

[CLS] [UNK] [MASK] with positive sales in [UNK] , and was praised by both [UNK] and western critics . [UNK] release , it received downloadable content , [MASK] with an expanded edition in [UNK] of that year . [UNK] was [MASK] adapted ##gan manga and an original video animation series . [UNK] [MASK] low sales of [UNK] [UNK] [UNK] , [UNK] [UNK] [UNK] was not localized , but a fan translation compatible with the game [UNK] expanded edition was released in [MASK] . [UNK] would return to the franchise with the [MASK] of [UNK] : [UNK] [UNK] for the [UNK] 4 . 



 [UNK] [UNK] travelled to the [UNK] [UNK] in [UNK] 2009 , to commence work on their seventh studio album , [UNK] 7 . [MASK] [MASK] a contract [MASK] [UNK] [UNK] [UNK] [UNK] label , [UNK] [UNK] , [MASK] in collaborations with [MASK] [UNK] profile producers . [UNK] late [UNK] 2009 , the [MASK] [MASK] [MASK] [MASK] were working with [UNK] [UNK] , known by his stage name [MASK] , on two songs . [UNK] [UNK] a [UNK] [UNK] was lice and prod

In [118]:
inputs["is_next"]

tensor(0, device='cuda:0')

In [119]:
outputs[0]

tensor([[-18.5915,   0.0000]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)