# Setup

Feel free to change

In [2]:
# -- Base -- #
import os
import random
import joblib
import logging
import time
import re
import io
from datetime import datetime
from tqdm import tqdm
import ipdb
from copy import deepcopy
from dataclasses import dataclass
import sys, getopt
import json
from pathlib import Path
import yaml
import shutil
import csv
from typing import (
    List,
    Dict
)

# -- Tokenizer -- #
import tokenizers
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers import normalizers

from tokenizers.normalizers import (
    Lowercase,
    NFD,
    StripAccents
)

from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders

# -- Metrics -- #
import numpy as np
import pandas as pd
import sqlite3 as sql
import tensorboard

# -- Tensorflow -- #
import tensorflow as tf
from tensorflow.keras.metrics import Mean
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.layers import (
    Softmax,
    Dense,
    AdditiveAttention,
    MultiHeadAttention,
    Layer
)

from tensorflow.keras.layers import (
    LayerNormalization,
    Dropout,
    Embedding
)

from tensorflow.keras import (
    Sequential,
    Model
)
from tensorflow.train import Checkpoint, CheckpointManager
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

# -- Misc Models -- #
import drain3
from gensim.models.phrases import Phrases
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.neighbors import NearestNeighbors
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.random_projection import SparseRandomProjection
from sklearn.svm import SVC

# -- Dash -- #
import dash
import dash_table
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html

from dash.dependencies import (
    Input,
    Output,
    State
)

import jax.numpy as jnp

import plotly.io as pio
import plotly.express as px
from dash import no_update
from flask_caching import Cache



In [3]:
def convert_sqlite_to_csv(inputFolder, ext, tableName):
    """ inputFolder - Folder where sqlite files are located. 
        ext - Extension of your sqlite file (eg. db, sqlite, sqlite3 etc.)
        tableName - table name from which you want to select the data.
    """
    csvWriter = csv.writer(open(inputFolder+'/output.csv', 'w', newline=''))
    for file1 in os.listdir(inputFolder):
        if file1.endswith('.'+ext):
            conn = sql.connect(inputFolder+'/'+file1)
            cursor = conn.cursor()
            cursor.execute("SELECT * FROM "+tableName)
            rows = cursor.fetchall()
            for row in rows:
                csvWriter.writerow(row)
            continue
        else:
            continue

Extensions

In [4]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [5]:
tf.__version__

'2.4.1'

## Environmental Variables


---



In [6]:
SOURCE = '/home/' + os.environ['USER']
CONTAINER = 'core.soaesb'

## Logging

In [7]:
logging.basicConfig(format='%(asctime)s %(levelname)s | %(message)s',
                    level=logging.INFO,
                    stream=sys.stdout)
logger = logging.getLogger(__name__)

# Define Dataset

## Define Database Functions

In [8]:
def database_builder(path: str) -> pd.DataFrame():
    logger.info('Building DataFrame ...')
    (_, _, files) = next(os.walk(path))
    sql_query = 'SELECT * FROM logs'
    data = []
    for f in files:
        if '.db' in f:
            conn = create_connection(path + f)
            d = pd.read_sql_query(sql_query, conn)
            data.append(d)
    logger.info('...complete!')
    return pd.concat(data)


def create_connection(path: str) -> sql.Connection:
    """
    Creates a database connection
    :param path: str
        path to database object
    :return sql.Connection
        a connection to the database
    """
    try:
        conn = sql.connect(path)
        logger.info('Connected to database ' + path)
        return conn
    except sql.Error as e:
        logger.warning(e)

## Define Dataset Main

In [9]:
dataset = database_builder(SOURCE + '/data/')
container_dataset = dataset[dataset['container_name'] == CONTAINER]

2021-05-27 16:51:06,186 INFO | Building DataFrame ...
2021-05-27 16:51:06,188 INFO | Connected to database /home/jovyan/data/tanner_logs.db
2021-05-27 16:51:06,994 INFO | Connected to database /home/jovyan/data/elastic_logs.db
2021-05-27 16:51:07,909 INFO | ...complete!


# W2V Pipeline

## Pipeline Objects

### Configuration

In [10]:
def set_attributes(self, config: dict):
    try:
        config = config[self.__class__.__name__]
    except Exception as e:
        logger.warning(e)
        logger.warning('No configuration found for ' +
                       self.__class__.__name__)

    for attr in config.keys():
        setattr(self, attr, config[attr])


@dataclass
class PreprocessingGlobalConfig:
    embed_size: int = 512
    max_vocab_size: int = 2000
    buffer_size: int = 10000
    global_training: bool = True
    path: str = '/results/'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class PhraseCaptureLayerConfig:
    min_count: int = 5
    threshold: float = 7
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: str = 'phrase_model.joblib'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class TextClusteringLayerConfig:
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: str = 'template_miner.joblib'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class NegativeSkipgramLayerConfig:
    window_size: int = 2
    num_neg_sampling: int = 10
    load_model: bool = True
    save_model: bool = False
    training: bool = True

    def load(self, config):
        set_attributes(self, config)


@dataclass
class W2VLayerConfig:
    epochs: int = 25
    batch_size: int = 2048
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: bool = 'word2vec'

    def load(self, config):
        set_attributes(self, config)


class PreprocessingPipelineConfig:
    def __init__(self):
        self.PreprocessingGlobalConfig = PreprocessingGlobalConfig()
        self.PhraseCaptureLayerConfig = PhraseCaptureLayerConfig()
        self.TextClusteringLayerConfig = TextClusteringLayerConfig()
        self.NegativeSkipgrameLayerConfig = NegativeSkipgramLayerConfig()
        self.W2VLayerConfig = W2VLayerConfig()

    def load(self, path):
        try:
            with open(path) as f:
                preprocessing_config = yaml.load(f, Loader=yaml.FullLoader)
        except FileNotFoundError as e:
            logger.warning(e)
            return None

        self.PreprocessingGlobalConfig.load(preprocessing_config)
        self.PhraseCaptureLayerConfig.load(preprocessing_config)
        self.TextClusteringLayerConfig.load(preprocessing_config)
        self.NegativeSkipgrameLayerConfig.load(preprocessing_config)
        self.W2VLayerConfig.load(preprocessing_config)

### Tokenizer

In [12]:
class PrimeTokenizer:
    def __init__(self):
        self.prime_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

        self.prime_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

        self.prime_tokenizer.pre_tokenizer = Whitespace()

        self.prime_tokenizer.post_processor = TemplateProcessing(
            single="[CLS] $A [SEP]",
            pair="[CLS] $A [SEP] $B:1 [SEP]:1",
            special_tokens=[
                ("[CLS]", 1),
                ("[SEP]", 2),
            ],
        )

        self.trainer = WordPieceTrainer(
            vocab_size=153411, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
        )

        self.prime_tokenizer.decoder = decoders.WordPiece()
        self.prime_tokenizer.enable_padding(length=200)
        self.prime_tokenizer.enable_truncation(200)

    def text_to_sequence(self, input_) -> List[tokenizers.Encoding]:
        if type(input_) is list:
            return self.prime_tokenizer.encode_batch(input_)
        return self.prime_tokenizer.encode(input_)

    def sequence_to_text(self, input_) -> List[str]:
        if type(input_) is list:
            return self.prime_tokenizer.decode_batch(batch)
        return self.prime_tokenizer.decode(input_)

    def train(self, data):
        log_itr = iter(data)
        tqdm_log_itr = tqdm(iterable=log_itr)
#         self.prime_tokenizer.train()
#         for _ in tqdm(range(len(data))):
#             log = log_itr.__next__()
        self.prime_tokenizer.train_from_iterator(tqdm_log_itr.__iter__(), self.trainer)
#         self.prime_tokenizer.train_from_iterator(log_itr, self.trainer)
        self.prime_tokenizer.save(SOURCE + "/results/prime_tokenizer.json")

    def get_tokenizer(self) -> Tokenizer:
        return self.prime_tokenizer

    def get_vocab(self) -> Dict[str, int]:
        return self.prime_tokenizer.get_vocab()
    
    def get_vocab_size(self) -> int:
        return self.prime_tokenizer.get_vocab_size()

### Generic Save Model

In [13]:
def save_model(model, path):
#     if not os.path.exists(path):
#         return

    if os.path.isfile(path):
        os.remove(path)
#     elif os.path.isdir(path):
#         shutil.rmtree(path)
#         return

    joblib.dump(model, path)

### Standardize Logs

In [14]:
def standardize_logs(logs: pd.DataFrame) -> pd.DataFrame:

    # remove timestamps
    logs['log'] = logs['log'].replace(
        to_replace=r'(?:\d{4}-\d{2}-\d{2}[\sT]\d{2}:\d{2}:\d{2}([.,]\d{3}|\s))|(?:\s{2,})',
        value=' ',
        regex=True)

    return logs

### PhraseCaptureLayer

In [15]:
class PhraseCaptureLayer(tf.keras.layers.Layer):

    def __init__(self,
                 config: PhraseCaptureLayerConfig,
                 global_config: PreprocessingGlobalConfig):

        super(PhraseCaptureLayer, self).__init__()
        self.min_count = config.min_count
        self.threshold = config.threshold
        self.load_model = config.load_model
        self.save_model = config.save_model
        self.path = global_config.path
        self.model_name = config.model_name

        if self.load_model:
            self.phrase_model = joblib.load(SOURCE +
                                            self.path +
                                            self.model_name)
        else:
            self.phrase_model = Phrases(min_count=self.min_count,
                                        threshold=self.threshold)

    def call(self, corpus, training):

        def clean_log(log):
            log = log.lower().strip()
            return re.sub(r'\s{2,}', ' ', log)

        def reorganize_return(corpus_with_phrases):
            log_list = []
            for tokenized_log in corpus_with_phrases:
                log_list.append(' '.join(tokenized_log))
            return log_list

        split_corpus = [log.split(' ') for log in corpus['log']]

        if not training:
            self.phrase_model = self.phrase_model.freeze()
        else:
            self.phrase_model.add_vocab(split_corpus)

        if self.save_model:
            save_model(self.phrase_model, SOURCE + self.path + self.model_name)

        corpus_with_phrases = self.phrase_model.__getitem__(split_corpus)
        return reorganize_return(corpus_with_phrases)

### TextClusteringLayer

In [16]:
class TextClusteringLayer(tf.keras.layers.Layer):

    def __init__(self,
                 config: TextClusteringLayerConfig,
                 global_config: PreprocessingGlobalConfig):

        super(TextClusteringLayer, self).__init__()
        self.load_model = config.load_model
        self.save_model = config.save_model
        self.path = global_config.path
        self.model_name = config.model_name

        if self.load_model is True:
            self.template_miner = joblib.load(SOURCE +
                                              self.path +
                                              self.model_name)
        else:
            self.template_miner = drain3.TemplateMiner()

    def call(self, corpus, training):
        if training:
            for log in corpus:
                self.template_miner.add_log_message(log)
            if self.save_model:
                save_model(self.template_miner,
                           SOURCE + self.path + self.model_name)

            for idx, log in enumerate(corpus):
                template = self.template_miner.match(log).get_template()
                corpus[idx] = template

            return [re.sub(pattern=r' +',
                           repl=' ',
                           string=cluster) for cluster in corpus]
        else:
            log_list = []
            for log in corpus:
                match_cluster = self.template_miner.match(log)
                if match_cluster is None:
                    match_cluster = self.template_miner.add_log_message(log)
                log_list.append(match_cluster)
            return [re.sub(pattern=r' +',
                           repl=' ',
                           string=cluster.get_template()) for cluster in log_list]

### NegativeSkipgramLayer

In [17]:
@dataclass
class NSLBundle:
    vocab: dict
    targets: list
    contexts: list
    labels: list


class NegativeSkipgramLayer(tf.keras.layers.Layer):

    def __init__(self,
                 config: NegativeSkipgramLayerConfig,
                 global_config: PreprocessingGlobalConfig):

        super(NegativeSkipgramLayer, self).__init__()
        self.vocab_size = 0
        self.vectorized_logs, self.corpus = [], []
        self.targets, self.contexts, self.labels = [], [], []
        self.vocab = {}
        self.embedding_dim = global_config.embed_size
        self.window_size = config.window_size
        self.load_data = config.load_model
        self.save_data = config.save_model
        self.num_neg_sampling = global_config.num_neg_sampling
        self.path = global_config.path

    def collect_vocabulary(self):
        self.vocab[0] = '<pad>'

        # --- OLD --- No longer need to fit
        # log_tokenizer.fit_on_texts(self.corpus)
        # TODO: Need to add text to seqeuence methods (Instead of  Tokenize)
        self.vectorized_logs = log_tokenizer.texts_to_sequences(self.corpus)

        # TODO: Need to add word vocabulary dictionary options
        self.vocab.update({v: k for k, v in log_tokenizer.word_index.items()})
        self.vocab_size = len(self.vocab.keys())

    def find_word_context(self):

        # Build the sampling table for vocab_size tokens.
        sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(len(self.vocab))

        for sequence in tqdm(self.vectorized_logs, position=0, leave=True):

            positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
                sequence,
                vocabulary_size=len(self.vocab),
                sampling_table=sampling_table,
                window_size=self.window_size,
                negative_samples=0)

            for target_word, context_word in positive_skip_grams:
                context_class = tf.expand_dims(
                    tf.constant([context_word], dtype='int64'), 1)

                negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                    true_classes=context_class,
                    num_true=1,
                    num_sampled=self.num_neg_sampling,
                    unique=True,
                    range_max=len(self.vocab),
                    seed=42,
                    name="negative_sampling")

                negative_sampling_candidates = tf.expand_dims(
                    negative_sampling_candidates, 1)

                context = tf.concat([context_class, negative_sampling_candidates], 0)
                label = tf.constant([1] + [0] * self.num_neg_sampling, dtype='int64')

                self.targets.append(target_word)
                self.contexts.append(context)
                self.labels.append(label)

    def call(self, corpus, training):
        if self.load_data:
            print("WTF X 2")
            try:
                self.vocab = joblib.load(SOURCE + self.path + 'vocab.joblib')
                self.targets = joblib.load(SOURCE + self.path + 'targets.joblib')
                self.contexts = joblib.load(SOURCE + self.path + 'contexts.joblib')
                self.labels = joblib.load(SOURCE + self.path + 'labels.joblib')
            except Exception as e:
                print(e)
        else:
            self.corpus = corpus
            self.collect_vocabulary()
            self.find_word_context()

            if self.save_data:
                save_model(self.vocab,
                           SOURCE + self.path + 'vocab.joblib')
                save_model(self.targets,
                           SOURCE + self.path + 'targets.joblib')
                save_model(self.contexts,
                           SOURCE + self.path + 'contexts.joblib')
                save_model(self.labels,
                           SOURCE + self.path + 'labels.joblib')

        return NSLBundle(self.vocab, self.targets, self.contexts, self.labels)

### Word2VecEmbeddingLayer

In [18]:
class Word2VecEmbeddingLayer(tf.keras.layers.Layer):

    def __init__(self,
                 config: W2VLayerConfig,
                 global_config: PreprocessingGlobalConfig):

        super(Word2VecEmbeddingLayer, self).__init__()
        self.embeddings = {}
        self.embedding_dim = global_config.embed_size
        self.buffer_size = global_config.buffer_size
        self.num_neg_sampling = global_config.num_neg_sampling
        self.load_model = config.load_model
        self.save_model = config.save_model
        self.batch_size = config.batch_size
        self.epochs = config.epochs
        self.Optimizer = tf.keras.optimizers.Adam()
        self.path = global_config.path
        self.model_name = config.model_name

        if self.load_model:
            self.Word2Vec = load_model(SOURCE + self.path + self.model_name)
        else:
            self.Word2Vec = None

    def call(self, in_bundle, training):

        vocab = in_bundle.vocab
        targets = in_bundle.targets
        contexts = in_bundle.contexts
        labels = in_bundle.labels

        if self.Word2Vec is None:
            self.Word2Vec = Word2Vec(len(vocab.keys()), self.embedding_dim, self.num_neg_sampling)
            self.Word2Vec.compile(
                optimizer=self.Optimizer,
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

        dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
        dataset = dataset.shuffle(self.buffer_size).batch(self.batch_size, drop_remainder=True)
        dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

        if training:
            self.Word2Vec.fit(dataset, epochs=self.epochs)

        weights = self.Word2Vec.get_layer('w2v_embedding').get_weights()[0]

        for word in vocab.items():
            self.embeddings.update({
                word[1]: weights[word[0]]
                })

        if self.save_model:
            if os.path.exists(SOURCE + self.path + self.model_name):
                shutil.rmtree(SOURCE + self.path + self.model_name)
            self.Word2Vec.save(SOURCE + self.path + self.model_name)
            out_v = io.open(SOURCE + self.path + 'vectors.tsv', 'w', encoding='utf-8')
            out_m = io.open(SOURCE + self.path + 'metadata.tsv', 'w', encoding='utf-8')

            for index, word in enumerate(vocab.values()):
                if index == 0:
                    continue  # skip 0, it's padding.
                vec = weights[index]
                out_v.write('\t'.join([str(x) for x in vec]) + "\n")
                out_m.write(word + "\n")
            out_v.close()
            out_m.close()

        self.Word2Vec.summary()
        return self.embeddings

### Word2VecModel

In [19]:
class Word2Vec(tf.keras.models.Model):

    def __init__(self, vocab_size, embedding_dim, num_neg_sampling):
        super(Word2Vec, self).__init__()
        self.target_embedding = tf.keras.layers.Embedding(
            vocab_size,
            embedding_dim,
            input_length=1, # input length 1 since we are focusing on one token
            name="w2v_embedding")

        self.context_embedding = tf.keras.layers.Embedding(
            vocab_size,
            embedding_dim,
            input_length=num_neg_sampling + 1) # window size for contextual 
            # reasoning behind the sample token
        self.dots = tf.keras.layers.Dot(axes=(3, 2))
        self.flatten = tf.keras.layers.Flatten()

    def call(self, pair):
        target, context = pair
        we = self.target_embedding(target)
        ce = self.context_embedding(context)
        dots = self.dots([ce, we])
        return self.flatten(dots)

### W2V_Pipeline

In [20]:
class W2V_Pipeline(tf.keras.Model):
    def __init__(self, config: PreprocessingPipelineConfig):
        super(W2V_Pipeline, self).__init__()

        self.PCL = PhraseCaptureLayer(config.PhraseCaptureLayerConfig,
                                      config.PreprocessingGlobalConfig)

        self.global_train = config.PreprocessingGlobalConfig.global_training
        self.PCL_train = True if self.global_train else config.PhraseCaptureLayerConfig.training  # noqa

    def call(self, x, tokenizer: PrimeTokenizer):
        x = standardize_logs(x)
        x = self.PCL(x, self.PCL_train)
        tokenizer.train(x)
        joblib.dump(tokenizer, SOURCE + '/results/tokenizer.joblib')
        print('dooooooooone')

In [21]:
container_dataset.head(25)

Unnamed: 0,timestamp,container_name,log,label
1271,2021-01-21T17:19:21.350Z,core.soaesb,"2021-01-21T17:19:12,170 | ERROR | Thread-244 ...",nitf-messaging-bundle-stopped
1486,2021-01-21T17:19:21.350Z,core.soaesb,"2021-01-21T17:19:12,170 | ERROR | Thread-244 ...",nitf-messaging-bundle-stopped
4897,2021-01-21T17:19:21.350Z,core.soaesb,"2021-01-21T17:19:11,906 | INFO | 1]-nio2-thre...",nitf-messaging-bundle-stopped
5465,2021-01-21T17:19:21.350Z,core.soaesb,"2021-01-21T17:19:11,906 | INFO | 1]-nio2-thre...",nitf-messaging-bundle-stopped
6712,2021-01-21T17:46:39.665Z,core.soaesb,"2021-01-21T17:46:31,488 | INFO | cxf-StsThrea...",newscene-bundle-stopped
6713,2021-01-21T17:46:39.665Z,core.soaesb,"2021-01-21T17:46:31,499 | INFO | cxf-StsThrea...",newscene-bundle-stopped
6734,2021-01-21T17:46:39.665Z,core.soaesb,"2021-01-21T17:46:31,646 | INFO | cxf-StsThrea...",newscene-bundle-stopped
6766,2021-01-21T17:34:19.606Z,core.soaesb,"2021-01-21T17:34:13,771 | ERROR | Thread-281 ...",newscene-bundle-stopped
6963,2021-01-21T17:46:39.665Z,core.soaesb,"2021-01-21T17:46:31,499 | INFO | cxf-StsThrea...",newscene-bundle-stopped
7502,2021-01-21T17:34:19.606Z,core.soaesb,"2021-01-21T17:34:15,394 | INFO | 1]-nio2-thre...",newscene-bundle-stopped


## W2V Pipeline Main

In [22]:
# ** Preprocessing **
'''
standardize_logs
'''

# ** Model **
# 1.
# LogTokenEmbedder
'''
Seq = [PCL
       TCL
       NSL
       GT1: W2V] -> {embedding_matrix, vocab}
'''
######

# 2.
# Transformer Stuff
'''
{log, embedding_matrix, vocab} ->
GT2: Transformer -> prediction
'''
# LOG_DIR = SOURCE + 'logs'
# metadata = os.path.join(LOG_DIR, 'metadata.tsv')
# config = projector.ProjectorConfig()

config_path = SOURCE + '/assets/notebooks/PreprocessingConfig.yaml'
preprocessing_config = PreprocessingPipelineConfig()
preprocessing_config.load(config_path)

# --- OLD SUBWORD TOKENIZER ---
# log_tokenizer = Tokenizer(src_path=SOURCE + '/assets/notebooks/demofile2.txt',
#                           model_path=SOURCE + '/assets/notebooks/sentencepiece_model',
#                           num_words=preprocessing_config.PreprocessingGlobalConfig.max_vocab_size)
# w2vp = W2V_Pipeline(preprocessing_config)
# embed_weights = w2vp(container_dataset)

# --- SUBWORD TOKENIZER --
prime_tokenizer = PrimeTokenizer()
w2vp = W2V_Pipeline(preprocessing_config)
w2vp(dataset, prime_tokenizer)
# prime_tokenizer.train(dataset['log'])

2021-05-27 16:52:47,241 INFO | collecting all words and their counts
2021-05-27 16:52:47,241 INFO | PROGRESS: at sentence #0, processed 0 words and 0 word types
2021-05-27 16:52:47,524 INFO | PROGRESS: at sentence #10000, processed 201154 words and 14141 word types
2021-05-27 16:52:47,825 INFO | PROGRESS: at sentence #20000, processed 416699 words and 22358 word types
2021-05-27 16:52:48,101 INFO | PROGRESS: at sentence #30000, processed 620573 words and 29247 word types
2021-05-27 16:52:48,368 INFO | PROGRESS: at sentence #40000, processed 817972 words and 35127 word types
2021-05-27 16:52:48,636 INFO | PROGRESS: at sentence #50000, processed 1016814 words and 40970 word types
2021-05-27 16:52:48,912 INFO | PROGRESS: at sentence #60000, processed 1214843 words and 46444 word types
2021-05-27 16:52:49,207 INFO | PROGRESS: at sentence #70000, processed 1412743 words and 51187 word types
2021-05-27 16:52:49,483 INFO | PROGRESS: at sentence #80000, processed 1614224 words and 55846 word t

717348it [00:14, 50980.01it/s]


TypeError: cannot pickle 'tokenizers.trainers.WordPieceTrainer' object

In [23]:
value = random.randint(0, 100000)
print(prime_tokenizer.text_to_sequence(dataset['log'].iloc[value]).tokens)

['[CLS]', 'info', '(', 'qtp1752461090', '-', '16', ')', '[', 'c', ':', 'catalog', 's', ':', 'shard1', 'r', ':', 'core_node3', 'x', ':', 'catalog_shard1_replica_n1', ']', 'o', '.', 'a', '.', 's', '.', 'u', '.', 'p', '.', 'logupdateprocessorfactory', '[', 'catalog_shard1_replica_n1', ']', 'webapp', '=/', 'solr', 'path', '=/', 'update', 'params', '={', 'wt', '=', 'javabin', '&', 'version', '=', '2', '}{', 'add', '=[', '799', '##1df', '##2b6', '##81', '##a4', '##753', '##969', '##ed2', '##fa0e', '##f916', '##ec', '(', '16895271934', '##6866', '##5856', ')]}', '0', '12', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PA

## W2V Dash 

### Supporting Functions

In [24]:
def tree_parser(node, inner_list, outer_list, root_node, depth):
    d = node.key_to_child_node  # dict
    for token in list(d.keys()):
        if len(root_node.key_to_child_node.keys()) == 0:
            ret_list = []
            for row in outer_list:
                proper_len = int(row[1])
                if len(row) == proper_len+1 or len(row) + 1 == depth:
                    ret_list.append(row)
            return ret_list
        inner_list.append(token)
        child = d[token]
        if child.key_to_child_node:
            tree_parser(child, inner_list, outer_list, root_node, depth)
        else:
            d.pop(token)
            outer_list.append(inner_list)
            inner_list = ['root']
            tree_parser(root_node, inner_list, outer_list, root_node, depth)

In [49]:
def tree_to_list_parser(node):
    tree_df = []
    curr_path = []
    tree_dict = {}
    prev_root = [("root", node)]
    while len(prev_root) > 0:
        # Peek at last value
        curr_root = prev_root[-1]

        # Get the node element
        curr_node = curr_root[1].key_to_child_node

        # Follow path value if not already there
        if len(curr_path) <= 0 or curr_path[-1] != curr_root[0]:
            curr_path.append(curr_root[0])

        visited = False
        if curr_root[1] in tree_dict:
            visited = True
        else:
            tree_dict[curr_root[1]] = True

        # Check if value has any leaf nodes
        if not visited and len(curr_node.keys()) > 0:
            # Add those to the stack
            for nn in curr_node.items():
                prev_root.append((nn[0], nn[1]))
        else:
            # Remove previous node in the path
            prev_root.pop()

            # Record to the database if leaf
            if len(curr_node.keys()) <= 0:
                tree_df.append(deepcopy(curr_path))

            # Move back up tree
            curr_path.pop()
    return tree_df

In [50]:
def appendSpherical_np(xyz):
    ptsnew = np.hstack((xyz, np.zeros(xyz.shape)))
    xy = xyz[:, 0]**2 + xyz[:, 1]**2
    ptsnew[:, 3] = np.sqrt(xy + xyz[:, 2]**2)
    ptsnew[:, 4] = np.arctan2(np.sqrt(xy), xyz[:, 2])  # for elevation angle defined from Z-axis down
    # ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up
    ptsnew[:, 5] = np.arctan2(xyz[:, 1], xyz[:, 0])
    return ptsnew

In [51]:
def get_spherical_coords(xyz):
    sph = np.zeros(shape=xyz.shape)
    xy = xyz[:, 0]**2 + xyz[:, 1]**2
    sph[:, 0] = np.sqrt(xy + xyz[:, 2]**2)
    sph[:, 1] = np.arctan2(np.sqrt(xy), xyz[:, 2])
    sph[:, 2] = np.arctan2(xyz[:, 1], xyz[:, 0])
    return sph

The output of the W2V pipeline is a matrix of size [vocab size x embedding size] 

### Environmental Variables

In [None]:
# -- W2V Dash Environmental Variables -- #

W2V_NEIGHBORS = 20
RECURSION_LIMIT = 10**6
N_PROJ_DIM = 3
DASH_SEED = 0

### Generate Projection Data

In [None]:
# -- Generate Data for Word Embeddings Projector -- #

# shape = vocab size x embedding dim size
weights = np.ndarray(shape=(len(embed_weights), w2v_config["embed_size"]))

# -- Populate Matrix for PCA -- #
for idx, weight in enumerate(list(embed_weights.values())):
    weights[idx, :] = weight

# -- Dimensionality Reduction -- #
pca = PCA(n_components=N_PROJ_DIM, random_state=DASH_SEED).fit(weights)
ica = FastICA(n_components=N_PROJ_DIM, random_state=DASH_SEED).fit(weights)
srp = SparseRandomProjection(n_components=N_PROJ_DIM, random_state=DASH_SEED).fit(weights)
reduced_embeddings = pca.transform(weights)

# -- Calculate Nearest Neighbors -- #
model = NearestNeighbors(n_neighbors=W2V_NEIGHBORS, algorithm='auto')
trained_embeddings = model.fit(reduced_embeddings)

# Currently the array has a shape of vocab size x N_PROJ_DIM and contains
# the fitted PCA data. We need to add the vocab in the first column so
# we know which vectors are represented.
scatter_plot_3d_cols = ['token', 'x1', 'x2', 'x3']
embedding_vocab_arr = np.array(list(embed_weights.keys()))
embedding_vocab_arr = np.expand_dims(embedding_vocab_arr, 1)
named_reduced_embeddings = np.hstack((embedding_vocab_arr, reduced_embeddings))
scatter_plot_3d_df = pd.DataFrame(
    data=named_reduced_embeddings,
    columns=scatter_plot_3d_cols)
scatter_plot_3d_df['x1'] = pd.to_numeric(scatter_plot_3d_df['x1'])
scatter_plot_3d_df['x2'] = pd.to_numeric(scatter_plot_3d_df['x2'])
scatter_plot_3d_df['x3'] = pd.to_numeric(scatter_plot_3d_df['x3'])

We will build our plot using the tree_parser function. This function recursively
steps through the drain3.TemplateMiner.drain.Node structure of our 
**TextClusteringLayer** (TCL). The recursion populates a np.array which is then used
to build a pandas dataframe which the plotly treemap accepts. There is a column
appended to the tail of the dataframe which counts the number of stars 
(wild card masks) present in the row. This is used to define the colors shown.

### Generate Treemap Data

In [None]:
# By default python's recursion limit is 10**4 which is too small for our needs
sys.setrecursionlimit(RECURSION_LIMIT)

# The root node is the master node of the tree and will be our return point
root_node = deepcopy(w2vp.TCL.template_miner.drain.root_node)
parsed_tree = tree_to_list_parser(root_node)
parsed_tree_df = pd.DataFrame(data=parsed_tree)

# The returned dataframe has generic columns so we will provide custom labels
n_cols = len(parsed_tree_df.columns)
col_name_list = []
for idx in range(n_cols):
    col_name_list.append('level' + str(idx))
parsed_tree_df.columns = col_name_list

'''
Without a color column our treemap would just be plain. We thought that taking
the sum of the drain mask would be an interesting way to color the treemap.
This lambda function will sum those values in each row and return them to a new
columnn named 'sum'
'''
parsed_tree_df['sum'] = parsed_tree_df.apply(lambda x: x.str.contains('<*>'), axis=1).sum(axis=1)  # noqa

### Dash Variables

In [None]:
pio.templates.default = "plotly_dark"
external_stylesheets_url = 'https://drive.google.com/uc?export=view&id=19OXGQ5iJIjRZD4VEZ-xiVChDmj0-SlSF'  # noqa
external_stylesheets = [external_stylesheets_url]

CACHE_CONFIG = dict()
CACHE_CONFIG['CACHE_TYPE'] = 'filesystem'
CACHE_CONFIG['CACHE_DIR'] = SOURCE + '/results/dash_cache'

### Colors

In [None]:
color_d = dict()
color_d['blue'] = 'rgb(66, 133, 244)'
color_d['red'] = 'rgb(219, 68, 55)'
color_d['yellow'] = 'rgb(244, 180, 0)'
color_d['orange'] = 'rgb(255, 165, 0)'
color_d['green'] = 'rgb(15, 157, 88)'
color_d['mint'] = 'rgb(3, 218, 198)'
color_d['dark mint'] = 'rgb(1, 135, 134)'
color_d['dark purple'] = 'rgb(55, 0, 179)'
color_d['purple'] = 'rgb(98, 0, 238)'

### Dash Formatting

In [None]:
# ================= #
#  3d Scatter Plot  #
# ================= #

# Line formatting
scatter_plot_3d_line = dict()
scatter_plot_3d_line['width'] = 2
scatter_plot_3d_line['color'] = color_d['dark mint']

scatter_plot_3d_selected_line = dict()
scatter_plot_3d_selected_line['width'] = 2
scatter_plot_3d_selected_line['color'] = color_d['dark mint']

scatter_plot_3d_nonselected_line = dict()
scatter_plot_3d_nonselected_line['width'] = 2
scatter_plot_3d_nonselected_line['color'] = color_d['dark mint']

scatter_plot_3d_darker_line = dict()
scatter_plot_3d_darker_line['width'] = 2
scatter_plot_3d_darker_line['color'] = color_d['dark purple']


# Marker formatting
scatter_plot_3d_marker = dict()
scatter_plot_3d_marker['size'] = 5
scatter_plot_3d_marker['line'] = scatter_plot_3d_line
scatter_plot_3d_marker['color'] = color_d['mint']

scatter_plot_3d_selected_marker = dict()
scatter_plot_3d_selected_marker['size'] = 5
scatter_plot_3d_selected_marker['color'] = color_d['mint']
scatter_plot_3d_selected_marker['line'] = scatter_plot_3d_selected_line

scatter_plot_3d_nonselected_marker = dict()
scatter_plot_3d_nonselected_marker['size'] = 5
scatter_plot_3d_nonselected_marker['color'] = color_d['mint']
scatter_plot_3d_nonselected_marker['opacity'] = 0.15
scatter_plot_3d_nonselected_marker['line'] = scatter_plot_3d_nonselected_line

scatter_plot_3d_marker_no_color = dict()
scatter_plot_3d_marker_no_color['size'] = 5
scatter_plot_3d_marker_no_color['line'] = scatter_plot_3d_darker_line

scatter_plot_3d_marker_cluster_center = dict()
scatter_plot_3d_marker_cluster_center['size'] = 10
scatter_plot_3d_marker_cluster_center['color'] = color_d['orange']
scatter_plot_3d_marker_cluster_center['opacity'] = 0.5
scatter_plot_3d_marker_cluster_center['line'] = scatter_plot_3d_darker_line

scatter_plot_3d_selected_table_marker = dict()
scatter_plot_3d_selected_table_marker['size'] = 5
scatter_plot_3d_selected_table_marker['color'] = color_d['yellow']
scatter_plot_3d_selected_table_marker['line'] = scatter_plot_3d_darker_line


# Style
scatter_plot_3d_style = dict()
scatter_plot_3d_style['height'] = '100%'
scatter_plot_3d_style['width'] = '100%'


# ========= #
#  Treemap  #
# ========= #

# Style
treemap_style = dict()
treemap_style['height'] = '100%'
treemap_style['width'] = '100%'


# ============ #
#  Data Table  #
# ============ #

# Style
data_table_cell_style = dict()
data_table_cell_style['textAlign'] = 'left'
data_table_cell_style['overflow'] = 'hidden'
data_table_cell_style['textOverflow'] = 'ellipsis'
data_table_cell_style['maxWidth'] = 0
data_table_cell_style['backgroundColor'] = 'rgb(20, 20, 20)'
data_table_cell_style['color'] = 'white'

data_table_header_style = dict()
data_table_header_style['backgroundColor'] = color_d['purple']


# ======== #
#  Labels  #
# ======== #

# Style
clustering_alg_drop_down_label_style = dict()
clustering_alg_drop_down_label_style['color'] = 'white'

coordinate_space_drop_down_label_style = dict()
coordinate_space_drop_down_label_style['color'] = 'white'

dim_reduction_drop_down_label_style = dict()
dim_reduction_drop_down_label_style['color'] = 'white'

### Dash Configuration

In [None]:
# ================= #
#  3d Scatter Plot  #
# ================= #
scatter_plot_3d_config = dict()
scatter_plot_3d_config['responsive'] = True


# ========= #
#  Treemap  #
# ========= #
treemap_config = dict()
treemap_config['responsive'] = True

### Dash Dropdown Options

In [None]:
clustering_alg_drop_down_options = [
    {'label': 'KNN', 'value': 'KNN'},
    {'label': 'GMM', 'value': 'GMM'},
    {'label': 'Bayesian GMM', 'value': 'BGMM'},
    {'label': 'Affinity Prop.', 'value': 'AP'},
    {'label': 'KMEANS', 'value': 'KM'},
    {'label': 'SVM', 'value': 'SVM'},
]

coordinate_space_drop_down_options = [
    {'label': 'Cartesian', 'value': 'CT'},
    {'label': 'Spherical', 'value': 'SP'}
]

dim_reduction_drop_down_options = [
    {'label': 'PCA', 'value': 'PCA'},
    {'label': 'ICA', 'value': 'ICA'},
    {'label': 'LDA', 'value': 'LDA'},
    {'label': 'Sparse RP', 'value': 'SRP'},
    {'label': 'Gaussian RP', 'value': 'GRP'}
]

### Dash Main

In [None]:
app = JupyterDash(__name__, external_stylesheets=external_stylesheets)
cache = Cache()
cache.init_app(app.server, config=CACHE_CONFIG)


# =============== #
#  Cluster Table  #
# =============== #
table = pd.DataFrame(
    data=list(embed_weights.keys()),
    columns=['token'])

# ============= #
#  Scatterplot  #
# ============= #
scatter_plot_3d_fig = px.scatter_3d(
                      scatter_plot_3d_df,
                      x='x1',
                      y='x2',
                      z='x3',
                      hover_name='token')

scatter_plot_2d_fig = px.scatter(
                     scatter_plot_3d_df,
                     x='x1',
                     y='x2',
                     hover_name='token')

scatter_plot_3d_fig.update_traces(marker=scatter_plot_3d_marker)
scatter_plot_3d_fig['layout']['uirevision'] = 1

scatter_plot_2d_fig.update_traces(marker=scatter_plot_3d_marker)
scatter_plot_2d_fig['layout']['uirevision'] = 1


# ========= #
#  Treemap  #
# ========= #
treemap_fig = px.treemap(
    parsed_tree_df,
    path=col_name_list,
    color='sum')


# ============ #
#  App Layout  #
# ============ #
app.layout = html.Div([

        html.Div([

            # -- Clustering Technique Dropdown -- #
            html.Label(
                "Clustering Algorithm (TODO)",
                style=clustering_alg_drop_down_label_style),
            dcc.Dropdown(
                id='cluster-dropdown',
                options=clustering_alg_drop_down_options,
                value='KNN'),

            # -- Coordinate Space Dropdown -- #
            html.Label(
                "Coordinate Space",
                style=coordinate_space_drop_down_label_style),
            dcc.Dropdown(
                id='coord-dropdown',
                options=coordinate_space_drop_down_options,
                value='CT'),

            # -- Dimensionality Reduction Technique Dropdown -- #
            html.Label(
                "Dimensionality Reduction (TODO)",
                style=dim_reduction_drop_down_label_style),
            dcc.Dropdown(
                id='dr-dropdown',
                options=dim_reduction_drop_down_options,
                value='PCA'
            )
        ], className='options-graph-container'),

        # -- 3d Scatter Plot -- #
        html.Div(
            [dcc.Graph(
                id='3d_scat',
                figure=scatter_plot_3d_fig,
                config=scatter_plot_3d_config,
                style=scatter_plot_3d_style),
             dcc.Slider(
                id='my-slider',
                min=0.5,
                max=0.9,
                step=0.05,
                value=0.5)],
            className='main-graph-container',
            id='graph_div'),

        # -- Tree Map -- #
        html.Div(
            dcc.Graph(
                id='3d_tree',
                figure=treemap_fig,
                config=treemap_config,
                style=treemap_style),
            className='secondary-graph-container',
            id='tree_div'),

        # -- Neighbors Datatable -- #
        html.Div(
            children=[dash_table.DataTable(
                 id='table',
                 columns=[{"name": i, "id": i} for i in table.columns],
                 data=pd.DataFrame().to_dict('records'),
                 style_cell=data_table_cell_style,
                 style_header=data_table_header_style,
             )],
            className='related-graph',
            id='data_table'),

        # signal value to trigger callbacks
        dcc.Store(id='signal')],

    id='report-container')


# ============= #
#  Memoization  #
# ============= #

# Table of Contents:
# -----------------------------
# 1. Projection DataFrame
# 2. Coordinates
# 3. Dimensionality Reductions
# 4. Clustering Algorithms
# -----------------------------

# -- 1. Projection DataFrame -- #
@cache.memoize()
def dataframe_store(embeddings):
    new_df = pd.DataFrame(
        data=embeddings,
        columns=scatter_plot_3d_cols)
    new_df['x1'] = pd.to_numeric(new_df['x1'])
    new_df['x2'] = pd.to_numeric(new_df['x2'])
    new_df['x3'] = pd.to_numeric(new_df['x3'])
    return new_df


# -- 2. Coordinates -- #
@cache.memoize()
def coordinate_space_store(value, embeddings):
    # calculate new coordinate space
    if value == 'SP':
        spherical_embeddings = get_spherical_coords(embeddings)
        embeddings_stack_tup = (embedding_vocab_arr, spherical_embeddings)
        named_embeddings = np.hstack(embeddings_stack_tup)
    elif value == "CT":
        embeddings_stack_tup = (embedding_vocab_arr, embeddings)
        named_embeddings = np.hstack(embeddings_stack_tup)
    else:
        return no_update
    return named_embeddings


# -- 3. Dimensionality Reduction -- #
@cache.memoize()
def dimension_reduct_store(value):
    # calculate new dimensionality reduction algorithm
    if value == "PCA":
        dr_embeddings = pca.transform(weights)
    elif value == "ICA":
        dr_embeddings = ica.transform(weights)
    elif value == "SRP":
        dr_embeddings = srp.transform(weights)
    else:
        return no_update
    return dr_embeddings


# -- 4. Clustering Algorithms -- #
@cache.memoize()
def clustering_algo_store(value, damp_value):
    # calculate new clustering algorithm
    if value == "KNN":
        model = NearestNeighbors(n_neighbors=W2V_NEIGHBORS, algorithm='auto')
    elif value == "AP":
        model = AffinityPropagation(damping=damp_value, random_state=DASH_SEED)
    elif value == "KM":
        model = KMeans(n_clusters=4)
    elif value == "GMM":
        model = GaussianMixture(n_components=4)
    elif value == "SVM":
        model = SVC(kernel='poly', degree=3, probability=True, random_state=DASH_SEED)
    return model


# =========== #
#  Callbacks  #
# =========== #

# -- Calculate Projection Data -- #
@app.callback(Output('signal', 'data'),
              Input('dr-dropdown', 'value'),
              Input('cluster-dropdown', 'value'),
              Input('coord-dropdown', 'value'),
              Input('my-slider', 'value'))
def compute_coordinate_space(dr_val, cluster_val, coord_val, damp_value):
    return (dr_val, cluster_val, coord_val, damp_value)


# -- Point Selection Mechanics -- #
@app.callback(Output("table", "data"),
              Output("3d_scat", "figure"),
              Input('3d_scat', 'clickData'),
              Input("signal", "data"),
              Input("table", "selected_rows"))
def select_point(clickData, value, rows):
    ctx = dash.callback_context
    ids = [c['prop_id'] for c in ctx.triggered]

    embeddings = dimension_reduct_store(value[0])
    model = clustering_algo_store(value[1], value[3])
    named_embeddings = coordinate_space_store(value[2], embeddings)
    df = dataframe_store(named_embeddings)

    clustering_model = model.fit(named_embeddings[:, 1:4].astype(float))

    if '3d_scat.clickData' in ids:
        if clickData:
            for p in clickData['points']:
                if value[1] != "KNN":
                    return no_update, no_update

                coord_list = [p['x'], p['y'], p['z']]
                query_arr = np.array(coord_list).reshape(1, -1)

                _, neighbors = clustering_model.kneighbors(X=query_arr)
                neighbors_list = neighbors.tolist()[0]
                tokens = []
                for idx in neighbors_list:
                    tokens.append(table.iloc[idx])
                update = pd.DataFrame(data=tokens)

                selected_df = df[df.index.isin(neighbors_list)]
                nonselected_df = df.drop(index=neighbors_list)

                ff = px.scatter_3d(
                    selected_df,
                    x='x1',
                    y='x2',
                    z='x3',
                    hover_name='token')

                ff = ff.update_traces(marker=scatter_plot_3d_selected_marker)

                if rows is not None:
                    table_point = selected_df[selected_df['token'] == rows]
                    ff2_1 = px.scatter_3d(
                            table_point,
                            x='x1',
                            y='x2',
                            z='x3',
                            text='token')

                    ff2_1 = ff2_1.update_traces(marker=scatter_plot_3d_selected_table_marker)
                    ff.add_trace(ff2_1.data[0])

                ff2 = px.scatter_3d(
                    nonselected_df,
                    x='x1',
                    y='x2',
                    z='x3',
                    hover_name='token')

                ff2 = ff2.update_traces(marker=scatter_plot_3d_nonselected_marker)

                ff.add_trace(ff2.data[0])
                ff['layout']['uirevision'] = 1

                return update.to_dict('records'), ff
    elif 'signal.data' in ids:
        if value[1] != "KNN":
            y_pred = clustering_model.predict(embeddings)

            df.insert(0, "Label", y_pred, True)
            ff = px.scatter_3d(
                df,
                x='x1',
                y='x2',
                z='x3',
                color='Label',
                hover_name='token')

            ff.update_traces(marker=scatter_plot_3d_marker_no_color)

            if "GMM" not in value[1]:
                centers = pd.DataFrame(data=clustering_model.cluster_centers_, columns=["x1", "x2", "x3"])
                ff2 = px.scatter_3d(
                    centers,
                    x='x1',
                    y='x2',
                    z='x3')
                ff2.update_traces(marker=scatter_plot_3d_marker_cluster_center)

                ff.add_trace(ff2.data[0])
        else:
            ff = px.scatter_3d(
                df,
                x='x1',
                y='x2',
                z='x3',
                hover_name='token')

            ff.update_traces(marker=scatter_plot_3d_marker)

        ff['layout']['uirevision'] = 1

        return no_update, ff
    else:
        return no_update, no_update


app.run_server(host='0.0.0.0', mode='jupyterlab')

# Transformer Pipeline

In [25]:
@dataclass
class TransformerGlobalConfig:
    d_model: int = 512
    max_seq_length: int = 200
    global_training: bool = True
    storage_path: str = '/results/'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class BERTLayerConfig:
    num_attention_heads: int = 8
    num_encoder_layers: int = 12
    dff: int = 2048
    max_seq_len: int = 2048
    dropout_rate: float = 0.1
    load_model: bool = False
    save_model: bool = True
    training: bool = True

    def load(self, config):
        set_attributes(self, config)


@dataclass
class HitAnomalyLayerConfig:
    num_attention_heads: int = 12
    num_encoder_layers: int = 3
    dff: int = 2048
    max_seq_len: int = 2048
    dropout_rate: float = 0.1
    load_model: bool = False
    save_model: bool = True
    training: bool = True

    def load(self, config):
        set_attributes(self, config)


class TransformerConfig:
    def __init__(self):
        self._global = TransformerGlobalConfig()
        self.BERT = BERTLayerConfig()
        self.HitAnomaly = HitAnomalyLayerConfig()

    def load(self, path):
        try:
            with open(path) as f:
                transformer_config = yaml.load(f, Loader=yaml.FullLoader)
        except FileNotFoundError as e:
            logger.warning(e)
            return None

        self._global.load(transformer_config)
        self.BERT.load(transformer_config)
        self.HitAnomaly.load(transformer_config)
        
def set_attributes_from_object(self, *args):
    try:
        for obj in args:
            for attr_key, attr in obj.__dict__.items():
                setattr(self, attr_key, attr)
    except Exception as e:
        logger.warning(e)

## Metric Objects

### Loss Function

In [26]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)

### Accuracy Function

In [27]:
def accuracy_function(real, pred):
    accuracies = tf.equal(real, tf.argmax(pred, axis=1))

    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)

    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracies) / tf.reduce_sum(mask)

### Custom Learning Rate Schedule

In [28]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

    def __init__(self, d_model: int, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

## Pipeline Objects

### PositionalEncodingLayer

In [29]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        if max_dims % 2 == 1: max_dims += 1  # max_dims must be even
        p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))
        pos_emb = np.empty((1, max_steps, max_dims))
        pos_emb[0, :, ::2] = np.sin(p / 10000 ** (2 * i / max_dims)).T
        pos_emb[0, :, 1::2] = np.cos(p / 10000 ** (2 * i / max_dims)).T
        self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))

    def call(self, inputs):
        shape = tf.shape(inputs)
        return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]

### EncoderBlock

In [30]:
class EncoderBlock(Layer):

    def __init__(
        self,
        d_model: int,
        num_heads: int,
        dff: int,
        rate=0.1):
        super(EncoderBlock, self).__init__()

        self.multi_headed_attention = MultiHeadAttention(num_heads=num_heads,
                                                         key_dim=d_model // num_heads,
                                                         dropout=0.1,
                                                         attention_axes=(1))

        self.feed_forward_network = Sequential([
            Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
            Dense(d_model, activation='relu')  # (batch_size, seq_len, d_model)
        ])

        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)

        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, x, mask):
        # (1) - Attention Score
#         logger.info('MULTIHEADED ATTENTION')
#         logger.info(x.shape)
        attn_output, attn_weights = self.multi_headed_attention(
            x,
            x,
            return_attention_scores=True)  # (batch_size, input_seq_len, d_model)

        # (2) - Add & Normalize
        attn_output = self.dropout1(attn_output, training=True)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        # (3) - Feed Forward NN
        feed_forward_output = self.feed_forward_network(out1)  # (batch_size, input_seq_len, d_model)

        # (4) - Add & Normalize
        feed_forward_output = self.dropout2(feed_forward_output, training=True)
        out2 = self.layernorm2(out1 + feed_forward_output)  # (batch_size, input_seq_len, d_model)

        return tf.convert_to_tensor(out2), tf.convert_to_tensor(attn_weights)

### BERT

In [31]:
class BERTLayer(Layer):
    def __init__(
        self,
        global_config: TransformerGlobalConfig,
        config: BERTLayerConfig):
        super(BERTLayer, self).__init__()

        set_attributes_from_object(
            self, 
            global_config,
            config)

        self.bert_layer_blocks = [EncoderBlock(
            self.d_model,
            self.num_attention_heads,
            self.dff,
            rate=self.dropout_rate) for _ in range(self.num_encoder_layers)]

    def call(self, input_: tf.tuple, **kwargs):
        enc_input = input_[0]
#         logger.info('BERT LAYER')
#         logger.info(enc_input.shape)
        encoding_padding_mask = None
        # BERT for Log Sequence Embedding
         for layer_idx in range(self.num_encoder_layers):
#             logger.info('BERT LAYER LOOP')
#             logger.info(enc_input.shape)            
            enc_output, attention = self.bert_layer_blocks[layer_idx](enc_input, encoding_padding_mask)
        return enc_output, attention

### HitAnomaly

In [32]:
class HitAnomalyLayer(Layer):
    def __init__(
        self,
        vocab_size: int,
        global_config: TransformerGlobalConfig,
        config: HitAnomalyLayerConfig):
        super(HitAnomalyLayer, self).__init__()
        
        self.vocab_size = vocab_size
        set_attributes_from_object(
            self, 
            global_config,
            config)
        
        self.encoding_blocks = [EncoderBlock(
            self.d_model,
            self.num_attention_heads,
            self.dff,
            rate=self.dropout_rate
        ) for _ in range(self.num_encoder_layers)]

        self.hidden_layer_output = []

#     @tf.function(jit_compile=True)
    def call(self, input_: tf.tuple, **kwargs):
        enc_input = input_[0]
        encoding_padding_mask = None

        # Encoder Block Hidden Layers for Log Encoder
        # (batch_size, inp_seq_len, d_model), (batch_size, class, inp_seq_len, inp_seq_len)
        for layer_idx in range(self.num_encoder_layers - 1):
            enc_output, att = self.encoding_blocks[layer_idx](enc_input, encoding_padding_mask)
            self.hidden_layer_output.append(enc_output)

        fin_output = enc_output
        final_output = tf.reduce_mean(fin_output, axis=1)
        final_output = tf.expand_dims(final_output, axis=0)

        # Last Encoding Block for Log Sequence Representation
        out, att = self.encoding_blocks[self.num_encoder_layers - 1](final_output, encoding_padding_mask)
        self.hidden_layer_output.append(out)

        # Final Pooling Layer
        seq_representation = tf.reduce_mean(out, axis=1)

        return seq_representation, att

### Transformer

In [33]:
class Transformer(Model):

    def __init__(
        self,
        tokenizer: PrimeTokenizer,
        config: TransformerConfig):
        super(Transformer, self).__init__()
        
        self.vocab_size = tokenizer.get_vocab_size()   
        set_attributes_from_object(
            self, 
            config._global)

        self.embedding = Embedding(
            self.vocab_size,
            self.d_model,
            input_length=self.max_seq_len)

        self.pos_encoding = PositionalEncoding(
            self.max_seq_len, 
            self.d_model)

        self.bert_layer = BERTLayer(
            config._global,
            config.BERT)

        self.hitanomaly_layer = HitAnomalyLayer(
            self.vocab_size,
            config._global,
            config.HitAnomaly)

        #self.dropout = Dropout(rate)

#     @tf.function(jit_compile=True)
    def call(self, input_tuple: tf.tuple, **kwargs):
        log_batch = input_tuple[0]
#         logger.info('INITIAL')
#         logger.info(log_batch.shape)
        encoding_padding_mask = None # input_tuple[1]
        
        embedding_tensor = self.embedding(log_batch) # (batch_size, input_seq_len, d_model)
#         logger.info('POST EMBEDDING LAYER')
#         logger.info(embedding_tensor.shape)
        
        embedding_tensor = self.pos_encoding(embedding_tensor)
#         logger.info('POST POSITIONAL ENCODING')
#         logger.info(embedding_tensor.shape)
        #embedding_tensor = self.dropout(embedding_tensor, training=TRAINING)

        # BERT for Log Sequence Embedding
        bert_arg = tf.tuple(embedding_tensor, encoding_padding_mask)
        enc_output, attention = self.bert_layer(bert_arg)

        # Encoder Block Hidden Layers for Log Sequence Representation
#         seq_representation, att = self.hitanomaly_layer(tf.tuple(enc_output, encoding_padding_mask))

        return enc_output, attention

## Transformer Main

### Batch Processing

In [34]:
def process_all_batches(n_iter, log_labels, batch_size):
    batches = []

    for idx in range(n_iter + 1):
        log_batch, labels = process_batch(dataset, idx, log_labels, batch_size)

        batches.append((log_batch, labels))

    return batches

def process_batch(dataset: pd.DataFrame,
                  idx: int,
                  labels: dict,
                  batch_size: int) -> tuple:
    start_window = idx * batch_size
    end_window = (idx + 1) * batch_size
    batched_data = dataset.iloc[start_window:end_window]
    encoded_batch = prime_tokenizer.text_to_sequence(batched_data['log'].to_list())
    id_batch = [log.ids for log in encoded_batch]
#     y_batch = labels[batched_data['label']]
    y_batch = [labels[idx] for idx in batched_data['label']]

    tf_idf = tf.convert_to_tensor(id_batch, dtype=tf.float32)
    y_idf  = tf.convert_to_tensor(y_batch, dtype=tf.float32)
    
    return tf_idf, y_idf

### Main (Initialization)

In [35]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

# -- Transformer Model -- #
transformer_config_path = SOURCE + '/assets/notebooks/TransformerConfig.yaml'
transformer_config = TransformerConfig()
transformer_config.load(transformer_config_path)
optimus_prime = Transformer(prime_tokenizer, transformer_config)

t_config = transformer_config._global

# -- Pipeline Info -- #
n_logs = len(container_dataset.index)
n_iter = n_logs // t_config.batch_size
remainder = n_logs % t_config.batch_size
attns = []

# -- Labels -- #
label_unique = dataset['label'].unique()
lbp = LabelEncoder().fit(label_unique)
binary_labels = lbp.transform(label_unique)

log_labels = {}
for idx, label in enumerate(label_unique):
    log_labels.update({
        label: binary_labels[idx]
    })
    
# -- Data Batches -- #
batched_dataset = process_all_batches(n_iter, log_labels, t_config.batch_size)

# -- Model Metrics -- #
learning_rate = CustomSchedule(t_config.d_model)
epoch_loss = Mean(name='train_loss')
epoch_accuracy = Mean(name='train_accuracy')
loss_object = SparseCategoricalCrossentropy(from_logits=True)
optimizer = Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# -- Classification Step Layers -- #
add_att_layer = AdditiveAttention()
softmax = Softmax()
s1 = Sequential([
    Dense(t_config.batch_size, activation=t_config.activation),
    Dense(4, activation=t_config.activation),
    Softmax()
])

# -- Checkpoints -- #
checkpoint_path = SOURCE + "checkpoints/"
checkpoint = Checkpoint(step=tf.Variable(1), transformer=optimus_prime, optimizer=optimizer)
checkpoint_manager = CheckpointManager(checkpoint, checkpoint_path, max_to_keep=5)

# tf.debugging.set_log_device_placement(True)
writer = tf.summary.create_file_writer(SOURCE + t_config.logdir)

1 Physical GPUs, 1 Logical GPU


### TrainStep

In [36]:
train_step_signature = [
    tf.TensorSpec(shape=([None, None]), dtype=tf.int32),
    tf.TensorSpec(shape=([None]), dtype=tf.int8)
]

# @tf.function(input_signature=train_step_signature)#, experimental_compile=True)
def train_step(log_batch: tf.Tensor, 
               labels: tf.Tensor):
    
    transformer_input = tf.tuple([
        log_batch,  # <tf.Tensor: shape=(batch_size, max_seq_len), dtype=float32>
        None  # <tf.Tensor: shape=(batch_size, num_classes), dtype=float32>
    ])
    
    with tf.GradientTape() as tape:
        Rs, acc = optimus_prime(transformer_input)
#         a_s = add_att_layer([Rs, Rs])
#         y = softmax(a_s * Rs)
#         print(a_s.shape)
        # y = Rs
#         loss = tf.py_function(loss_function, [labels, y], tf.float32)
#         pred = s1(y)
#         labels = tf.cast(labels, tf.int8)
    # Optimize the model
#     grads = tape.gradient(loss, optimus_prime.trainable_variables)
#     optimizer.apply_gradients(zip(grads, optimus_prime.trainable_variables))

#     acc = accuracy_function(labels, pred)

    # Tracking Progress
#     epoch_loss.update_state(loss)  # Adding Batch Loss
#     epoch_accuracy.update_state(acc)

    return Rs, acc

### Main (Training)

In [37]:
attentions = []

for epoch in range(t_config.epoch):

    start = time.time()
    epoch_loss.reset_states()
    epoch_accuracy.reset_states()
    dataset_iter = iter(batched_dataset)

    t = tqdm(range(n_iter), desc="Epoch: {:03d}, Loss: {:.3f}, Accuracy: {:.3%}".format(0, 0, 0), position=0, leave=True)
    for _ in t:
        batch = next(dataset_iter)
        log_batch = batch[0]
        labels = batch[1]

        # Returns Eager Tensor for Predictions
#         tf.summary.trace_on()
#         tf.profiler.experimental.start(SOURCE + t_config.logdir)

#         with writer.as_default():
        Rs, acc = train_step(log_batch, labels)
        #attentions.append((Rs, acc))
          # with tf.summary.record_if(True):

#             tf.summary.trace_export(
#               name = "training_trace",
#               step=0,
#               profiler_outdir=SOURCE + t_config.logdir
#             )

#         tf.profiler.experimental.stop()
#         tf.summary.trace_off()

#         checkpoint.step.assign_add(1)

#         if int(checkpoint.step) % 10 == 0:
#             save_path = checkpoint_manager.save()

        t.set_description(desc="Epoch: {:03d}, Loss: {:.3f}, Accuracy: {:.3%} ".format(epoch,
                                                                    epoch_loss.result(),
                                                                    epoch_accuracy.result()))
        t.refresh()

Epoch: 000, Loss: 0.000, Accuracy: 0.000%:   0%|          | 0/574 [00:00<?, ?it/s]

2021-05-27 16:54:54,811 INFO | INITIAL
2021-05-27 16:54:54,811 INFO | (50, 200)
2021-05-27 16:54:55,018 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:55,018 INFO | (50, 200, 512)
2021-05-27 16:54:55,020 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:55,020 INFO | (50, 200, 512)
2021-05-27 16:54:55,021 INFO | BERT LAYER
2021-05-27 16:54:55,022 INFO | (200, 512)
2021-05-27 16:54:55,022 INFO | BERT LAYER LOOP
2021-05-27 16:54:55,022 INFO | (200, 512)
2021-05-27 16:54:55,023 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:55,024 INFO | (200, 512)
2021-05-27 16:54:55,476 INFO | BERT LAYER LOOP
2021-05-27 16:54:55,477 INFO | (200, 512)
2021-05-27 16:54:55,477 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:55,478 INFO | (200, 512)
2021-05-27 16:54:55,506 INFO | BERT LAYER LOOP
2021-05-27 16:54:55,506 INFO | (200, 512)
2021-05-27 16:54:55,506 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:55,507 INFO | (200, 512)
2021-05-27 16:54:55,533 INFO | BERT LAYER LOOP
2021-05-27 16:54:55,534 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   0%|          | 1/574 [00:00<09:26,  1.01it/s]

2021-05-27 16:54:55,800 INFO | INITIAL
2021-05-27 16:54:55,800 INFO | (50, 200)
2021-05-27 16:54:55,808 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:55,809 INFO | (50, 200, 512)
2021-05-27 16:54:55,810 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:55,811 INFO | (50, 200, 512)
2021-05-27 16:54:55,812 INFO | BERT LAYER
2021-05-27 16:54:55,813 INFO | (200, 512)
2021-05-27 16:54:55,813 INFO | BERT LAYER LOOP
2021-05-27 16:54:55,813 INFO | (200, 512)
2021-05-27 16:54:55,814 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:55,814 INFO | (200, 512)
2021-05-27 16:54:55,820 INFO | BERT LAYER LOOP
2021-05-27 16:54:55,821 INFO | (200, 512)
2021-05-27 16:54:55,821 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:55,822 INFO | (200, 512)
2021-05-27 16:54:55,827 INFO | BERT LAYER LOOP
2021-05-27 16:54:55,828 INFO | (200, 512)
2021-05-27 16:54:55,828 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:55,829 INFO | (200, 512)
2021-05-27 16:54:55,835 INFO | BERT LAYER LOOP
2021-05-27 16:54:55,836 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   0%|          | 2/574 [00:01<04:30,  2.12it/s]

2021-05-27 16:54:55,911 INFO | INITIAL
2021-05-27 16:54:55,912 INFO | (50, 200)
2021-05-27 16:54:55,917 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:55,917 INFO | (50, 200, 512)
2021-05-27 16:54:55,919 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:55,919 INFO | (50, 200, 512)
2021-05-27 16:54:55,920 INFO | BERT LAYER
2021-05-27 16:54:55,920 INFO | (200, 512)
2021-05-27 16:54:55,921 INFO | BERT LAYER LOOP
2021-05-27 16:54:55,921 INFO | (200, 512)
2021-05-27 16:54:55,921 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:55,922 INFO | (200, 512)
2021-05-27 16:54:55,929 INFO | BERT LAYER LOOP
2021-05-27 16:54:55,930 INFO | (200, 512)
2021-05-27 16:54:55,930 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:55,931 INFO | (200, 512)
2021-05-27 16:54:55,937 INFO | BERT LAYER LOOP
2021-05-27 16:54:55,937 INFO | (200, 512)
2021-05-27 16:54:55,938 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:55,938 INFO | (200, 512)
2021-05-27 16:54:55,943 INFO | BERT LAYER LOOP
2021-05-27 16:54:55,944 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   1%|          | 3/574 [00:01<02:54,  3.28it/s]

2021-05-27 16:54:56,016 INFO | INITIAL
2021-05-27 16:54:56,016 INFO | (50, 200)
2021-05-27 16:54:56,034 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:56,035 INFO | (50, 200, 512)
2021-05-27 16:54:56,037 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:56,038 INFO | (50, 200, 512)
2021-05-27 16:54:56,039 INFO | BERT LAYER
2021-05-27 16:54:56,039 INFO | (200, 512)
2021-05-27 16:54:56,040 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,041 INFO | (200, 512)
2021-05-27 16:54:56,041 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,042 INFO | (200, 512)
2021-05-27 16:54:56,049 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,049 INFO | (200, 512)
2021-05-27 16:54:56,050 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,051 INFO | (200, 512)
2021-05-27 16:54:56,060 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,061 INFO | (200, 512)
2021-05-27 16:54:56,062 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,063 INFO | (200, 512)
2021-05-27 16:54:56,069 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,069 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   1%|          | 4/574 [00:01<02:12,  4.30it/s]

2021-05-27 16:54:56,138 INFO | INITIAL
2021-05-27 16:54:56,138 INFO | (50, 200)
2021-05-27 16:54:56,146 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:56,147 INFO | (50, 200, 512)
2021-05-27 16:54:56,148 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:56,149 INFO | (50, 200, 512)
2021-05-27 16:54:56,149 INFO | BERT LAYER
2021-05-27 16:54:56,150 INFO | (200, 512)
2021-05-27 16:54:56,150 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,150 INFO | (200, 512)
2021-05-27 16:54:56,151 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,151 INFO | (200, 512)
2021-05-27 16:54:56,157 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,157 INFO | (200, 512)
2021-05-27 16:54:56,157 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,158 INFO | (200, 512)
2021-05-27 16:54:56,164 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,164 INFO | (200, 512)
2021-05-27 16:54:56,165 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,165 INFO | (200, 512)
2021-05-27 16:54:56,170 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,171 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   1%|          | 5/574 [00:01<01:46,  5.36it/s]

2021-05-27 16:54:56,243 INFO | INITIAL
2021-05-27 16:54:56,243 INFO | (50, 200)
2021-05-27 16:54:56,249 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:56,250 INFO | (50, 200, 512)
2021-05-27 16:54:56,252 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:56,252 INFO | (50, 200, 512)
2021-05-27 16:54:56,253 INFO | BERT LAYER
2021-05-27 16:54:56,254 INFO | (200, 512)
2021-05-27 16:54:56,254 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,254 INFO | (200, 512)
2021-05-27 16:54:56,255 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,256 INFO | (200, 512)
2021-05-27 16:54:56,263 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,263 INFO | (200, 512)
2021-05-27 16:54:56,264 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,264 INFO | (200, 512)
2021-05-27 16:54:56,270 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,271 INFO | (200, 512)
2021-05-27 16:54:56,271 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,272 INFO | (200, 512)
2021-05-27 16:54:56,278 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,279 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   1%|          | 6/574 [00:01<01:31,  6.21it/s]

2021-05-27 16:54:56,355 INFO | INITIAL
2021-05-27 16:54:56,355 INFO | (50, 200)
2021-05-27 16:54:56,361 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:56,361 INFO | (50, 200, 512)
2021-05-27 16:54:56,363 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:56,363 INFO | (50, 200, 512)
2021-05-27 16:54:56,364 INFO | BERT LAYER
2021-05-27 16:54:56,365 INFO | (200, 512)
2021-05-27 16:54:56,365 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,366 INFO | (200, 512)
2021-05-27 16:54:56,366 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,367 INFO | (200, 512)
2021-05-27 16:54:56,372 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,373 INFO | (200, 512)
2021-05-27 16:54:56,373 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,374 INFO | (200, 512)
2021-05-27 16:54:56,381 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,383 INFO | (200, 512)
2021-05-27 16:54:56,384 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,384 INFO | (200, 512)
2021-05-27 16:54:56,390 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,390 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   1%|          | 7/574 [00:01<01:22,  6.88it/s]

2021-05-27 16:54:56,468 INFO | INITIAL
2021-05-27 16:54:56,468 INFO | (50, 200)
2021-05-27 16:54:56,473 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:56,474 INFO | (50, 200, 512)
2021-05-27 16:54:56,476 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:56,476 INFO | (50, 200, 512)
2021-05-27 16:54:56,477 INFO | BERT LAYER
2021-05-27 16:54:56,478 INFO | (200, 512)
2021-05-27 16:54:56,479 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,479 INFO | (200, 512)
2021-05-27 16:54:56,480 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,480 INFO | (200, 512)
2021-05-27 16:54:56,486 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,487 INFO | (200, 512)
2021-05-27 16:54:56,487 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,488 INFO | (200, 512)
2021-05-27 16:54:56,493 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,494 INFO | (200, 512)
2021-05-27 16:54:56,495 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,495 INFO | (200, 512)
2021-05-27 16:54:56,500 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,500 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   1%|▏         | 8/574 [00:01<01:15,  7.50it/s]

2021-05-27 16:54:56,575 INFO | INITIAL
2021-05-27 16:54:56,575 INFO | (50, 200)
2021-05-27 16:54:56,582 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:56,582 INFO | (50, 200, 512)
2021-05-27 16:54:56,584 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:56,585 INFO | (50, 200, 512)
2021-05-27 16:54:56,586 INFO | BERT LAYER
2021-05-27 16:54:56,586 INFO | (200, 512)
2021-05-27 16:54:56,586 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,587 INFO | (200, 512)
2021-05-27 16:54:56,587 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,588 INFO | (200, 512)
2021-05-27 16:54:56,594 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,595 INFO | (200, 512)
2021-05-27 16:54:56,595 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,596 INFO | (200, 512)
2021-05-27 16:54:56,601 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,602 INFO | (200, 512)
2021-05-27 16:54:56,602 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,603 INFO | (200, 512)
2021-05-27 16:54:56,609 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,610 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   2%|▏         | 9/574 [00:01<01:10,  8.02it/s]

2021-05-27 16:54:56,681 INFO | INITIAL
2021-05-27 16:54:56,681 INFO | (50, 200)
2021-05-27 16:54:56,686 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:56,686 INFO | (50, 200, 512)
2021-05-27 16:54:56,688 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:56,688 INFO | (50, 200, 512)
2021-05-27 16:54:56,689 INFO | BERT LAYER
2021-05-27 16:54:56,690 INFO | (200, 512)
2021-05-27 16:54:56,690 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,690 INFO | (200, 512)
2021-05-27 16:54:56,691 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,692 INFO | (200, 512)
2021-05-27 16:54:56,697 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,697 INFO | (200, 512)
2021-05-27 16:54:56,697 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,698 INFO | (200, 512)
2021-05-27 16:54:56,704 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,704 INFO | (200, 512)
2021-05-27 16:54:56,705 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,706 INFO | (200, 512)
2021-05-27 16:54:56,714 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,715 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   2%|▏         | 10/574 [00:01<01:07,  8.41it/s]

2021-05-27 16:54:56,787 INFO | INITIAL
2021-05-27 16:54:56,787 INFO | (50, 200)
2021-05-27 16:54:56,794 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:56,794 INFO | (50, 200, 512)
2021-05-27 16:54:56,796 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:56,796 INFO | (50, 200, 512)
2021-05-27 16:54:56,797 INFO | BERT LAYER
2021-05-27 16:54:56,798 INFO | (200, 512)
2021-05-27 16:54:56,798 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,798 INFO | (200, 512)
2021-05-27 16:54:56,799 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,799 INFO | (200, 512)
2021-05-27 16:54:56,805 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,806 INFO | (200, 512)
2021-05-27 16:54:56,806 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,806 INFO | (200, 512)
2021-05-27 16:54:56,813 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,813 INFO | (200, 512)
2021-05-27 16:54:56,814 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,814 INFO | (200, 512)
2021-05-27 16:54:56,820 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,820 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   2%|▏         | 11/574 [00:02<01:04,  8.73it/s]

2021-05-27 16:54:56,891 INFO | INITIAL
2021-05-27 16:54:56,892 INFO | (50, 200)
2021-05-27 16:54:56,898 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:56,898 INFO | (50, 200, 512)
2021-05-27 16:54:56,899 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:56,900 INFO | (50, 200, 512)
2021-05-27 16:54:56,901 INFO | BERT LAYER
2021-05-27 16:54:56,901 INFO | (200, 512)
2021-05-27 16:54:56,901 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,902 INFO | (200, 512)
2021-05-27 16:54:56,902 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,902 INFO | (200, 512)
2021-05-27 16:54:56,909 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,910 INFO | (200, 512)
2021-05-27 16:54:56,910 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,912 INFO | (200, 512)
2021-05-27 16:54:56,918 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,919 INFO | (200, 512)
2021-05-27 16:54:56,919 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:56,919 INFO | (200, 512)
2021-05-27 16:54:56,925 INFO | BERT LAYER LOOP
2021-05-27 16:54:56,925 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   2%|▏         | 12/574 [00:02<01:02,  9.01it/s]

2021-05-27 16:54:56,994 INFO | INITIAL
2021-05-27 16:54:56,994 INFO | (50, 200)
2021-05-27 16:54:57,000 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:57,001 INFO | (50, 200, 512)
2021-05-27 16:54:57,002 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:57,002 INFO | (50, 200, 512)
2021-05-27 16:54:57,003 INFO | BERT LAYER
2021-05-27 16:54:57,004 INFO | (200, 512)
2021-05-27 16:54:57,004 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,010 INFO | (200, 512)
2021-05-27 16:54:57,013 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,014 INFO | (200, 512)
2021-05-27 16:54:57,021 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,022 INFO | (200, 512)
2021-05-27 16:54:57,023 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,023 INFO | (200, 512)
2021-05-27 16:54:57,029 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,029 INFO | (200, 512)
2021-05-27 16:54:57,030 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,030 INFO | (200, 512)
2021-05-27 16:54:57,035 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,036 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   2%|▏         | 13/574 [00:02<01:02,  8.95it/s]

2021-05-27 16:54:57,108 INFO | INITIAL
2021-05-27 16:54:57,109 INFO | (50, 200)
2021-05-27 16:54:57,115 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:57,116 INFO | (50, 200, 512)
2021-05-27 16:54:57,118 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:57,119 INFO | (50, 200, 512)
2021-05-27 16:54:57,120 INFO | BERT LAYER
2021-05-27 16:54:57,120 INFO | (200, 512)
2021-05-27 16:54:57,121 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,121 INFO | (200, 512)
2021-05-27 16:54:57,121 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,122 INFO | (200, 512)
2021-05-27 16:54:57,130 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,130 INFO | (200, 512)
2021-05-27 16:54:57,131 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,132 INFO | (200, 512)
2021-05-27 16:54:57,137 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,138 INFO | (200, 512)
2021-05-27 16:54:57,139 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,141 INFO | (200, 512)
2021-05-27 16:54:57,147 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,147 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   2%|▏         | 14/574 [00:02<01:03,  8.86it/s]

2021-05-27 16:54:57,223 INFO | INITIAL
2021-05-27 16:54:57,223 INFO | (50, 200)
2021-05-27 16:54:57,231 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:57,232 INFO | (50, 200, 512)
2021-05-27 16:54:57,233 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:57,233 INFO | (50, 200, 512)
2021-05-27 16:54:57,234 INFO | BERT LAYER
2021-05-27 16:54:57,235 INFO | (200, 512)
2021-05-27 16:54:57,235 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,236 INFO | (200, 512)
2021-05-27 16:54:57,236 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,237 INFO | (200, 512)
2021-05-27 16:54:57,243 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,244 INFO | (200, 512)
2021-05-27 16:54:57,246 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,246 INFO | (200, 512)
2021-05-27 16:54:57,253 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,253 INFO | (200, 512)
2021-05-27 16:54:57,254 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,254 INFO | (200, 512)
2021-05-27 16:54:57,260 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,260 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   3%|▎         | 15/574 [00:02<01:02,  8.98it/s]

2021-05-27 16:54:57,332 INFO | INITIAL
2021-05-27 16:54:57,332 INFO | (50, 200)
2021-05-27 16:54:57,337 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:57,338 INFO | (50, 200, 512)
2021-05-27 16:54:57,339 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:57,340 INFO | (50, 200, 512)
2021-05-27 16:54:57,341 INFO | BERT LAYER
2021-05-27 16:54:57,349 INFO | (200, 512)
2021-05-27 16:54:57,350 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,350 INFO | (200, 512)
2021-05-27 16:54:57,351 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,351 INFO | (200, 512)
2021-05-27 16:54:57,356 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,357 INFO | (200, 512)
2021-05-27 16:54:57,357 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,358 INFO | (200, 512)
2021-05-27 16:54:57,364 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,364 INFO | (200, 512)
2021-05-27 16:54:57,365 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,365 INFO | (200, 512)
2021-05-27 16:54:57,370 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,371 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   3%|▎         | 16/574 [00:02<01:01,  9.02it/s]

2021-05-27 16:54:57,440 INFO | INITIAL
2021-05-27 16:54:57,441 INFO | (50, 200)
2021-05-27 16:54:57,448 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:57,449 INFO | (50, 200, 512)
2021-05-27 16:54:57,451 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:57,451 INFO | (50, 200, 512)
2021-05-27 16:54:57,452 INFO | BERT LAYER
2021-05-27 16:54:57,453 INFO | (200, 512)
2021-05-27 16:54:57,453 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,453 INFO | (200, 512)
2021-05-27 16:54:57,454 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,455 INFO | (200, 512)
2021-05-27 16:54:57,460 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,460 INFO | (200, 512)
2021-05-27 16:54:57,461 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,461 INFO | (200, 512)
2021-05-27 16:54:57,467 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,467 INFO | (200, 512)
2021-05-27 16:54:57,467 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,468 INFO | (200, 512)
2021-05-27 16:54:57,473 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,474 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   3%|▎         | 17/574 [00:02<01:01,  9.12it/s]

2021-05-27 16:54:57,547 INFO | INITIAL
2021-05-27 16:54:57,548 INFO | (50, 200)
2021-05-27 16:54:57,553 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:57,553 INFO | (50, 200, 512)
2021-05-27 16:54:57,554 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:57,555 INFO | (50, 200, 512)
2021-05-27 16:54:57,555 INFO | BERT LAYER
2021-05-27 16:54:57,556 INFO | (200, 512)
2021-05-27 16:54:57,556 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,556 INFO | (200, 512)
2021-05-27 16:54:57,557 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,557 INFO | (200, 512)
2021-05-27 16:54:57,564 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,564 INFO | (200, 512)
2021-05-27 16:54:57,564 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,565 INFO | (200, 512)
2021-05-27 16:54:57,570 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,570 INFO | (200, 512)
2021-05-27 16:54:57,571 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,571 INFO | (200, 512)
2021-05-27 16:54:57,578 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,579 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   3%|▎         | 18/574 [00:02<01:00,  9.17it/s]

2021-05-27 16:54:57,655 INFO | INITIAL
2021-05-27 16:54:57,656 INFO | (50, 200)
2021-05-27 16:54:57,662 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:57,663 INFO | (50, 200, 512)
2021-05-27 16:54:57,664 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:57,665 INFO | (50, 200, 512)
2021-05-27 16:54:57,666 INFO | BERT LAYER
2021-05-27 16:54:57,667 INFO | (200, 512)
2021-05-27 16:54:57,667 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,668 INFO | (200, 512)
2021-05-27 16:54:57,669 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,669 INFO | (200, 512)
2021-05-27 16:54:57,676 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,677 INFO | (200, 512)
2021-05-27 16:54:57,678 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,678 INFO | (200, 512)
2021-05-27 16:54:57,686 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,687 INFO | (200, 512)
2021-05-27 16:54:57,688 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,688 INFO | (200, 512)
2021-05-27 16:54:57,695 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,696 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   3%|▎         | 19/574 [00:02<01:01,  9.01it/s]

2021-05-27 16:54:57,770 INFO | INITIAL
2021-05-27 16:54:57,771 INFO | (50, 200)
2021-05-27 16:54:57,780 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:57,781 INFO | (50, 200, 512)
2021-05-27 16:54:57,782 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:57,784 INFO | (50, 200, 512)
2021-05-27 16:54:57,785 INFO | BERT LAYER
2021-05-27 16:54:57,786 INFO | (200, 512)
2021-05-27 16:54:57,786 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,787 INFO | (200, 512)
2021-05-27 16:54:57,787 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,788 INFO | (200, 512)
2021-05-27 16:54:57,794 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,794 INFO | (200, 512)
2021-05-27 16:54:57,795 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,795 INFO | (200, 512)
2021-05-27 16:54:57,801 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,801 INFO | (200, 512)
2021-05-27 16:54:57,802 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,802 INFO | (200, 512)
2021-05-27 16:54:57,808 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,809 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   3%|▎         | 20/574 [00:03<01:01,  9.01it/s]

2021-05-27 16:54:57,882 INFO | INITIAL
2021-05-27 16:54:57,883 INFO | (50, 200)
2021-05-27 16:54:57,888 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:57,888 INFO | (50, 200, 512)
2021-05-27 16:54:57,889 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:57,890 INFO | (50, 200, 512)
2021-05-27 16:54:57,890 INFO | BERT LAYER
2021-05-27 16:54:57,891 INFO | (200, 512)
2021-05-27 16:54:57,891 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,892 INFO | (200, 512)
2021-05-27 16:54:57,893 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,893 INFO | (200, 512)
2021-05-27 16:54:57,899 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,900 INFO | (200, 512)
2021-05-27 16:54:57,900 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,900 INFO | (200, 512)
2021-05-27 16:54:57,906 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,906 INFO | (200, 512)
2021-05-27 16:54:57,907 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,907 INFO | (200, 512)
2021-05-27 16:54:57,915 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,916 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   4%|▎         | 21/574 [00:03<01:00,  9.20it/s]

2021-05-27 16:54:57,985 INFO | INITIAL
2021-05-27 16:54:57,985 INFO | (50, 200)
2021-05-27 16:54:57,991 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:57,992 INFO | (50, 200, 512)
2021-05-27 16:54:57,993 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:57,993 INFO | (50, 200, 512)
2021-05-27 16:54:57,994 INFO | BERT LAYER
2021-05-27 16:54:57,995 INFO | (200, 512)
2021-05-27 16:54:57,995 INFO | BERT LAYER LOOP
2021-05-27 16:54:57,995 INFO | (200, 512)
2021-05-27 16:54:57,996 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:57,996 INFO | (200, 512)
2021-05-27 16:54:58,003 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,003 INFO | (200, 512)
2021-05-27 16:54:58,004 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,004 INFO | (200, 512)
2021-05-27 16:54:58,009 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,010 INFO | (200, 512)
2021-05-27 16:54:58,010 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,013 INFO | (200, 512)
2021-05-27 16:54:58,018 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,018 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   4%|▍         | 22/574 [00:03<00:59,  9.28it/s]

2021-05-27 16:54:58,090 INFO | INITIAL
2021-05-27 16:54:58,091 INFO | (50, 200)
2021-05-27 16:54:58,098 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:58,099 INFO | (50, 200, 512)
2021-05-27 16:54:58,100 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:58,100 INFO | (50, 200, 512)
2021-05-27 16:54:58,101 INFO | BERT LAYER
2021-05-27 16:54:58,101 INFO | (200, 512)
2021-05-27 16:54:58,101 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,102 INFO | (200, 512)
2021-05-27 16:54:58,102 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,103 INFO | (200, 512)
2021-05-27 16:54:58,109 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,109 INFO | (200, 512)
2021-05-27 16:54:58,110 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,111 INFO | (200, 512)
2021-05-27 16:54:58,118 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,118 INFO | (200, 512)
2021-05-27 16:54:58,119 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,119 INFO | (200, 512)
2021-05-27 16:54:58,124 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,125 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   4%|▍         | 23/574 [00:03<00:58,  9.40it/s]

2021-05-27 16:54:58,194 INFO | INITIAL
2021-05-27 16:54:58,194 INFO | (50, 200)
2021-05-27 16:54:58,200 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:58,201 INFO | (50, 200, 512)
2021-05-27 16:54:58,202 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:58,202 INFO | (50, 200, 512)
2021-05-27 16:54:58,203 INFO | BERT LAYER
2021-05-27 16:54:58,204 INFO | (200, 512)
2021-05-27 16:54:58,204 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,205 INFO | (200, 512)
2021-05-27 16:54:58,206 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,206 INFO | (200, 512)
2021-05-27 16:54:58,211 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,212 INFO | (200, 512)
2021-05-27 16:54:58,212 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,212 INFO | (200, 512)
2021-05-27 16:54:58,218 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,218 INFO | (200, 512)
2021-05-27 16:54:58,218 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,219 INFO | (200, 512)
2021-05-27 16:54:58,224 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,224 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   4%|▍         | 24/574 [00:03<00:57,  9.52it/s]

2021-05-27 16:54:58,295 INFO | INITIAL
2021-05-27 16:54:58,296 INFO | (50, 200)
2021-05-27 16:54:58,301 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:58,301 INFO | (50, 200, 512)
2021-05-27 16:54:58,302 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:58,303 INFO | (50, 200, 512)
2021-05-27 16:54:58,304 INFO | BERT LAYER
2021-05-27 16:54:58,304 INFO | (200, 512)
2021-05-27 16:54:58,304 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,305 INFO | (200, 512)
2021-05-27 16:54:58,305 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,306 INFO | (200, 512)
2021-05-27 16:54:58,312 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,313 INFO | (200, 512)
2021-05-27 16:54:58,313 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,314 INFO | (200, 512)
2021-05-27 16:54:58,320 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,321 INFO | (200, 512)
2021-05-27 16:54:58,321 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,322 INFO | (200, 512)
2021-05-27 16:54:58,327 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,327 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   4%|▍         | 25/574 [00:03<00:57,  9.57it/s]

2021-05-27 16:54:58,398 INFO | INITIAL
2021-05-27 16:54:58,399 INFO | (50, 200)
2021-05-27 16:54:58,405 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:58,405 INFO | (50, 200, 512)
2021-05-27 16:54:58,406 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:58,407 INFO | (50, 200, 512)
2021-05-27 16:54:58,407 INFO | BERT LAYER
2021-05-27 16:54:58,408 INFO | (200, 512)
2021-05-27 16:54:58,408 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,409 INFO | (200, 512)
2021-05-27 16:54:58,410 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,410 INFO | (200, 512)
2021-05-27 16:54:58,416 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,417 INFO | (200, 512)
2021-05-27 16:54:58,417 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,417 INFO | (200, 512)
2021-05-27 16:54:58,423 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,423 INFO | (200, 512)
2021-05-27 16:54:58,424 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,425 INFO | (200, 512)
2021-05-27 16:54:58,430 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,431 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   5%|▍         | 26/574 [00:03<00:57,  9.56it/s]

2021-05-27 16:54:58,503 INFO | INITIAL
2021-05-27 16:54:58,504 INFO | (50, 200)
2021-05-27 16:54:58,509 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:58,509 INFO | (50, 200, 512)
2021-05-27 16:54:58,511 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:58,511 INFO | (50, 200, 512)
2021-05-27 16:54:58,512 INFO | BERT LAYER
2021-05-27 16:54:58,513 INFO | (200, 512)
2021-05-27 16:54:58,513 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,513 INFO | (200, 512)
2021-05-27 16:54:58,514 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,514 INFO | (200, 512)
2021-05-27 16:54:58,520 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,521 INFO | (200, 512)
2021-05-27 16:54:58,521 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,522 INFO | (200, 512)
2021-05-27 16:54:58,528 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,528 INFO | (200, 512)
2021-05-27 16:54:58,529 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,530 INFO | (200, 512)
2021-05-27 16:54:58,537 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,537 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   5%|▍         | 27/574 [00:03<00:57,  9.55it/s]

2021-05-27 16:54:58,609 INFO | INITIAL
2021-05-27 16:54:58,610 INFO | (50, 200)
2021-05-27 16:54:58,616 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:58,616 INFO | (50, 200, 512)
2021-05-27 16:54:58,618 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:58,618 INFO | (50, 200, 512)
2021-05-27 16:54:58,619 INFO | BERT LAYER
2021-05-27 16:54:58,620 INFO | (200, 512)
2021-05-27 16:54:58,620 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,621 INFO | (200, 512)
2021-05-27 16:54:58,621 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,622 INFO | (200, 512)
2021-05-27 16:54:58,629 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,630 INFO | (200, 512)
2021-05-27 16:54:58,630 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,631 INFO | (200, 512)
2021-05-27 16:54:58,637 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,637 INFO | (200, 512)
2021-05-27 16:54:58,638 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,638 INFO | (200, 512)
2021-05-27 16:54:58,644 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,644 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   5%|▍         | 28/574 [00:03<00:57,  9.56it/s]

2021-05-27 16:54:58,713 INFO | INITIAL
2021-05-27 16:54:58,714 INFO | (50, 200)
2021-05-27 16:54:58,721 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:58,722 INFO | (50, 200, 512)
2021-05-27 16:54:58,723 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:58,723 INFO | (50, 200, 512)
2021-05-27 16:54:58,724 INFO | BERT LAYER
2021-05-27 16:54:58,725 INFO | (200, 512)
2021-05-27 16:54:58,725 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,726 INFO | (200, 512)
2021-05-27 16:54:58,726 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,727 INFO | (200, 512)
2021-05-27 16:54:58,733 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,733 INFO | (200, 512)
2021-05-27 16:54:58,734 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,734 INFO | (200, 512)
2021-05-27 16:54:58,741 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,743 INFO | (200, 512)
2021-05-27 16:54:58,744 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,744 INFO | (200, 512)
2021-05-27 16:54:58,750 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,750 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   5%|▌         | 29/574 [00:04<00:56,  9.57it/s]

2021-05-27 16:54:58,817 INFO | INITIAL
2021-05-27 16:54:58,817 INFO | (50, 200)
2021-05-27 16:54:58,822 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:58,823 INFO | (50, 200, 512)
2021-05-27 16:54:58,824 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:58,824 INFO | (50, 200, 512)
2021-05-27 16:54:58,825 INFO | BERT LAYER
2021-05-27 16:54:58,826 INFO | (200, 512)
2021-05-27 16:54:58,826 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,827 INFO | (200, 512)
2021-05-27 16:54:58,827 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,828 INFO | (200, 512)
2021-05-27 16:54:58,833 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,834 INFO | (200, 512)
2021-05-27 16:54:58,834 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,834 INFO | (200, 512)
2021-05-27 16:54:58,839 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,840 INFO | (200, 512)
2021-05-27 16:54:58,840 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,841 INFO | (200, 512)
2021-05-27 16:54:58,850 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,851 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   5%|▌         | 30/574 [00:04<00:56,  9.59it/s]

2021-05-27 16:54:58,921 INFO | INITIAL
2021-05-27 16:54:58,922 INFO | (50, 200)
2021-05-27 16:54:58,929 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:58,930 INFO | (50, 200, 512)
2021-05-27 16:54:58,931 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:58,932 INFO | (50, 200, 512)
2021-05-27 16:54:58,932 INFO | BERT LAYER
2021-05-27 16:54:58,933 INFO | (200, 512)
2021-05-27 16:54:58,933 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,934 INFO | (200, 512)
2021-05-27 16:54:58,934 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,934 INFO | (200, 512)
2021-05-27 16:54:58,940 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,941 INFO | (200, 512)
2021-05-27 16:54:58,942 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,942 INFO | (200, 512)
2021-05-27 16:54:58,948 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,949 INFO | (200, 512)
2021-05-27 16:54:58,949 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:58,949 INFO | (200, 512)
2021-05-27 16:54:58,955 INFO | BERT LAYER LOOP
2021-05-27 16:54:58,955 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   5%|▌         | 31/574 [00:04<00:56,  9.58it/s]

2021-05-27 16:54:59,026 INFO | INITIAL
2021-05-27 16:54:59,026 INFO | (50, 200)
2021-05-27 16:54:59,032 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:59,033 INFO | (50, 200, 512)
2021-05-27 16:54:59,034 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:59,035 INFO | (50, 200, 512)
2021-05-27 16:54:59,035 INFO | BERT LAYER
2021-05-27 16:54:59,036 INFO | (200, 512)
2021-05-27 16:54:59,036 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,036 INFO | (200, 512)
2021-05-27 16:54:59,037 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,037 INFO | (200, 512)
2021-05-27 16:54:59,043 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,045 INFO | (200, 512)
2021-05-27 16:54:59,046 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,046 INFO | (200, 512)
2021-05-27 16:54:59,052 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,053 INFO | (200, 512)
2021-05-27 16:54:59,053 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,054 INFO | (200, 512)
2021-05-27 16:54:59,060 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,060 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   6%|▌         | 32/574 [00:04<00:56,  9.56it/s]

2021-05-27 16:54:59,131 INFO | INITIAL
2021-05-27 16:54:59,131 INFO | (50, 200)
2021-05-27 16:54:59,136 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:59,137 INFO | (50, 200, 512)
2021-05-27 16:54:59,138 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:59,138 INFO | (50, 200, 512)
2021-05-27 16:54:59,139 INFO | BERT LAYER
2021-05-27 16:54:59,139 INFO | (200, 512)
2021-05-27 16:54:59,140 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,140 INFO | (200, 512)
2021-05-27 16:54:59,140 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,141 INFO | (200, 512)
2021-05-27 16:54:59,147 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,149 INFO | (200, 512)
2021-05-27 16:54:59,149 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,150 INFO | (200, 512)
2021-05-27 16:54:59,156 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,157 INFO | (200, 512)
2021-05-27 16:54:59,158 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,158 INFO | (200, 512)
2021-05-27 16:54:59,164 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,164 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   6%|▌         | 33/574 [00:04<00:56,  9.62it/s]

2021-05-27 16:54:59,233 INFO | INITIAL
2021-05-27 16:54:59,234 INFO | (50, 200)
2021-05-27 16:54:59,239 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:59,240 INFO | (50, 200, 512)
2021-05-27 16:54:59,242 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:59,242 INFO | (50, 200, 512)
2021-05-27 16:54:59,243 INFO | BERT LAYER
2021-05-27 16:54:59,244 INFO | (200, 512)
2021-05-27 16:54:59,244 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,245 INFO | (200, 512)
2021-05-27 16:54:59,246 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,246 INFO | (200, 512)
2021-05-27 16:54:59,252 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,253 INFO | (200, 512)
2021-05-27 16:54:59,253 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,253 INFO | (200, 512)
2021-05-27 16:54:59,258 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,259 INFO | (200, 512)
2021-05-27 16:54:59,259 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,260 INFO | (200, 512)
2021-05-27 16:54:59,265 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,266 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   6%|▌         | 34/574 [00:04<00:55,  9.71it/s]

2021-05-27 16:54:59,334 INFO | INITIAL
2021-05-27 16:54:59,334 INFO | (50, 200)
2021-05-27 16:54:59,340 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:59,340 INFO | (50, 200, 512)
2021-05-27 16:54:59,343 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:59,344 INFO | (50, 200, 512)
2021-05-27 16:54:59,345 INFO | BERT LAYER
2021-05-27 16:54:59,345 INFO | (200, 512)
2021-05-27 16:54:59,346 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,346 INFO | (200, 512)
2021-05-27 16:54:59,348 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,348 INFO | (200, 512)
2021-05-27 16:54:59,354 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,355 INFO | (200, 512)
2021-05-27 16:54:59,355 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,356 INFO | (200, 512)
2021-05-27 16:54:59,362 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,362 INFO | (200, 512)
2021-05-27 16:54:59,363 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,363 INFO | (200, 512)
2021-05-27 16:54:59,369 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,370 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   6%|▌         | 35/574 [00:04<00:55,  9.64it/s]

2021-05-27 16:54:59,439 INFO | INITIAL
2021-05-27 16:54:59,440 INFO | (50, 200)
2021-05-27 16:54:59,446 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:59,446 INFO | (50, 200, 512)
2021-05-27 16:54:59,448 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:59,448 INFO | (50, 200, 512)
2021-05-27 16:54:59,449 INFO | BERT LAYER
2021-05-27 16:54:59,449 INFO | (200, 512)
2021-05-27 16:54:59,450 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,450 INFO | (200, 512)
2021-05-27 16:54:59,450 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,451 INFO | (200, 512)
2021-05-27 16:54:59,456 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,457 INFO | (200, 512)
2021-05-27 16:54:59,457 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,457 INFO | (200, 512)
2021-05-27 16:54:59,465 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,465 INFO | (200, 512)
2021-05-27 16:54:59,466 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,466 INFO | (200, 512)
2021-05-27 16:54:59,471 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,472 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   6%|▌         | 35/574 [00:04<00:55,  9.64it/s]

2021-05-27 16:54:59,539 INFO | INITIAL
2021-05-27 16:54:59,539 INFO | (50, 200)
2021-05-27 16:54:59,547 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:59,547 INFO | (50, 200, 512)
2021-05-27 16:54:59,548 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:59,549 INFO | (50, 200, 512)
2021-05-27 16:54:59,549 INFO | BERT LAYER
2021-05-27 16:54:59,550 INFO | (200, 512)
2021-05-27 16:54:59,550 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,551 INFO | (200, 512)
2021-05-27 16:54:59,551 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,551 INFO | (200, 512)
2021-05-27 16:54:59,556 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,557 INFO | (200, 512)
2021-05-27 16:54:59,557 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,557 INFO | (200, 512)
2021-05-27 16:54:59,563 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,563 INFO | (200, 512)
2021-05-27 16:54:59,564 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,564 INFO | (200, 512)
2021-05-27 16:54:59,570 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,570 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   6%|▋         | 37/574 [00:04<00:54,  9.84it/s]

2021-05-27 16:54:59,638 INFO | INITIAL
2021-05-27 16:54:59,638 INFO | (50, 200)
2021-05-27 16:54:59,644 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:59,644 INFO | (50, 200, 512)
2021-05-27 16:54:59,646 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:59,646 INFO | (50, 200, 512)
2021-05-27 16:54:59,647 INFO | BERT LAYER
2021-05-27 16:54:59,648 INFO | (200, 512)
2021-05-27 16:54:59,648 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,649 INFO | (200, 512)
2021-05-27 16:54:59,649 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,649 INFO | (200, 512)
2021-05-27 16:54:59,655 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,655 INFO | (200, 512)
2021-05-27 16:54:59,656 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,656 INFO | (200, 512)
2021-05-27 16:54:59,662 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,662 INFO | (200, 512)
2021-05-27 16:54:59,663 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,663 INFO | (200, 512)
2021-05-27 16:54:59,670 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,671 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   6%|▋         | 37/574 [00:04<00:54,  9.84it/s]

2021-05-27 16:54:59,737 INFO | INITIAL
2021-05-27 16:54:59,738 INFO | (50, 200)
2021-05-27 16:54:59,744 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:59,744 INFO | (50, 200, 512)
2021-05-27 16:54:59,746 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:59,747 INFO | (50, 200, 512)
2021-05-27 16:54:59,747 INFO | BERT LAYER
2021-05-27 16:54:59,748 INFO | (200, 512)
2021-05-27 16:54:59,748 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,749 INFO | (200, 512)
2021-05-27 16:54:59,749 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,750 INFO | (200, 512)
2021-05-27 16:54:59,755 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,755 INFO | (200, 512)
2021-05-27 16:54:59,756 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,756 INFO | (200, 512)
2021-05-27 16:54:59,762 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,762 INFO | (200, 512)
2021-05-27 16:54:59,763 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,763 INFO | (200, 512)
2021-05-27 16:54:59,768 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,769 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   7%|▋         | 39/574 [00:05<00:54,  9.87it/s]

2021-05-27 16:54:59,840 INFO | INITIAL
2021-05-27 16:54:59,840 INFO | (50, 200)
2021-05-27 16:54:59,847 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:59,847 INFO | (50, 200, 512)
2021-05-27 16:54:59,849 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:59,850 INFO | (50, 200, 512)
2021-05-27 16:54:59,851 INFO | BERT LAYER
2021-05-27 16:54:59,852 INFO | (200, 512)
2021-05-27 16:54:59,852 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,853 INFO | (200, 512)
2021-05-27 16:54:59,853 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,854 INFO | (200, 512)
2021-05-27 16:54:59,861 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,862 INFO | (200, 512)
2021-05-27 16:54:59,862 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,862 INFO | (200, 512)
2021-05-27 16:54:59,868 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,868 INFO | (200, 512)
2021-05-27 16:54:59,869 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,869 INFO | (200, 512)
2021-05-27 16:54:59,875 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,876 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   7%|▋         | 40/574 [00:05<00:54,  9.74it/s]

2021-05-27 16:54:59,947 INFO | INITIAL
2021-05-27 16:54:59,947 INFO | (50, 200)
2021-05-27 16:54:59,952 INFO | POST EMBEDDING LAYER
2021-05-27 16:54:59,953 INFO | (50, 200, 512)
2021-05-27 16:54:59,954 INFO | POST POSITIONAL ENCODING
2021-05-27 16:54:59,954 INFO | (50, 200, 512)
2021-05-27 16:54:59,955 INFO | BERT LAYER
2021-05-27 16:54:59,955 INFO | (200, 512)
2021-05-27 16:54:59,956 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,956 INFO | (200, 512)
2021-05-27 16:54:59,956 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,957 INFO | (200, 512)
2021-05-27 16:54:59,963 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,963 INFO | (200, 512)
2021-05-27 16:54:59,964 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,964 INFO | (200, 512)
2021-05-27 16:54:59,969 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,970 INFO | (200, 512)
2021-05-27 16:54:59,970 INFO | MULTIHEADED ATTENTION
2021-05-27 16:54:59,970 INFO | (200, 512)
2021-05-27 16:54:59,975 INFO | BERT LAYER LOOP
2021-05-27 16:54:59,976 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   7%|▋         | 40/574 [00:05<00:54,  9.74it/s]

2021-05-27 16:55:00,046 INFO | INITIAL
2021-05-27 16:55:00,047 INFO | (50, 200)
2021-05-27 16:55:00,054 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:00,054 INFO | (50, 200, 512)
2021-05-27 16:55:00,056 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:00,056 INFO | (50, 200, 512)
2021-05-27 16:55:00,057 INFO | BERT LAYER
2021-05-27 16:55:00,057 INFO | (200, 512)
2021-05-27 16:55:00,058 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,058 INFO | (200, 512)
2021-05-27 16:55:00,059 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,060 INFO | (200, 512)
2021-05-27 16:55:00,066 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,066 INFO | (200, 512)
2021-05-27 16:55:00,067 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,067 INFO | (200, 512)
2021-05-27 16:55:00,073 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,073 INFO | (200, 512)
2021-05-27 16:55:00,074 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,075 INFO | (200, 512)
2021-05-27 16:55:00,082 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,082 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   7%|▋         | 42/574 [00:05<00:54,  9.72it/s]

2021-05-27 16:55:00,153 INFO | INITIAL
2021-05-27 16:55:00,154 INFO | (50, 200)
2021-05-27 16:55:00,160 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:00,160 INFO | (50, 200, 512)
2021-05-27 16:55:00,162 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:00,162 INFO | (50, 200, 512)
2021-05-27 16:55:00,163 INFO | BERT LAYER
2021-05-27 16:55:00,163 INFO | (200, 512)
2021-05-27 16:55:00,164 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,164 INFO | (200, 512)
2021-05-27 16:55:00,165 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,165 INFO | (200, 512)
2021-05-27 16:55:00,170 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,171 INFO | (200, 512)
2021-05-27 16:55:00,171 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,171 INFO | (200, 512)
2021-05-27 16:55:00,176 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,177 INFO | (200, 512)
2021-05-27 16:55:00,177 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,178 INFO | (200, 512)
2021-05-27 16:55:00,185 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,186 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   7%|▋         | 43/574 [00:05<00:55,  9.52it/s]

2021-05-27 16:55:00,266 INFO | INITIAL
2021-05-27 16:55:00,267 INFO | (50, 200)
2021-05-27 16:55:00,271 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:00,272 INFO | (50, 200, 512)
2021-05-27 16:55:00,273 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:00,274 INFO | (50, 200, 512)
2021-05-27 16:55:00,275 INFO | BERT LAYER
2021-05-27 16:55:00,275 INFO | (200, 512)
2021-05-27 16:55:00,276 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,276 INFO | (200, 512)
2021-05-27 16:55:00,277 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,277 INFO | (200, 512)
2021-05-27 16:55:00,284 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,285 INFO | (200, 512)
2021-05-27 16:55:00,285 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,286 INFO | (200, 512)
2021-05-27 16:55:00,293 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,294 INFO | (200, 512)
2021-05-27 16:55:00,294 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,295 INFO | (200, 512)
2021-05-27 16:55:00,301 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,302 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   8%|▊         | 44/574 [00:05<00:56,  9.45it/s]

2021-05-27 16:55:00,375 INFO | INITIAL
2021-05-27 16:55:00,376 INFO | (50, 200)
2021-05-27 16:55:00,383 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:00,384 INFO | (50, 200, 512)
2021-05-27 16:55:00,385 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:00,386 INFO | (50, 200, 512)
2021-05-27 16:55:00,387 INFO | BERT LAYER
2021-05-27 16:55:00,387 INFO | (200, 512)
2021-05-27 16:55:00,388 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,388 INFO | (200, 512)
2021-05-27 16:55:00,389 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,389 INFO | (200, 512)
2021-05-27 16:55:00,396 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,397 INFO | (200, 512)
2021-05-27 16:55:00,397 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,397 INFO | (200, 512)
2021-05-27 16:55:00,403 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,403 INFO | (200, 512)
2021-05-27 16:55:00,403 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,404 INFO | (200, 512)
2021-05-27 16:55:00,410 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,410 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   8%|▊         | 45/574 [00:05<00:56,  9.32it/s]

2021-05-27 16:55:00,486 INFO | INITIAL
2021-05-27 16:55:00,487 INFO | (50, 200)
2021-05-27 16:55:00,492 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:00,493 INFO | (50, 200, 512)
2021-05-27 16:55:00,495 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:00,495 INFO | (50, 200, 512)
2021-05-27 16:55:00,497 INFO | BERT LAYER
2021-05-27 16:55:00,497 INFO | (200, 512)
2021-05-27 16:55:00,498 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,498 INFO | (200, 512)
2021-05-27 16:55:00,498 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,499 INFO | (200, 512)
2021-05-27 16:55:00,505 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,505 INFO | (200, 512)
2021-05-27 16:55:00,506 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,506 INFO | (200, 512)
2021-05-27 16:55:00,513 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,514 INFO | (200, 512)
2021-05-27 16:55:00,515 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,515 INFO | (200, 512)
2021-05-27 16:55:00,522 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,522 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   8%|▊         | 46/574 [00:05<00:57,  9.15it/s]

2021-05-27 16:55:00,601 INFO | INITIAL
2021-05-27 16:55:00,602 INFO | (50, 200)
2021-05-27 16:55:00,607 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:00,608 INFO | (50, 200, 512)
2021-05-27 16:55:00,610 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:00,611 INFO | (50, 200, 512)
2021-05-27 16:55:00,612 INFO | BERT LAYER
2021-05-27 16:55:00,612 INFO | (200, 512)
2021-05-27 16:55:00,613 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,613 INFO | (200, 512)
2021-05-27 16:55:00,614 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,614 INFO | (200, 512)
2021-05-27 16:55:00,620 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,620 INFO | (200, 512)
2021-05-27 16:55:00,621 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,621 INFO | (200, 512)
2021-05-27 16:55:00,627 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,628 INFO | (200, 512)
2021-05-27 16:55:00,628 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,629 INFO | (200, 512)
2021-05-27 16:55:00,634 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,635 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   8%|▊         | 47/574 [00:05<00:56,  9.32it/s]

2021-05-27 16:55:00,703 INFO | INITIAL
2021-05-27 16:55:00,703 INFO | (50, 200)
2021-05-27 16:55:00,708 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:00,709 INFO | (50, 200, 512)
2021-05-27 16:55:00,710 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:00,711 INFO | (50, 200, 512)
2021-05-27 16:55:00,712 INFO | BERT LAYER
2021-05-27 16:55:00,712 INFO | (200, 512)
2021-05-27 16:55:00,712 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,713 INFO | (200, 512)
2021-05-27 16:55:00,713 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,714 INFO | (200, 512)
2021-05-27 16:55:00,720 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,720 INFO | (200, 512)
2021-05-27 16:55:00,721 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,721 INFO | (200, 512)
2021-05-27 16:55:00,727 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,727 INFO | (200, 512)
2021-05-27 16:55:00,728 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,728 INFO | (200, 512)
2021-05-27 16:55:00,733 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,734 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   8%|▊         | 48/574 [00:05<00:56,  9.37it/s]

2021-05-27 16:55:00,808 INFO | INITIAL
2021-05-27 16:55:00,809 INFO | (50, 200)
2021-05-27 16:55:00,815 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:00,815 INFO | (50, 200, 512)
2021-05-27 16:55:00,817 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:00,817 INFO | (50, 200, 512)
2021-05-27 16:55:00,818 INFO | BERT LAYER
2021-05-27 16:55:00,818 INFO | (200, 512)
2021-05-27 16:55:00,818 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,819 INFO | (200, 512)
2021-05-27 16:55:00,821 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,821 INFO | (200, 512)
2021-05-27 16:55:00,827 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,828 INFO | (200, 512)
2021-05-27 16:55:00,829 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,829 INFO | (200, 512)
2021-05-27 16:55:00,837 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,840 INFO | (200, 512)
2021-05-27 16:55:00,842 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,843 INFO | (200, 512)
2021-05-27 16:55:00,852 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,853 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   9%|▊         | 49/574 [00:06<00:57,  9.09it/s]

2021-05-27 16:55:00,927 INFO | INITIAL
2021-05-27 16:55:00,928 INFO | (50, 200)
2021-05-27 16:55:00,933 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:00,934 INFO | (50, 200, 512)
2021-05-27 16:55:00,935 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:00,935 INFO | (50, 200, 512)
2021-05-27 16:55:00,936 INFO | BERT LAYER
2021-05-27 16:55:00,936 INFO | (200, 512)
2021-05-27 16:55:00,936 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,937 INFO | (200, 512)
2021-05-27 16:55:00,937 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,937 INFO | (200, 512)
2021-05-27 16:55:00,943 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,945 INFO | (200, 512)
2021-05-27 16:55:00,946 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,947 INFO | (200, 512)
2021-05-27 16:55:00,952 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,953 INFO | (200, 512)
2021-05-27 16:55:00,953 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:00,953 INFO | (200, 512)
2021-05-27 16:55:00,959 INFO | BERT LAYER LOOP
2021-05-27 16:55:00,960 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   9%|▊         | 50/574 [00:06<00:56,  9.20it/s]

2021-05-27 16:55:01,032 INFO | INITIAL
2021-05-27 16:55:01,033 INFO | (50, 200)
2021-05-27 16:55:01,038 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:01,038 INFO | (50, 200, 512)
2021-05-27 16:55:01,040 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:01,040 INFO | (50, 200, 512)
2021-05-27 16:55:01,041 INFO | BERT LAYER
2021-05-27 16:55:01,042 INFO | (200, 512)
2021-05-27 16:55:01,042 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,042 INFO | (200, 512)
2021-05-27 16:55:01,043 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,043 INFO | (200, 512)
2021-05-27 16:55:01,050 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,051 INFO | (200, 512)
2021-05-27 16:55:01,051 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,051 INFO | (200, 512)
2021-05-27 16:55:01,058 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,058 INFO | (200, 512)
2021-05-27 16:55:01,059 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,059 INFO | (200, 512)
2021-05-27 16:55:01,065 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,065 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   9%|▉         | 51/574 [00:06<00:57,  9.16it/s]

2021-05-27 16:55:01,142 INFO | INITIAL
2021-05-27 16:55:01,143 INFO | (50, 200)
2021-05-27 16:55:01,148 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:01,148 INFO | (50, 200, 512)
2021-05-27 16:55:01,150 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:01,150 INFO | (50, 200, 512)
2021-05-27 16:55:01,151 INFO | BERT LAYER
2021-05-27 16:55:01,152 INFO | (200, 512)
2021-05-27 16:55:01,152 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,152 INFO | (200, 512)
2021-05-27 16:55:01,153 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,153 INFO | (200, 512)
2021-05-27 16:55:01,158 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,159 INFO | (200, 512)
2021-05-27 16:55:01,159 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,159 INFO | (200, 512)
2021-05-27 16:55:01,167 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,167 INFO | (200, 512)
2021-05-27 16:55:01,168 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,168 INFO | (200, 512)
2021-05-27 16:55:01,173 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,174 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   9%|▉         | 51/574 [00:06<00:57,  9.16it/s]

2021-05-27 16:55:01,241 INFO | INITIAL
2021-05-27 16:55:01,242 INFO | (50, 200)
2021-05-27 16:55:01,249 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:01,249 INFO | (50, 200, 512)
2021-05-27 16:55:01,251 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:01,251 INFO | (50, 200, 512)
2021-05-27 16:55:01,252 INFO | BERT LAYER
2021-05-27 16:55:01,253 INFO | (200, 512)
2021-05-27 16:55:01,253 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,254 INFO | (200, 512)
2021-05-27 16:55:01,254 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,255 INFO | (200, 512)
2021-05-27 16:55:01,262 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,263 INFO | (200, 512)
2021-05-27 16:55:01,263 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,263 INFO | (200, 512)
2021-05-27 16:55:01,269 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,270 INFO | (200, 512)
2021-05-27 16:55:01,270 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,271 INFO | (200, 512)
2021-05-27 16:55:01,278 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,278 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   9%|▉         | 53/574 [00:06<00:55,  9.32it/s]

2021-05-27 16:55:01,352 INFO | INITIAL
2021-05-27 16:55:01,353 INFO | (50, 200)
2021-05-27 16:55:01,358 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:01,359 INFO | (50, 200, 512)
2021-05-27 16:55:01,360 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:01,361 INFO | (50, 200, 512)
2021-05-27 16:55:01,363 INFO | BERT LAYER
2021-05-27 16:55:01,363 INFO | (200, 512)
2021-05-27 16:55:01,364 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,364 INFO | (200, 512)
2021-05-27 16:55:01,364 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,365 INFO | (200, 512)
2021-05-27 16:55:01,370 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,371 INFO | (200, 512)
2021-05-27 16:55:01,371 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,372 INFO | (200, 512)
2021-05-27 16:55:01,377 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,378 INFO | (200, 512)
2021-05-27 16:55:01,379 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,379 INFO | (200, 512)
2021-05-27 16:55:01,385 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,386 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :   9%|▉         | 54/574 [00:06<00:55,  9.29it/s]

2021-05-27 16:55:01,462 INFO | INITIAL
2021-05-27 16:55:01,462 INFO | (50, 200)
2021-05-27 16:55:01,467 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:01,468 INFO | (50, 200, 512)
2021-05-27 16:55:01,469 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:01,470 INFO | (50, 200, 512)
2021-05-27 16:55:01,471 INFO | BERT LAYER
2021-05-27 16:55:01,471 INFO | (200, 512)
2021-05-27 16:55:01,471 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,472 INFO | (200, 512)
2021-05-27 16:55:01,472 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,472 INFO | (200, 512)
2021-05-27 16:55:01,479 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,479 INFO | (200, 512)
2021-05-27 16:55:01,480 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,481 INFO | (200, 512)
2021-05-27 16:55:01,487 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,487 INFO | (200, 512)
2021-05-27 16:55:01,487 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,488 INFO | (200, 512)
2021-05-27 16:55:01,493 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,493 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  10%|▉         | 55/574 [00:06<00:55,  9.36it/s]

2021-05-27 16:55:01,565 INFO | INITIAL
2021-05-27 16:55:01,566 INFO | (50, 200)
2021-05-27 16:55:01,571 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:01,571 INFO | (50, 200, 512)
2021-05-27 16:55:01,572 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:01,573 INFO | (50, 200, 512)
2021-05-27 16:55:01,574 INFO | BERT LAYER
2021-05-27 16:55:01,574 INFO | (200, 512)
2021-05-27 16:55:01,575 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,575 INFO | (200, 512)
2021-05-27 16:55:01,576 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,576 INFO | (200, 512)
2021-05-27 16:55:01,582 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,583 INFO | (200, 512)
2021-05-27 16:55:01,583 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,584 INFO | (200, 512)
2021-05-27 16:55:01,591 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,592 INFO | (200, 512)
2021-05-27 16:55:01,593 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,594 INFO | (200, 512)
2021-05-27 16:55:01,600 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,600 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  10%|▉         | 56/574 [00:06<00:55,  9.41it/s]

2021-05-27 16:55:01,671 INFO | INITIAL
2021-05-27 16:55:01,671 INFO | (50, 200)
2021-05-27 16:55:01,678 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:01,678 INFO | (50, 200, 512)
2021-05-27 16:55:01,680 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:01,680 INFO | (50, 200, 512)
2021-05-27 16:55:01,681 INFO | BERT LAYER
2021-05-27 16:55:01,681 INFO | (200, 512)
2021-05-27 16:55:01,682 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,682 INFO | (200, 512)
2021-05-27 16:55:01,682 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,683 INFO | (200, 512)
2021-05-27 16:55:01,689 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,689 INFO | (200, 512)
2021-05-27 16:55:01,690 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,690 INFO | (200, 512)
2021-05-27 16:55:01,696 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,696 INFO | (200, 512)
2021-05-27 16:55:01,697 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,697 INFO | (200, 512)
2021-05-27 16:55:01,702 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,703 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  10%|▉         | 56/574 [00:06<00:55,  9.41it/s]

2021-05-27 16:55:01,769 INFO | INITIAL
2021-05-27 16:55:01,769 INFO | (50, 200)
2021-05-27 16:55:01,774 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:01,775 INFO | (50, 200, 512)
2021-05-27 16:55:01,776 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:01,777 INFO | (50, 200, 512)
2021-05-27 16:55:01,778 INFO | BERT LAYER
2021-05-27 16:55:01,778 INFO | (200, 512)
2021-05-27 16:55:01,779 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,779 INFO | (200, 512)
2021-05-27 16:55:01,779 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,780 INFO | (200, 512)
2021-05-27 16:55:01,787 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,787 INFO | (200, 512)
2021-05-27 16:55:01,788 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,788 INFO | (200, 512)
2021-05-27 16:55:01,795 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,795 INFO | (200, 512)
2021-05-27 16:55:01,795 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,796 INFO | (200, 512)
2021-05-27 16:55:01,802 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,802 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  10%|█         | 58/574 [00:07<00:53,  9.63it/s]

2021-05-27 16:55:01,872 INFO | INITIAL
2021-05-27 16:55:01,872 INFO | (50, 200)
2021-05-27 16:55:01,880 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:01,880 INFO | (50, 200, 512)
2021-05-27 16:55:01,882 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:01,882 INFO | (50, 200, 512)
2021-05-27 16:55:01,883 INFO | BERT LAYER
2021-05-27 16:55:01,883 INFO | (200, 512)
2021-05-27 16:55:01,884 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,884 INFO | (200, 512)
2021-05-27 16:55:01,884 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,885 INFO | (200, 512)
2021-05-27 16:55:01,890 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,890 INFO | (200, 512)
2021-05-27 16:55:01,891 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,891 INFO | (200, 512)
2021-05-27 16:55:01,898 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,898 INFO | (200, 512)
2021-05-27 16:55:01,898 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,899 INFO | (200, 512)
2021-05-27 16:55:01,905 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,905 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  10%|█         | 59/574 [00:07<00:53,  9.68it/s]

2021-05-27 16:55:01,973 INFO | INITIAL
2021-05-27 16:55:01,974 INFO | (50, 200)
2021-05-27 16:55:01,982 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:01,982 INFO | (50, 200, 512)
2021-05-27 16:55:01,984 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:01,984 INFO | (50, 200, 512)
2021-05-27 16:55:01,985 INFO | BERT LAYER
2021-05-27 16:55:01,986 INFO | (200, 512)
2021-05-27 16:55:01,986 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,986 INFO | (200, 512)
2021-05-27 16:55:01,987 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,987 INFO | (200, 512)
2021-05-27 16:55:01,993 INFO | BERT LAYER LOOP
2021-05-27 16:55:01,994 INFO | (200, 512)
2021-05-27 16:55:01,994 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:01,995 INFO | (200, 512)
2021-05-27 16:55:02,000 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,001 INFO | (200, 512)
2021-05-27 16:55:02,002 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,002 INFO | (200, 512)
2021-05-27 16:55:02,007 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,008 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  10%|█         | 60/574 [00:07<00:52,  9.70it/s]

2021-05-27 16:55:02,076 INFO | INITIAL
2021-05-27 16:55:02,076 INFO | (50, 200)
2021-05-27 16:55:02,082 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:02,083 INFO | (50, 200, 512)
2021-05-27 16:55:02,086 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:02,086 INFO | (50, 200, 512)
2021-05-27 16:55:02,087 INFO | BERT LAYER
2021-05-27 16:55:02,087 INFO | (200, 512)
2021-05-27 16:55:02,088 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,088 INFO | (200, 512)
2021-05-27 16:55:02,089 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,089 INFO | (200, 512)
2021-05-27 16:55:02,095 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,096 INFO | (200, 512)
2021-05-27 16:55:02,096 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,097 INFO | (200, 512)
2021-05-27 16:55:02,102 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,102 INFO | (200, 512)
2021-05-27 16:55:02,103 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,103 INFO | (200, 512)
2021-05-27 16:55:02,108 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,109 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  11%|█         | 61/574 [00:07<00:53,  9.57it/s]

2021-05-27 16:55:02,185 INFO | INITIAL
2021-05-27 16:55:02,185 INFO | (50, 200)
2021-05-27 16:55:02,191 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:02,191 INFO | (50, 200, 512)
2021-05-27 16:55:02,193 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:02,193 INFO | (50, 200, 512)
2021-05-27 16:55:02,194 INFO | BERT LAYER
2021-05-27 16:55:02,195 INFO | (200, 512)
2021-05-27 16:55:02,195 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,195 INFO | (200, 512)
2021-05-27 16:55:02,196 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,197 INFO | (200, 512)
2021-05-27 16:55:02,203 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,204 INFO | (200, 512)
2021-05-27 16:55:02,204 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,205 INFO | (200, 512)
2021-05-27 16:55:02,211 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,211 INFO | (200, 512)
2021-05-27 16:55:02,212 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,212 INFO | (200, 512)
2021-05-27 16:55:02,218 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,219 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  11%|█         | 62/574 [00:07<00:53,  9.52it/s]

2021-05-27 16:55:02,291 INFO | INITIAL
2021-05-27 16:55:02,292 INFO | (50, 200)
2021-05-27 16:55:02,299 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:02,300 INFO | (50, 200, 512)
2021-05-27 16:55:02,301 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:02,301 INFO | (50, 200, 512)
2021-05-27 16:55:02,302 INFO | BERT LAYER
2021-05-27 16:55:02,303 INFO | (200, 512)
2021-05-27 16:55:02,303 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,304 INFO | (200, 512)
2021-05-27 16:55:02,304 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,304 INFO | (200, 512)
2021-05-27 16:55:02,310 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,311 INFO | (200, 512)
2021-05-27 16:55:02,311 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,312 INFO | (200, 512)
2021-05-27 16:55:02,319 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,319 INFO | (200, 512)
2021-05-27 16:55:02,320 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,320 INFO | (200, 512)
2021-05-27 16:55:02,327 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,328 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  11%|█         | 63/574 [00:07<00:54,  9.44it/s]

2021-05-27 16:55:02,399 INFO | INITIAL
2021-05-27 16:55:02,399 INFO | (50, 200)
2021-05-27 16:55:02,404 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:02,405 INFO | (50, 200, 512)
2021-05-27 16:55:02,406 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:02,406 INFO | (50, 200, 512)
2021-05-27 16:55:02,407 INFO | BERT LAYER
2021-05-27 16:55:02,407 INFO | (200, 512)
2021-05-27 16:55:02,408 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,408 INFO | (200, 512)
2021-05-27 16:55:02,409 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,410 INFO | (200, 512)
2021-05-27 16:55:02,416 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,416 INFO | (200, 512)
2021-05-27 16:55:02,417 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,417 INFO | (200, 512)
2021-05-27 16:55:02,423 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,423 INFO | (200, 512)
2021-05-27 16:55:02,423 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,424 INFO | (200, 512)
2021-05-27 16:55:02,430 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,430 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  11%|█         | 64/574 [00:07<00:53,  9.57it/s]

2021-05-27 16:55:02,500 INFO | INITIAL
2021-05-27 16:55:02,501 INFO | (50, 200)
2021-05-27 16:55:02,506 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:02,506 INFO | (50, 200, 512)
2021-05-27 16:55:02,508 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:02,508 INFO | (50, 200, 512)
2021-05-27 16:55:02,509 INFO | BERT LAYER
2021-05-27 16:55:02,510 INFO | (200, 512)
2021-05-27 16:55:02,510 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,511 INFO | (200, 512)
2021-05-27 16:55:02,511 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,511 INFO | (200, 512)
2021-05-27 16:55:02,519 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,520 INFO | (200, 512)
2021-05-27 16:55:02,520 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,520 INFO | (200, 512)
2021-05-27 16:55:02,526 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,526 INFO | (200, 512)
2021-05-27 16:55:02,527 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,527 INFO | (200, 512)
2021-05-27 16:55:02,532 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,533 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  11%|█▏        | 65/574 [00:07<00:52,  9.62it/s]

2021-05-27 16:55:02,603 INFO | INITIAL
2021-05-27 16:55:02,603 INFO | (50, 200)
2021-05-27 16:55:02,609 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:02,609 INFO | (50, 200, 512)
2021-05-27 16:55:02,610 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:02,611 INFO | (50, 200, 512)
2021-05-27 16:55:02,612 INFO | BERT LAYER
2021-05-27 16:55:02,612 INFO | (200, 512)
2021-05-27 16:55:02,613 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,613 INFO | (200, 512)
2021-05-27 16:55:02,614 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,615 INFO | (200, 512)
2021-05-27 16:55:02,619 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,620 INFO | (200, 512)
2021-05-27 16:55:02,620 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,621 INFO | (200, 512)
2021-05-27 16:55:02,626 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,627 INFO | (200, 512)
2021-05-27 16:55:02,627 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,628 INFO | (200, 512)
2021-05-27 16:55:02,635 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,635 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  11%|█▏        | 66/574 [00:07<00:52,  9.67it/s]

2021-05-27 16:55:02,705 INFO | INITIAL
2021-05-27 16:55:02,705 INFO | (50, 200)
2021-05-27 16:55:02,711 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:02,712 INFO | (50, 200, 512)
2021-05-27 16:55:02,713 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:02,713 INFO | (50, 200, 512)
2021-05-27 16:55:02,714 INFO | BERT LAYER
2021-05-27 16:55:02,715 INFO | (200, 512)
2021-05-27 16:55:02,715 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,715 INFO | (200, 512)
2021-05-27 16:55:02,716 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,716 INFO | (200, 512)
2021-05-27 16:55:02,722 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,723 INFO | (200, 512)
2021-05-27 16:55:02,723 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,724 INFO | (200, 512)
2021-05-27 16:55:02,730 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,731 INFO | (200, 512)
2021-05-27 16:55:02,731 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,732 INFO | (200, 512)
2021-05-27 16:55:02,736 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,737 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  12%|█▏        | 67/574 [00:07<00:52,  9.60it/s]

2021-05-27 16:55:02,811 INFO | INITIAL
2021-05-27 16:55:02,811 INFO | (50, 200)
2021-05-27 16:55:02,816 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:02,817 INFO | (50, 200, 512)
2021-05-27 16:55:02,818 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:02,819 INFO | (50, 200, 512)
2021-05-27 16:55:02,819 INFO | BERT LAYER
2021-05-27 16:55:02,819 INFO | (200, 512)
2021-05-27 16:55:02,820 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,820 INFO | (200, 512)
2021-05-27 16:55:02,820 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,821 INFO | (200, 512)
2021-05-27 16:55:02,826 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,827 INFO | (200, 512)
2021-05-27 16:55:02,827 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,827 INFO | (200, 512)
2021-05-27 16:55:02,832 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,833 INFO | (200, 512)
2021-05-27 16:55:02,833 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,834 INFO | (200, 512)
2021-05-27 16:55:02,839 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,839 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  12%|█▏        | 68/574 [00:08<00:52,  9.64it/s]

2021-05-27 16:55:02,914 INFO | INITIAL
2021-05-27 16:55:02,914 INFO | (50, 200)
2021-05-27 16:55:02,920 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:02,921 INFO | (50, 200, 512)
2021-05-27 16:55:02,922 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:02,922 INFO | (50, 200, 512)
2021-05-27 16:55:02,923 INFO | BERT LAYER
2021-05-27 16:55:02,924 INFO | (200, 512)
2021-05-27 16:55:02,924 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,925 INFO | (200, 512)
2021-05-27 16:55:02,925 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,926 INFO | (200, 512)
2021-05-27 16:55:02,932 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,932 INFO | (200, 512)
2021-05-27 16:55:02,932 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,933 INFO | (200, 512)
2021-05-27 16:55:02,938 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,939 INFO | (200, 512)
2021-05-27 16:55:02,939 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:02,940 INFO | (200, 512)
2021-05-27 16:55:02,946 INFO | BERT LAYER LOOP
2021-05-27 16:55:02,947 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  12%|█▏        | 69/574 [00:08<00:53,  9.51it/s]

2021-05-27 16:55:03,022 INFO | INITIAL
2021-05-27 16:55:03,022 INFO | (50, 200)
2021-05-27 16:55:03,028 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:03,028 INFO | (50, 200, 512)
2021-05-27 16:55:03,030 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:03,030 INFO | (50, 200, 512)
2021-05-27 16:55:03,031 INFO | BERT LAYER
2021-05-27 16:55:03,032 INFO | (200, 512)
2021-05-27 16:55:03,032 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,032 INFO | (200, 512)
2021-05-27 16:55:03,033 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,033 INFO | (200, 512)
2021-05-27 16:55:03,040 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,041 INFO | (200, 512)
2021-05-27 16:55:03,041 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,042 INFO | (200, 512)
2021-05-27 16:55:03,048 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,049 INFO | (200, 512)
2021-05-27 16:55:03,050 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,050 INFO | (200, 512)
2021-05-27 16:55:03,057 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,057 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  12%|█▏        | 70/574 [00:08<00:53,  9.38it/s]

2021-05-27 16:55:03,132 INFO | INITIAL
2021-05-27 16:55:03,133 INFO | (50, 200)
2021-05-27 16:55:03,138 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:03,139 INFO | (50, 200, 512)
2021-05-27 16:55:03,140 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:03,141 INFO | (50, 200, 512)
2021-05-27 16:55:03,142 INFO | BERT LAYER
2021-05-27 16:55:03,143 INFO | (200, 512)
2021-05-27 16:55:03,145 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,147 INFO | (200, 512)
2021-05-27 16:55:03,148 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,148 INFO | (200, 512)
2021-05-27 16:55:03,154 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,154 INFO | (200, 512)
2021-05-27 16:55:03,154 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,155 INFO | (200, 512)
2021-05-27 16:55:03,161 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,162 INFO | (200, 512)
2021-05-27 16:55:03,163 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,163 INFO | (200, 512)
2021-05-27 16:55:03,169 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,169 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  12%|█▏        | 71/574 [00:08<00:54,  9.16it/s]

2021-05-27 16:55:03,247 INFO | INITIAL
2021-05-27 16:55:03,248 INFO | (50, 200)
2021-05-27 16:55:03,254 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:03,255 INFO | (50, 200, 512)
2021-05-27 16:55:03,257 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:03,257 INFO | (50, 200, 512)
2021-05-27 16:55:03,258 INFO | BERT LAYER
2021-05-27 16:55:03,259 INFO | (200, 512)
2021-05-27 16:55:03,259 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,260 INFO | (200, 512)
2021-05-27 16:55:03,261 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,261 INFO | (200, 512)
2021-05-27 16:55:03,267 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,267 INFO | (200, 512)
2021-05-27 16:55:03,268 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,268 INFO | (200, 512)
2021-05-27 16:55:03,273 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,274 INFO | (200, 512)
2021-05-27 16:55:03,275 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,275 INFO | (200, 512)
2021-05-27 16:55:03,282 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,282 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  13%|█▎        | 72/574 [00:08<00:54,  9.16it/s]

2021-05-27 16:55:03,356 INFO | INITIAL
2021-05-27 16:55:03,357 INFO | (50, 200)
2021-05-27 16:55:03,365 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:03,366 INFO | (50, 200, 512)
2021-05-27 16:55:03,367 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:03,368 INFO | (50, 200, 512)
2021-05-27 16:55:03,369 INFO | BERT LAYER
2021-05-27 16:55:03,369 INFO | (200, 512)
2021-05-27 16:55:03,370 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,370 INFO | (200, 512)
2021-05-27 16:55:03,371 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,372 INFO | (200, 512)
2021-05-27 16:55:03,378 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,379 INFO | (200, 512)
2021-05-27 16:55:03,379 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,380 INFO | (200, 512)
2021-05-27 16:55:03,388 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,388 INFO | (200, 512)
2021-05-27 16:55:03,389 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,389 INFO | (200, 512)
2021-05-27 16:55:03,396 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,396 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  13%|█▎        | 73/574 [00:08<00:55,  9.05it/s]

2021-05-27 16:55:03,470 INFO | INITIAL
2021-05-27 16:55:03,470 INFO | (50, 200)
2021-05-27 16:55:03,475 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:03,476 INFO | (50, 200, 512)
2021-05-27 16:55:03,477 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:03,478 INFO | (50, 200, 512)
2021-05-27 16:55:03,479 INFO | BERT LAYER
2021-05-27 16:55:03,479 INFO | (200, 512)
2021-05-27 16:55:03,480 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,480 INFO | (200, 512)
2021-05-27 16:55:03,480 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,481 INFO | (200, 512)
2021-05-27 16:55:03,487 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,488 INFO | (200, 512)
2021-05-27 16:55:03,489 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,489 INFO | (200, 512)
2021-05-27 16:55:03,497 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,497 INFO | (200, 512)
2021-05-27 16:55:03,498 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,498 INFO | (200, 512)
2021-05-27 16:55:03,504 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,505 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  13%|█▎        | 74/574 [00:08<00:55,  9.08it/s]

2021-05-27 16:55:03,580 INFO | INITIAL
2021-05-27 16:55:03,580 INFO | (50, 200)
2021-05-27 16:55:03,587 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:03,587 INFO | (50, 200, 512)
2021-05-27 16:55:03,589 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:03,590 INFO | (50, 200, 512)
2021-05-27 16:55:03,591 INFO | BERT LAYER
2021-05-27 16:55:03,591 INFO | (200, 512)
2021-05-27 16:55:03,592 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,592 INFO | (200, 512)
2021-05-27 16:55:03,593 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,594 INFO | (200, 512)
2021-05-27 16:55:03,599 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,599 INFO | (200, 512)
2021-05-27 16:55:03,600 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,600 INFO | (200, 512)
2021-05-27 16:55:03,605 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,605 INFO | (200, 512)
2021-05-27 16:55:03,606 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,606 INFO | (200, 512)
2021-05-27 16:55:03,613 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,613 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  13%|█▎        | 75/574 [00:08<00:54,  9.20it/s]

2021-05-27 16:55:03,685 INFO | INITIAL
2021-05-27 16:55:03,685 INFO | (50, 200)
2021-05-27 16:55:03,690 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:03,691 INFO | (50, 200, 512)
2021-05-27 16:55:03,692 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:03,693 INFO | (50, 200, 512)
2021-05-27 16:55:03,693 INFO | BERT LAYER
2021-05-27 16:55:03,694 INFO | (200, 512)
2021-05-27 16:55:03,694 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,695 INFO | (200, 512)
2021-05-27 16:55:03,695 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,696 INFO | (200, 512)
2021-05-27 16:55:03,702 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,703 INFO | (200, 512)
2021-05-27 16:55:03,703 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,704 INFO | (200, 512)
2021-05-27 16:55:03,710 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,710 INFO | (200, 512)
2021-05-27 16:55:03,711 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,711 INFO | (200, 512)
2021-05-27 16:55:03,716 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,717 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  13%|█▎        | 76/574 [00:08<00:53,  9.28it/s]

2021-05-27 16:55:03,790 INFO | INITIAL
2021-05-27 16:55:03,791 INFO | (50, 200)
2021-05-27 16:55:03,798 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:03,799 INFO | (50, 200, 512)
2021-05-27 16:55:03,800 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:03,801 INFO | (50, 200, 512)
2021-05-27 16:55:03,802 INFO | BERT LAYER
2021-05-27 16:55:03,802 INFO | (200, 512)
2021-05-27 16:55:03,803 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,804 INFO | (200, 512)
2021-05-27 16:55:03,804 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,805 INFO | (200, 512)
2021-05-27 16:55:03,812 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,812 INFO | (200, 512)
2021-05-27 16:55:03,812 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,813 INFO | (200, 512)
2021-05-27 16:55:03,819 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,820 INFO | (200, 512)
2021-05-27 16:55:03,820 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,820 INFO | (200, 512)
2021-05-27 16:55:03,826 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,826 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  13%|█▎        | 77/574 [00:09<00:53,  9.27it/s]

2021-05-27 16:55:03,898 INFO | INITIAL
2021-05-27 16:55:03,899 INFO | (50, 200)
2021-05-27 16:55:03,903 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:03,904 INFO | (50, 200, 512)
2021-05-27 16:55:03,905 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:03,905 INFO | (50, 200, 512)
2021-05-27 16:55:03,906 INFO | BERT LAYER
2021-05-27 16:55:03,906 INFO | (200, 512)
2021-05-27 16:55:03,906 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,907 INFO | (200, 512)
2021-05-27 16:55:03,908 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,908 INFO | (200, 512)
2021-05-27 16:55:03,914 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,915 INFO | (200, 512)
2021-05-27 16:55:03,915 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,915 INFO | (200, 512)
2021-05-27 16:55:03,923 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,924 INFO | (200, 512)
2021-05-27 16:55:03,925 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:03,925 INFO | (200, 512)
2021-05-27 16:55:03,932 INFO | BERT LAYER LOOP
2021-05-27 16:55:03,932 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  14%|█▎        | 78/574 [00:09<00:52,  9.40it/s]

2021-05-27 16:55:04,001 INFO | INITIAL
2021-05-27 16:55:04,002 INFO | (50, 200)
2021-05-27 16:55:04,008 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:04,009 INFO | (50, 200, 512)
2021-05-27 16:55:04,010 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:04,010 INFO | (50, 200, 512)
2021-05-27 16:55:04,011 INFO | BERT LAYER
2021-05-27 16:55:04,012 INFO | (200, 512)
2021-05-27 16:55:04,012 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,012 INFO | (200, 512)
2021-05-27 16:55:04,013 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,013 INFO | (200, 512)
2021-05-27 16:55:04,019 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,020 INFO | (200, 512)
2021-05-27 16:55:04,020 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,021 INFO | (200, 512)
2021-05-27 16:55:04,027 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,027 INFO | (200, 512)
2021-05-27 16:55:04,028 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,028 INFO | (200, 512)
2021-05-27 16:55:04,034 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,035 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  14%|█▍        | 79/574 [00:09<00:52,  9.43it/s]

2021-05-27 16:55:04,106 INFO | INITIAL
2021-05-27 16:55:04,107 INFO | (50, 200)
2021-05-27 16:55:04,115 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:04,116 INFO | (50, 200, 512)
2021-05-27 16:55:04,117 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:04,118 INFO | (50, 200, 512)
2021-05-27 16:55:04,118 INFO | BERT LAYER
2021-05-27 16:55:04,119 INFO | (200, 512)
2021-05-27 16:55:04,120 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,120 INFO | (200, 512)
2021-05-27 16:55:04,121 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,121 INFO | (200, 512)
2021-05-27 16:55:04,128 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,128 INFO | (200, 512)
2021-05-27 16:55:04,129 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,129 INFO | (200, 512)
2021-05-27 16:55:04,135 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,136 INFO | (200, 512)
2021-05-27 16:55:04,136 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,137 INFO | (200, 512)
2021-05-27 16:55:04,143 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,143 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  14%|█▍        | 80/574 [00:09<00:52,  9.36it/s]

2021-05-27 16:55:04,216 INFO | INITIAL
2021-05-27 16:55:04,216 INFO | (50, 200)
2021-05-27 16:55:04,221 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:04,222 INFO | (50, 200, 512)
2021-05-27 16:55:04,223 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:04,224 INFO | (50, 200, 512)
2021-05-27 16:55:04,225 INFO | BERT LAYER
2021-05-27 16:55:04,226 INFO | (200, 512)
2021-05-27 16:55:04,226 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,227 INFO | (200, 512)
2021-05-27 16:55:04,227 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,227 INFO | (200, 512)
2021-05-27 16:55:04,233 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,233 INFO | (200, 512)
2021-05-27 16:55:04,234 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,234 INFO | (200, 512)
2021-05-27 16:55:04,239 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,239 INFO | (200, 512)
2021-05-27 16:55:04,240 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,240 INFO | (200, 512)
2021-05-27 16:55:04,246 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,246 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  14%|█▍        | 80/574 [00:09<00:52,  9.36it/s]

2021-05-27 16:55:04,314 INFO | INITIAL
2021-05-27 16:55:04,314 INFO | (50, 200)
2021-05-27 16:55:04,320 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:04,320 INFO | (50, 200, 512)
2021-05-27 16:55:04,322 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:04,323 INFO | (50, 200, 512)
2021-05-27 16:55:04,324 INFO | BERT LAYER
2021-05-27 16:55:04,324 INFO | (200, 512)
2021-05-27 16:55:04,325 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,325 INFO | (200, 512)
2021-05-27 16:55:04,326 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,326 INFO | (200, 512)
2021-05-27 16:55:04,332 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,332 INFO | (200, 512)
2021-05-27 16:55:04,333 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,333 INFO | (200, 512)
2021-05-27 16:55:04,338 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,339 INFO | (200, 512)
2021-05-27 16:55:04,339 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,339 INFO | (200, 512)
2021-05-27 16:55:04,345 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,346 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  14%|█▍        | 82/574 [00:09<00:51,  9.59it/s]

2021-05-27 16:55:04,418 INFO | INITIAL
2021-05-27 16:55:04,418 INFO | (50, 200)
2021-05-27 16:55:04,423 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:04,424 INFO | (50, 200, 512)
2021-05-27 16:55:04,425 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:04,426 INFO | (50, 200, 512)
2021-05-27 16:55:04,427 INFO | BERT LAYER
2021-05-27 16:55:04,427 INFO | (200, 512)
2021-05-27 16:55:04,427 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,428 INFO | (200, 512)
2021-05-27 16:55:04,428 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,428 INFO | (200, 512)
2021-05-27 16:55:04,434 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,435 INFO | (200, 512)
2021-05-27 16:55:04,435 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,435 INFO | (200, 512)
2021-05-27 16:55:04,441 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,442 INFO | (200, 512)
2021-05-27 16:55:04,442 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,442 INFO | (200, 512)
2021-05-27 16:55:04,448 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,449 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  14%|█▍        | 83/574 [00:09<00:51,  9.63it/s]

2021-05-27 16:55:04,521 INFO | INITIAL
2021-05-27 16:55:04,522 INFO | (50, 200)
2021-05-27 16:55:04,527 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:04,528 INFO | (50, 200, 512)
2021-05-27 16:55:04,529 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:04,530 INFO | (50, 200, 512)
2021-05-27 16:55:04,530 INFO | BERT LAYER
2021-05-27 16:55:04,531 INFO | (200, 512)
2021-05-27 16:55:04,531 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,532 INFO | (200, 512)
2021-05-27 16:55:04,532 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,532 INFO | (200, 512)
2021-05-27 16:55:04,538 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,539 INFO | (200, 512)
2021-05-27 16:55:04,539 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,540 INFO | (200, 512)
2021-05-27 16:55:04,547 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,547 INFO | (200, 512)
2021-05-27 16:55:04,548 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,548 INFO | (200, 512)
2021-05-27 16:55:04,554 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,554 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  15%|█▍        | 84/574 [00:09<00:50,  9.61it/s]

2021-05-27 16:55:04,625 INFO | INITIAL
2021-05-27 16:55:04,626 INFO | (50, 200)
2021-05-27 16:55:04,633 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:04,633 INFO | (50, 200, 512)
2021-05-27 16:55:04,634 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:04,635 INFO | (50, 200, 512)
2021-05-27 16:55:04,635 INFO | BERT LAYER
2021-05-27 16:55:04,636 INFO | (200, 512)
2021-05-27 16:55:04,636 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,636 INFO | (200, 512)
2021-05-27 16:55:04,637 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,637 INFO | (200, 512)
2021-05-27 16:55:04,644 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,644 INFO | (200, 512)
2021-05-27 16:55:04,645 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,645 INFO | (200, 512)
2021-05-27 16:55:04,652 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,652 INFO | (200, 512)
2021-05-27 16:55:04,653 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,653 INFO | (200, 512)
2021-05-27 16:55:04,660 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,661 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  15%|█▍        | 85/574 [00:09<00:51,  9.46it/s]

2021-05-27 16:55:04,735 INFO | INITIAL
2021-05-27 16:55:04,736 INFO | (50, 200)
2021-05-27 16:55:04,741 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:04,742 INFO | (50, 200, 512)
2021-05-27 16:55:04,743 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:04,744 INFO | (50, 200, 512)
2021-05-27 16:55:04,744 INFO | BERT LAYER
2021-05-27 16:55:04,745 INFO | (200, 512)
2021-05-27 16:55:04,745 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,746 INFO | (200, 512)
2021-05-27 16:55:04,746 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,747 INFO | (200, 512)
2021-05-27 16:55:04,755 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,755 INFO | (200, 512)
2021-05-27 16:55:04,755 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,756 INFO | (200, 512)
2021-05-27 16:55:04,763 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,763 INFO | (200, 512)
2021-05-27 16:55:04,764 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,764 INFO | (200, 512)
2021-05-27 16:55:04,770 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,770 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  15%|█▍        | 86/574 [00:10<00:51,  9.52it/s]

2021-05-27 16:55:04,839 INFO | INITIAL
2021-05-27 16:55:04,839 INFO | (50, 200)
2021-05-27 16:55:04,846 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:04,846 INFO | (50, 200, 512)
2021-05-27 16:55:04,848 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:04,849 INFO | (50, 200, 512)
2021-05-27 16:55:04,850 INFO | BERT LAYER
2021-05-27 16:55:04,851 INFO | (200, 512)
2021-05-27 16:55:04,851 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,852 INFO | (200, 512)
2021-05-27 16:55:04,853 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,854 INFO | (200, 512)
2021-05-27 16:55:04,861 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,862 INFO | (200, 512)
2021-05-27 16:55:04,862 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,863 INFO | (200, 512)
2021-05-27 16:55:04,868 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,869 INFO | (200, 512)
2021-05-27 16:55:04,869 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,870 INFO | (200, 512)
2021-05-27 16:55:04,876 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,876 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  15%|█▌        | 87/574 [00:10<00:52,  9.34it/s]

2021-05-27 16:55:04,950 INFO | INITIAL
2021-05-27 16:55:04,951 INFO | (50, 200)
2021-05-27 16:55:04,955 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:04,956 INFO | (50, 200, 512)
2021-05-27 16:55:04,957 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:04,957 INFO | (50, 200, 512)
2021-05-27 16:55:04,959 INFO | BERT LAYER
2021-05-27 16:55:04,959 INFO | (200, 512)
2021-05-27 16:55:04,959 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,960 INFO | (200, 512)
2021-05-27 16:55:04,960 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,961 INFO | (200, 512)
2021-05-27 16:55:04,966 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,967 INFO | (200, 512)
2021-05-27 16:55:04,967 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,967 INFO | (200, 512)
2021-05-27 16:55:04,973 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,973 INFO | (200, 512)
2021-05-27 16:55:04,973 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:04,974 INFO | (200, 512)
2021-05-27 16:55:04,980 INFO | BERT LAYER LOOP
2021-05-27 16:55:04,981 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  15%|█▌        | 88/574 [00:10<00:51,  9.51it/s]

2021-05-27 16:55:05,051 INFO | INITIAL
2021-05-27 16:55:05,052 INFO | (50, 200)
2021-05-27 16:55:05,058 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:05,059 INFO | (50, 200, 512)
2021-05-27 16:55:05,060 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:05,061 INFO | (50, 200, 512)
2021-05-27 16:55:05,061 INFO | BERT LAYER
2021-05-27 16:55:05,062 INFO | (200, 512)
2021-05-27 16:55:05,062 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,062 INFO | (200, 512)
2021-05-27 16:55:05,063 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,063 INFO | (200, 512)
2021-05-27 16:55:05,068 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,069 INFO | (200, 512)
2021-05-27 16:55:05,069 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,070 INFO | (200, 512)
2021-05-27 16:55:05,075 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,077 INFO | (200, 512)
2021-05-27 16:55:05,078 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,078 INFO | (200, 512)
2021-05-27 16:55:05,085 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,085 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  16%|█▌        | 89/574 [00:10<00:50,  9.59it/s]

2021-05-27 16:55:05,153 INFO | INITIAL
2021-05-27 16:55:05,154 INFO | (50, 200)
2021-05-27 16:55:05,159 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:05,159 INFO | (50, 200, 512)
2021-05-27 16:55:05,160 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:05,161 INFO | (50, 200, 512)
2021-05-27 16:55:05,162 INFO | BERT LAYER
2021-05-27 16:55:05,162 INFO | (200, 512)
2021-05-27 16:55:05,162 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,163 INFO | (200, 512)
2021-05-27 16:55:05,163 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,163 INFO | (200, 512)
2021-05-27 16:55:05,169 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,169 INFO | (200, 512)
2021-05-27 16:55:05,169 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,170 INFO | (200, 512)
2021-05-27 16:55:05,175 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,175 INFO | (200, 512)
2021-05-27 16:55:05,176 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,176 INFO | (200, 512)
2021-05-27 16:55:05,183 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,183 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  16%|█▌        | 90/574 [00:10<00:50,  9.65it/s]

2021-05-27 16:55:05,256 INFO | INITIAL
2021-05-27 16:55:05,256 INFO | (50, 200)
2021-05-27 16:55:05,263 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:05,269 INFO | (50, 200, 512)
2021-05-27 16:55:05,272 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:05,275 INFO | (50, 200, 512)
2021-05-27 16:55:05,277 INFO | BERT LAYER
2021-05-27 16:55:05,278 INFO | (200, 512)
2021-05-27 16:55:05,278 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,278 INFO | (200, 512)
2021-05-27 16:55:05,279 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,279 INFO | (200, 512)
2021-05-27 16:55:05,285 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,286 INFO | (200, 512)
2021-05-27 16:55:05,286 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,287 INFO | (200, 512)
2021-05-27 16:55:05,292 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,292 INFO | (200, 512)
2021-05-27 16:55:05,293 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,293 INFO | (200, 512)
2021-05-27 16:55:05,299 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,300 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  16%|█▌        | 91/574 [00:10<00:51,  9.37it/s]

2021-05-27 16:55:05,369 INFO | INITIAL
2021-05-27 16:55:05,370 INFO | (50, 200)
2021-05-27 16:55:05,375 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:05,375 INFO | (50, 200, 512)
2021-05-27 16:55:05,376 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:05,377 INFO | (50, 200, 512)
2021-05-27 16:55:05,377 INFO | BERT LAYER
2021-05-27 16:55:05,378 INFO | (200, 512)
2021-05-27 16:55:05,378 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,379 INFO | (200, 512)
2021-05-27 16:55:05,379 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,379 INFO | (200, 512)
2021-05-27 16:55:05,385 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,386 INFO | (200, 512)
2021-05-27 16:55:05,387 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,387 INFO | (200, 512)
2021-05-27 16:55:05,393 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,394 INFO | (200, 512)
2021-05-27 16:55:05,394 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,394 INFO | (200, 512)
2021-05-27 16:55:05,400 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,400 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  16%|█▌        | 92/574 [00:10<00:50,  9.46it/s]

2021-05-27 16:55:05,473 INFO | INITIAL
2021-05-27 16:55:05,473 INFO | (50, 200)
2021-05-27 16:55:05,482 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:05,482 INFO | (50, 200, 512)
2021-05-27 16:55:05,483 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:05,484 INFO | (50, 200, 512)
2021-05-27 16:55:05,485 INFO | BERT LAYER
2021-05-27 16:55:05,485 INFO | (200, 512)
2021-05-27 16:55:05,485 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,486 INFO | (200, 512)
2021-05-27 16:55:05,486 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,486 INFO | (200, 512)
2021-05-27 16:55:05,493 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,493 INFO | (200, 512)
2021-05-27 16:55:05,494 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,494 INFO | (200, 512)
2021-05-27 16:55:05,499 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,499 INFO | (200, 512)
2021-05-27 16:55:05,500 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,500 INFO | (200, 512)
2021-05-27 16:55:05,505 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,505 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  16%|█▌        | 93/574 [00:10<00:51,  9.39it/s]

2021-05-27 16:55:05,581 INFO | INITIAL
2021-05-27 16:55:05,582 INFO | (50, 200)
2021-05-27 16:55:05,588 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:05,588 INFO | (50, 200, 512)
2021-05-27 16:55:05,590 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:05,591 INFO | (50, 200, 512)
2021-05-27 16:55:05,592 INFO | BERT LAYER
2021-05-27 16:55:05,592 INFO | (200, 512)
2021-05-27 16:55:05,593 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,593 INFO | (200, 512)
2021-05-27 16:55:05,594 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,594 INFO | (200, 512)
2021-05-27 16:55:05,600 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,601 INFO | (200, 512)
2021-05-27 16:55:05,601 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,602 INFO | (200, 512)
2021-05-27 16:55:05,607 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,608 INFO | (200, 512)
2021-05-27 16:55:05,609 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,610 INFO | (200, 512)
2021-05-27 16:55:05,617 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,619 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  16%|█▋        | 94/574 [00:10<00:53,  9.01it/s]

2021-05-27 16:55:05,702 INFO | INITIAL
2021-05-27 16:55:05,703 INFO | (50, 200)
2021-05-27 16:55:05,708 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:05,709 INFO | (50, 200, 512)
2021-05-27 16:55:05,710 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:05,711 INFO | (50, 200, 512)
2021-05-27 16:55:05,712 INFO | BERT LAYER
2021-05-27 16:55:05,712 INFO | (200, 512)
2021-05-27 16:55:05,712 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,713 INFO | (200, 512)
2021-05-27 16:55:05,713 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,714 INFO | (200, 512)
2021-05-27 16:55:05,721 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,722 INFO | (200, 512)
2021-05-27 16:55:05,722 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,723 INFO | (200, 512)
2021-05-27 16:55:05,729 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,730 INFO | (200, 512)
2021-05-27 16:55:05,730 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,731 INFO | (200, 512)
2021-05-27 16:55:05,736 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,736 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  17%|█▋        | 95/574 [00:11<00:54,  8.82it/s]

2021-05-27 16:55:05,822 INFO | INITIAL
2021-05-27 16:55:05,822 INFO | (50, 200)
2021-05-27 16:55:05,827 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:05,828 INFO | (50, 200, 512)
2021-05-27 16:55:05,829 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:05,830 INFO | (50, 200, 512)
2021-05-27 16:55:05,831 INFO | BERT LAYER
2021-05-27 16:55:05,831 INFO | (200, 512)
2021-05-27 16:55:05,831 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,832 INFO | (200, 512)
2021-05-27 16:55:05,832 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,832 INFO | (200, 512)
2021-05-27 16:55:05,838 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,839 INFO | (200, 512)
2021-05-27 16:55:05,839 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,839 INFO | (200, 512)
2021-05-27 16:55:05,847 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,847 INFO | (200, 512)
2021-05-27 16:55:05,848 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,848 INFO | (200, 512)
2021-05-27 16:55:05,856 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,856 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  17%|█▋        | 96/574 [00:11<00:53,  8.91it/s]

2021-05-27 16:55:05,931 INFO | INITIAL
2021-05-27 16:55:05,932 INFO | (50, 200)
2021-05-27 16:55:05,939 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:05,939 INFO | (50, 200, 512)
2021-05-27 16:55:05,941 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:05,941 INFO | (50, 200, 512)
2021-05-27 16:55:05,943 INFO | BERT LAYER
2021-05-27 16:55:05,944 INFO | (200, 512)
2021-05-27 16:55:05,944 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,945 INFO | (200, 512)
2021-05-27 16:55:05,945 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,946 INFO | (200, 512)
2021-05-27 16:55:05,955 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,955 INFO | (200, 512)
2021-05-27 16:55:05,956 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,957 INFO | (200, 512)
2021-05-27 16:55:05,964 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,965 INFO | (200, 512)
2021-05-27 16:55:05,966 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:05,966 INFO | (200, 512)
2021-05-27 16:55:05,973 INFO | BERT LAYER LOOP
2021-05-27 16:55:05,973 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  17%|█▋        | 97/574 [00:11<00:55,  8.56it/s]

2021-05-27 16:55:06,060 INFO | INITIAL
2021-05-27 16:55:06,060 INFO | (50, 200)
2021-05-27 16:55:06,067 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:06,068 INFO | (50, 200, 512)
2021-05-27 16:55:06,069 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:06,070 INFO | (50, 200, 512)
2021-05-27 16:55:06,071 INFO | BERT LAYER
2021-05-27 16:55:06,071 INFO | (200, 512)
2021-05-27 16:55:06,071 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,072 INFO | (200, 512)
2021-05-27 16:55:06,072 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,072 INFO | (200, 512)
2021-05-27 16:55:06,080 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,080 INFO | (200, 512)
2021-05-27 16:55:06,081 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,081 INFO | (200, 512)
2021-05-27 16:55:06,087 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,088 INFO | (200, 512)
2021-05-27 16:55:06,088 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,089 INFO | (200, 512)
2021-05-27 16:55:06,095 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,096 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  17%|█▋        | 98/574 [00:11<00:54,  8.72it/s]

2021-05-27 16:55:06,168 INFO | INITIAL
2021-05-27 16:55:06,169 INFO | (50, 200)
2021-05-27 16:55:06,174 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:06,174 INFO | (50, 200, 512)
2021-05-27 16:55:06,176 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:06,176 INFO | (50, 200, 512)
2021-05-27 16:55:06,177 INFO | BERT LAYER
2021-05-27 16:55:06,178 INFO | (200, 512)
2021-05-27 16:55:06,179 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,179 INFO | (200, 512)
2021-05-27 16:55:06,179 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,180 INFO | (200, 512)
2021-05-27 16:55:06,185 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,186 INFO | (200, 512)
2021-05-27 16:55:06,186 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,186 INFO | (200, 512)
2021-05-27 16:55:06,192 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,193 INFO | (200, 512)
2021-05-27 16:55:06,194 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,194 INFO | (200, 512)
2021-05-27 16:55:06,200 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,200 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  17%|█▋        | 99/574 [00:11<00:53,  8.85it/s]

2021-05-27 16:55:06,277 INFO | INITIAL
2021-05-27 16:55:06,278 INFO | (50, 200)
2021-05-27 16:55:06,284 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:06,285 INFO | (50, 200, 512)
2021-05-27 16:55:06,286 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:06,287 INFO | (50, 200, 512)
2021-05-27 16:55:06,288 INFO | BERT LAYER
2021-05-27 16:55:06,288 INFO | (200, 512)
2021-05-27 16:55:06,289 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,290 INFO | (200, 512)
2021-05-27 16:55:06,290 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,292 INFO | (200, 512)
2021-05-27 16:55:06,299 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,299 INFO | (200, 512)
2021-05-27 16:55:06,300 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,300 INFO | (200, 512)
2021-05-27 16:55:06,305 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,305 INFO | (200, 512)
2021-05-27 16:55:06,306 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,306 INFO | (200, 512)
2021-05-27 16:55:06,311 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,312 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  17%|█▋        | 100/574 [00:11<00:53,  8.92it/s]

2021-05-27 16:55:06,388 INFO | INITIAL
2021-05-27 16:55:06,388 INFO | (50, 200)
2021-05-27 16:55:06,395 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:06,396 INFO | (50, 200, 512)
2021-05-27 16:55:06,397 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:06,397 INFO | (50, 200, 512)
2021-05-27 16:55:06,398 INFO | BERT LAYER
2021-05-27 16:55:06,398 INFO | (200, 512)
2021-05-27 16:55:06,399 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,400 INFO | (200, 512)
2021-05-27 16:55:06,400 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,400 INFO | (200, 512)
2021-05-27 16:55:06,407 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,408 INFO | (200, 512)
2021-05-27 16:55:06,408 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,409 INFO | (200, 512)
2021-05-27 16:55:06,417 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,417 INFO | (200, 512)
2021-05-27 16:55:06,418 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,419 INFO | (200, 512)
2021-05-27 16:55:06,424 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,425 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  18%|█▊        | 101/574 [00:11<00:52,  8.96it/s]

2021-05-27 16:55:06,498 INFO | INITIAL
2021-05-27 16:55:06,498 INFO | (50, 200)
2021-05-27 16:55:06,502 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:06,503 INFO | (50, 200, 512)
2021-05-27 16:55:06,504 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:06,505 INFO | (50, 200, 512)
2021-05-27 16:55:06,505 INFO | BERT LAYER
2021-05-27 16:55:06,506 INFO | (200, 512)
2021-05-27 16:55:06,506 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,507 INFO | (200, 512)
2021-05-27 16:55:06,507 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,508 INFO | (200, 512)
2021-05-27 16:55:06,514 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,515 INFO | (200, 512)
2021-05-27 16:55:06,515 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,516 INFO | (200, 512)
2021-05-27 16:55:06,521 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,522 INFO | (200, 512)
2021-05-27 16:55:06,522 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,523 INFO | (200, 512)
2021-05-27 16:55:06,529 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,529 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  18%|█▊        | 102/574 [00:11<00:52,  9.03it/s]

2021-05-27 16:55:06,606 INFO | INITIAL
2021-05-27 16:55:06,607 INFO | (50, 200)
2021-05-27 16:55:06,618 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:06,618 INFO | (50, 200, 512)
2021-05-27 16:55:06,620 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:06,620 INFO | (50, 200, 512)
2021-05-27 16:55:06,622 INFO | BERT LAYER
2021-05-27 16:55:06,625 INFO | (200, 512)
2021-05-27 16:55:06,626 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,626 INFO | (200, 512)
2021-05-27 16:55:06,627 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,628 INFO | (200, 512)
2021-05-27 16:55:06,635 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,636 INFO | (200, 512)
2021-05-27 16:55:06,636 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,637 INFO | (200, 512)
2021-05-27 16:55:06,644 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,645 INFO | (200, 512)
2021-05-27 16:55:06,645 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,646 INFO | (200, 512)
2021-05-27 16:55:06,652 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,653 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  18%|█▊        | 103/574 [00:11<00:53,  8.81it/s]

2021-05-27 16:55:06,727 INFO | INITIAL
2021-05-27 16:55:06,727 INFO | (50, 200)
2021-05-27 16:55:06,735 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:06,736 INFO | (50, 200, 512)
2021-05-27 16:55:06,738 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:06,738 INFO | (50, 200, 512)
2021-05-27 16:55:06,739 INFO | BERT LAYER
2021-05-27 16:55:06,739 INFO | (200, 512)
2021-05-27 16:55:06,740 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,740 INFO | (200, 512)
2021-05-27 16:55:06,741 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,741 INFO | (200, 512)
2021-05-27 16:55:06,749 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,750 INFO | (200, 512)
2021-05-27 16:55:06,751 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,751 INFO | (200, 512)
2021-05-27 16:55:06,758 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,758 INFO | (200, 512)
2021-05-27 16:55:06,759 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,759 INFO | (200, 512)
2021-05-27 16:55:06,766 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,767 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  18%|█▊        | 104/574 [00:12<00:52,  8.91it/s]

2021-05-27 16:55:06,835 INFO | INITIAL
2021-05-27 16:55:06,836 INFO | (50, 200)
2021-05-27 16:55:06,840 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:06,841 INFO | (50, 200, 512)
2021-05-27 16:55:06,842 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:06,843 INFO | (50, 200, 512)
2021-05-27 16:55:06,844 INFO | BERT LAYER
2021-05-27 16:55:06,844 INFO | (200, 512)
2021-05-27 16:55:06,844 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,845 INFO | (200, 512)
2021-05-27 16:55:06,845 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,845 INFO | (200, 512)
2021-05-27 16:55:06,851 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,851 INFO | (200, 512)
2021-05-27 16:55:06,852 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,852 INFO | (200, 512)
2021-05-27 16:55:06,859 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,860 INFO | (200, 512)
2021-05-27 16:55:06,861 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,861 INFO | (200, 512)
2021-05-27 16:55:06,868 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,868 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  18%|█▊        | 105/574 [00:12<00:51,  9.07it/s]

2021-05-27 16:55:06,942 INFO | INITIAL
2021-05-27 16:55:06,943 INFO | (50, 200)
2021-05-27 16:55:06,948 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:06,949 INFO | (50, 200, 512)
2021-05-27 16:55:06,950 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:06,951 INFO | (50, 200, 512)
2021-05-27 16:55:06,952 INFO | BERT LAYER
2021-05-27 16:55:06,952 INFO | (200, 512)
2021-05-27 16:55:06,953 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,953 INFO | (200, 512)
2021-05-27 16:55:06,953 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,954 INFO | (200, 512)
2021-05-27 16:55:06,960 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,961 INFO | (200, 512)
2021-05-27 16:55:06,962 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,962 INFO | (200, 512)
2021-05-27 16:55:06,968 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,969 INFO | (200, 512)
2021-05-27 16:55:06,970 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:06,970 INFO | (200, 512)
2021-05-27 16:55:06,977 INFO | BERT LAYER LOOP
2021-05-27 16:55:06,978 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  18%|█▊        | 106/574 [00:12<00:51,  9.10it/s]

2021-05-27 16:55:07,050 INFO | INITIAL
2021-05-27 16:55:07,051 INFO | (50, 200)
2021-05-27 16:55:07,056 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:07,057 INFO | (50, 200, 512)
2021-05-27 16:55:07,058 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:07,059 INFO | (50, 200, 512)
2021-05-27 16:55:07,060 INFO | BERT LAYER
2021-05-27 16:55:07,062 INFO | (200, 512)
2021-05-27 16:55:07,063 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,063 INFO | (200, 512)
2021-05-27 16:55:07,064 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,064 INFO | (200, 512)
2021-05-27 16:55:07,071 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,072 INFO | (200, 512)
2021-05-27 16:55:07,073 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,073 INFO | (200, 512)
2021-05-27 16:55:07,082 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,083 INFO | (200, 512)
2021-05-27 16:55:07,083 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,084 INFO | (200, 512)
2021-05-27 16:55:07,089 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,089 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  19%|█▊        | 107/574 [00:12<00:51,  9.05it/s]

2021-05-27 16:55:07,162 INFO | INITIAL
2021-05-27 16:55:07,163 INFO | (50, 200)
2021-05-27 16:55:07,169 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:07,170 INFO | (50, 200, 512)
2021-05-27 16:55:07,171 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:07,172 INFO | (50, 200, 512)
2021-05-27 16:55:07,173 INFO | BERT LAYER
2021-05-27 16:55:07,173 INFO | (200, 512)
2021-05-27 16:55:07,174 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,175 INFO | (200, 512)
2021-05-27 16:55:07,175 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,176 INFO | (200, 512)
2021-05-27 16:55:07,181 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,181 INFO | (200, 512)
2021-05-27 16:55:07,181 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,182 INFO | (200, 512)
2021-05-27 16:55:07,189 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,190 INFO | (200, 512)
2021-05-27 16:55:07,190 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,190 INFO | (200, 512)
2021-05-27 16:55:07,197 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,197 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  19%|█▉        | 108/574 [00:12<00:51,  8.97it/s]

2021-05-27 16:55:07,276 INFO | INITIAL
2021-05-27 16:55:07,277 INFO | (50, 200)
2021-05-27 16:55:07,284 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:07,285 INFO | (50, 200, 512)
2021-05-27 16:55:07,286 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:07,287 INFO | (50, 200, 512)
2021-05-27 16:55:07,288 INFO | BERT LAYER
2021-05-27 16:55:07,288 INFO | (200, 512)
2021-05-27 16:55:07,288 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,289 INFO | (200, 512)
2021-05-27 16:55:07,289 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,289 INFO | (200, 512)
2021-05-27 16:55:07,295 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,295 INFO | (200, 512)
2021-05-27 16:55:07,296 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,296 INFO | (200, 512)
2021-05-27 16:55:07,302 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,302 INFO | (200, 512)
2021-05-27 16:55:07,303 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,303 INFO | (200, 512)
2021-05-27 16:55:07,308 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,309 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  19%|█▉        | 109/574 [00:12<00:50,  9.14it/s]

2021-05-27 16:55:07,380 INFO | INITIAL
2021-05-27 16:55:07,381 INFO | (50, 200)
2021-05-27 16:55:07,386 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:07,387 INFO | (50, 200, 512)
2021-05-27 16:55:07,388 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:07,388 INFO | (50, 200, 512)
2021-05-27 16:55:07,389 INFO | BERT LAYER
2021-05-27 16:55:07,390 INFO | (200, 512)
2021-05-27 16:55:07,390 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,390 INFO | (200, 512)
2021-05-27 16:55:07,391 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,391 INFO | (200, 512)
2021-05-27 16:55:07,397 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,398 INFO | (200, 512)
2021-05-27 16:55:07,399 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,399 INFO | (200, 512)
2021-05-27 16:55:07,405 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,405 INFO | (200, 512)
2021-05-27 16:55:07,406 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,406 INFO | (200, 512)
2021-05-27 16:55:07,412 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,412 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  19%|█▉        | 110/574 [00:12<00:49,  9.30it/s]

2021-05-27 16:55:07,484 INFO | INITIAL
2021-05-27 16:55:07,484 INFO | (50, 200)
2021-05-27 16:55:07,491 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:07,491 INFO | (50, 200, 512)
2021-05-27 16:55:07,493 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:07,493 INFO | (50, 200, 512)
2021-05-27 16:55:07,495 INFO | BERT LAYER
2021-05-27 16:55:07,495 INFO | (200, 512)
2021-05-27 16:55:07,496 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,496 INFO | (200, 512)
2021-05-27 16:55:07,497 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,497 INFO | (200, 512)
2021-05-27 16:55:07,503 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,504 INFO | (200, 512)
2021-05-27 16:55:07,505 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,505 INFO | (200, 512)
2021-05-27 16:55:07,512 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,513 INFO | (200, 512)
2021-05-27 16:55:07,513 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,514 INFO | (200, 512)
2021-05-27 16:55:07,519 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,520 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  19%|█▉        | 111/574 [00:12<00:50,  9.13it/s]

2021-05-27 16:55:07,598 INFO | INITIAL
2021-05-27 16:55:07,599 INFO | (50, 200)
2021-05-27 16:55:07,606 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:07,606 INFO | (50, 200, 512)
2021-05-27 16:55:07,607 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:07,608 INFO | (50, 200, 512)
2021-05-27 16:55:07,609 INFO | BERT LAYER
2021-05-27 16:55:07,610 INFO | (200, 512)
2021-05-27 16:55:07,610 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,611 INFO | (200, 512)
2021-05-27 16:55:07,611 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,612 INFO | (200, 512)
2021-05-27 16:55:07,617 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,618 INFO | (200, 512)
2021-05-27 16:55:07,618 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,619 INFO | (200, 512)
2021-05-27 16:55:07,627 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,628 INFO | (200, 512)
2021-05-27 16:55:07,628 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,629 INFO | (200, 512)
2021-05-27 16:55:07,636 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,636 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  20%|█▉        | 112/574 [00:12<00:51,  8.98it/s]

2021-05-27 16:55:07,713 INFO | INITIAL
2021-05-27 16:55:07,714 INFO | (50, 200)
2021-05-27 16:55:07,720 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:07,720 INFO | (50, 200, 512)
2021-05-27 16:55:07,722 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:07,722 INFO | (50, 200, 512)
2021-05-27 16:55:07,723 INFO | BERT LAYER
2021-05-27 16:55:07,724 INFO | (200, 512)
2021-05-27 16:55:07,725 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,726 INFO | (200, 512)
2021-05-27 16:55:07,726 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,727 INFO | (200, 512)
2021-05-27 16:55:07,735 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,736 INFO | (200, 512)
2021-05-27 16:55:07,736 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,737 INFO | (200, 512)
2021-05-27 16:55:07,743 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,743 INFO | (200, 512)
2021-05-27 16:55:07,744 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,744 INFO | (200, 512)
2021-05-27 16:55:07,750 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,751 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  20%|█▉        | 113/574 [00:13<00:51,  8.97it/s]

2021-05-27 16:55:07,825 INFO | INITIAL
2021-05-27 16:55:07,826 INFO | (50, 200)
2021-05-27 16:55:07,832 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:07,833 INFO | (50, 200, 512)
2021-05-27 16:55:07,834 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:07,834 INFO | (50, 200, 512)
2021-05-27 16:55:07,835 INFO | BERT LAYER
2021-05-27 16:55:07,835 INFO | (200, 512)
2021-05-27 16:55:07,835 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,836 INFO | (200, 512)
2021-05-27 16:55:07,836 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,836 INFO | (200, 512)
2021-05-27 16:55:07,843 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,844 INFO | (200, 512)
2021-05-27 16:55:07,844 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,845 INFO | (200, 512)
2021-05-27 16:55:07,850 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,851 INFO | (200, 512)
2021-05-27 16:55:07,851 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,852 INFO | (200, 512)
2021-05-27 16:55:07,857 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,857 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  20%|█▉        | 114/574 [00:13<00:51,  9.00it/s]

2021-05-27 16:55:07,935 INFO | INITIAL
2021-05-27 16:55:07,936 INFO | (50, 200)
2021-05-27 16:55:07,942 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:07,942 INFO | (50, 200, 512)
2021-05-27 16:55:07,944 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:07,944 INFO | (50, 200, 512)
2021-05-27 16:55:07,945 INFO | BERT LAYER
2021-05-27 16:55:07,946 INFO | (200, 512)
2021-05-27 16:55:07,946 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,946 INFO | (200, 512)
2021-05-27 16:55:07,947 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,947 INFO | (200, 512)
2021-05-27 16:55:07,953 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,954 INFO | (200, 512)
2021-05-27 16:55:07,954 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,955 INFO | (200, 512)
2021-05-27 16:55:07,962 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,963 INFO | (200, 512)
2021-05-27 16:55:07,963 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:07,964 INFO | (200, 512)
2021-05-27 16:55:07,970 INFO | BERT LAYER LOOP
2021-05-27 16:55:07,970 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  20%|██        | 115/574 [00:13<00:51,  8.99it/s]

2021-05-27 16:55:08,047 INFO | INITIAL
2021-05-27 16:55:08,047 INFO | (50, 200)
2021-05-27 16:55:08,052 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:08,053 INFO | (50, 200, 512)
2021-05-27 16:55:08,054 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:08,054 INFO | (50, 200, 512)
2021-05-27 16:55:08,055 INFO | BERT LAYER
2021-05-27 16:55:08,056 INFO | (200, 512)
2021-05-27 16:55:08,056 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,057 INFO | (200, 512)
2021-05-27 16:55:08,057 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,057 INFO | (200, 512)
2021-05-27 16:55:08,065 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,065 INFO | (200, 512)
2021-05-27 16:55:08,066 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,066 INFO | (200, 512)
2021-05-27 16:55:08,072 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,073 INFO | (200, 512)
2021-05-27 16:55:08,073 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,074 INFO | (200, 512)
2021-05-27 16:55:08,080 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,080 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  20%|██        | 116/574 [00:13<00:50,  9.08it/s]

2021-05-27 16:55:08,155 INFO | INITIAL
2021-05-27 16:55:08,155 INFO | (50, 200)
2021-05-27 16:55:08,161 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:08,162 INFO | (50, 200, 512)
2021-05-27 16:55:08,163 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:08,164 INFO | (50, 200, 512)
2021-05-27 16:55:08,165 INFO | BERT LAYER
2021-05-27 16:55:08,165 INFO | (200, 512)
2021-05-27 16:55:08,166 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,167 INFO | (200, 512)
2021-05-27 16:55:08,167 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,168 INFO | (200, 512)
2021-05-27 16:55:08,173 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,173 INFO | (200, 512)
2021-05-27 16:55:08,174 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,174 INFO | (200, 512)
2021-05-27 16:55:08,182 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,182 INFO | (200, 512)
2021-05-27 16:55:08,183 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,183 INFO | (200, 512)
2021-05-27 16:55:08,188 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,189 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  20%|██        | 117/574 [00:13<00:49,  9.19it/s]

2021-05-27 16:55:08,260 INFO | INITIAL
2021-05-27 16:55:08,261 INFO | (50, 200)
2021-05-27 16:55:08,268 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:08,269 INFO | (50, 200, 512)
2021-05-27 16:55:08,271 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:08,271 INFO | (50, 200, 512)
2021-05-27 16:55:08,272 INFO | BERT LAYER
2021-05-27 16:55:08,273 INFO | (200, 512)
2021-05-27 16:55:08,273 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,273 INFO | (200, 512)
2021-05-27 16:55:08,274 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,275 INFO | (200, 512)
2021-05-27 16:55:08,281 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,282 INFO | (200, 512)
2021-05-27 16:55:08,283 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,283 INFO | (200, 512)
2021-05-27 16:55:08,288 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,289 INFO | (200, 512)
2021-05-27 16:55:08,289 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,289 INFO | (200, 512)
2021-05-27 16:55:08,296 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,296 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  21%|██        | 118/574 [00:13<00:49,  9.22it/s]

2021-05-27 16:55:08,368 INFO | INITIAL
2021-05-27 16:55:08,368 INFO | (50, 200)
2021-05-27 16:55:08,373 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:08,374 INFO | (50, 200, 512)
2021-05-27 16:55:08,376 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:08,376 INFO | (50, 200, 512)
2021-05-27 16:55:08,377 INFO | BERT LAYER
2021-05-27 16:55:08,378 INFO | (200, 512)
2021-05-27 16:55:08,378 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,378 INFO | (200, 512)
2021-05-27 16:55:08,379 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,379 INFO | (200, 512)
2021-05-27 16:55:08,385 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,385 INFO | (200, 512)
2021-05-27 16:55:08,386 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,386 INFO | (200, 512)
2021-05-27 16:55:08,392 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,392 INFO | (200, 512)
2021-05-27 16:55:08,393 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,393 INFO | (200, 512)
2021-05-27 16:55:08,401 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,407 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  21%|██        | 119/574 [00:13<00:49,  9.15it/s]

2021-05-27 16:55:08,479 INFO | INITIAL
2021-05-27 16:55:08,479 INFO | (50, 200)
2021-05-27 16:55:08,485 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:08,486 INFO | (50, 200, 512)
2021-05-27 16:55:08,487 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:08,488 INFO | (50, 200, 512)
2021-05-27 16:55:08,489 INFO | BERT LAYER
2021-05-27 16:55:08,489 INFO | (200, 512)
2021-05-27 16:55:08,490 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,490 INFO | (200, 512)
2021-05-27 16:55:08,492 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,492 INFO | (200, 512)
2021-05-27 16:55:08,499 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,499 INFO | (200, 512)
2021-05-27 16:55:08,500 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,500 INFO | (200, 512)
2021-05-27 16:55:08,506 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,506 INFO | (200, 512)
2021-05-27 16:55:08,507 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,507 INFO | (200, 512)
2021-05-27 16:55:08,513 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,514 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  21%|██        | 120/574 [00:13<00:49,  9.21it/s]

2021-05-27 16:55:08,586 INFO | INITIAL
2021-05-27 16:55:08,587 INFO | (50, 200)
2021-05-27 16:55:08,593 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:08,594 INFO | (50, 200, 512)
2021-05-27 16:55:08,595 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:08,596 INFO | (50, 200, 512)
2021-05-27 16:55:08,596 INFO | BERT LAYER
2021-05-27 16:55:08,597 INFO | (200, 512)
2021-05-27 16:55:08,597 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,597 INFO | (200, 512)
2021-05-27 16:55:08,598 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,598 INFO | (200, 512)
2021-05-27 16:55:08,605 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,605 INFO | (200, 512)
2021-05-27 16:55:08,606 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,606 INFO | (200, 512)
2021-05-27 16:55:08,611 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,612 INFO | (200, 512)
2021-05-27 16:55:08,613 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,613 INFO | (200, 512)
2021-05-27 16:55:08,618 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,619 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  21%|██        | 121/574 [00:13<00:48,  9.42it/s]

2021-05-27 16:55:08,687 INFO | INITIAL
2021-05-27 16:55:08,687 INFO | (50, 200)
2021-05-27 16:55:08,692 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:08,692 INFO | (50, 200, 512)
2021-05-27 16:55:08,694 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:08,694 INFO | (50, 200, 512)
2021-05-27 16:55:08,695 INFO | BERT LAYER
2021-05-27 16:55:08,695 INFO | (200, 512)
2021-05-27 16:55:08,696 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,696 INFO | (200, 512)
2021-05-27 16:55:08,697 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,697 INFO | (200, 512)
2021-05-27 16:55:08,704 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,704 INFO | (200, 512)
2021-05-27 16:55:08,705 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,705 INFO | (200, 512)
2021-05-27 16:55:08,712 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,712 INFO | (200, 512)
2021-05-27 16:55:08,713 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,713 INFO | (200, 512)
2021-05-27 16:55:08,719 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,719 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  21%|██▏       | 122/574 [00:13<00:49,  9.12it/s]

2021-05-27 16:55:08,805 INFO | INITIAL
2021-05-27 16:55:08,805 INFO | (50, 200)
2021-05-27 16:55:08,813 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:08,813 INFO | (50, 200, 512)
2021-05-27 16:55:08,814 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:08,814 INFO | (50, 200, 512)
2021-05-27 16:55:08,815 INFO | BERT LAYER
2021-05-27 16:55:08,816 INFO | (200, 512)
2021-05-27 16:55:08,816 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,816 INFO | (200, 512)
2021-05-27 16:55:08,817 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,817 INFO | (200, 512)
2021-05-27 16:55:08,823 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,823 INFO | (200, 512)
2021-05-27 16:55:08,823 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,824 INFO | (200, 512)
2021-05-27 16:55:08,831 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,831 INFO | (200, 512)
2021-05-27 16:55:08,832 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,832 INFO | (200, 512)
2021-05-27 16:55:08,838 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,839 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  21%|██▏       | 123/574 [00:14<00:49,  9.07it/s]

2021-05-27 16:55:08,916 INFO | INITIAL
2021-05-27 16:55:08,917 INFO | (50, 200)
2021-05-27 16:55:08,922 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:08,923 INFO | (50, 200, 512)
2021-05-27 16:55:08,924 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:08,925 INFO | (50, 200, 512)
2021-05-27 16:55:08,926 INFO | BERT LAYER
2021-05-27 16:55:08,926 INFO | (200, 512)
2021-05-27 16:55:08,926 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,927 INFO | (200, 512)
2021-05-27 16:55:08,927 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,928 INFO | (200, 512)
2021-05-27 16:55:08,934 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,935 INFO | (200, 512)
2021-05-27 16:55:08,935 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,936 INFO | (200, 512)
2021-05-27 16:55:08,943 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,943 INFO | (200, 512)
2021-05-27 16:55:08,944 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:08,944 INFO | (200, 512)
2021-05-27 16:55:08,950 INFO | BERT LAYER LOOP
2021-05-27 16:55:08,950 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  22%|██▏       | 124/574 [00:14<00:48,  9.21it/s]

2021-05-27 16:55:09,021 INFO | INITIAL
2021-05-27 16:55:09,022 INFO | (50, 200)
2021-05-27 16:55:09,027 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:09,027 INFO | (50, 200, 512)
2021-05-27 16:55:09,028 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:09,029 INFO | (50, 200, 512)
2021-05-27 16:55:09,030 INFO | BERT LAYER
2021-05-27 16:55:09,030 INFO | (200, 512)
2021-05-27 16:55:09,031 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,031 INFO | (200, 512)
2021-05-27 16:55:09,031 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,032 INFO | (200, 512)
2021-05-27 16:55:09,036 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,037 INFO | (200, 512)
2021-05-27 16:55:09,038 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,038 INFO | (200, 512)
2021-05-27 16:55:09,043 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,043 INFO | (200, 512)
2021-05-27 16:55:09,044 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,044 INFO | (200, 512)
2021-05-27 16:55:09,049 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,049 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  22%|██▏       | 125/574 [00:14<00:48,  9.28it/s]

2021-05-27 16:55:09,126 INFO | INITIAL
2021-05-27 16:55:09,127 INFO | (50, 200)
2021-05-27 16:55:09,132 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:09,133 INFO | (50, 200, 512)
2021-05-27 16:55:09,134 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:09,135 INFO | (50, 200, 512)
2021-05-27 16:55:09,136 INFO | BERT LAYER
2021-05-27 16:55:09,136 INFO | (200, 512)
2021-05-27 16:55:09,137 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,137 INFO | (200, 512)
2021-05-27 16:55:09,138 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,138 INFO | (200, 512)
2021-05-27 16:55:09,144 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,145 INFO | (200, 512)
2021-05-27 16:55:09,145 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,146 INFO | (200, 512)
2021-05-27 16:55:09,151 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,152 INFO | (200, 512)
2021-05-27 16:55:09,152 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,152 INFO | (200, 512)
2021-05-27 16:55:09,158 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,159 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  22%|██▏       | 126/574 [00:14<00:47,  9.46it/s]

2021-05-27 16:55:09,228 INFO | INITIAL
2021-05-27 16:55:09,228 INFO | (50, 200)
2021-05-27 16:55:09,233 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:09,234 INFO | (50, 200, 512)
2021-05-27 16:55:09,235 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:09,236 INFO | (50, 200, 512)
2021-05-27 16:55:09,237 INFO | BERT LAYER
2021-05-27 16:55:09,237 INFO | (200, 512)
2021-05-27 16:55:09,238 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,238 INFO | (200, 512)
2021-05-27 16:55:09,238 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,239 INFO | (200, 512)
2021-05-27 16:55:09,244 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,245 INFO | (200, 512)
2021-05-27 16:55:09,245 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,246 INFO | (200, 512)
2021-05-27 16:55:09,251 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,251 INFO | (200, 512)
2021-05-27 16:55:09,252 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,252 INFO | (200, 512)
2021-05-27 16:55:09,257 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,258 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  22%|██▏       | 127/574 [00:14<00:47,  9.49it/s]

2021-05-27 16:55:09,332 INFO | INITIAL
2021-05-27 16:55:09,333 INFO | (50, 200)
2021-05-27 16:55:09,340 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:09,341 INFO | (50, 200, 512)
2021-05-27 16:55:09,342 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:09,343 INFO | (50, 200, 512)
2021-05-27 16:55:09,344 INFO | BERT LAYER
2021-05-27 16:55:09,344 INFO | (200, 512)
2021-05-27 16:55:09,345 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,345 INFO | (200, 512)
2021-05-27 16:55:09,346 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,347 INFO | (200, 512)
2021-05-27 16:55:09,352 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,352 INFO | (200, 512)
2021-05-27 16:55:09,353 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,353 INFO | (200, 512)
2021-05-27 16:55:09,359 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,360 INFO | (200, 512)
2021-05-27 16:55:09,360 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,361 INFO | (200, 512)
2021-05-27 16:55:09,366 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,367 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  22%|██▏       | 128/574 [00:14<00:47,  9.37it/s]

2021-05-27 16:55:09,442 INFO | INITIAL
2021-05-27 16:55:09,442 INFO | (50, 200)
2021-05-27 16:55:09,448 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:09,448 INFO | (50, 200, 512)
2021-05-27 16:55:09,450 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:09,451 INFO | (50, 200, 512)
2021-05-27 16:55:09,452 INFO | BERT LAYER
2021-05-27 16:55:09,452 INFO | (200, 512)
2021-05-27 16:55:09,453 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,453 INFO | (200, 512)
2021-05-27 16:55:09,453 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,454 INFO | (200, 512)
2021-05-27 16:55:09,460 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,460 INFO | (200, 512)
2021-05-27 16:55:09,461 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,461 INFO | (200, 512)
2021-05-27 16:55:09,469 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,470 INFO | (200, 512)
2021-05-27 16:55:09,470 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,471 INFO | (200, 512)
2021-05-27 16:55:09,478 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,478 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  22%|██▏       | 129/574 [00:14<00:47,  9.28it/s]

2021-05-27 16:55:09,552 INFO | INITIAL
2021-05-27 16:55:09,553 INFO | (50, 200)
2021-05-27 16:55:09,558 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:09,558 INFO | (50, 200, 512)
2021-05-27 16:55:09,560 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:09,561 INFO | (50, 200, 512)
2021-05-27 16:55:09,562 INFO | BERT LAYER
2021-05-27 16:55:09,562 INFO | (200, 512)
2021-05-27 16:55:09,563 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,563 INFO | (200, 512)
2021-05-27 16:55:09,564 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,565 INFO | (200, 512)
2021-05-27 16:55:09,570 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,570 INFO | (200, 512)
2021-05-27 16:55:09,571 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,571 INFO | (200, 512)
2021-05-27 16:55:09,577 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,577 INFO | (200, 512)
2021-05-27 16:55:09,578 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,578 INFO | (200, 512)
2021-05-27 16:55:09,585 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,585 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  23%|██▎       | 130/574 [00:14<00:47,  9.29it/s]

2021-05-27 16:55:09,660 INFO | INITIAL
2021-05-27 16:55:09,660 INFO | (50, 200)
2021-05-27 16:55:09,666 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:09,666 INFO | (50, 200, 512)
2021-05-27 16:55:09,668 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:09,669 INFO | (50, 200, 512)
2021-05-27 16:55:09,670 INFO | BERT LAYER
2021-05-27 16:55:09,671 INFO | (200, 512)
2021-05-27 16:55:09,672 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,672 INFO | (200, 512)
2021-05-27 16:55:09,673 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,673 INFO | (200, 512)
2021-05-27 16:55:09,680 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,681 INFO | (200, 512)
2021-05-27 16:55:09,681 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,682 INFO | (200, 512)
2021-05-27 16:55:09,688 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,689 INFO | (200, 512)
2021-05-27 16:55:09,689 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,690 INFO | (200, 512)
2021-05-27 16:55:09,697 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,698 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  23%|██▎       | 131/574 [00:14<00:48,  9.21it/s]

2021-05-27 16:55:09,770 INFO | INITIAL
2021-05-27 16:55:09,771 INFO | (50, 200)
2021-05-27 16:55:09,776 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:09,776 INFO | (50, 200, 512)
2021-05-27 16:55:09,778 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:09,778 INFO | (50, 200, 512)
2021-05-27 16:55:09,779 INFO | BERT LAYER
2021-05-27 16:55:09,779 INFO | (200, 512)
2021-05-27 16:55:09,780 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,780 INFO | (200, 512)
2021-05-27 16:55:09,780 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,781 INFO | (200, 512)
2021-05-27 16:55:09,786 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,787 INFO | (200, 512)
2021-05-27 16:55:09,788 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,788 INFO | (200, 512)
2021-05-27 16:55:09,793 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,794 INFO | (200, 512)
2021-05-27 16:55:09,794 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,795 INFO | (200, 512)
2021-05-27 16:55:09,800 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,801 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  23%|██▎       | 132/574 [00:15<00:47,  9.33it/s]

2021-05-27 16:55:09,875 INFO | INITIAL
2021-05-27 16:55:09,879 INFO | (50, 200)
2021-05-27 16:55:09,885 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:09,885 INFO | (50, 200, 512)
2021-05-27 16:55:09,886 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:09,887 INFO | (50, 200, 512)
2021-05-27 16:55:09,888 INFO | BERT LAYER
2021-05-27 16:55:09,888 INFO | (200, 512)
2021-05-27 16:55:09,888 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,889 INFO | (200, 512)
2021-05-27 16:55:09,889 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,889 INFO | (200, 512)
2021-05-27 16:55:09,895 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,896 INFO | (200, 512)
2021-05-27 16:55:09,896 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,896 INFO | (200, 512)
2021-05-27 16:55:09,903 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,903 INFO | (200, 512)
2021-05-27 16:55:09,903 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,904 INFO | (200, 512)
2021-05-27 16:55:09,909 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,909 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  23%|██▎       | 133/574 [00:15<00:47,  9.31it/s]

2021-05-27 16:55:09,983 INFO | INITIAL
2021-05-27 16:55:09,983 INFO | (50, 200)
2021-05-27 16:55:09,988 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:09,988 INFO | (50, 200, 512)
2021-05-27 16:55:09,990 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:09,990 INFO | (50, 200, 512)
2021-05-27 16:55:09,991 INFO | BERT LAYER
2021-05-27 16:55:09,992 INFO | (200, 512)
2021-05-27 16:55:09,992 INFO | BERT LAYER LOOP
2021-05-27 16:55:09,992 INFO | (200, 512)
2021-05-27 16:55:09,993 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:09,993 INFO | (200, 512)
2021-05-27 16:55:09,999 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,000 INFO | (200, 512)
2021-05-27 16:55:10,001 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,001 INFO | (200, 512)
2021-05-27 16:55:10,007 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,008 INFO | (200, 512)
2021-05-27 16:55:10,009 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,009 INFO | (200, 512)
2021-05-27 16:55:10,014 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,015 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  23%|██▎       | 134/574 [00:15<00:46,  9.42it/s]

2021-05-27 16:55:10,085 INFO | INITIAL
2021-05-27 16:55:10,086 INFO | (50, 200)
2021-05-27 16:55:10,092 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:10,092 INFO | (50, 200, 512)
2021-05-27 16:55:10,093 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:10,094 INFO | (50, 200, 512)
2021-05-27 16:55:10,094 INFO | BERT LAYER
2021-05-27 16:55:10,094 INFO | (200, 512)
2021-05-27 16:55:10,095 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,095 INFO | (200, 512)
2021-05-27 16:55:10,095 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,096 INFO | (200, 512)
2021-05-27 16:55:10,102 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,103 INFO | (200, 512)
2021-05-27 16:55:10,103 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,103 INFO | (200, 512)
2021-05-27 16:55:10,109 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,109 INFO | (200, 512)
2021-05-27 16:55:10,109 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,110 INFO | (200, 512)
2021-05-27 16:55:10,116 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,116 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  24%|██▎       | 135/574 [00:15<00:45,  9.57it/s]

2021-05-27 16:55:10,186 INFO | INITIAL
2021-05-27 16:55:10,187 INFO | (50, 200)
2021-05-27 16:55:10,193 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:10,193 INFO | (50, 200, 512)
2021-05-27 16:55:10,195 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:10,195 INFO | (50, 200, 512)
2021-05-27 16:55:10,196 INFO | BERT LAYER
2021-05-27 16:55:10,196 INFO | (200, 512)
2021-05-27 16:55:10,196 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,197 INFO | (200, 512)
2021-05-27 16:55:10,197 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,198 INFO | (200, 512)
2021-05-27 16:55:10,204 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,205 INFO | (200, 512)
2021-05-27 16:55:10,205 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,206 INFO | (200, 512)
2021-05-27 16:55:10,212 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,213 INFO | (200, 512)
2021-05-27 16:55:10,213 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,214 INFO | (200, 512)
2021-05-27 16:55:10,220 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,220 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  24%|██▎       | 136/574 [00:15<00:46,  9.51it/s]

2021-05-27 16:55:10,293 INFO | INITIAL
2021-05-27 16:55:10,293 INFO | (50, 200)
2021-05-27 16:55:10,300 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:10,301 INFO | (50, 200, 512)
2021-05-27 16:55:10,302 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:10,303 INFO | (50, 200, 512)
2021-05-27 16:55:10,304 INFO | BERT LAYER
2021-05-27 16:55:10,304 INFO | (200, 512)
2021-05-27 16:55:10,304 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,304 INFO | (200, 512)
2021-05-27 16:55:10,305 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,306 INFO | (200, 512)
2021-05-27 16:55:10,310 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,311 INFO | (200, 512)
2021-05-27 16:55:10,311 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,311 INFO | (200, 512)
2021-05-27 16:55:10,316 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,316 INFO | (200, 512)
2021-05-27 16:55:10,317 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,317 INFO | (200, 512)
2021-05-27 16:55:10,322 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,322 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  24%|██▎       | 136/574 [00:15<00:46,  9.51it/s]

2021-05-27 16:55:10,391 INFO | INITIAL
2021-05-27 16:55:10,391 INFO | (50, 200)
2021-05-27 16:55:10,398 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:10,398 INFO | (50, 200, 512)
2021-05-27 16:55:10,400 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:10,400 INFO | (50, 200, 512)
2021-05-27 16:55:10,401 INFO | BERT LAYER
2021-05-27 16:55:10,402 INFO | (200, 512)
2021-05-27 16:55:10,402 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,403 INFO | (200, 512)
2021-05-27 16:55:10,403 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,404 INFO | (200, 512)
2021-05-27 16:55:10,410 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,410 INFO | (200, 512)
2021-05-27 16:55:10,411 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,412 INFO | (200, 512)
2021-05-27 16:55:10,418 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,419 INFO | (200, 512)
2021-05-27 16:55:10,419 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,420 INFO | (200, 512)
2021-05-27 16:55:10,425 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,426 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  24%|██▍       | 138/574 [00:15<00:44,  9.74it/s]

2021-05-27 16:55:10,492 INFO | INITIAL
2021-05-27 16:55:10,493 INFO | (50, 200)
2021-05-27 16:55:10,498 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:10,499 INFO | (50, 200, 512)
2021-05-27 16:55:10,500 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:10,500 INFO | (50, 200, 512)
2021-05-27 16:55:10,501 INFO | BERT LAYER
2021-05-27 16:55:10,501 INFO | (200, 512)
2021-05-27 16:55:10,502 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,502 INFO | (200, 512)
2021-05-27 16:55:10,502 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,503 INFO | (200, 512)
2021-05-27 16:55:10,509 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,509 INFO | (200, 512)
2021-05-27 16:55:10,510 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,510 INFO | (200, 512)
2021-05-27 16:55:10,516 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,516 INFO | (200, 512)
2021-05-27 16:55:10,517 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,517 INFO | (200, 512)
2021-05-27 16:55:10,523 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,524 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  24%|██▍       | 139/574 [00:15<00:44,  9.78it/s]

2021-05-27 16:55:10,593 INFO | INITIAL
2021-05-27 16:55:10,593 INFO | (50, 200)
2021-05-27 16:55:10,598 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:10,599 INFO | (50, 200, 512)
2021-05-27 16:55:10,600 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:10,601 INFO | (50, 200, 512)
2021-05-27 16:55:10,602 INFO | BERT LAYER
2021-05-27 16:55:10,602 INFO | (200, 512)
2021-05-27 16:55:10,603 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,603 INFO | (200, 512)
2021-05-27 16:55:10,604 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,605 INFO | (200, 512)
2021-05-27 16:55:10,612 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,612 INFO | (200, 512)
2021-05-27 16:55:10,613 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,613 INFO | (200, 512)
2021-05-27 16:55:10,620 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,620 INFO | (200, 512)
2021-05-27 16:55:10,621 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,621 INFO | (200, 512)
2021-05-27 16:55:10,628 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,628 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  24%|██▍       | 139/574 [00:15<00:44,  9.78it/s]

2021-05-27 16:55:10,693 INFO | INITIAL
2021-05-27 16:55:10,693 INFO | (50, 200)
2021-05-27 16:55:10,699 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:10,700 INFO | (50, 200, 512)
2021-05-27 16:55:10,701 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:10,702 INFO | (50, 200, 512)
2021-05-27 16:55:10,702 INFO | BERT LAYER
2021-05-27 16:55:10,703 INFO | (200, 512)
2021-05-27 16:55:10,703 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,703 INFO | (200, 512)
2021-05-27 16:55:10,704 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,704 INFO | (200, 512)
2021-05-27 16:55:10,710 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,711 INFO | (200, 512)
2021-05-27 16:55:10,711 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,711 INFO | (200, 512)
2021-05-27 16:55:10,717 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,717 INFO | (200, 512)
2021-05-27 16:55:10,718 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,718 INFO | (200, 512)
2021-05-27 16:55:10,724 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,724 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  25%|██▍       | 141/574 [00:15<00:44,  9.70it/s]

2021-05-27 16:55:10,802 INFO | INITIAL
2021-05-27 16:55:10,803 INFO | (50, 200)
2021-05-27 16:55:10,815 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:10,816 INFO | (50, 200, 512)
2021-05-27 16:55:10,817 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:10,817 INFO | (50, 200, 512)
2021-05-27 16:55:10,818 INFO | BERT LAYER
2021-05-27 16:55:10,819 INFO | (200, 512)
2021-05-27 16:55:10,819 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,820 INFO | (200, 512)
2021-05-27 16:55:10,820 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,820 INFO | (200, 512)
2021-05-27 16:55:10,826 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,827 INFO | (200, 512)
2021-05-27 16:55:10,827 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,827 INFO | (200, 512)
2021-05-27 16:55:10,833 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,834 INFO | (200, 512)
2021-05-27 16:55:10,834 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,835 INFO | (200, 512)
2021-05-27 16:55:10,840 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,840 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  25%|██▍       | 142/574 [00:16<00:45,  9.50it/s]

2021-05-27 16:55:10,914 INFO | INITIAL
2021-05-27 16:55:10,914 INFO | (50, 200)
2021-05-27 16:55:10,919 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:10,920 INFO | (50, 200, 512)
2021-05-27 16:55:10,921 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:10,922 INFO | (50, 200, 512)
2021-05-27 16:55:10,922 INFO | BERT LAYER
2021-05-27 16:55:10,923 INFO | (200, 512)
2021-05-27 16:55:10,923 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,924 INFO | (200, 512)
2021-05-27 16:55:10,924 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,925 INFO | (200, 512)
2021-05-27 16:55:10,931 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,932 INFO | (200, 512)
2021-05-27 16:55:10,932 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,933 INFO | (200, 512)
2021-05-27 16:55:10,941 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,942 INFO | (200, 512)
2021-05-27 16:55:10,942 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:10,943 INFO | (200, 512)
2021-05-27 16:55:10,950 INFO | BERT LAYER LOOP
2021-05-27 16:55:10,951 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  25%|██▍       | 143/574 [00:16<00:46,  9.36it/s]

2021-05-27 16:55:11,026 INFO | INITIAL
2021-05-27 16:55:11,027 INFO | (50, 200)
2021-05-27 16:55:11,034 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:11,034 INFO | (50, 200, 512)
2021-05-27 16:55:11,036 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:11,036 INFO | (50, 200, 512)
2021-05-27 16:55:11,037 INFO | BERT LAYER
2021-05-27 16:55:11,037 INFO | (200, 512)
2021-05-27 16:55:11,037 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,038 INFO | (200, 512)
2021-05-27 16:55:11,038 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,038 INFO | (200, 512)
2021-05-27 16:55:11,045 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,046 INFO | (200, 512)
2021-05-27 16:55:11,046 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,046 INFO | (200, 512)
2021-05-27 16:55:11,053 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,053 INFO | (200, 512)
2021-05-27 16:55:11,054 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,054 INFO | (200, 512)
2021-05-27 16:55:11,061 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,061 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  25%|██▌       | 144/574 [00:16<00:45,  9.43it/s]

2021-05-27 16:55:11,130 INFO | INITIAL
2021-05-27 16:55:11,131 INFO | (50, 200)
2021-05-27 16:55:11,136 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:11,137 INFO | (50, 200, 512)
2021-05-27 16:55:11,139 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:11,139 INFO | (50, 200, 512)
2021-05-27 16:55:11,140 INFO | BERT LAYER
2021-05-27 16:55:11,142 INFO | (200, 512)
2021-05-27 16:55:11,142 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,143 INFO | (200, 512)
2021-05-27 16:55:11,143 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,144 INFO | (200, 512)
2021-05-27 16:55:11,150 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,151 INFO | (200, 512)
2021-05-27 16:55:11,151 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,152 INFO | (200, 512)
2021-05-27 16:55:11,158 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,159 INFO | (200, 512)
2021-05-27 16:55:11,160 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,160 INFO | (200, 512)
2021-05-27 16:55:11,167 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,167 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  25%|██▌       | 145/574 [00:16<00:45,  9.39it/s]

2021-05-27 16:55:11,237 INFO | INITIAL
2021-05-27 16:55:11,238 INFO | (50, 200)
2021-05-27 16:55:11,243 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:11,244 INFO | (50, 200, 512)
2021-05-27 16:55:11,245 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:11,246 INFO | (50, 200, 512)
2021-05-27 16:55:11,246 INFO | BERT LAYER
2021-05-27 16:55:11,247 INFO | (200, 512)
2021-05-27 16:55:11,247 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,248 INFO | (200, 512)
2021-05-27 16:55:11,248 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,248 INFO | (200, 512)
2021-05-27 16:55:11,253 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,254 INFO | (200, 512)
2021-05-27 16:55:11,254 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,255 INFO | (200, 512)
2021-05-27 16:55:11,261 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,261 INFO | (200, 512)
2021-05-27 16:55:11,262 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,262 INFO | (200, 512)
2021-05-27 16:55:11,268 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,270 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  25%|██▌       | 146/574 [00:16<00:46,  9.26it/s]

2021-05-27 16:55:11,350 INFO | INITIAL
2021-05-27 16:55:11,351 INFO | (50, 200)
2021-05-27 16:55:11,357 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:11,358 INFO | (50, 200, 512)
2021-05-27 16:55:11,360 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:11,360 INFO | (50, 200, 512)
2021-05-27 16:55:11,361 INFO | BERT LAYER
2021-05-27 16:55:11,363 INFO | (200, 512)
2021-05-27 16:55:11,364 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,365 INFO | (200, 512)
2021-05-27 16:55:11,365 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,366 INFO | (200, 512)
2021-05-27 16:55:11,371 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,372 INFO | (200, 512)
2021-05-27 16:55:11,372 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,373 INFO | (200, 512)
2021-05-27 16:55:11,379 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,379 INFO | (200, 512)
2021-05-27 16:55:11,380 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,380 INFO | (200, 512)
2021-05-27 16:55:11,386 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,387 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  26%|██▌       | 147/574 [00:16<00:46,  9.23it/s]

2021-05-27 16:55:11,459 INFO | INITIAL
2021-05-27 16:55:11,461 INFO | (50, 200)
2021-05-27 16:55:11,467 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:11,468 INFO | (50, 200, 512)
2021-05-27 16:55:11,470 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:11,470 INFO | (50, 200, 512)
2021-05-27 16:55:11,472 INFO | BERT LAYER
2021-05-27 16:55:11,473 INFO | (200, 512)
2021-05-27 16:55:11,473 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,474 INFO | (200, 512)
2021-05-27 16:55:11,475 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,475 INFO | (200, 512)
2021-05-27 16:55:11,482 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,482 INFO | (200, 512)
2021-05-27 16:55:11,483 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,483 INFO | (200, 512)
2021-05-27 16:55:11,489 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,489 INFO | (200, 512)
2021-05-27 16:55:11,489 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,490 INFO | (200, 512)
2021-05-27 16:55:11,496 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,497 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  26%|██▌       | 148/574 [00:16<00:46,  9.09it/s]

2021-05-27 16:55:11,573 INFO | INITIAL
2021-05-27 16:55:11,573 INFO | (50, 200)
2021-05-27 16:55:11,583 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:11,584 INFO | (50, 200, 512)
2021-05-27 16:55:11,585 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:11,585 INFO | (50, 200, 512)
2021-05-27 16:55:11,586 INFO | BERT LAYER
2021-05-27 16:55:11,587 INFO | (200, 512)
2021-05-27 16:55:11,587 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,587 INFO | (200, 512)
2021-05-27 16:55:11,588 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,588 INFO | (200, 512)
2021-05-27 16:55:11,593 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,594 INFO | (200, 512)
2021-05-27 16:55:11,594 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,595 INFO | (200, 512)
2021-05-27 16:55:11,601 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,602 INFO | (200, 512)
2021-05-27 16:55:11,602 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,603 INFO | (200, 512)
2021-05-27 16:55:11,610 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,611 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  26%|██▌       | 149/574 [00:16<00:47,  9.01it/s]

2021-05-27 16:55:11,686 INFO | INITIAL
2021-05-27 16:55:11,686 INFO | (50, 200)
2021-05-27 16:55:11,695 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:11,696 INFO | (50, 200, 512)
2021-05-27 16:55:11,697 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:11,697 INFO | (50, 200, 512)
2021-05-27 16:55:11,698 INFO | BERT LAYER
2021-05-27 16:55:11,698 INFO | (200, 512)
2021-05-27 16:55:11,699 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,699 INFO | (200, 512)
2021-05-27 16:55:11,699 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,700 INFO | (200, 512)
2021-05-27 16:55:11,706 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,707 INFO | (200, 512)
2021-05-27 16:55:11,707 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,707 INFO | (200, 512)
2021-05-27 16:55:11,712 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,713 INFO | (200, 512)
2021-05-27 16:55:11,714 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,714 INFO | (200, 512)
2021-05-27 16:55:11,720 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,720 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  26%|██▌       | 150/574 [00:16<00:46,  9.10it/s]

2021-05-27 16:55:11,793 INFO | INITIAL
2021-05-27 16:55:11,794 INFO | (50, 200)
2021-05-27 16:55:11,798 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:11,799 INFO | (50, 200, 512)
2021-05-27 16:55:11,800 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:11,801 INFO | (50, 200, 512)
2021-05-27 16:55:11,801 INFO | BERT LAYER
2021-05-27 16:55:11,802 INFO | (200, 512)
2021-05-27 16:55:11,802 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,803 INFO | (200, 512)
2021-05-27 16:55:11,803 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,803 INFO | (200, 512)
2021-05-27 16:55:11,809 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,810 INFO | (200, 512)
2021-05-27 16:55:11,811 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,812 INFO | (200, 512)
2021-05-27 16:55:11,819 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,819 INFO | (200, 512)
2021-05-27 16:55:11,820 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,820 INFO | (200, 512)
2021-05-27 16:55:11,826 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,827 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  26%|██▋       | 151/574 [00:17<00:45,  9.24it/s]

2021-05-27 16:55:11,897 INFO | INITIAL
2021-05-27 16:55:11,898 INFO | (50, 200)
2021-05-27 16:55:11,904 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:11,904 INFO | (50, 200, 512)
2021-05-27 16:55:11,906 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:11,906 INFO | (50, 200, 512)
2021-05-27 16:55:11,907 INFO | BERT LAYER
2021-05-27 16:55:11,908 INFO | (200, 512)
2021-05-27 16:55:11,908 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,909 INFO | (200, 512)
2021-05-27 16:55:11,910 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,911 INFO | (200, 512)
2021-05-27 16:55:11,918 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,918 INFO | (200, 512)
2021-05-27 16:55:11,919 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,919 INFO | (200, 512)
2021-05-27 16:55:11,926 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,928 INFO | (200, 512)
2021-05-27 16:55:11,928 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:11,929 INFO | (200, 512)
2021-05-27 16:55:11,934 INFO | BERT LAYER LOOP
2021-05-27 16:55:11,934 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  26%|██▋       | 152/574 [00:17<00:46,  9.08it/s]

2021-05-27 16:55:12,013 INFO | INITIAL
2021-05-27 16:55:12,014 INFO | (50, 200)
2021-05-27 16:55:12,021 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:12,021 INFO | (50, 200, 512)
2021-05-27 16:55:12,023 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:12,023 INFO | (50, 200, 512)
2021-05-27 16:55:12,024 INFO | BERT LAYER
2021-05-27 16:55:12,024 INFO | (200, 512)
2021-05-27 16:55:12,025 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,025 INFO | (200, 512)
2021-05-27 16:55:12,025 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,026 INFO | (200, 512)
2021-05-27 16:55:12,035 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,036 INFO | (200, 512)
2021-05-27 16:55:12,036 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,036 INFO | (200, 512)
2021-05-27 16:55:12,042 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,043 INFO | (200, 512)
2021-05-27 16:55:12,044 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,044 INFO | (200, 512)
2021-05-27 16:55:12,050 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,051 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  27%|██▋       | 153/574 [00:17<00:47,  8.83it/s]

2021-05-27 16:55:12,133 INFO | INITIAL
2021-05-27 16:55:12,133 INFO | (50, 200)
2021-05-27 16:55:12,139 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:12,139 INFO | (50, 200, 512)
2021-05-27 16:55:12,141 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:12,141 INFO | (50, 200, 512)
2021-05-27 16:55:12,142 INFO | BERT LAYER
2021-05-27 16:55:12,142 INFO | (200, 512)
2021-05-27 16:55:12,142 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,143 INFO | (200, 512)
2021-05-27 16:55:12,144 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,145 INFO | (200, 512)
2021-05-27 16:55:12,152 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,152 INFO | (200, 512)
2021-05-27 16:55:12,153 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,153 INFO | (200, 512)
2021-05-27 16:55:12,159 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,160 INFO | (200, 512)
2021-05-27 16:55:12,161 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,161 INFO | (200, 512)
2021-05-27 16:55:12,168 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,168 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  27%|██▋       | 154/574 [00:17<00:47,  8.80it/s]

2021-05-27 16:55:12,248 INFO | INITIAL
2021-05-27 16:55:12,248 INFO | (50, 200)
2021-05-27 16:55:12,253 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:12,253 INFO | (50, 200, 512)
2021-05-27 16:55:12,255 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:12,255 INFO | (50, 200, 512)
2021-05-27 16:55:12,256 INFO | BERT LAYER
2021-05-27 16:55:12,256 INFO | (200, 512)
2021-05-27 16:55:12,257 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,257 INFO | (200, 512)
2021-05-27 16:55:12,258 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,259 INFO | (200, 512)
2021-05-27 16:55:12,264 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,264 INFO | (200, 512)
2021-05-27 16:55:12,265 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,265 INFO | (200, 512)
2021-05-27 16:55:12,272 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,272 INFO | (200, 512)
2021-05-27 16:55:12,273 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,273 INFO | (200, 512)
2021-05-27 16:55:12,280 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,281 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  27%|██▋       | 155/574 [00:17<00:46,  9.00it/s]

2021-05-27 16:55:12,352 INFO | INITIAL
2021-05-27 16:55:12,353 INFO | (50, 200)
2021-05-27 16:55:12,358 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:12,358 INFO | (50, 200, 512)
2021-05-27 16:55:12,359 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:12,360 INFO | (50, 200, 512)
2021-05-27 16:55:12,361 INFO | BERT LAYER
2021-05-27 16:55:12,361 INFO | (200, 512)
2021-05-27 16:55:12,361 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,362 INFO | (200, 512)
2021-05-27 16:55:12,363 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,363 INFO | (200, 512)
2021-05-27 16:55:12,368 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,369 INFO | (200, 512)
2021-05-27 16:55:12,369 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,369 INFO | (200, 512)
2021-05-27 16:55:12,375 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,375 INFO | (200, 512)
2021-05-27 16:55:12,376 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,376 INFO | (200, 512)
2021-05-27 16:55:12,382 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,382 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  27%|██▋       | 155/574 [00:17<00:46,  9.00it/s]

2021-05-27 16:55:12,449 INFO | INITIAL
2021-05-27 16:55:12,449 INFO | (50, 200)
2021-05-27 16:55:12,454 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:12,454 INFO | (50, 200, 512)
2021-05-27 16:55:12,456 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:12,456 INFO | (50, 200, 512)
2021-05-27 16:55:12,457 INFO | BERT LAYER
2021-05-27 16:55:12,457 INFO | (200, 512)
2021-05-27 16:55:12,458 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,458 INFO | (200, 512)
2021-05-27 16:55:12,459 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,459 INFO | (200, 512)
2021-05-27 16:55:12,466 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,467 INFO | (200, 512)
2021-05-27 16:55:12,467 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,467 INFO | (200, 512)
2021-05-27 16:55:12,473 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,474 INFO | (200, 512)
2021-05-27 16:55:12,475 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,475 INFO | (200, 512)
2021-05-27 16:55:12,482 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,483 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  27%|██▋       | 157/574 [00:17<00:44,  9.40it/s]

2021-05-27 16:55:12,554 INFO | INITIAL
2021-05-27 16:55:12,555 INFO | (50, 200)
2021-05-27 16:55:12,560 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:12,561 INFO | (50, 200, 512)
2021-05-27 16:55:12,562 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:12,563 INFO | (50, 200, 512)
2021-05-27 16:55:12,564 INFO | BERT LAYER
2021-05-27 16:55:12,564 INFO | (200, 512)
2021-05-27 16:55:12,564 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,565 INFO | (200, 512)
2021-05-27 16:55:12,565 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,565 INFO | (200, 512)
2021-05-27 16:55:12,572 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,572 INFO | (200, 512)
2021-05-27 16:55:12,573 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,573 INFO | (200, 512)
2021-05-27 16:55:12,580 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,580 INFO | (200, 512)
2021-05-27 16:55:12,581 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,581 INFO | (200, 512)
2021-05-27 16:55:12,588 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,588 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  28%|██▊       | 158/574 [00:17<00:44,  9.36it/s]

2021-05-27 16:55:12,663 INFO | INITIAL
2021-05-27 16:55:12,663 INFO | (50, 200)
2021-05-27 16:55:12,669 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:12,669 INFO | (50, 200, 512)
2021-05-27 16:55:12,671 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:12,671 INFO | (50, 200, 512)
2021-05-27 16:55:12,672 INFO | BERT LAYER
2021-05-27 16:55:12,673 INFO | (200, 512)
2021-05-27 16:55:12,673 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,674 INFO | (200, 512)
2021-05-27 16:55:12,674 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,675 INFO | (200, 512)
2021-05-27 16:55:12,683 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,683 INFO | (200, 512)
2021-05-27 16:55:12,684 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,684 INFO | (200, 512)
2021-05-27 16:55:12,689 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,690 INFO | (200, 512)
2021-05-27 16:55:12,690 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,691 INFO | (200, 512)
2021-05-27 16:55:12,698 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,698 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  28%|██▊       | 159/574 [00:17<00:44,  9.28it/s]

2021-05-27 16:55:12,773 INFO | INITIAL
2021-05-27 16:55:12,775 INFO | (50, 200)
2021-05-27 16:55:12,782 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:12,782 INFO | (50, 200, 512)
2021-05-27 16:55:12,783 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:12,784 INFO | (50, 200, 512)
2021-05-27 16:55:12,785 INFO | BERT LAYER
2021-05-27 16:55:12,785 INFO | (200, 512)
2021-05-27 16:55:12,785 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,786 INFO | (200, 512)
2021-05-27 16:55:12,786 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,787 INFO | (200, 512)
2021-05-27 16:55:12,792 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,792 INFO | (200, 512)
2021-05-27 16:55:12,793 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,793 INFO | (200, 512)
2021-05-27 16:55:12,798 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,799 INFO | (200, 512)
2021-05-27 16:55:12,799 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,800 INFO | (200, 512)
2021-05-27 16:55:12,804 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,805 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  28%|██▊       | 159/574 [00:18<00:44,  9.28it/s]

2021-05-27 16:55:12,869 INFO | INITIAL
2021-05-27 16:55:12,870 INFO | (50, 200)
2021-05-27 16:55:12,875 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:12,875 INFO | (50, 200, 512)
2021-05-27 16:55:12,877 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:12,878 INFO | (50, 200, 512)
2021-05-27 16:55:12,879 INFO | BERT LAYER
2021-05-27 16:55:12,879 INFO | (200, 512)
2021-05-27 16:55:12,880 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,880 INFO | (200, 512)
2021-05-27 16:55:12,881 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,881 INFO | (200, 512)
2021-05-27 16:55:12,887 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,887 INFO | (200, 512)
2021-05-27 16:55:12,888 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,888 INFO | (200, 512)
2021-05-27 16:55:12,893 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,894 INFO | (200, 512)
2021-05-27 16:55:12,894 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,895 INFO | (200, 512)
2021-05-27 16:55:12,900 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,900 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  28%|██▊       | 161/574 [00:18<00:42,  9.73it/s]

2021-05-27 16:55:12,965 INFO | INITIAL
2021-05-27 16:55:12,965 INFO | (50, 200)
2021-05-27 16:55:12,970 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:12,971 INFO | (50, 200, 512)
2021-05-27 16:55:12,972 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:12,972 INFO | (50, 200, 512)
2021-05-27 16:55:12,973 INFO | BERT LAYER
2021-05-27 16:55:12,974 INFO | (200, 512)
2021-05-27 16:55:12,974 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,975 INFO | (200, 512)
2021-05-27 16:55:12,975 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,976 INFO | (200, 512)
2021-05-27 16:55:12,983 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,984 INFO | (200, 512)
2021-05-27 16:55:12,984 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,984 INFO | (200, 512)
2021-05-27 16:55:12,990 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,990 INFO | (200, 512)
2021-05-27 16:55:12,991 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:12,991 INFO | (200, 512)
2021-05-27 16:55:12,996 INFO | BERT LAYER LOOP
2021-05-27 16:55:12,997 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  28%|██▊       | 162/574 [00:18<00:42,  9.73it/s]

2021-05-27 16:55:13,068 INFO | INITIAL
2021-05-27 16:55:13,068 INFO | (50, 200)
2021-05-27 16:55:13,074 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:13,075 INFO | (50, 200, 512)
2021-05-27 16:55:13,077 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:13,077 INFO | (50, 200, 512)
2021-05-27 16:55:13,078 INFO | BERT LAYER
2021-05-27 16:55:13,079 INFO | (200, 512)
2021-05-27 16:55:13,079 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,079 INFO | (200, 512)
2021-05-27 16:55:13,080 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,080 INFO | (200, 512)
2021-05-27 16:55:13,088 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,089 INFO | (200, 512)
2021-05-27 16:55:13,089 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,090 INFO | (200, 512)
2021-05-27 16:55:13,097 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,097 INFO | (200, 512)
2021-05-27 16:55:13,098 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,098 INFO | (200, 512)
2021-05-27 16:55:13,106 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,106 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  28%|██▊       | 163/574 [00:18<00:43,  9.42it/s]

2021-05-27 16:55:13,184 INFO | INITIAL
2021-05-27 16:55:13,185 INFO | (50, 200)
2021-05-27 16:55:13,189 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:13,190 INFO | (50, 200, 512)
2021-05-27 16:55:13,191 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:13,192 INFO | (50, 200, 512)
2021-05-27 16:55:13,193 INFO | BERT LAYER
2021-05-27 16:55:13,193 INFO | (200, 512)
2021-05-27 16:55:13,193 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,194 INFO | (200, 512)
2021-05-27 16:55:13,195 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,195 INFO | (200, 512)
2021-05-27 16:55:13,202 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,203 INFO | (200, 512)
2021-05-27 16:55:13,203 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,204 INFO | (200, 512)
2021-05-27 16:55:13,211 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,211 INFO | (200, 512)
2021-05-27 16:55:13,212 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,213 INFO | (200, 512)
2021-05-27 16:55:13,219 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,219 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  29%|██▊       | 164/574 [00:18<00:43,  9.42it/s]

2021-05-27 16:55:13,290 INFO | INITIAL
2021-05-27 16:55:13,290 INFO | (50, 200)
2021-05-27 16:55:13,298 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:13,298 INFO | (50, 200, 512)
2021-05-27 16:55:13,299 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:13,300 INFO | (50, 200, 512)
2021-05-27 16:55:13,300 INFO | BERT LAYER
2021-05-27 16:55:13,301 INFO | (200, 512)
2021-05-27 16:55:13,301 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,301 INFO | (200, 512)
2021-05-27 16:55:13,302 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,302 INFO | (200, 512)
2021-05-27 16:55:13,308 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,308 INFO | (200, 512)
2021-05-27 16:55:13,308 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,309 INFO | (200, 512)
2021-05-27 16:55:13,315 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,316 INFO | (200, 512)
2021-05-27 16:55:13,316 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,317 INFO | (200, 512)
2021-05-27 16:55:13,323 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,323 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  29%|██▊       | 165/574 [00:18<00:45,  9.01it/s]

2021-05-27 16:55:13,414 INFO | INITIAL
2021-05-27 16:55:13,415 INFO | (50, 200)
2021-05-27 16:55:13,420 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:13,420 INFO | (50, 200, 512)
2021-05-27 16:55:13,421 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:13,422 INFO | (50, 200, 512)
2021-05-27 16:55:13,423 INFO | BERT LAYER
2021-05-27 16:55:13,423 INFO | (200, 512)
2021-05-27 16:55:13,423 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,424 INFO | (200, 512)
2021-05-27 16:55:13,424 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,425 INFO | (200, 512)
2021-05-27 16:55:13,431 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,431 INFO | (200, 512)
2021-05-27 16:55:13,432 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,432 INFO | (200, 512)
2021-05-27 16:55:13,439 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,440 INFO | (200, 512)
2021-05-27 16:55:13,440 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,441 INFO | (200, 512)
2021-05-27 16:55:13,447 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,447 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  29%|██▉       | 166/574 [00:18<00:44,  9.14it/s]

2021-05-27 16:55:13,520 INFO | INITIAL
2021-05-27 16:55:13,521 INFO | (50, 200)
2021-05-27 16:55:13,526 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:13,527 INFO | (50, 200, 512)
2021-05-27 16:55:13,528 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:13,528 INFO | (50, 200, 512)
2021-05-27 16:55:13,529 INFO | BERT LAYER
2021-05-27 16:55:13,529 INFO | (200, 512)
2021-05-27 16:55:13,530 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,530 INFO | (200, 512)
2021-05-27 16:55:13,531 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,531 INFO | (200, 512)
2021-05-27 16:55:13,536 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,537 INFO | (200, 512)
2021-05-27 16:55:13,537 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,537 INFO | (200, 512)
2021-05-27 16:55:13,543 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,544 INFO | (200, 512)
2021-05-27 16:55:13,544 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,545 INFO | (200, 512)
2021-05-27 16:55:13,551 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,552 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  29%|██▉       | 167/574 [00:18<00:44,  9.13it/s]

2021-05-27 16:55:13,629 INFO | INITIAL
2021-05-27 16:55:13,630 INFO | (50, 200)
2021-05-27 16:55:13,636 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:13,636 INFO | (50, 200, 512)
2021-05-27 16:55:13,638 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:13,638 INFO | (50, 200, 512)
2021-05-27 16:55:13,639 INFO | BERT LAYER
2021-05-27 16:55:13,640 INFO | (200, 512)
2021-05-27 16:55:13,640 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,641 INFO | (200, 512)
2021-05-27 16:55:13,642 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,642 INFO | (200, 512)
2021-05-27 16:55:13,648 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,649 INFO | (200, 512)
2021-05-27 16:55:13,649 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,650 INFO | (200, 512)
2021-05-27 16:55:13,656 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,656 INFO | (200, 512)
2021-05-27 16:55:13,656 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,657 INFO | (200, 512)
2021-05-27 16:55:13,663 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,663 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  29%|██▉       | 168/574 [00:18<00:43,  9.24it/s]

2021-05-27 16:55:13,734 INFO | INITIAL
2021-05-27 16:55:13,735 INFO | (50, 200)
2021-05-27 16:55:13,740 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:13,741 INFO | (50, 200, 512)
2021-05-27 16:55:13,742 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:13,743 INFO | (50, 200, 512)
2021-05-27 16:55:13,744 INFO | BERT LAYER
2021-05-27 16:55:13,745 INFO | (200, 512)
2021-05-27 16:55:13,746 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,747 INFO | (200, 512)
2021-05-27 16:55:13,748 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,748 INFO | (200, 512)
2021-05-27 16:55:13,754 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,755 INFO | (200, 512)
2021-05-27 16:55:13,755 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,756 INFO | (200, 512)
2021-05-27 16:55:13,762 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,763 INFO | (200, 512)
2021-05-27 16:55:13,764 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,764 INFO | (200, 512)
2021-05-27 16:55:13,770 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,771 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  29%|██▉       | 169/574 [00:19<00:43,  9.21it/s]

2021-05-27 16:55:13,844 INFO | INITIAL
2021-05-27 16:55:13,845 INFO | (50, 200)
2021-05-27 16:55:13,850 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:13,850 INFO | (50, 200, 512)
2021-05-27 16:55:13,852 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:13,852 INFO | (50, 200, 512)
2021-05-27 16:55:13,854 INFO | BERT LAYER
2021-05-27 16:55:13,854 INFO | (200, 512)
2021-05-27 16:55:13,854 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,855 INFO | (200, 512)
2021-05-27 16:55:13,855 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,856 INFO | (200, 512)
2021-05-27 16:55:13,864 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,865 INFO | (200, 512)
2021-05-27 16:55:13,866 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,866 INFO | (200, 512)
2021-05-27 16:55:13,871 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,872 INFO | (200, 512)
2021-05-27 16:55:13,872 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,872 INFO | (200, 512)
2021-05-27 16:55:13,879 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,880 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  30%|██▉       | 170/574 [00:19<00:43,  9.23it/s]

2021-05-27 16:55:13,951 INFO | INITIAL
2021-05-27 16:55:13,952 INFO | (50, 200)
2021-05-27 16:55:13,960 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:13,961 INFO | (50, 200, 512)
2021-05-27 16:55:13,962 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:13,963 INFO | (50, 200, 512)
2021-05-27 16:55:13,964 INFO | BERT LAYER
2021-05-27 16:55:13,965 INFO | (200, 512)
2021-05-27 16:55:13,965 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,965 INFO | (200, 512)
2021-05-27 16:55:13,966 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,966 INFO | (200, 512)
2021-05-27 16:55:13,972 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,973 INFO | (200, 512)
2021-05-27 16:55:13,974 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,974 INFO | (200, 512)
2021-05-27 16:55:13,982 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,983 INFO | (200, 512)
2021-05-27 16:55:13,983 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:13,983 INFO | (200, 512)
2021-05-27 16:55:13,989 INFO | BERT LAYER LOOP
2021-05-27 16:55:13,990 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  30%|██▉       | 171/574 [00:19<00:44,  9.12it/s]

2021-05-27 16:55:14,064 INFO | INITIAL
2021-05-27 16:55:14,065 INFO | (50, 200)
2021-05-27 16:55:14,070 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:14,070 INFO | (50, 200, 512)
2021-05-27 16:55:14,072 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:14,072 INFO | (50, 200, 512)
2021-05-27 16:55:14,073 INFO | BERT LAYER
2021-05-27 16:55:14,073 INFO | (200, 512)
2021-05-27 16:55:14,074 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,074 INFO | (200, 512)
2021-05-27 16:55:14,075 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,076 INFO | (200, 512)
2021-05-27 16:55:14,084 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,085 INFO | (200, 512)
2021-05-27 16:55:14,085 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,085 INFO | (200, 512)
2021-05-27 16:55:14,092 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,092 INFO | (200, 512)
2021-05-27 16:55:14,093 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,094 INFO | (200, 512)
2021-05-27 16:55:14,100 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,101 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  30%|██▉       | 172/574 [00:19<00:44,  9.09it/s]

2021-05-27 16:55:14,175 INFO | INITIAL
2021-05-27 16:55:14,176 INFO | (50, 200)
2021-05-27 16:55:14,181 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:14,182 INFO | (50, 200, 512)
2021-05-27 16:55:14,183 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:14,184 INFO | (50, 200, 512)
2021-05-27 16:55:14,185 INFO | BERT LAYER
2021-05-27 16:55:14,185 INFO | (200, 512)
2021-05-27 16:55:14,185 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,186 INFO | (200, 512)
2021-05-27 16:55:14,186 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,186 INFO | (200, 512)
2021-05-27 16:55:14,193 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,193 INFO | (200, 512)
2021-05-27 16:55:14,194 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,194 INFO | (200, 512)
2021-05-27 16:55:14,200 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,201 INFO | (200, 512)
2021-05-27 16:55:14,202 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,202 INFO | (200, 512)
2021-05-27 16:55:14,207 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,208 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  30%|███       | 173/574 [00:19<00:43,  9.22it/s]

2021-05-27 16:55:14,280 INFO | INITIAL
2021-05-27 16:55:14,280 INFO | (50, 200)
2021-05-27 16:55:14,287 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:14,288 INFO | (50, 200, 512)
2021-05-27 16:55:14,289 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:14,289 INFO | (50, 200, 512)
2021-05-27 16:55:14,290 INFO | BERT LAYER
2021-05-27 16:55:14,291 INFO | (200, 512)
2021-05-27 16:55:14,292 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,292 INFO | (200, 512)
2021-05-27 16:55:14,293 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,293 INFO | (200, 512)
2021-05-27 16:55:14,299 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,300 INFO | (200, 512)
2021-05-27 16:55:14,301 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,301 INFO | (200, 512)
2021-05-27 16:55:14,307 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,308 INFO | (200, 512)
2021-05-27 16:55:14,308 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,309 INFO | (200, 512)
2021-05-27 16:55:14,315 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,315 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  30%|███       | 174/574 [00:19<00:42,  9.32it/s]

2021-05-27 16:55:14,384 INFO | INITIAL
2021-05-27 16:55:14,385 INFO | (50, 200)
2021-05-27 16:55:14,390 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:14,390 INFO | (50, 200, 512)
2021-05-27 16:55:14,391 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:14,392 INFO | (50, 200, 512)
2021-05-27 16:55:14,393 INFO | BERT LAYER
2021-05-27 16:55:14,393 INFO | (200, 512)
2021-05-27 16:55:14,393 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,394 INFO | (200, 512)
2021-05-27 16:55:14,394 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,394 INFO | (200, 512)
2021-05-27 16:55:14,402 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,403 INFO | (200, 512)
2021-05-27 16:55:14,403 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,403 INFO | (200, 512)
2021-05-27 16:55:14,409 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,409 INFO | (200, 512)
2021-05-27 16:55:14,410 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,411 INFO | (200, 512)
2021-05-27 16:55:14,418 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,419 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  30%|███       | 175/574 [00:19<00:42,  9.33it/s]

2021-05-27 16:55:14,491 INFO | INITIAL
2021-05-27 16:55:14,493 INFO | (50, 200)
2021-05-27 16:55:14,500 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:14,500 INFO | (50, 200, 512)
2021-05-27 16:55:14,502 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:14,502 INFO | (50, 200, 512)
2021-05-27 16:55:14,503 INFO | BERT LAYER
2021-05-27 16:55:14,504 INFO | (200, 512)
2021-05-27 16:55:14,504 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,504 INFO | (200, 512)
2021-05-27 16:55:14,505 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,505 INFO | (200, 512)
2021-05-27 16:55:14,511 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,512 INFO | (200, 512)
2021-05-27 16:55:14,513 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,514 INFO | (200, 512)
2021-05-27 16:55:14,521 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,521 INFO | (200, 512)
2021-05-27 16:55:14,522 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,522 INFO | (200, 512)
2021-05-27 16:55:14,528 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,529 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  31%|███       | 176/574 [00:19<00:42,  9.33it/s]

2021-05-27 16:55:14,598 INFO | INITIAL
2021-05-27 16:55:14,599 INFO | (50, 200)
2021-05-27 16:55:14,604 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:14,604 INFO | (50, 200, 512)
2021-05-27 16:55:14,605 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:14,606 INFO | (50, 200, 512)
2021-05-27 16:55:14,606 INFO | BERT LAYER
2021-05-27 16:55:14,607 INFO | (200, 512)
2021-05-27 16:55:14,607 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,607 INFO | (200, 512)
2021-05-27 16:55:14,608 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,609 INFO | (200, 512)
2021-05-27 16:55:14,614 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,615 INFO | (200, 512)
2021-05-27 16:55:14,615 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,616 INFO | (200, 512)
2021-05-27 16:55:14,623 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,624 INFO | (200, 512)
2021-05-27 16:55:14,625 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,626 INFO | (200, 512)
2021-05-27 16:55:14,633 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,633 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  31%|███       | 177/574 [00:19<00:42,  9.38it/s]

2021-05-27 16:55:14,704 INFO | INITIAL
2021-05-27 16:55:14,704 INFO | (50, 200)
2021-05-27 16:55:14,710 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:14,710 INFO | (50, 200, 512)
2021-05-27 16:55:14,712 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:14,712 INFO | (50, 200, 512)
2021-05-27 16:55:14,713 INFO | BERT LAYER
2021-05-27 16:55:14,713 INFO | (200, 512)
2021-05-27 16:55:14,713 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,714 INFO | (200, 512)
2021-05-27 16:55:14,714 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,715 INFO | (200, 512)
2021-05-27 16:55:14,721 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,722 INFO | (200, 512)
2021-05-27 16:55:14,722 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,722 INFO | (200, 512)
2021-05-27 16:55:14,730 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,730 INFO | (200, 512)
2021-05-27 16:55:14,731 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,731 INFO | (200, 512)
2021-05-27 16:55:14,738 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,738 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  31%|███       | 178/574 [00:20<00:44,  8.89it/s]

2021-05-27 16:55:14,830 INFO | INITIAL
2021-05-27 16:55:14,831 INFO | (50, 200)
2021-05-27 16:55:14,836 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:14,837 INFO | (50, 200, 512)
2021-05-27 16:55:14,838 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:14,839 INFO | (50, 200, 512)
2021-05-27 16:55:14,841 INFO | BERT LAYER
2021-05-27 16:55:14,841 INFO | (200, 512)
2021-05-27 16:55:14,842 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,842 INFO | (200, 512)
2021-05-27 16:55:14,842 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,843 INFO | (200, 512)
2021-05-27 16:55:14,848 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,849 INFO | (200, 512)
2021-05-27 16:55:14,849 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,849 INFO | (200, 512)
2021-05-27 16:55:14,854 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,855 INFO | (200, 512)
2021-05-27 16:55:14,855 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,855 INFO | (200, 512)
2021-05-27 16:55:14,861 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,861 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  31%|███       | 179/574 [00:20<00:43,  9.17it/s]

2021-05-27 16:55:14,931 INFO | INITIAL
2021-05-27 16:55:14,932 INFO | (50, 200)
2021-05-27 16:55:14,938 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:14,938 INFO | (50, 200, 512)
2021-05-27 16:55:14,939 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:14,940 INFO | (50, 200, 512)
2021-05-27 16:55:14,940 INFO | BERT LAYER
2021-05-27 16:55:14,941 INFO | (200, 512)
2021-05-27 16:55:14,941 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,942 INFO | (200, 512)
2021-05-27 16:55:14,943 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,943 INFO | (200, 512)
2021-05-27 16:55:14,950 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,951 INFO | (200, 512)
2021-05-27 16:55:14,951 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,952 INFO | (200, 512)
2021-05-27 16:55:14,959 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,960 INFO | (200, 512)
2021-05-27 16:55:14,960 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:14,961 INFO | (200, 512)
2021-05-27 16:55:14,967 INFO | BERT LAYER LOOP
2021-05-27 16:55:14,968 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  31%|███▏      | 180/574 [00:20<00:42,  9.23it/s]

2021-05-27 16:55:15,038 INFO | INITIAL
2021-05-27 16:55:15,038 INFO | (50, 200)
2021-05-27 16:55:15,044 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:15,044 INFO | (50, 200, 512)
2021-05-27 16:55:15,046 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:15,046 INFO | (50, 200, 512)
2021-05-27 16:55:15,047 INFO | BERT LAYER
2021-05-27 16:55:15,048 INFO | (200, 512)
2021-05-27 16:55:15,048 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,049 INFO | (200, 512)
2021-05-27 16:55:15,049 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,050 INFO | (200, 512)
2021-05-27 16:55:15,056 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,057 INFO | (200, 512)
2021-05-27 16:55:15,057 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,058 INFO | (200, 512)
2021-05-27 16:55:15,063 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,064 INFO | (200, 512)
2021-05-27 16:55:15,064 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,065 INFO | (200, 512)
2021-05-27 16:55:15,070 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,070 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  32%|███▏      | 181/574 [00:20<00:41,  9.42it/s]

2021-05-27 16:55:15,138 INFO | INITIAL
2021-05-27 16:55:15,139 INFO | (50, 200)
2021-05-27 16:55:15,146 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:15,146 INFO | (50, 200, 512)
2021-05-27 16:55:15,148 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:15,148 INFO | (50, 200, 512)
2021-05-27 16:55:15,149 INFO | BERT LAYER
2021-05-27 16:55:15,149 INFO | (200, 512)
2021-05-27 16:55:15,149 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,150 INFO | (200, 512)
2021-05-27 16:55:15,151 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,151 INFO | (200, 512)
2021-05-27 16:55:15,159 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,160 INFO | (200, 512)
2021-05-27 16:55:15,160 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,161 INFO | (200, 512)
2021-05-27 16:55:15,168 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,169 INFO | (200, 512)
2021-05-27 16:55:15,169 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,170 INFO | (200, 512)
2021-05-27 16:55:15,176 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,176 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  32%|███▏      | 182/574 [00:20<00:41,  9.36it/s]

2021-05-27 16:55:15,248 INFO | INITIAL
2021-05-27 16:55:15,248 INFO | (50, 200)
2021-05-27 16:55:15,253 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:15,253 INFO | (50, 200, 512)
2021-05-27 16:55:15,254 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:15,255 INFO | (50, 200, 512)
2021-05-27 16:55:15,256 INFO | BERT LAYER
2021-05-27 16:55:15,256 INFO | (200, 512)
2021-05-27 16:55:15,257 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,257 INFO | (200, 512)
2021-05-27 16:55:15,258 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,259 INFO | (200, 512)
2021-05-27 16:55:15,264 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,265 INFO | (200, 512)
2021-05-27 16:55:15,265 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,265 INFO | (200, 512)
2021-05-27 16:55:15,271 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,272 INFO | (200, 512)
2021-05-27 16:55:15,272 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,272 INFO | (200, 512)
2021-05-27 16:55:15,280 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,280 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  32%|███▏      | 183/574 [00:20<00:41,  9.51it/s]

2021-05-27 16:55:15,349 INFO | INITIAL
2021-05-27 16:55:15,350 INFO | (50, 200)
2021-05-27 16:55:15,355 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:15,356 INFO | (50, 200, 512)
2021-05-27 16:55:15,357 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:15,358 INFO | (50, 200, 512)
2021-05-27 16:55:15,359 INFO | BERT LAYER
2021-05-27 16:55:15,359 INFO | (200, 512)
2021-05-27 16:55:15,359 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,360 INFO | (200, 512)
2021-05-27 16:55:15,361 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,361 INFO | (200, 512)
2021-05-27 16:55:15,366 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,367 INFO | (200, 512)
2021-05-27 16:55:15,367 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,368 INFO | (200, 512)
2021-05-27 16:55:15,373 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,375 INFO | (200, 512)
2021-05-27 16:55:15,375 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,376 INFO | (200, 512)
2021-05-27 16:55:15,383 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,383 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  32%|███▏      | 184/574 [00:20<00:40,  9.58it/s]

2021-05-27 16:55:15,451 INFO | INITIAL
2021-05-27 16:55:15,452 INFO | (50, 200)
2021-05-27 16:55:15,456 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:15,457 INFO | (50, 200, 512)
2021-05-27 16:55:15,458 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:15,459 INFO | (50, 200, 512)
2021-05-27 16:55:15,459 INFO | BERT LAYER
2021-05-27 16:55:15,460 INFO | (200, 512)
2021-05-27 16:55:15,460 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,460 INFO | (200, 512)
2021-05-27 16:55:15,461 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,461 INFO | (200, 512)
2021-05-27 16:55:15,468 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,468 INFO | (200, 512)
2021-05-27 16:55:15,469 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,469 INFO | (200, 512)
2021-05-27 16:55:15,476 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,477 INFO | (200, 512)
2021-05-27 16:55:15,477 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,477 INFO | (200, 512)
2021-05-27 16:55:15,485 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,486 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  32%|███▏      | 185/574 [00:20<00:40,  9.49it/s]

2021-05-27 16:55:15,559 INFO | INITIAL
2021-05-27 16:55:15,560 INFO | (50, 200)
2021-05-27 16:55:15,565 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:15,565 INFO | (50, 200, 512)
2021-05-27 16:55:15,567 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:15,567 INFO | (50, 200, 512)
2021-05-27 16:55:15,568 INFO | BERT LAYER
2021-05-27 16:55:15,569 INFO | (200, 512)
2021-05-27 16:55:15,569 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,569 INFO | (200, 512)
2021-05-27 16:55:15,570 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,570 INFO | (200, 512)
2021-05-27 16:55:15,576 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,576 INFO | (200, 512)
2021-05-27 16:55:15,577 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,577 INFO | (200, 512)
2021-05-27 16:55:15,585 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,587 INFO | (200, 512)
2021-05-27 16:55:15,588 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,588 INFO | (200, 512)
2021-05-27 16:55:15,595 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,596 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  32%|███▏      | 186/574 [00:20<00:40,  9.47it/s]

2021-05-27 16:55:15,665 INFO | INITIAL
2021-05-27 16:55:15,665 INFO | (50, 200)
2021-05-27 16:55:15,671 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:15,671 INFO | (50, 200, 512)
2021-05-27 16:55:15,672 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:15,673 INFO | (50, 200, 512)
2021-05-27 16:55:15,674 INFO | BERT LAYER
2021-05-27 16:55:15,674 INFO | (200, 512)
2021-05-27 16:55:15,674 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,675 INFO | (200, 512)
2021-05-27 16:55:15,676 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,676 INFO | (200, 512)
2021-05-27 16:55:15,683 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,683 INFO | (200, 512)
2021-05-27 16:55:15,684 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,684 INFO | (200, 512)
2021-05-27 16:55:15,690 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,690 INFO | (200, 512)
2021-05-27 16:55:15,691 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,692 INFO | (200, 512)
2021-05-27 16:55:15,697 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,698 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  33%|███▎      | 187/574 [00:20<00:40,  9.62it/s]

2021-05-27 16:55:15,765 INFO | INITIAL
2021-05-27 16:55:15,766 INFO | (50, 200)
2021-05-27 16:55:15,773 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:15,773 INFO | (50, 200, 512)
2021-05-27 16:55:15,774 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:15,775 INFO | (50, 200, 512)
2021-05-27 16:55:15,777 INFO | BERT LAYER
2021-05-27 16:55:15,777 INFO | (200, 512)
2021-05-27 16:55:15,777 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,778 INFO | (200, 512)
2021-05-27 16:55:15,778 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,778 INFO | (200, 512)
2021-05-27 16:55:15,784 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,784 INFO | (200, 512)
2021-05-27 16:55:15,784 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,785 INFO | (200, 512)
2021-05-27 16:55:15,790 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,790 INFO | (200, 512)
2021-05-27 16:55:15,791 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,792 INFO | (200, 512)
2021-05-27 16:55:15,797 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,797 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  33%|███▎      | 188/574 [00:21<00:40,  9.59it/s]

2021-05-27 16:55:15,870 INFO | INITIAL
2021-05-27 16:55:15,870 INFO | (50, 200)
2021-05-27 16:55:15,875 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:15,876 INFO | (50, 200, 512)
2021-05-27 16:55:15,877 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:15,878 INFO | (50, 200, 512)
2021-05-27 16:55:15,879 INFO | BERT LAYER
2021-05-27 16:55:15,879 INFO | (200, 512)
2021-05-27 16:55:15,879 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,880 INFO | (200, 512)
2021-05-27 16:55:15,881 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,881 INFO | (200, 512)
2021-05-27 16:55:15,887 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,887 INFO | (200, 512)
2021-05-27 16:55:15,888 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,888 INFO | (200, 512)
2021-05-27 16:55:15,894 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,894 INFO | (200, 512)
2021-05-27 16:55:15,894 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,895 INFO | (200, 512)
2021-05-27 16:55:15,900 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,900 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  33%|███▎      | 188/574 [00:21<00:40,  9.59it/s]

2021-05-27 16:55:15,967 INFO | INITIAL
2021-05-27 16:55:15,968 INFO | (50, 200)
2021-05-27 16:55:15,973 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:15,973 INFO | (50, 200, 512)
2021-05-27 16:55:15,974 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:15,975 INFO | (50, 200, 512)
2021-05-27 16:55:15,976 INFO | BERT LAYER
2021-05-27 16:55:15,976 INFO | (200, 512)
2021-05-27 16:55:15,977 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,977 INFO | (200, 512)
2021-05-27 16:55:15,978 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,978 INFO | (200, 512)
2021-05-27 16:55:15,985 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,986 INFO | (200, 512)
2021-05-27 16:55:15,986 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,987 INFO | (200, 512)
2021-05-27 16:55:15,993 INFO | BERT LAYER LOOP
2021-05-27 16:55:15,993 INFO | (200, 512)
2021-05-27 16:55:15,994 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:15,994 INFO | (200, 512)
2021-05-27 16:55:16,001 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,001 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  33%|███▎      | 190/574 [00:21<00:39,  9.76it/s]

2021-05-27 16:55:16,071 INFO | INITIAL
2021-05-27 16:55:16,071 INFO | (50, 200)
2021-05-27 16:55:16,076 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:16,077 INFO | (50, 200, 512)
2021-05-27 16:55:16,078 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:16,079 INFO | (50, 200, 512)
2021-05-27 16:55:16,080 INFO | BERT LAYER
2021-05-27 16:55:16,080 INFO | (200, 512)
2021-05-27 16:55:16,081 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,082 INFO | (200, 512)
2021-05-27 16:55:16,082 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,083 INFO | (200, 512)
2021-05-27 16:55:16,088 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,088 INFO | (200, 512)
2021-05-27 16:55:16,089 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,089 INFO | (200, 512)
2021-05-27 16:55:16,095 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,095 INFO | (200, 512)
2021-05-27 16:55:16,095 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,096 INFO | (200, 512)
2021-05-27 16:55:16,102 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,102 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  33%|███▎      | 191/574 [00:21<00:39,  9.73it/s]

2021-05-27 16:55:16,175 INFO | INITIAL
2021-05-27 16:55:16,176 INFO | (50, 200)
2021-05-27 16:55:16,182 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:16,182 INFO | (50, 200, 512)
2021-05-27 16:55:16,184 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:16,184 INFO | (50, 200, 512)
2021-05-27 16:55:16,185 INFO | BERT LAYER
2021-05-27 16:55:16,185 INFO | (200, 512)
2021-05-27 16:55:16,186 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,186 INFO | (200, 512)
2021-05-27 16:55:16,186 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,187 INFO | (200, 512)
2021-05-27 16:55:16,194 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,195 INFO | (200, 512)
2021-05-27 16:55:16,195 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,196 INFO | (200, 512)
2021-05-27 16:55:16,201 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,201 INFO | (200, 512)
2021-05-27 16:55:16,202 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,202 INFO | (200, 512)
2021-05-27 16:55:16,208 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,208 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  33%|███▎      | 192/574 [00:21<00:40,  9.47it/s]

2021-05-27 16:55:16,288 INFO | INITIAL
2021-05-27 16:55:16,288 INFO | (50, 200)
2021-05-27 16:55:16,295 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:16,296 INFO | (50, 200, 512)
2021-05-27 16:55:16,297 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:16,298 INFO | (50, 200, 512)
2021-05-27 16:55:16,299 INFO | BERT LAYER
2021-05-27 16:55:16,300 INFO | (200, 512)
2021-05-27 16:55:16,300 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,300 INFO | (200, 512)
2021-05-27 16:55:16,301 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,302 INFO | (200, 512)
2021-05-27 16:55:16,307 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,309 INFO | (200, 512)
2021-05-27 16:55:16,309 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,310 INFO | (200, 512)
2021-05-27 16:55:16,315 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,316 INFO | (200, 512)
2021-05-27 16:55:16,316 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,317 INFO | (200, 512)
2021-05-27 16:55:16,323 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,323 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  34%|███▎      | 193/574 [00:21<00:40,  9.35it/s]

2021-05-27 16:55:16,398 INFO | INITIAL
2021-05-27 16:55:16,399 INFO | (50, 200)
2021-05-27 16:55:16,404 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:16,404 INFO | (50, 200, 512)
2021-05-27 16:55:16,405 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:16,405 INFO | (50, 200, 512)
2021-05-27 16:55:16,406 INFO | BERT LAYER
2021-05-27 16:55:16,406 INFO | (200, 512)
2021-05-27 16:55:16,407 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,407 INFO | (200, 512)
2021-05-27 16:55:16,408 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,409 INFO | (200, 512)
2021-05-27 16:55:16,416 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,417 INFO | (200, 512)
2021-05-27 16:55:16,418 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,418 INFO | (200, 512)
2021-05-27 16:55:16,425 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,426 INFO | (200, 512)
2021-05-27 16:55:16,426 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,427 INFO | (200, 512)
2021-05-27 16:55:16,433 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,433 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  34%|███▍      | 194/574 [00:21<00:40,  9.40it/s]

2021-05-27 16:55:16,504 INFO | INITIAL
2021-05-27 16:55:16,504 INFO | (50, 200)
2021-05-27 16:55:16,510 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:16,510 INFO | (50, 200, 512)
2021-05-27 16:55:16,512 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:16,513 INFO | (50, 200, 512)
2021-05-27 16:55:16,514 INFO | BERT LAYER
2021-05-27 16:55:16,514 INFO | (200, 512)
2021-05-27 16:55:16,515 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,516 INFO | (200, 512)
2021-05-27 16:55:16,517 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,517 INFO | (200, 512)
2021-05-27 16:55:16,522 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,523 INFO | (200, 512)
2021-05-27 16:55:16,523 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,524 INFO | (200, 512)
2021-05-27 16:55:16,531 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,531 INFO | (200, 512)
2021-05-27 16:55:16,532 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,532 INFO | (200, 512)
2021-05-27 16:55:16,537 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,537 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  34%|███▍      | 195/574 [00:21<00:40,  9.40it/s]

2021-05-27 16:55:16,610 INFO | INITIAL
2021-05-27 16:55:16,611 INFO | (50, 200)
2021-05-27 16:55:16,617 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:16,618 INFO | (50, 200, 512)
2021-05-27 16:55:16,620 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:16,621 INFO | (50, 200, 512)
2021-05-27 16:55:16,622 INFO | BERT LAYER
2021-05-27 16:55:16,622 INFO | (200, 512)
2021-05-27 16:55:16,622 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,622 INFO | (200, 512)
2021-05-27 16:55:16,623 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,623 INFO | (200, 512)
2021-05-27 16:55:16,631 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,631 INFO | (200, 512)
2021-05-27 16:55:16,632 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,632 INFO | (200, 512)
2021-05-27 16:55:16,639 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,639 INFO | (200, 512)
2021-05-27 16:55:16,640 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,640 INFO | (200, 512)
2021-05-27 16:55:16,648 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,648 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  34%|███▍      | 196/574 [00:21<00:39,  9.45it/s]

2021-05-27 16:55:16,715 INFO | INITIAL
2021-05-27 16:55:16,715 INFO | (50, 200)
2021-05-27 16:55:16,720 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:16,721 INFO | (50, 200, 512)
2021-05-27 16:55:16,722 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:16,722 INFO | (50, 200, 512)
2021-05-27 16:55:16,723 INFO | BERT LAYER
2021-05-27 16:55:16,723 INFO | (200, 512)
2021-05-27 16:55:16,724 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,725 INFO | (200, 512)
2021-05-27 16:55:16,725 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,726 INFO | (200, 512)
2021-05-27 16:55:16,731 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,732 INFO | (200, 512)
2021-05-27 16:55:16,733 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,733 INFO | (200, 512)
2021-05-27 16:55:16,738 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,738 INFO | (200, 512)
2021-05-27 16:55:16,739 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,739 INFO | (200, 512)
2021-05-27 16:55:16,746 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,747 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  34%|███▍      | 197/574 [00:22<00:39,  9.44it/s]

2021-05-27 16:55:16,821 INFO | INITIAL
2021-05-27 16:55:16,821 INFO | (50, 200)
2021-05-27 16:55:16,828 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:16,829 INFO | (50, 200, 512)
2021-05-27 16:55:16,830 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:16,831 INFO | (50, 200, 512)
2021-05-27 16:55:16,832 INFO | BERT LAYER
2021-05-27 16:55:16,832 INFO | (200, 512)
2021-05-27 16:55:16,833 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,834 INFO | (200, 512)
2021-05-27 16:55:16,834 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,835 INFO | (200, 512)
2021-05-27 16:55:16,841 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,842 INFO | (200, 512)
2021-05-27 16:55:16,842 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,843 INFO | (200, 512)
2021-05-27 16:55:16,849 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,849 INFO | (200, 512)
2021-05-27 16:55:16,850 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,850 INFO | (200, 512)
2021-05-27 16:55:16,855 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,855 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  34%|███▍      | 198/574 [00:22<00:40,  9.36it/s]

2021-05-27 16:55:16,929 INFO | INITIAL
2021-05-27 16:55:16,930 INFO | (50, 200)
2021-05-27 16:55:16,935 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:16,935 INFO | (50, 200, 512)
2021-05-27 16:55:16,937 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:16,937 INFO | (50, 200, 512)
2021-05-27 16:55:16,938 INFO | BERT LAYER
2021-05-27 16:55:16,938 INFO | (200, 512)
2021-05-27 16:55:16,938 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,939 INFO | (200, 512)
2021-05-27 16:55:16,939 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,939 INFO | (200, 512)
2021-05-27 16:55:16,946 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,947 INFO | (200, 512)
2021-05-27 16:55:16,947 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,948 INFO | (200, 512)
2021-05-27 16:55:16,953 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,954 INFO | (200, 512)
2021-05-27 16:55:16,954 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:16,955 INFO | (200, 512)
2021-05-27 16:55:16,961 INFO | BERT LAYER LOOP
2021-05-27 16:55:16,962 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  35%|███▍      | 199/574 [00:22<00:39,  9.51it/s]

2021-05-27 16:55:17,030 INFO | INITIAL
2021-05-27 16:55:17,031 INFO | (50, 200)
2021-05-27 16:55:17,036 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:17,037 INFO | (50, 200, 512)
2021-05-27 16:55:17,038 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:17,038 INFO | (50, 200, 512)
2021-05-27 16:55:17,039 INFO | BERT LAYER
2021-05-27 16:55:17,040 INFO | (200, 512)
2021-05-27 16:55:17,041 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,042 INFO | (200, 512)
2021-05-27 16:55:17,042 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,043 INFO | (200, 512)
2021-05-27 16:55:17,049 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,050 INFO | (200, 512)
2021-05-27 16:55:17,050 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,051 INFO | (200, 512)
2021-05-27 16:55:17,058 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,058 INFO | (200, 512)
2021-05-27 16:55:17,059 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,060 INFO | (200, 512)
2021-05-27 16:55:17,067 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,067 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  35%|███▍      | 200/574 [00:22<00:39,  9.51it/s]

2021-05-27 16:55:17,136 INFO | INITIAL
2021-05-27 16:55:17,136 INFO | (50, 200)
2021-05-27 16:55:17,141 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:17,141 INFO | (50, 200, 512)
2021-05-27 16:55:17,143 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:17,143 INFO | (50, 200, 512)
2021-05-27 16:55:17,144 INFO | BERT LAYER
2021-05-27 16:55:17,145 INFO | (200, 512)
2021-05-27 16:55:17,145 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,146 INFO | (200, 512)
2021-05-27 16:55:17,146 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,147 INFO | (200, 512)
2021-05-27 16:55:17,153 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,153 INFO | (200, 512)
2021-05-27 16:55:17,154 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,154 INFO | (200, 512)
2021-05-27 16:55:17,160 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,160 INFO | (200, 512)
2021-05-27 16:55:17,161 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,161 INFO | (200, 512)
2021-05-27 16:55:17,166 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,166 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  35%|███▍      | 200/574 [00:22<00:39,  9.51it/s]

2021-05-27 16:55:17,231 INFO | INITIAL
2021-05-27 16:55:17,231 INFO | (50, 200)
2021-05-27 16:55:17,236 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:17,237 INFO | (50, 200, 512)
2021-05-27 16:55:17,238 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:17,239 INFO | (50, 200, 512)
2021-05-27 16:55:17,239 INFO | BERT LAYER
2021-05-27 16:55:17,240 INFO | (200, 512)
2021-05-27 16:55:17,240 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,240 INFO | (200, 512)
2021-05-27 16:55:17,241 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,242 INFO | (200, 512)
2021-05-27 16:55:17,248 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,249 INFO | (200, 512)
2021-05-27 16:55:17,249 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,250 INFO | (200, 512)
2021-05-27 16:55:17,255 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,255 INFO | (200, 512)
2021-05-27 16:55:17,256 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,256 INFO | (200, 512)
2021-05-27 16:55:17,263 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,264 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  35%|███▌      | 202/574 [00:22<00:38,  9.75it/s]

2021-05-27 16:55:17,335 INFO | INITIAL
2021-05-27 16:55:17,335 INFO | (50, 200)
2021-05-27 16:55:17,341 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:17,342 INFO | (50, 200, 512)
2021-05-27 16:55:17,343 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:17,344 INFO | (50, 200, 512)
2021-05-27 16:55:17,345 INFO | BERT LAYER
2021-05-27 16:55:17,345 INFO | (200, 512)
2021-05-27 16:55:17,345 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,346 INFO | (200, 512)
2021-05-27 16:55:17,346 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,346 INFO | (200, 512)
2021-05-27 16:55:17,353 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,353 INFO | (200, 512)
2021-05-27 16:55:17,354 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,355 INFO | (200, 512)
2021-05-27 16:55:17,362 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,362 INFO | (200, 512)
2021-05-27 16:55:17,363 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,363 INFO | (200, 512)
2021-05-27 16:55:17,369 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,370 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  35%|███▌      | 203/574 [00:22<00:38,  9.69it/s]

2021-05-27 16:55:17,440 INFO | INITIAL
2021-05-27 16:55:17,441 INFO | (50, 200)
2021-05-27 16:55:17,449 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:17,450 INFO | (50, 200, 512)
2021-05-27 16:55:17,451 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:17,451 INFO | (50, 200, 512)
2021-05-27 16:55:17,452 INFO | BERT LAYER
2021-05-27 16:55:17,452 INFO | (200, 512)
2021-05-27 16:55:17,452 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,453 INFO | (200, 512)
2021-05-27 16:55:17,453 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,453 INFO | (200, 512)
2021-05-27 16:55:17,459 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,459 INFO | (200, 512)
2021-05-27 16:55:17,460 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,460 INFO | (200, 512)
2021-05-27 16:55:17,466 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,466 INFO | (200, 512)
2021-05-27 16:55:17,466 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,467 INFO | (200, 512)
2021-05-27 16:55:17,472 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,472 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  36%|███▌      | 204/574 [00:22<00:38,  9.72it/s]

2021-05-27 16:55:17,542 INFO | INITIAL
2021-05-27 16:55:17,543 INFO | (50, 200)
2021-05-27 16:55:17,549 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:17,550 INFO | (50, 200, 512)
2021-05-27 16:55:17,551 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:17,552 INFO | (50, 200, 512)
2021-05-27 16:55:17,553 INFO | BERT LAYER
2021-05-27 16:55:17,554 INFO | (200, 512)
2021-05-27 16:55:17,555 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,555 INFO | (200, 512)
2021-05-27 16:55:17,556 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,557 INFO | (200, 512)
2021-05-27 16:55:17,563 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,563 INFO | (200, 512)
2021-05-27 16:55:17,564 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,565 INFO | (200, 512)
2021-05-27 16:55:17,570 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,570 INFO | (200, 512)
2021-05-27 16:55:17,570 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,571 INFO | (200, 512)
2021-05-27 16:55:17,576 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,577 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  36%|███▌      | 205/574 [00:22<00:38,  9.62it/s]

2021-05-27 16:55:17,649 INFO | INITIAL
2021-05-27 16:55:17,649 INFO | (50, 200)
2021-05-27 16:55:17,655 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:17,655 INFO | (50, 200, 512)
2021-05-27 16:55:17,656 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:17,656 INFO | (50, 200, 512)
2021-05-27 16:55:17,657 INFO | BERT LAYER
2021-05-27 16:55:17,658 INFO | (200, 512)
2021-05-27 16:55:17,658 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,658 INFO | (200, 512)
2021-05-27 16:55:17,659 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,659 INFO | (200, 512)
2021-05-27 16:55:17,666 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,666 INFO | (200, 512)
2021-05-27 16:55:17,667 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,667 INFO | (200, 512)
2021-05-27 16:55:17,673 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,673 INFO | (200, 512)
2021-05-27 16:55:17,674 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,674 INFO | (200, 512)
2021-05-27 16:55:17,681 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,682 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  36%|███▌      | 206/574 [00:22<00:38,  9.52it/s]

2021-05-27 16:55:17,757 INFO | INITIAL
2021-05-27 16:55:17,758 INFO | (50, 200)
2021-05-27 16:55:17,766 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:17,766 INFO | (50, 200, 512)
2021-05-27 16:55:17,768 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:17,768 INFO | (50, 200, 512)
2021-05-27 16:55:17,769 INFO | BERT LAYER
2021-05-27 16:55:17,769 INFO | (200, 512)
2021-05-27 16:55:17,770 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,770 INFO | (200, 512)
2021-05-27 16:55:17,771 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,771 INFO | (200, 512)
2021-05-27 16:55:17,778 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,779 INFO | (200, 512)
2021-05-27 16:55:17,779 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,780 INFO | (200, 512)
2021-05-27 16:55:17,786 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,787 INFO | (200, 512)
2021-05-27 16:55:17,787 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,787 INFO | (200, 512)
2021-05-27 16:55:17,794 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,794 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  36%|███▌      | 207/574 [00:23<00:39,  9.31it/s]

2021-05-27 16:55:17,871 INFO | INITIAL
2021-05-27 16:55:17,871 INFO | (50, 200)
2021-05-27 16:55:17,876 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:17,877 INFO | (50, 200, 512)
2021-05-27 16:55:17,878 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:17,879 INFO | (50, 200, 512)
2021-05-27 16:55:17,880 INFO | BERT LAYER
2021-05-27 16:55:17,880 INFO | (200, 512)
2021-05-27 16:55:17,881 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,881 INFO | (200, 512)
2021-05-27 16:55:17,882 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,882 INFO | (200, 512)
2021-05-27 16:55:17,889 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,889 INFO | (200, 512)
2021-05-27 16:55:17,890 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,890 INFO | (200, 512)
2021-05-27 16:55:17,896 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,897 INFO | (200, 512)
2021-05-27 16:55:17,897 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,897 INFO | (200, 512)
2021-05-27 16:55:17,903 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,903 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  36%|███▌      | 208/574 [00:23<00:39,  9.34it/s]

2021-05-27 16:55:17,977 INFO | INITIAL
2021-05-27 16:55:17,977 INFO | (50, 200)
2021-05-27 16:55:17,983 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:17,983 INFO | (50, 200, 512)
2021-05-27 16:55:17,984 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:17,985 INFO | (50, 200, 512)
2021-05-27 16:55:17,985 INFO | BERT LAYER
2021-05-27 16:55:17,986 INFO | (200, 512)
2021-05-27 16:55:17,986 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,986 INFO | (200, 512)
2021-05-27 16:55:17,987 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,987 INFO | (200, 512)
2021-05-27 16:55:17,994 INFO | BERT LAYER LOOP
2021-05-27 16:55:17,994 INFO | (200, 512)
2021-05-27 16:55:17,995 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:17,995 INFO | (200, 512)
2021-05-27 16:55:18,000 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,001 INFO | (200, 512)
2021-05-27 16:55:18,001 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,002 INFO | (200, 512)
2021-05-27 16:55:18,007 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,008 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  36%|███▋      | 209/574 [00:23<00:38,  9.43it/s]

2021-05-27 16:55:18,080 INFO | INITIAL
2021-05-27 16:55:18,081 INFO | (50, 200)
2021-05-27 16:55:18,086 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:18,087 INFO | (50, 200, 512)
2021-05-27 16:55:18,088 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:18,089 INFO | (50, 200, 512)
2021-05-27 16:55:18,090 INFO | BERT LAYER
2021-05-27 16:55:18,091 INFO | (200, 512)
2021-05-27 16:55:18,091 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,092 INFO | (200, 512)
2021-05-27 16:55:18,093 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,093 INFO | (200, 512)
2021-05-27 16:55:18,099 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,100 INFO | (200, 512)
2021-05-27 16:55:18,100 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,101 INFO | (200, 512)
2021-05-27 16:55:18,107 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,108 INFO | (200, 512)
2021-05-27 16:55:18,109 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,109 INFO | (200, 512)
2021-05-27 16:55:18,114 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,115 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  37%|███▋      | 210/574 [00:23<00:38,  9.45it/s]

2021-05-27 16:55:18,185 INFO | INITIAL
2021-05-27 16:55:18,185 INFO | (50, 200)
2021-05-27 16:55:18,191 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:18,192 INFO | (50, 200, 512)
2021-05-27 16:55:18,193 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:18,194 INFO | (50, 200, 512)
2021-05-27 16:55:18,195 INFO | BERT LAYER
2021-05-27 16:55:18,195 INFO | (200, 512)
2021-05-27 16:55:18,196 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,196 INFO | (200, 512)
2021-05-27 16:55:18,197 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,197 INFO | (200, 512)
2021-05-27 16:55:18,203 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,204 INFO | (200, 512)
2021-05-27 16:55:18,204 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,205 INFO | (200, 512)
2021-05-27 16:55:18,211 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,211 INFO | (200, 512)
2021-05-27 16:55:18,212 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,212 INFO | (200, 512)
2021-05-27 16:55:18,218 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,219 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  37%|███▋      | 211/574 [00:23<00:38,  9.36it/s]

2021-05-27 16:55:18,296 INFO | INITIAL
2021-05-27 16:55:18,297 INFO | (50, 200)
2021-05-27 16:55:18,305 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:18,305 INFO | (50, 200, 512)
2021-05-27 16:55:18,307 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:18,307 INFO | (50, 200, 512)
2021-05-27 16:55:18,308 INFO | BERT LAYER
2021-05-27 16:55:18,309 INFO | (200, 512)
2021-05-27 16:55:18,309 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,310 INFO | (200, 512)
2021-05-27 16:55:18,310 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,311 INFO | (200, 512)
2021-05-27 16:55:18,317 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,317 INFO | (200, 512)
2021-05-27 16:55:18,318 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,318 INFO | (200, 512)
2021-05-27 16:55:18,324 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,325 INFO | (200, 512)
2021-05-27 16:55:18,326 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,327 INFO | (200, 512)
2021-05-27 16:55:18,333 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,333 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  37%|███▋      | 212/574 [00:23<00:39,  9.07it/s]

2021-05-27 16:55:18,413 INFO | INITIAL
2021-05-27 16:55:18,414 INFO | (50, 200)
2021-05-27 16:55:18,420 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:18,420 INFO | (50, 200, 512)
2021-05-27 16:55:18,422 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:18,423 INFO | (50, 200, 512)
2021-05-27 16:55:18,423 INFO | BERT LAYER
2021-05-27 16:55:18,424 INFO | (200, 512)
2021-05-27 16:55:18,424 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,425 INFO | (200, 512)
2021-05-27 16:55:18,425 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,426 INFO | (200, 512)
2021-05-27 16:55:18,432 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,432 INFO | (200, 512)
2021-05-27 16:55:18,433 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,433 INFO | (200, 512)
2021-05-27 16:55:18,438 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,439 INFO | (200, 512)
2021-05-27 16:55:18,439 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,440 INFO | (200, 512)
2021-05-27 16:55:18,446 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,447 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  37%|███▋      | 213/574 [00:23<00:39,  9.12it/s]

2021-05-27 16:55:18,521 INFO | INITIAL
2021-05-27 16:55:18,521 INFO | (50, 200)
2021-05-27 16:55:18,527 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:18,528 INFO | (50, 200, 512)
2021-05-27 16:55:18,529 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:18,530 INFO | (50, 200, 512)
2021-05-27 16:55:18,530 INFO | BERT LAYER
2021-05-27 16:55:18,531 INFO | (200, 512)
2021-05-27 16:55:18,531 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,531 INFO | (200, 512)
2021-05-27 16:55:18,532 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,532 INFO | (200, 512)
2021-05-27 16:55:18,538 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,538 INFO | (200, 512)
2021-05-27 16:55:18,538 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,539 INFO | (200, 512)
2021-05-27 16:55:18,544 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,545 INFO | (200, 512)
2021-05-27 16:55:18,545 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,546 INFO | (200, 512)
2021-05-27 16:55:18,551 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,552 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  37%|███▋      | 214/574 [00:23<00:39,  9.19it/s]

2021-05-27 16:55:18,628 INFO | INITIAL
2021-05-27 16:55:18,629 INFO | (50, 200)
2021-05-27 16:55:18,634 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:18,634 INFO | (50, 200, 512)
2021-05-27 16:55:18,636 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:18,636 INFO | (50, 200, 512)
2021-05-27 16:55:18,637 INFO | BERT LAYER
2021-05-27 16:55:18,637 INFO | (200, 512)
2021-05-27 16:55:18,638 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,638 INFO | (200, 512)
2021-05-27 16:55:18,639 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,639 INFO | (200, 512)
2021-05-27 16:55:18,646 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,646 INFO | (200, 512)
2021-05-27 16:55:18,650 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,651 INFO | (200, 512)
2021-05-27 16:55:18,658 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,658 INFO | (200, 512)
2021-05-27 16:55:18,659 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,659 INFO | (200, 512)
2021-05-27 16:55:18,665 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,665 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  37%|███▋      | 215/574 [00:23<00:39,  9.16it/s]

2021-05-27 16:55:18,737 INFO | INITIAL
2021-05-27 16:55:18,738 INFO | (50, 200)
2021-05-27 16:55:18,744 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:18,744 INFO | (50, 200, 512)
2021-05-27 16:55:18,746 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:18,746 INFO | (50, 200, 512)
2021-05-27 16:55:18,747 INFO | BERT LAYER
2021-05-27 16:55:18,748 INFO | (200, 512)
2021-05-27 16:55:18,748 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,748 INFO | (200, 512)
2021-05-27 16:55:18,749 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,749 INFO | (200, 512)
2021-05-27 16:55:18,756 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,756 INFO | (200, 512)
2021-05-27 16:55:18,757 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,758 INFO | (200, 512)
2021-05-27 16:55:18,764 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,764 INFO | (200, 512)
2021-05-27 16:55:18,765 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,766 INFO | (200, 512)
2021-05-27 16:55:18,771 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,771 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  38%|███▊      | 216/574 [00:24<00:38,  9.23it/s]

2021-05-27 16:55:18,844 INFO | INITIAL
2021-05-27 16:55:18,845 INFO | (50, 200)
2021-05-27 16:55:18,850 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:18,851 INFO | (50, 200, 512)
2021-05-27 16:55:18,852 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:18,852 INFO | (50, 200, 512)
2021-05-27 16:55:18,853 INFO | BERT LAYER
2021-05-27 16:55:18,853 INFO | (200, 512)
2021-05-27 16:55:18,854 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,854 INFO | (200, 512)
2021-05-27 16:55:18,854 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,855 INFO | (200, 512)
2021-05-27 16:55:18,861 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,861 INFO | (200, 512)
2021-05-27 16:55:18,862 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,862 INFO | (200, 512)
2021-05-27 16:55:18,867 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,868 INFO | (200, 512)
2021-05-27 16:55:18,869 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,869 INFO | (200, 512)
2021-05-27 16:55:18,874 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,875 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  38%|███▊      | 217/574 [00:24<00:38,  9.33it/s]

2021-05-27 16:55:18,949 INFO | INITIAL
2021-05-27 16:55:18,949 INFO | (50, 200)
2021-05-27 16:55:18,956 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:18,956 INFO | (50, 200, 512)
2021-05-27 16:55:18,958 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:18,959 INFO | (50, 200, 512)
2021-05-27 16:55:18,960 INFO | BERT LAYER
2021-05-27 16:55:18,961 INFO | (200, 512)
2021-05-27 16:55:18,961 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,961 INFO | (200, 512)
2021-05-27 16:55:18,962 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,962 INFO | (200, 512)
2021-05-27 16:55:18,968 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,968 INFO | (200, 512)
2021-05-27 16:55:18,968 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,969 INFO | (200, 512)
2021-05-27 16:55:18,974 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,974 INFO | (200, 512)
2021-05-27 16:55:18,974 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:18,975 INFO | (200, 512)
2021-05-27 16:55:18,980 INFO | BERT LAYER LOOP
2021-05-27 16:55:18,981 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  38%|███▊      | 218/574 [00:24<00:37,  9.43it/s]

2021-05-27 16:55:19,052 INFO | INITIAL
2021-05-27 16:55:19,054 INFO | (50, 200)
2021-05-27 16:55:19,060 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:19,060 INFO | (50, 200, 512)
2021-05-27 16:55:19,062 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:19,062 INFO | (50, 200, 512)
2021-05-27 16:55:19,063 INFO | BERT LAYER
2021-05-27 16:55:19,063 INFO | (200, 512)
2021-05-27 16:55:19,064 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,064 INFO | (200, 512)
2021-05-27 16:55:19,065 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,065 INFO | (200, 512)
2021-05-27 16:55:19,070 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,071 INFO | (200, 512)
2021-05-27 16:55:19,071 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,072 INFO | (200, 512)
2021-05-27 16:55:19,077 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,078 INFO | (200, 512)
2021-05-27 16:55:19,078 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,078 INFO | (200, 512)
2021-05-27 16:55:19,085 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,085 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  38%|███▊      | 219/574 [00:24<00:37,  9.42it/s]

2021-05-27 16:55:19,159 INFO | INITIAL
2021-05-27 16:55:19,160 INFO | (50, 200)
2021-05-27 16:55:19,167 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:19,167 INFO | (50, 200, 512)
2021-05-27 16:55:19,168 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:19,169 INFO | (50, 200, 512)
2021-05-27 16:55:19,170 INFO | BERT LAYER
2021-05-27 16:55:19,170 INFO | (200, 512)
2021-05-27 16:55:19,171 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,171 INFO | (200, 512)
2021-05-27 16:55:19,171 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,172 INFO | (200, 512)
2021-05-27 16:55:19,179 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,180 INFO | (200, 512)
2021-05-27 16:55:19,180 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,180 INFO | (200, 512)
2021-05-27 16:55:19,187 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,187 INFO | (200, 512)
2021-05-27 16:55:19,188 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,188 INFO | (200, 512)
2021-05-27 16:55:19,195 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,196 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  38%|███▊      | 220/574 [00:24<00:37,  9.46it/s]

2021-05-27 16:55:19,263 INFO | INITIAL
2021-05-27 16:55:19,264 INFO | (50, 200)
2021-05-27 16:55:19,269 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:19,270 INFO | (50, 200, 512)
2021-05-27 16:55:19,271 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:19,271 INFO | (50, 200, 512)
2021-05-27 16:55:19,272 INFO | BERT LAYER
2021-05-27 16:55:19,273 INFO | (200, 512)
2021-05-27 16:55:19,273 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,273 INFO | (200, 512)
2021-05-27 16:55:19,274 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,274 INFO | (200, 512)
2021-05-27 16:55:19,281 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,282 INFO | (200, 512)
2021-05-27 16:55:19,282 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,282 INFO | (200, 512)
2021-05-27 16:55:19,290 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,290 INFO | (200, 512)
2021-05-27 16:55:19,291 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,291 INFO | (200, 512)
2021-05-27 16:55:19,298 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,298 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  39%|███▊      | 221/574 [00:24<00:37,  9.44it/s]

2021-05-27 16:55:19,370 INFO | INITIAL
2021-05-27 16:55:19,370 INFO | (50, 200)
2021-05-27 16:55:19,376 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:19,377 INFO | (50, 200, 512)
2021-05-27 16:55:19,378 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:19,378 INFO | (50, 200, 512)
2021-05-27 16:55:19,379 INFO | BERT LAYER
2021-05-27 16:55:19,379 INFO | (200, 512)
2021-05-27 16:55:19,380 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,380 INFO | (200, 512)
2021-05-27 16:55:19,381 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,381 INFO | (200, 512)
2021-05-27 16:55:19,386 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,387 INFO | (200, 512)
2021-05-27 16:55:19,387 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,387 INFO | (200, 512)
2021-05-27 16:55:19,392 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,393 INFO | (200, 512)
2021-05-27 16:55:19,393 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,393 INFO | (200, 512)
2021-05-27 16:55:19,398 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,399 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  39%|███▊      | 221/574 [00:24<00:37,  9.44it/s]

2021-05-27 16:55:19,469 INFO | INITIAL
2021-05-27 16:55:19,469 INFO | (50, 200)
2021-05-27 16:55:19,475 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:19,476 INFO | (50, 200, 512)
2021-05-27 16:55:19,477 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:19,478 INFO | (50, 200, 512)
2021-05-27 16:55:19,479 INFO | BERT LAYER
2021-05-27 16:55:19,479 INFO | (200, 512)
2021-05-27 16:55:19,480 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,480 INFO | (200, 512)
2021-05-27 16:55:19,481 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,481 INFO | (200, 512)
2021-05-27 16:55:19,487 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,487 INFO | (200, 512)
2021-05-27 16:55:19,488 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,488 INFO | (200, 512)
2021-05-27 16:55:19,495 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,496 INFO | (200, 512)
2021-05-27 16:55:19,496 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,497 INFO | (200, 512)
2021-05-27 16:55:19,502 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,503 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  39%|███▉      | 223/574 [00:24<00:36,  9.51it/s]

2021-05-27 16:55:19,578 INFO | INITIAL
2021-05-27 16:55:19,578 INFO | (50, 200)
2021-05-27 16:55:19,584 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:19,584 INFO | (50, 200, 512)
2021-05-27 16:55:19,587 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:19,587 INFO | (50, 200, 512)
2021-05-27 16:55:19,588 INFO | BERT LAYER
2021-05-27 16:55:19,589 INFO | (200, 512)
2021-05-27 16:55:19,589 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,589 INFO | (200, 512)
2021-05-27 16:55:19,590 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,590 INFO | (200, 512)
2021-05-27 16:55:19,597 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,598 INFO | (200, 512)
2021-05-27 16:55:19,598 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,598 INFO | (200, 512)
2021-05-27 16:55:19,605 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,605 INFO | (200, 512)
2021-05-27 16:55:19,606 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,606 INFO | (200, 512)
2021-05-27 16:55:19,614 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,614 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  39%|███▉      | 224/574 [00:24<00:37,  9.40it/s]

2021-05-27 16:55:19,689 INFO | INITIAL
2021-05-27 16:55:19,689 INFO | (50, 200)
2021-05-27 16:55:19,697 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:19,697 INFO | (50, 200, 512)
2021-05-27 16:55:19,698 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:19,699 INFO | (50, 200, 512)
2021-05-27 16:55:19,700 INFO | BERT LAYER
2021-05-27 16:55:19,700 INFO | (200, 512)
2021-05-27 16:55:19,700 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,701 INFO | (200, 512)
2021-05-27 16:55:19,701 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,701 INFO | (200, 512)
2021-05-27 16:55:19,707 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,707 INFO | (200, 512)
2021-05-27 16:55:19,708 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,708 INFO | (200, 512)
2021-05-27 16:55:19,716 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,717 INFO | (200, 512)
2021-05-27 16:55:19,718 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,718 INFO | (200, 512)
2021-05-27 16:55:19,725 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,726 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  39%|███▉      | 225/574 [00:24<00:37,  9.32it/s]

2021-05-27 16:55:19,798 INFO | INITIAL
2021-05-27 16:55:19,799 INFO | (50, 200)
2021-05-27 16:55:19,803 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:19,804 INFO | (50, 200, 512)
2021-05-27 16:55:19,805 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:19,806 INFO | (50, 200, 512)
2021-05-27 16:55:19,806 INFO | BERT LAYER
2021-05-27 16:55:19,807 INFO | (200, 512)
2021-05-27 16:55:19,807 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,808 INFO | (200, 512)
2021-05-27 16:55:19,809 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,809 INFO | (200, 512)
2021-05-27 16:55:19,814 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,815 INFO | (200, 512)
2021-05-27 16:55:19,815 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,816 INFO | (200, 512)
2021-05-27 16:55:19,822 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,824 INFO | (200, 512)
2021-05-27 16:55:19,825 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,826 INFO | (200, 512)
2021-05-27 16:55:19,832 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,832 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  39%|███▉      | 226/574 [00:25<00:37,  9.37it/s]

2021-05-27 16:55:19,903 INFO | INITIAL
2021-05-27 16:55:19,904 INFO | (50, 200)
2021-05-27 16:55:19,909 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:19,910 INFO | (50, 200, 512)
2021-05-27 16:55:19,911 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:19,912 INFO | (50, 200, 512)
2021-05-27 16:55:19,912 INFO | BERT LAYER
2021-05-27 16:55:19,913 INFO | (200, 512)
2021-05-27 16:55:19,913 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,913 INFO | (200, 512)
2021-05-27 16:55:19,914 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,915 INFO | (200, 512)
2021-05-27 16:55:19,920 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,920 INFO | (200, 512)
2021-05-27 16:55:19,921 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,921 INFO | (200, 512)
2021-05-27 16:55:19,928 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,929 INFO | (200, 512)
2021-05-27 16:55:19,929 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:19,930 INFO | (200, 512)
2021-05-27 16:55:19,935 INFO | BERT LAYER LOOP
2021-05-27 16:55:19,935 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  40%|███▉      | 227/574 [00:25<00:36,  9.38it/s]

2021-05-27 16:55:20,010 INFO | INITIAL
2021-05-27 16:55:20,010 INFO | (50, 200)
2021-05-27 16:55:20,015 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:20,016 INFO | (50, 200, 512)
2021-05-27 16:55:20,017 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:20,017 INFO | (50, 200, 512)
2021-05-27 16:55:20,018 INFO | BERT LAYER
2021-05-27 16:55:20,019 INFO | (200, 512)
2021-05-27 16:55:20,019 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,019 INFO | (200, 512)
2021-05-27 16:55:20,020 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,021 INFO | (200, 512)
2021-05-27 16:55:20,029 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,029 INFO | (200, 512)
2021-05-27 16:55:20,031 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,032 INFO | (200, 512)
2021-05-27 16:55:20,038 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,039 INFO | (200, 512)
2021-05-27 16:55:20,039 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,040 INFO | (200, 512)
2021-05-27 16:55:20,047 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,048 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  40%|███▉      | 228/574 [00:25<00:36,  9.40it/s]

2021-05-27 16:55:20,115 INFO | INITIAL
2021-05-27 16:55:20,116 INFO | (50, 200)
2021-05-27 16:55:20,122 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:20,122 INFO | (50, 200, 512)
2021-05-27 16:55:20,124 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:20,125 INFO | (50, 200, 512)
2021-05-27 16:55:20,126 INFO | BERT LAYER
2021-05-27 16:55:20,127 INFO | (200, 512)
2021-05-27 16:55:20,127 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,127 INFO | (200, 512)
2021-05-27 16:55:20,128 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,128 INFO | (200, 512)
2021-05-27 16:55:20,134 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,134 INFO | (200, 512)
2021-05-27 16:55:20,135 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,135 INFO | (200, 512)
2021-05-27 16:55:20,140 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,140 INFO | (200, 512)
2021-05-27 16:55:20,141 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,141 INFO | (200, 512)
2021-05-27 16:55:20,147 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,148 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  40%|███▉      | 229/574 [00:25<00:36,  9.50it/s]

2021-05-27 16:55:20,219 INFO | INITIAL
2021-05-27 16:55:20,220 INFO | (50, 200)
2021-05-27 16:55:20,228 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:20,229 INFO | (50, 200, 512)
2021-05-27 16:55:20,230 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:20,231 INFO | (50, 200, 512)
2021-05-27 16:55:20,232 INFO | BERT LAYER
2021-05-27 16:55:20,232 INFO | (200, 512)
2021-05-27 16:55:20,232 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,233 INFO | (200, 512)
2021-05-27 16:55:20,233 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,233 INFO | (200, 512)
2021-05-27 16:55:20,238 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,239 INFO | (200, 512)
2021-05-27 16:55:20,239 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,240 INFO | (200, 512)
2021-05-27 16:55:20,245 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,246 INFO | (200, 512)
2021-05-27 16:55:20,246 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,246 INFO | (200, 512)
2021-05-27 16:55:20,252 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,252 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  40%|████      | 230/574 [00:25<00:36,  9.45it/s]

2021-05-27 16:55:20,326 INFO | INITIAL
2021-05-27 16:55:20,327 INFO | (50, 200)
2021-05-27 16:55:20,333 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:20,333 INFO | (50, 200, 512)
2021-05-27 16:55:20,334 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:20,335 INFO | (50, 200, 512)
2021-05-27 16:55:20,336 INFO | BERT LAYER
2021-05-27 16:55:20,336 INFO | (200, 512)
2021-05-27 16:55:20,336 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,337 INFO | (200, 512)
2021-05-27 16:55:20,338 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,338 INFO | (200, 512)
2021-05-27 16:55:20,344 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,344 INFO | (200, 512)
2021-05-27 16:55:20,345 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,345 INFO | (200, 512)
2021-05-27 16:55:20,351 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,351 INFO | (200, 512)
2021-05-27 16:55:20,352 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,352 INFO | (200, 512)
2021-05-27 16:55:20,360 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,360 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  40%|████      | 231/574 [00:25<00:36,  9.42it/s]

2021-05-27 16:55:20,432 INFO | INITIAL
2021-05-27 16:55:20,433 INFO | (50, 200)
2021-05-27 16:55:20,437 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:20,438 INFO | (50, 200, 512)
2021-05-27 16:55:20,439 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:20,439 INFO | (50, 200, 512)
2021-05-27 16:55:20,440 INFO | BERT LAYER
2021-05-27 16:55:20,441 INFO | (200, 512)
2021-05-27 16:55:20,441 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,442 INFO | (200, 512)
2021-05-27 16:55:20,444 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,444 INFO | (200, 512)
2021-05-27 16:55:20,451 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,451 INFO | (200, 512)
2021-05-27 16:55:20,452 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,453 INFO | (200, 512)
2021-05-27 16:55:20,461 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,461 INFO | (200, 512)
2021-05-27 16:55:20,462 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,463 INFO | (200, 512)
2021-05-27 16:55:20,469 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,469 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  40%|████      | 232/574 [00:25<00:37,  9.23it/s]

2021-05-27 16:55:20,546 INFO | INITIAL
2021-05-27 16:55:20,546 INFO | (50, 200)
2021-05-27 16:55:20,551 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:20,552 INFO | (50, 200, 512)
2021-05-27 16:55:20,553 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:20,559 INFO | (50, 200, 512)
2021-05-27 16:55:20,560 INFO | BERT LAYER
2021-05-27 16:55:20,561 INFO | (200, 512)
2021-05-27 16:55:20,561 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,562 INFO | (200, 512)
2021-05-27 16:55:20,563 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,563 INFO | (200, 512)
2021-05-27 16:55:20,569 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,569 INFO | (200, 512)
2021-05-27 16:55:20,570 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,570 INFO | (200, 512)
2021-05-27 16:55:20,576 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,577 INFO | (200, 512)
2021-05-27 16:55:20,577 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,578 INFO | (200, 512)
2021-05-27 16:55:20,584 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,585 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  41%|████      | 233/574 [00:25<00:37,  9.02it/s]

2021-05-27 16:55:20,662 INFO | INITIAL
2021-05-27 16:55:20,663 INFO | (50, 200)
2021-05-27 16:55:20,668 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:20,668 INFO | (50, 200, 512)
2021-05-27 16:55:20,669 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:20,670 INFO | (50, 200, 512)
2021-05-27 16:55:20,671 INFO | BERT LAYER
2021-05-27 16:55:20,671 INFO | (200, 512)
2021-05-27 16:55:20,672 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,672 INFO | (200, 512)
2021-05-27 16:55:20,673 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,673 INFO | (200, 512)
2021-05-27 16:55:20,680 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,680 INFO | (200, 512)
2021-05-27 16:55:20,681 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,681 INFO | (200, 512)
2021-05-27 16:55:20,686 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,687 INFO | (200, 512)
2021-05-27 16:55:20,688 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,688 INFO | (200, 512)
2021-05-27 16:55:20,694 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,695 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  41%|████      | 234/574 [00:25<00:36,  9.25it/s]

2021-05-27 16:55:20,764 INFO | INITIAL
2021-05-27 16:55:20,765 INFO | (50, 200)
2021-05-27 16:55:20,771 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:20,771 INFO | (50, 200, 512)
2021-05-27 16:55:20,773 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:20,773 INFO | (50, 200, 512)
2021-05-27 16:55:20,774 INFO | BERT LAYER
2021-05-27 16:55:20,775 INFO | (200, 512)
2021-05-27 16:55:20,776 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,776 INFO | (200, 512)
2021-05-27 16:55:20,776 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,777 INFO | (200, 512)
2021-05-27 16:55:20,783 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,783 INFO | (200, 512)
2021-05-27 16:55:20,783 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,784 INFO | (200, 512)
2021-05-27 16:55:20,790 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,790 INFO | (200, 512)
2021-05-27 16:55:20,790 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,791 INFO | (200, 512)
2021-05-27 16:55:20,797 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,798 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  41%|████      | 235/574 [00:26<00:36,  9.33it/s]

2021-05-27 16:55:20,869 INFO | INITIAL
2021-05-27 16:55:20,870 INFO | (50, 200)
2021-05-27 16:55:20,876 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:20,876 INFO | (50, 200, 512)
2021-05-27 16:55:20,877 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:20,878 INFO | (50, 200, 512)
2021-05-27 16:55:20,879 INFO | BERT LAYER
2021-05-27 16:55:20,879 INFO | (200, 512)
2021-05-27 16:55:20,880 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,880 INFO | (200, 512)
2021-05-27 16:55:20,880 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,881 INFO | (200, 512)
2021-05-27 16:55:20,886 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,886 INFO | (200, 512)
2021-05-27 16:55:20,887 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,887 INFO | (200, 512)
2021-05-27 16:55:20,894 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,894 INFO | (200, 512)
2021-05-27 16:55:20,895 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,895 INFO | (200, 512)
2021-05-27 16:55:20,901 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,901 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  41%|████      | 236/574 [00:26<00:35,  9.45it/s]

2021-05-27 16:55:20,972 INFO | INITIAL
2021-05-27 16:55:20,972 INFO | (50, 200)
2021-05-27 16:55:20,980 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:20,980 INFO | (50, 200, 512)
2021-05-27 16:55:20,982 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:20,983 INFO | (50, 200, 512)
2021-05-27 16:55:20,984 INFO | BERT LAYER
2021-05-27 16:55:20,984 INFO | (200, 512)
2021-05-27 16:55:20,985 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,985 INFO | (200, 512)
2021-05-27 16:55:20,985 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,986 INFO | (200, 512)
2021-05-27 16:55:20,993 INFO | BERT LAYER LOOP
2021-05-27 16:55:20,993 INFO | (200, 512)
2021-05-27 16:55:20,994 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:20,994 INFO | (200, 512)
2021-05-27 16:55:20,999 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,000 INFO | (200, 512)
2021-05-27 16:55:21,000 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,001 INFO | (200, 512)
2021-05-27 16:55:21,005 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,005 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  41%|████▏     | 237/574 [00:26<00:35,  9.54it/s]

2021-05-27 16:55:21,074 INFO | INITIAL
2021-05-27 16:55:21,075 INFO | (50, 200)
2021-05-27 16:55:21,081 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:21,081 INFO | (50, 200, 512)
2021-05-27 16:55:21,083 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:21,083 INFO | (50, 200, 512)
2021-05-27 16:55:21,084 INFO | BERT LAYER
2021-05-27 16:55:21,084 INFO | (200, 512)
2021-05-27 16:55:21,085 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,085 INFO | (200, 512)
2021-05-27 16:55:21,086 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,086 INFO | (200, 512)
2021-05-27 16:55:21,093 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,094 INFO | (200, 512)
2021-05-27 16:55:21,094 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,094 INFO | (200, 512)
2021-05-27 16:55:21,101 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,101 INFO | (200, 512)
2021-05-27 16:55:21,102 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,102 INFO | (200, 512)
2021-05-27 16:55:21,109 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,109 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  41%|████▏     | 238/574 [00:26<00:35,  9.50it/s]

2021-05-27 16:55:21,181 INFO | INITIAL
2021-05-27 16:55:21,182 INFO | (50, 200)
2021-05-27 16:55:21,189 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:21,189 INFO | (50, 200, 512)
2021-05-27 16:55:21,190 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:21,191 INFO | (50, 200, 512)
2021-05-27 16:55:21,191 INFO | BERT LAYER
2021-05-27 16:55:21,192 INFO | (200, 512)
2021-05-27 16:55:21,193 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,193 INFO | (200, 512)
2021-05-27 16:55:21,194 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,194 INFO | (200, 512)
2021-05-27 16:55:21,199 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,199 INFO | (200, 512)
2021-05-27 16:55:21,200 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,200 INFO | (200, 512)
2021-05-27 16:55:21,205 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,206 INFO | (200, 512)
2021-05-27 16:55:21,206 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,207 INFO | (200, 512)
2021-05-27 16:55:21,213 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,214 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  42%|████▏     | 239/574 [00:26<00:34,  9.58it/s]

2021-05-27 16:55:21,283 INFO | INITIAL
2021-05-27 16:55:21,284 INFO | (50, 200)
2021-05-27 16:55:21,289 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:21,290 INFO | (50, 200, 512)
2021-05-27 16:55:21,292 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:21,292 INFO | (50, 200, 512)
2021-05-27 16:55:21,294 INFO | BERT LAYER
2021-05-27 16:55:21,294 INFO | (200, 512)
2021-05-27 16:55:21,295 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,295 INFO | (200, 512)
2021-05-27 16:55:21,296 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,297 INFO | (200, 512)
2021-05-27 16:55:21,304 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,304 INFO | (200, 512)
2021-05-27 16:55:21,305 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,305 INFO | (200, 512)
2021-05-27 16:55:21,312 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,312 INFO | (200, 512)
2021-05-27 16:55:21,313 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,313 INFO | (200, 512)
2021-05-27 16:55:21,320 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,320 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  42%|████▏     | 240/574 [00:26<00:35,  9.42it/s]

2021-05-27 16:55:21,393 INFO | INITIAL
2021-05-27 16:55:21,394 INFO | (50, 200)
2021-05-27 16:55:21,399 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:21,399 INFO | (50, 200, 512)
2021-05-27 16:55:21,400 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:21,401 INFO | (50, 200, 512)
2021-05-27 16:55:21,402 INFO | BERT LAYER
2021-05-27 16:55:21,402 INFO | (200, 512)
2021-05-27 16:55:21,402 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,402 INFO | (200, 512)
2021-05-27 16:55:21,403 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,403 INFO | (200, 512)
2021-05-27 16:55:21,409 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,409 INFO | (200, 512)
2021-05-27 16:55:21,410 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,410 INFO | (200, 512)
2021-05-27 16:55:21,415 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,416 INFO | (200, 512)
2021-05-27 16:55:21,416 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,417 INFO | (200, 512)
2021-05-27 16:55:21,423 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,424 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  42%|████▏     | 241/574 [00:26<00:34,  9.55it/s]

2021-05-27 16:55:21,494 INFO | INITIAL
2021-05-27 16:55:21,495 INFO | (50, 200)
2021-05-27 16:55:21,502 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:21,503 INFO | (50, 200, 512)
2021-05-27 16:55:21,504 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:21,505 INFO | (50, 200, 512)
2021-05-27 16:55:21,505 INFO | BERT LAYER
2021-05-27 16:55:21,506 INFO | (200, 512)
2021-05-27 16:55:21,506 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,506 INFO | (200, 512)
2021-05-27 16:55:21,507 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,507 INFO | (200, 512)
2021-05-27 16:55:21,513 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,514 INFO | (200, 512)
2021-05-27 16:55:21,514 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,515 INFO | (200, 512)
2021-05-27 16:55:21,519 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,520 INFO | (200, 512)
2021-05-27 16:55:21,520 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,521 INFO | (200, 512)
2021-05-27 16:55:21,526 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,527 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  42%|████▏     | 241/574 [00:26<00:34,  9.55it/s]

2021-05-27 16:55:21,592 INFO | INITIAL
2021-05-27 16:55:21,593 INFO | (50, 200)
2021-05-27 16:55:21,598 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:21,599 INFO | (50, 200, 512)
2021-05-27 16:55:21,600 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:21,600 INFO | (50, 200, 512)
2021-05-27 16:55:21,601 INFO | BERT LAYER
2021-05-27 16:55:21,601 INFO | (200, 512)
2021-05-27 16:55:21,602 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,602 INFO | (200, 512)
2021-05-27 16:55:21,602 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,602 INFO | (200, 512)
2021-05-27 16:55:21,607 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,608 INFO | (200, 512)
2021-05-27 16:55:21,608 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,609 INFO | (200, 512)
2021-05-27 16:55:21,616 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,616 INFO | (200, 512)
2021-05-27 16:55:21,617 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,617 INFO | (200, 512)
2021-05-27 16:55:21,625 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,626 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  42%|████▏     | 243/574 [00:26<00:34,  9.69it/s]

2021-05-27 16:55:21,697 INFO | INITIAL
2021-05-27 16:55:21,698 INFO | (50, 200)
2021-05-27 16:55:21,702 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:21,703 INFO | (50, 200, 512)
2021-05-27 16:55:21,704 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:21,705 INFO | (50, 200, 512)
2021-05-27 16:55:21,706 INFO | BERT LAYER
2021-05-27 16:55:21,706 INFO | (200, 512)
2021-05-27 16:55:21,706 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,707 INFO | (200, 512)
2021-05-27 16:55:21,708 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,708 INFO | (200, 512)
2021-05-27 16:55:21,716 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,717 INFO | (200, 512)
2021-05-27 16:55:21,717 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,718 INFO | (200, 512)
2021-05-27 16:55:21,723 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,724 INFO | (200, 512)
2021-05-27 16:55:21,724 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,725 INFO | (200, 512)
2021-05-27 16:55:21,730 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,731 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  43%|████▎     | 244/574 [00:26<00:33,  9.71it/s]

2021-05-27 16:55:21,800 INFO | INITIAL
2021-05-27 16:55:21,800 INFO | (50, 200)
2021-05-27 16:55:21,805 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:21,805 INFO | (50, 200, 512)
2021-05-27 16:55:21,807 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:21,807 INFO | (50, 200, 512)
2021-05-27 16:55:21,808 INFO | BERT LAYER
2021-05-27 16:55:21,808 INFO | (200, 512)
2021-05-27 16:55:21,809 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,809 INFO | (200, 512)
2021-05-27 16:55:21,810 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,810 INFO | (200, 512)
2021-05-27 16:55:21,815 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,816 INFO | (200, 512)
2021-05-27 16:55:21,816 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,816 INFO | (200, 512)
2021-05-27 16:55:21,822 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,823 INFO | (200, 512)
2021-05-27 16:55:21,823 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,825 INFO | (200, 512)
2021-05-27 16:55:21,832 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,832 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  43%|████▎     | 245/574 [00:27<00:34,  9.46it/s]

2021-05-27 16:55:21,913 INFO | INITIAL
2021-05-27 16:55:21,914 INFO | (50, 200)
2021-05-27 16:55:21,918 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:21,919 INFO | (50, 200, 512)
2021-05-27 16:55:21,920 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:21,920 INFO | (50, 200, 512)
2021-05-27 16:55:21,921 INFO | BERT LAYER
2021-05-27 16:55:21,921 INFO | (200, 512)
2021-05-27 16:55:21,922 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,922 INFO | (200, 512)
2021-05-27 16:55:21,922 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,922 INFO | (200, 512)
2021-05-27 16:55:21,929 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,929 INFO | (200, 512)
2021-05-27 16:55:21,930 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,930 INFO | (200, 512)
2021-05-27 16:55:21,937 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,938 INFO | (200, 512)
2021-05-27 16:55:21,938 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:21,938 INFO | (200, 512)
2021-05-27 16:55:21,945 INFO | BERT LAYER LOOP
2021-05-27 16:55:21,945 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  43%|████▎     | 246/574 [00:27<00:34,  9.53it/s]

2021-05-27 16:55:22,016 INFO | INITIAL
2021-05-27 16:55:22,016 INFO | (50, 200)
2021-05-27 16:55:22,022 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:22,022 INFO | (50, 200, 512)
2021-05-27 16:55:22,024 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:22,024 INFO | (50, 200, 512)
2021-05-27 16:55:22,026 INFO | BERT LAYER
2021-05-27 16:55:22,026 INFO | (200, 512)
2021-05-27 16:55:22,027 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,027 INFO | (200, 512)
2021-05-27 16:55:22,028 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,028 INFO | (200, 512)
2021-05-27 16:55:22,034 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,034 INFO | (200, 512)
2021-05-27 16:55:22,035 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,035 INFO | (200, 512)
2021-05-27 16:55:22,042 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,043 INFO | (200, 512)
2021-05-27 16:55:22,043 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,043 INFO | (200, 512)
2021-05-27 16:55:22,049 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,050 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  43%|████▎     | 247/574 [00:27<00:34,  9.55it/s]

2021-05-27 16:55:22,120 INFO | INITIAL
2021-05-27 16:55:22,120 INFO | (50, 200)
2021-05-27 16:55:22,126 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:22,126 INFO | (50, 200, 512)
2021-05-27 16:55:22,127 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:22,128 INFO | (50, 200, 512)
2021-05-27 16:55:22,129 INFO | BERT LAYER
2021-05-27 16:55:22,129 INFO | (200, 512)
2021-05-27 16:55:22,129 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,130 INFO | (200, 512)
2021-05-27 16:55:22,130 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,130 INFO | (200, 512)
2021-05-27 16:55:22,137 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,138 INFO | (200, 512)
2021-05-27 16:55:22,138 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,139 INFO | (200, 512)
2021-05-27 16:55:22,145 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,145 INFO | (200, 512)
2021-05-27 16:55:22,146 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,146 INFO | (200, 512)
2021-05-27 16:55:22,152 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,152 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  43%|████▎     | 248/574 [00:27<00:34,  9.55it/s]

2021-05-27 16:55:22,225 INFO | INITIAL
2021-05-27 16:55:22,225 INFO | (50, 200)
2021-05-27 16:55:22,232 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:22,232 INFO | (50, 200, 512)
2021-05-27 16:55:22,233 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:22,234 INFO | (50, 200, 512)
2021-05-27 16:55:22,235 INFO | BERT LAYER
2021-05-27 16:55:22,235 INFO | (200, 512)
2021-05-27 16:55:22,235 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,236 INFO | (200, 512)
2021-05-27 16:55:22,236 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,237 INFO | (200, 512)
2021-05-27 16:55:22,243 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,244 INFO | (200, 512)
2021-05-27 16:55:22,244 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,245 INFO | (200, 512)
2021-05-27 16:55:22,253 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,253 INFO | (200, 512)
2021-05-27 16:55:22,254 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,254 INFO | (200, 512)
2021-05-27 16:55:22,260 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,261 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  43%|████▎     | 249/574 [00:27<00:33,  9.56it/s]

2021-05-27 16:55:22,329 INFO | INITIAL
2021-05-27 16:55:22,329 INFO | (50, 200)
2021-05-27 16:55:22,334 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:22,334 INFO | (50, 200, 512)
2021-05-27 16:55:22,335 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:22,335 INFO | (50, 200, 512)
2021-05-27 16:55:22,336 INFO | BERT LAYER
2021-05-27 16:55:22,336 INFO | (200, 512)
2021-05-27 16:55:22,337 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,337 INFO | (200, 512)
2021-05-27 16:55:22,338 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,338 INFO | (200, 512)
2021-05-27 16:55:22,344 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,344 INFO | (200, 512)
2021-05-27 16:55:22,345 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,345 INFO | (200, 512)
2021-05-27 16:55:22,353 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,353 INFO | (200, 512)
2021-05-27 16:55:22,354 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,354 INFO | (200, 512)
2021-05-27 16:55:22,360 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,361 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  43%|████▎     | 249/574 [00:27<00:33,  9.56it/s]

2021-05-27 16:55:22,427 INFO | INITIAL
2021-05-27 16:55:22,428 INFO | (50, 200)
2021-05-27 16:55:22,434 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:22,434 INFO | (50, 200, 512)
2021-05-27 16:55:22,436 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:22,436 INFO | (50, 200, 512)
2021-05-27 16:55:22,437 INFO | BERT LAYER
2021-05-27 16:55:22,437 INFO | (200, 512)
2021-05-27 16:55:22,438 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,438 INFO | (200, 512)
2021-05-27 16:55:22,439 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,439 INFO | (200, 512)
2021-05-27 16:55:22,445 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,446 INFO | (200, 512)
2021-05-27 16:55:22,446 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,447 INFO | (200, 512)
2021-05-27 16:55:22,452 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,452 INFO | (200, 512)
2021-05-27 16:55:22,453 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,453 INFO | (200, 512)
2021-05-27 16:55:22,460 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,461 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  44%|████▎     | 251/574 [00:27<00:33,  9.66it/s]

2021-05-27 16:55:22,533 INFO | INITIAL
2021-05-27 16:55:22,534 INFO | (50, 200)
2021-05-27 16:55:22,538 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:22,539 INFO | (50, 200, 512)
2021-05-27 16:55:22,540 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:22,540 INFO | (50, 200, 512)
2021-05-27 16:55:22,541 INFO | BERT LAYER
2021-05-27 16:55:22,541 INFO | (200, 512)
2021-05-27 16:55:22,542 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,542 INFO | (200, 512)
2021-05-27 16:55:22,543 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,543 INFO | (200, 512)
2021-05-27 16:55:22,548 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,549 INFO | (200, 512)
2021-05-27 16:55:22,549 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,549 INFO | (200, 512)
2021-05-27 16:55:22,555 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,555 INFO | (200, 512)
2021-05-27 16:55:22,556 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,556 INFO | (200, 512)
2021-05-27 16:55:22,563 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,563 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  44%|████▍     | 252/574 [00:27<00:33,  9.67it/s]

2021-05-27 16:55:22,636 INFO | INITIAL
2021-05-27 16:55:22,637 INFO | (50, 200)
2021-05-27 16:55:22,642 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:22,643 INFO | (50, 200, 512)
2021-05-27 16:55:22,644 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:22,645 INFO | (50, 200, 512)
2021-05-27 16:55:22,646 INFO | BERT LAYER
2021-05-27 16:55:22,646 INFO | (200, 512)
2021-05-27 16:55:22,647 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,648 INFO | (200, 512)
2021-05-27 16:55:22,648 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,648 INFO | (200, 512)
2021-05-27 16:55:22,655 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,656 INFO | (200, 512)
2021-05-27 16:55:22,657 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,657 INFO | (200, 512)
2021-05-27 16:55:22,664 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,664 INFO | (200, 512)
2021-05-27 16:55:22,665 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,665 INFO | (200, 512)
2021-05-27 16:55:22,670 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,670 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  44%|████▍     | 253/574 [00:27<00:33,  9.58it/s]

2021-05-27 16:55:22,744 INFO | INITIAL
2021-05-27 16:55:22,744 INFO | (50, 200)
2021-05-27 16:55:22,749 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:22,749 INFO | (50, 200, 512)
2021-05-27 16:55:22,751 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:22,751 INFO | (50, 200, 512)
2021-05-27 16:55:22,752 INFO | BERT LAYER
2021-05-27 16:55:22,752 INFO | (200, 512)
2021-05-27 16:55:22,752 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,753 INFO | (200, 512)
2021-05-27 16:55:22,753 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,753 INFO | (200, 512)
2021-05-27 16:55:22,760 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,760 INFO | (200, 512)
2021-05-27 16:55:22,761 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,762 INFO | (200, 512)
2021-05-27 16:55:22,768 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,769 INFO | (200, 512)
2021-05-27 16:55:22,769 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,770 INFO | (200, 512)
2021-05-27 16:55:22,775 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,776 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  44%|████▍     | 253/574 [00:28<00:33,  9.58it/s]

2021-05-27 16:55:22,842 INFO | INITIAL
2021-05-27 16:55:22,842 INFO | (50, 200)
2021-05-27 16:55:22,849 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:22,849 INFO | (50, 200, 512)
2021-05-27 16:55:22,851 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:22,851 INFO | (50, 200, 512)
2021-05-27 16:55:22,852 INFO | BERT LAYER
2021-05-27 16:55:22,852 INFO | (200, 512)
2021-05-27 16:55:22,853 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,853 INFO | (200, 512)
2021-05-27 16:55:22,853 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,854 INFO | (200, 512)
2021-05-27 16:55:22,860 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,861 INFO | (200, 512)
2021-05-27 16:55:22,862 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,862 INFO | (200, 512)
2021-05-27 16:55:22,867 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,868 INFO | (200, 512)
2021-05-27 16:55:22,868 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,869 INFO | (200, 512)
2021-05-27 16:55:22,874 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,874 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  44%|████▍     | 255/574 [00:28<00:32,  9.68it/s]

2021-05-27 16:55:22,947 INFO | INITIAL
2021-05-27 16:55:22,948 INFO | (50, 200)
2021-05-27 16:55:22,953 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:22,954 INFO | (50, 200, 512)
2021-05-27 16:55:22,955 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:22,956 INFO | (50, 200, 512)
2021-05-27 16:55:22,957 INFO | BERT LAYER
2021-05-27 16:55:22,958 INFO | (200, 512)
2021-05-27 16:55:22,958 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,959 INFO | (200, 512)
2021-05-27 16:55:22,960 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,960 INFO | (200, 512)
2021-05-27 16:55:22,967 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,967 INFO | (200, 512)
2021-05-27 16:55:22,968 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,968 INFO | (200, 512)
2021-05-27 16:55:22,973 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,974 INFO | (200, 512)
2021-05-27 16:55:22,974 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:22,975 INFO | (200, 512)
2021-05-27 16:55:22,981 INFO | BERT LAYER LOOP
2021-05-27 16:55:22,981 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  45%|████▍     | 256/574 [00:28<00:32,  9.65it/s]

2021-05-27 16:55:23,052 INFO | INITIAL
2021-05-27 16:55:23,052 INFO | (50, 200)
2021-05-27 16:55:23,059 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:23,060 INFO | (50, 200, 512)
2021-05-27 16:55:23,061 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:23,062 INFO | (50, 200, 512)
2021-05-27 16:55:23,063 INFO | BERT LAYER
2021-05-27 16:55:23,063 INFO | (200, 512)
2021-05-27 16:55:23,064 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,064 INFO | (200, 512)
2021-05-27 16:55:23,065 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,065 INFO | (200, 512)
2021-05-27 16:55:23,073 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,073 INFO | (200, 512)
2021-05-27 16:55:23,073 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,074 INFO | (200, 512)
2021-05-27 16:55:23,081 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,082 INFO | (200, 512)
2021-05-27 16:55:23,082 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,083 INFO | (200, 512)
2021-05-27 16:55:23,089 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,090 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  45%|████▍     | 257/574 [00:28<00:33,  9.53it/s]

2021-05-27 16:55:23,161 INFO | INITIAL
2021-05-27 16:55:23,162 INFO | (50, 200)
2021-05-27 16:55:23,172 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:23,173 INFO | (50, 200, 512)
2021-05-27 16:55:23,174 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:23,175 INFO | (50, 200, 512)
2021-05-27 16:55:23,176 INFO | BERT LAYER
2021-05-27 16:55:23,177 INFO | (200, 512)
2021-05-27 16:55:23,177 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,178 INFO | (200, 512)
2021-05-27 16:55:23,178 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,179 INFO | (200, 512)
2021-05-27 16:55:23,185 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,185 INFO | (200, 512)
2021-05-27 16:55:23,186 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,186 INFO | (200, 512)
2021-05-27 16:55:23,193 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,193 INFO | (200, 512)
2021-05-27 16:55:23,194 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,194 INFO | (200, 512)
2021-05-27 16:55:23,199 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,200 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  45%|████▍     | 258/574 [00:28<00:33,  9.49it/s]

2021-05-27 16:55:23,267 INFO | INITIAL
2021-05-27 16:55:23,268 INFO | (50, 200)
2021-05-27 16:55:23,273 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:23,273 INFO | (50, 200, 512)
2021-05-27 16:55:23,275 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:23,275 INFO | (50, 200, 512)
2021-05-27 16:55:23,276 INFO | BERT LAYER
2021-05-27 16:55:23,277 INFO | (200, 512)
2021-05-27 16:55:23,277 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,278 INFO | (200, 512)
2021-05-27 16:55:23,278 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,279 INFO | (200, 512)
2021-05-27 16:55:23,285 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,285 INFO | (200, 512)
2021-05-27 16:55:23,286 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,286 INFO | (200, 512)
2021-05-27 16:55:23,292 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,293 INFO | (200, 512)
2021-05-27 16:55:23,294 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,295 INFO | (200, 512)
2021-05-27 16:55:23,301 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,301 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  45%|████▌     | 259/574 [00:28<00:33,  9.50it/s]

2021-05-27 16:55:23,372 INFO | INITIAL
2021-05-27 16:55:23,373 INFO | (50, 200)
2021-05-27 16:55:23,379 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:23,380 INFO | (50, 200, 512)
2021-05-27 16:55:23,381 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:23,382 INFO | (50, 200, 512)
2021-05-27 16:55:23,383 INFO | BERT LAYER
2021-05-27 16:55:23,383 INFO | (200, 512)
2021-05-27 16:55:23,384 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,385 INFO | (200, 512)
2021-05-27 16:55:23,385 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,386 INFO | (200, 512)
2021-05-27 16:55:23,392 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,393 INFO | (200, 512)
2021-05-27 16:55:23,393 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,393 INFO | (200, 512)
2021-05-27 16:55:23,400 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,400 INFO | (200, 512)
2021-05-27 16:55:23,401 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,401 INFO | (200, 512)
2021-05-27 16:55:23,407 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,408 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  45%|████▌     | 260/574 [00:28<00:33,  9.40it/s]

2021-05-27 16:55:23,481 INFO | INITIAL
2021-05-27 16:55:23,482 INFO | (50, 200)
2021-05-27 16:55:23,487 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:23,488 INFO | (50, 200, 512)
2021-05-27 16:55:23,490 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:23,490 INFO | (50, 200, 512)
2021-05-27 16:55:23,491 INFO | BERT LAYER
2021-05-27 16:55:23,492 INFO | (200, 512)
2021-05-27 16:55:23,492 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,493 INFO | (200, 512)
2021-05-27 16:55:23,496 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,496 INFO | (200, 512)
2021-05-27 16:55:23,502 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,503 INFO | (200, 512)
2021-05-27 16:55:23,503 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,504 INFO | (200, 512)
2021-05-27 16:55:23,509 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,510 INFO | (200, 512)
2021-05-27 16:55:23,510 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,511 INFO | (200, 512)
2021-05-27 16:55:23,516 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,517 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  45%|████▌     | 261/574 [00:28<00:33,  9.27it/s]

2021-05-27 16:55:23,593 INFO | INITIAL
2021-05-27 16:55:23,594 INFO | (50, 200)
2021-05-27 16:55:23,599 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:23,599 INFO | (50, 200, 512)
2021-05-27 16:55:23,601 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:23,601 INFO | (50, 200, 512)
2021-05-27 16:55:23,602 INFO | BERT LAYER
2021-05-27 16:55:23,602 INFO | (200, 512)
2021-05-27 16:55:23,603 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,603 INFO | (200, 512)
2021-05-27 16:55:23,604 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,604 INFO | (200, 512)
2021-05-27 16:55:23,611 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,612 INFO | (200, 512)
2021-05-27 16:55:23,613 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,613 INFO | (200, 512)
2021-05-27 16:55:23,619 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,620 INFO | (200, 512)
2021-05-27 16:55:23,620 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,621 INFO | (200, 512)
2021-05-27 16:55:23,628 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,629 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  46%|████▌     | 262/574 [00:28<00:33,  9.24it/s]

2021-05-27 16:55:23,702 INFO | INITIAL
2021-05-27 16:55:23,703 INFO | (50, 200)
2021-05-27 16:55:23,709 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:23,710 INFO | (50, 200, 512)
2021-05-27 16:55:23,711 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:23,712 INFO | (50, 200, 512)
2021-05-27 16:55:23,713 INFO | BERT LAYER
2021-05-27 16:55:23,714 INFO | (200, 512)
2021-05-27 16:55:23,714 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,715 INFO | (200, 512)
2021-05-27 16:55:23,715 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,716 INFO | (200, 512)
2021-05-27 16:55:23,722 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,723 INFO | (200, 512)
2021-05-27 16:55:23,723 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,724 INFO | (200, 512)
2021-05-27 16:55:23,729 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,730 INFO | (200, 512)
2021-05-27 16:55:23,731 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,731 INFO | (200, 512)
2021-05-27 16:55:23,737 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,737 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  46%|████▌     | 263/574 [00:29<00:33,  9.15it/s]

2021-05-27 16:55:23,814 INFO | INITIAL
2021-05-27 16:55:23,815 INFO | (50, 200)
2021-05-27 16:55:23,820 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:23,820 INFO | (50, 200, 512)
2021-05-27 16:55:23,822 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:23,822 INFO | (50, 200, 512)
2021-05-27 16:55:23,823 INFO | BERT LAYER
2021-05-27 16:55:23,824 INFO | (200, 512)
2021-05-27 16:55:23,824 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,824 INFO | (200, 512)
2021-05-27 16:55:23,825 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,825 INFO | (200, 512)
2021-05-27 16:55:23,832 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,833 INFO | (200, 512)
2021-05-27 16:55:23,833 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,834 INFO | (200, 512)
2021-05-27 16:55:23,840 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,840 INFO | (200, 512)
2021-05-27 16:55:23,841 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,841 INFO | (200, 512)
2021-05-27 16:55:23,848 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,848 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  46%|████▌     | 264/574 [00:29<00:33,  9.17it/s]

2021-05-27 16:55:23,923 INFO | INITIAL
2021-05-27 16:55:23,923 INFO | (50, 200)
2021-05-27 16:55:23,931 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:23,931 INFO | (50, 200, 512)
2021-05-27 16:55:23,933 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:23,933 INFO | (50, 200, 512)
2021-05-27 16:55:23,934 INFO | BERT LAYER
2021-05-27 16:55:23,935 INFO | (200, 512)
2021-05-27 16:55:23,935 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,935 INFO | (200, 512)
2021-05-27 16:55:23,936 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,936 INFO | (200, 512)
2021-05-27 16:55:23,942 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,943 INFO | (200, 512)
2021-05-27 16:55:23,943 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,944 INFO | (200, 512)
2021-05-27 16:55:23,949 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,950 INFO | (200, 512)
2021-05-27 16:55:23,950 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:23,951 INFO | (200, 512)
2021-05-27 16:55:23,956 INFO | BERT LAYER LOOP
2021-05-27 16:55:23,956 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  46%|████▌     | 265/574 [00:29<00:33,  9.12it/s]

2021-05-27 16:55:24,034 INFO | INITIAL
2021-05-27 16:55:24,034 INFO | (50, 200)
2021-05-27 16:55:24,040 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:24,041 INFO | (50, 200, 512)
2021-05-27 16:55:24,042 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:24,043 INFO | (50, 200, 512)
2021-05-27 16:55:24,044 INFO | BERT LAYER
2021-05-27 16:55:24,045 INFO | (200, 512)
2021-05-27 16:55:24,045 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,046 INFO | (200, 512)
2021-05-27 16:55:24,047 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,047 INFO | (200, 512)
2021-05-27 16:55:24,055 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,055 INFO | (200, 512)
2021-05-27 16:55:24,055 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,056 INFO | (200, 512)
2021-05-27 16:55:24,064 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,064 INFO | (200, 512)
2021-05-27 16:55:24,065 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,065 INFO | (200, 512)
2021-05-27 16:55:24,070 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,071 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  46%|████▋     | 266/574 [00:29<00:33,  9.10it/s]

2021-05-27 16:55:24,144 INFO | INITIAL
2021-05-27 16:55:24,145 INFO | (50, 200)
2021-05-27 16:55:24,150 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:24,150 INFO | (50, 200, 512)
2021-05-27 16:55:24,152 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:24,152 INFO | (50, 200, 512)
2021-05-27 16:55:24,153 INFO | BERT LAYER
2021-05-27 16:55:24,153 INFO | (200, 512)
2021-05-27 16:55:24,154 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,154 INFO | (200, 512)
2021-05-27 16:55:24,155 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,155 INFO | (200, 512)
2021-05-27 16:55:24,163 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,163 INFO | (200, 512)
2021-05-27 16:55:24,164 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,165 INFO | (200, 512)
2021-05-27 16:55:24,171 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,171 INFO | (200, 512)
2021-05-27 16:55:24,172 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,172 INFO | (200, 512)
2021-05-27 16:55:24,178 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,179 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  47%|████▋     | 267/574 [00:29<00:33,  9.06it/s]

2021-05-27 16:55:24,256 INFO | INITIAL
2021-05-27 16:55:24,256 INFO | (50, 200)
2021-05-27 16:55:24,263 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:24,263 INFO | (50, 200, 512)
2021-05-27 16:55:24,265 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:24,265 INFO | (50, 200, 512)
2021-05-27 16:55:24,266 INFO | BERT LAYER
2021-05-27 16:55:24,266 INFO | (200, 512)
2021-05-27 16:55:24,267 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,267 INFO | (200, 512)
2021-05-27 16:55:24,267 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,268 INFO | (200, 512)
2021-05-27 16:55:24,273 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,273 INFO | (200, 512)
2021-05-27 16:55:24,274 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,275 INFO | (200, 512)
2021-05-27 16:55:24,282 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,283 INFO | (200, 512)
2021-05-27 16:55:24,283 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,283 INFO | (200, 512)
2021-05-27 16:55:24,290 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,291 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  47%|████▋     | 268/574 [00:29<00:34,  8.99it/s]

2021-05-27 16:55:24,369 INFO | INITIAL
2021-05-27 16:55:24,370 INFO | (50, 200)
2021-05-27 16:55:24,377 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:24,377 INFO | (50, 200, 512)
2021-05-27 16:55:24,379 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:24,380 INFO | (50, 200, 512)
2021-05-27 16:55:24,381 INFO | BERT LAYER
2021-05-27 16:55:24,381 INFO | (200, 512)
2021-05-27 16:55:24,382 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,382 INFO | (200, 512)
2021-05-27 16:55:24,383 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,383 INFO | (200, 512)
2021-05-27 16:55:24,389 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,390 INFO | (200, 512)
2021-05-27 16:55:24,390 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,391 INFO | (200, 512)
2021-05-27 16:55:24,398 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,398 INFO | (200, 512)
2021-05-27 16:55:24,399 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,399 INFO | (200, 512)
2021-05-27 16:55:24,406 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,406 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  47%|████▋     | 269/574 [00:29<00:33,  8.99it/s]

2021-05-27 16:55:24,480 INFO | INITIAL
2021-05-27 16:55:24,480 INFO | (50, 200)
2021-05-27 16:55:24,486 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:24,486 INFO | (50, 200, 512)
2021-05-27 16:55:24,488 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:24,488 INFO | (50, 200, 512)
2021-05-27 16:55:24,489 INFO | BERT LAYER
2021-05-27 16:55:24,489 INFO | (200, 512)
2021-05-27 16:55:24,489 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,490 INFO | (200, 512)
2021-05-27 16:55:24,490 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,491 INFO | (200, 512)
2021-05-27 16:55:24,498 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,499 INFO | (200, 512)
2021-05-27 16:55:24,500 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,500 INFO | (200, 512)
2021-05-27 16:55:24,505 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,506 INFO | (200, 512)
2021-05-27 16:55:24,506 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,507 INFO | (200, 512)
2021-05-27 16:55:24,513 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,514 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  47%|████▋     | 270/574 [00:29<00:32,  9.21it/s]

2021-05-27 16:55:24,582 INFO | INITIAL
2021-05-27 16:55:24,583 INFO | (50, 200)
2021-05-27 16:55:24,588 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:24,589 INFO | (50, 200, 512)
2021-05-27 16:55:24,591 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:24,592 INFO | (50, 200, 512)
2021-05-27 16:55:24,593 INFO | BERT LAYER
2021-05-27 16:55:24,593 INFO | (200, 512)
2021-05-27 16:55:24,594 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,594 INFO | (200, 512)
2021-05-27 16:55:24,594 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,595 INFO | (200, 512)
2021-05-27 16:55:24,601 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,601 INFO | (200, 512)
2021-05-27 16:55:24,602 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,602 INFO | (200, 512)
2021-05-27 16:55:24,607 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,608 INFO | (200, 512)
2021-05-27 16:55:24,609 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,609 INFO | (200, 512)
2021-05-27 16:55:24,616 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,617 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  47%|████▋     | 271/574 [00:29<00:32,  9.33it/s]

2021-05-27 16:55:24,687 INFO | INITIAL
2021-05-27 16:55:24,687 INFO | (50, 200)
2021-05-27 16:55:24,694 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:24,695 INFO | (50, 200, 512)
2021-05-27 16:55:24,696 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:24,697 INFO | (50, 200, 512)
2021-05-27 16:55:24,698 INFO | BERT LAYER
2021-05-27 16:55:24,699 INFO | (200, 512)
2021-05-27 16:55:24,699 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,700 INFO | (200, 512)
2021-05-27 16:55:24,701 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,702 INFO | (200, 512)
2021-05-27 16:55:24,708 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,709 INFO | (200, 512)
2021-05-27 16:55:24,710 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,710 INFO | (200, 512)
2021-05-27 16:55:24,716 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,716 INFO | (200, 512)
2021-05-27 16:55:24,717 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,718 INFO | (200, 512)
2021-05-27 16:55:24,723 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,724 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  47%|████▋     | 272/574 [00:29<00:33,  9.13it/s]

2021-05-27 16:55:24,801 INFO | INITIAL
2021-05-27 16:55:24,802 INFO | (50, 200)
2021-05-27 16:55:24,812 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:24,813 INFO | (50, 200, 512)
2021-05-27 16:55:24,814 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:24,815 INFO | (50, 200, 512)
2021-05-27 16:55:24,816 INFO | BERT LAYER
2021-05-27 16:55:24,816 INFO | (200, 512)
2021-05-27 16:55:24,817 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,817 INFO | (200, 512)
2021-05-27 16:55:24,818 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,818 INFO | (200, 512)
2021-05-27 16:55:24,825 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,825 INFO | (200, 512)
2021-05-27 16:55:24,826 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,826 INFO | (200, 512)
2021-05-27 16:55:24,834 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,835 INFO | (200, 512)
2021-05-27 16:55:24,836 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,836 INFO | (200, 512)
2021-05-27 16:55:24,843 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,845 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  48%|████▊     | 273/574 [00:30<00:34,  8.64it/s]

2021-05-27 16:55:24,932 INFO | INITIAL
2021-05-27 16:55:24,933 INFO | (50, 200)
2021-05-27 16:55:24,940 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:24,941 INFO | (50, 200, 512)
2021-05-27 16:55:24,943 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:24,943 INFO | (50, 200, 512)
2021-05-27 16:55:24,944 INFO | BERT LAYER
2021-05-27 16:55:24,944 INFO | (200, 512)
2021-05-27 16:55:24,944 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,945 INFO | (200, 512)
2021-05-27 16:55:24,945 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,946 INFO | (200, 512)
2021-05-27 16:55:24,953 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,953 INFO | (200, 512)
2021-05-27 16:55:24,954 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,954 INFO | (200, 512)
2021-05-27 16:55:24,961 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,962 INFO | (200, 512)
2021-05-27 16:55:24,962 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:24,963 INFO | (200, 512)
2021-05-27 16:55:24,970 INFO | BERT LAYER LOOP
2021-05-27 16:55:24,970 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  48%|████▊     | 274/574 [00:30<00:34,  8.71it/s]

2021-05-27 16:55:25,044 INFO | INITIAL
2021-05-27 16:55:25,045 INFO | (50, 200)
2021-05-27 16:55:25,050 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:25,050 INFO | (50, 200, 512)
2021-05-27 16:55:25,052 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:25,052 INFO | (50, 200, 512)
2021-05-27 16:55:25,053 INFO | BERT LAYER
2021-05-27 16:55:25,053 INFO | (200, 512)
2021-05-27 16:55:25,054 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,054 INFO | (200, 512)
2021-05-27 16:55:25,055 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,056 INFO | (200, 512)
2021-05-27 16:55:25,062 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,063 INFO | (200, 512)
2021-05-27 16:55:25,063 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,064 INFO | (200, 512)
2021-05-27 16:55:25,071 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,071 INFO | (200, 512)
2021-05-27 16:55:25,072 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,072 INFO | (200, 512)
2021-05-27 16:55:25,080 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,080 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  48%|████▊     | 275/574 [00:30<00:33,  8.85it/s]

2021-05-27 16:55:25,153 INFO | INITIAL
2021-05-27 16:55:25,155 INFO | (50, 200)
2021-05-27 16:55:25,161 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:25,162 INFO | (50, 200, 512)
2021-05-27 16:55:25,164 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:25,165 INFO | (50, 200, 512)
2021-05-27 16:55:25,166 INFO | BERT LAYER
2021-05-27 16:55:25,167 INFO | (200, 512)
2021-05-27 16:55:25,168 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,168 INFO | (200, 512)
2021-05-27 16:55:25,169 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,169 INFO | (200, 512)
2021-05-27 16:55:25,178 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,178 INFO | (200, 512)
2021-05-27 16:55:25,179 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,179 INFO | (200, 512)
2021-05-27 16:55:25,185 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,186 INFO | (200, 512)
2021-05-27 16:55:25,186 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,187 INFO | (200, 512)
2021-05-27 16:55:25,193 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,193 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  48%|████▊     | 276/574 [00:30<00:33,  8.89it/s]

2021-05-27 16:55:25,265 INFO | INITIAL
2021-05-27 16:55:25,266 INFO | (50, 200)
2021-05-27 16:55:25,270 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:25,271 INFO | (50, 200, 512)
2021-05-27 16:55:25,272 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:25,272 INFO | (50, 200, 512)
2021-05-27 16:55:25,273 INFO | BERT LAYER
2021-05-27 16:55:25,273 INFO | (200, 512)
2021-05-27 16:55:25,274 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,274 INFO | (200, 512)
2021-05-27 16:55:25,275 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,276 INFO | (200, 512)
2021-05-27 16:55:25,281 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,281 INFO | (200, 512)
2021-05-27 16:55:25,281 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,282 INFO | (200, 512)
2021-05-27 16:55:25,288 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,289 INFO | (200, 512)
2021-05-27 16:55:25,289 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,290 INFO | (200, 512)
2021-05-27 16:55:25,298 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,298 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  48%|████▊     | 277/574 [00:30<00:32,  9.07it/s]

2021-05-27 16:55:25,369 INFO | INITIAL
2021-05-27 16:55:25,370 INFO | (50, 200)
2021-05-27 16:55:25,377 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:25,377 INFO | (50, 200, 512)
2021-05-27 16:55:25,378 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:25,379 INFO | (50, 200, 512)
2021-05-27 16:55:25,380 INFO | BERT LAYER
2021-05-27 16:55:25,380 INFO | (200, 512)
2021-05-27 16:55:25,380 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,381 INFO | (200, 512)
2021-05-27 16:55:25,381 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,381 INFO | (200, 512)
2021-05-27 16:55:25,387 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,387 INFO | (200, 512)
2021-05-27 16:55:25,387 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,388 INFO | (200, 512)
2021-05-27 16:55:25,393 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,394 INFO | (200, 512)
2021-05-27 16:55:25,394 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,394 INFO | (200, 512)
2021-05-27 16:55:25,400 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,400 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  48%|████▊     | 278/574 [00:30<00:32,  9.24it/s]

2021-05-27 16:55:25,473 INFO | INITIAL
2021-05-27 16:55:25,473 INFO | (50, 200)
2021-05-27 16:55:25,482 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:25,483 INFO | (50, 200, 512)
2021-05-27 16:55:25,484 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:25,484 INFO | (50, 200, 512)
2021-05-27 16:55:25,485 INFO | BERT LAYER
2021-05-27 16:55:25,485 INFO | (200, 512)
2021-05-27 16:55:25,486 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,486 INFO | (200, 512)
2021-05-27 16:55:25,486 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,487 INFO | (200, 512)
2021-05-27 16:55:25,493 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,493 INFO | (200, 512)
2021-05-27 16:55:25,493 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,494 INFO | (200, 512)
2021-05-27 16:55:25,499 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,500 INFO | (200, 512)
2021-05-27 16:55:25,500 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,501 INFO | (200, 512)
2021-05-27 16:55:25,507 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,507 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  49%|████▊     | 279/574 [00:30<00:31,  9.30it/s]

2021-05-27 16:55:25,580 INFO | INITIAL
2021-05-27 16:55:25,581 INFO | (50, 200)
2021-05-27 16:55:25,588 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:25,588 INFO | (50, 200, 512)
2021-05-27 16:55:25,590 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:25,590 INFO | (50, 200, 512)
2021-05-27 16:55:25,595 INFO | BERT LAYER
2021-05-27 16:55:25,596 INFO | (200, 512)
2021-05-27 16:55:25,596 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,597 INFO | (200, 512)
2021-05-27 16:55:25,599 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,599 INFO | (200, 512)
2021-05-27 16:55:25,606 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,606 INFO | (200, 512)
2021-05-27 16:55:25,607 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,607 INFO | (200, 512)
2021-05-27 16:55:25,614 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,615 INFO | (200, 512)
2021-05-27 16:55:25,615 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,616 INFO | (200, 512)
2021-05-27 16:55:25,623 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,623 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  49%|████▉     | 280/574 [00:30<00:33,  8.88it/s]

2021-05-27 16:55:25,704 INFO | INITIAL
2021-05-27 16:55:25,704 INFO | (50, 200)
2021-05-27 16:55:25,711 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:25,712 INFO | (50, 200, 512)
2021-05-27 16:55:25,713 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:25,713 INFO | (50, 200, 512)
2021-05-27 16:55:25,714 INFO | BERT LAYER
2021-05-27 16:55:25,714 INFO | (200, 512)
2021-05-27 16:55:25,715 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,715 INFO | (200, 512)
2021-05-27 16:55:25,716 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,716 INFO | (200, 512)
2021-05-27 16:55:25,721 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,722 INFO | (200, 512)
2021-05-27 16:55:25,722 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,723 INFO | (200, 512)
2021-05-27 16:55:25,728 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,728 INFO | (200, 512)
2021-05-27 16:55:25,729 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,729 INFO | (200, 512)
2021-05-27 16:55:25,735 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,736 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  49%|████▉     | 281/574 [00:30<00:32,  9.15it/s]

2021-05-27 16:55:25,805 INFO | INITIAL
2021-05-27 16:55:25,805 INFO | (50, 200)
2021-05-27 16:55:25,810 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:25,811 INFO | (50, 200, 512)
2021-05-27 16:55:25,812 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:25,812 INFO | (50, 200, 512)
2021-05-27 16:55:25,813 INFO | BERT LAYER
2021-05-27 16:55:25,814 INFO | (200, 512)
2021-05-27 16:55:25,814 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,814 INFO | (200, 512)
2021-05-27 16:55:25,815 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,815 INFO | (200, 512)
2021-05-27 16:55:25,821 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,821 INFO | (200, 512)
2021-05-27 16:55:25,822 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,822 INFO | (200, 512)
2021-05-27 16:55:25,828 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,828 INFO | (200, 512)
2021-05-27 16:55:25,828 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,829 INFO | (200, 512)
2021-05-27 16:55:25,836 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,837 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  49%|████▉     | 282/574 [00:31<00:31,  9.32it/s]

2021-05-27 16:55:25,907 INFO | INITIAL
2021-05-27 16:55:25,908 INFO | (50, 200)
2021-05-27 16:55:25,919 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:25,920 INFO | (50, 200, 512)
2021-05-27 16:55:25,921 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:25,921 INFO | (50, 200, 512)
2021-05-27 16:55:25,923 INFO | BERT LAYER
2021-05-27 16:55:25,923 INFO | (200, 512)
2021-05-27 16:55:25,923 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,924 INFO | (200, 512)
2021-05-27 16:55:25,924 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,926 INFO | (200, 512)
2021-05-27 16:55:25,933 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,933 INFO | (200, 512)
2021-05-27 16:55:25,934 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,934 INFO | (200, 512)
2021-05-27 16:55:25,941 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,942 INFO | (200, 512)
2021-05-27 16:55:25,942 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:25,943 INFO | (200, 512)
2021-05-27 16:55:25,948 INFO | BERT LAYER LOOP
2021-05-27 16:55:25,949 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  49%|████▉     | 283/574 [00:31<00:31,  9.15it/s]

2021-05-27 16:55:26,021 INFO | INITIAL
2021-05-27 16:55:26,022 INFO | (50, 200)
2021-05-27 16:55:26,028 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:26,029 INFO | (50, 200, 512)
2021-05-27 16:55:26,030 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:26,030 INFO | (50, 200, 512)
2021-05-27 16:55:26,031 INFO | BERT LAYER
2021-05-27 16:55:26,032 INFO | (200, 512)
2021-05-27 16:55:26,033 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,033 INFO | (200, 512)
2021-05-27 16:55:26,034 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,034 INFO | (200, 512)
2021-05-27 16:55:26,040 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,041 INFO | (200, 512)
2021-05-27 16:55:26,042 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,043 INFO | (200, 512)
2021-05-27 16:55:26,049 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,050 INFO | (200, 512)
2021-05-27 16:55:26,050 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,051 INFO | (200, 512)
2021-05-27 16:55:26,058 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,058 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  49%|████▉     | 284/574 [00:31<00:31,  9.15it/s]

2021-05-27 16:55:26,131 INFO | INITIAL
2021-05-27 16:55:26,132 INFO | (50, 200)
2021-05-27 16:55:26,138 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:26,139 INFO | (50, 200, 512)
2021-05-27 16:55:26,141 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:26,141 INFO | (50, 200, 512)
2021-05-27 16:55:26,142 INFO | BERT LAYER
2021-05-27 16:55:26,143 INFO | (200, 512)
2021-05-27 16:55:26,144 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,144 INFO | (200, 512)
2021-05-27 16:55:26,145 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,145 INFO | (200, 512)
2021-05-27 16:55:26,152 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,152 INFO | (200, 512)
2021-05-27 16:55:26,153 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,153 INFO | (200, 512)
2021-05-27 16:55:26,158 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,159 INFO | (200, 512)
2021-05-27 16:55:26,160 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,160 INFO | (200, 512)
2021-05-27 16:55:26,165 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,166 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  50%|████▉     | 285/574 [00:31<00:31,  9.15it/s]

2021-05-27 16:55:26,240 INFO | INITIAL
2021-05-27 16:55:26,242 INFO | (50, 200)
2021-05-27 16:55:26,249 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:26,249 INFO | (50, 200, 512)
2021-05-27 16:55:26,251 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:26,251 INFO | (50, 200, 512)
2021-05-27 16:55:26,252 INFO | BERT LAYER
2021-05-27 16:55:26,252 INFO | (200, 512)
2021-05-27 16:55:26,252 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,253 INFO | (200, 512)
2021-05-27 16:55:26,253 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,253 INFO | (200, 512)
2021-05-27 16:55:26,259 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,259 INFO | (200, 512)
2021-05-27 16:55:26,261 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,261 INFO | (200, 512)
2021-05-27 16:55:26,267 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,268 INFO | (200, 512)
2021-05-27 16:55:26,268 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,268 INFO | (200, 512)
2021-05-27 16:55:26,275 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,276 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  50%|████▉     | 286/574 [00:31<00:31,  9.19it/s]

2021-05-27 16:55:26,347 INFO | INITIAL
2021-05-27 16:55:26,348 INFO | (50, 200)
2021-05-27 16:55:26,353 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:26,354 INFO | (50, 200, 512)
2021-05-27 16:55:26,355 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:26,355 INFO | (50, 200, 512)
2021-05-27 16:55:26,356 INFO | BERT LAYER
2021-05-27 16:55:26,357 INFO | (200, 512)
2021-05-27 16:55:26,357 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,357 INFO | (200, 512)
2021-05-27 16:55:26,358 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,358 INFO | (200, 512)
2021-05-27 16:55:26,364 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,365 INFO | (200, 512)
2021-05-27 16:55:26,366 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,366 INFO | (200, 512)
2021-05-27 16:55:26,372 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,373 INFO | (200, 512)
2021-05-27 16:55:26,373 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,374 INFO | (200, 512)
2021-05-27 16:55:26,380 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,381 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  50%|█████     | 287/574 [00:31<00:30,  9.27it/s]

2021-05-27 16:55:26,453 INFO | INITIAL
2021-05-27 16:55:26,453 INFO | (50, 200)
2021-05-27 16:55:26,460 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:26,460 INFO | (50, 200, 512)
2021-05-27 16:55:26,461 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:26,462 INFO | (50, 200, 512)
2021-05-27 16:55:26,463 INFO | BERT LAYER
2021-05-27 16:55:26,463 INFO | (200, 512)
2021-05-27 16:55:26,464 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,464 INFO | (200, 512)
2021-05-27 16:55:26,464 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,465 INFO | (200, 512)
2021-05-27 16:55:26,471 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,471 INFO | (200, 512)
2021-05-27 16:55:26,472 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,472 INFO | (200, 512)
2021-05-27 16:55:26,479 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,479 INFO | (200, 512)
2021-05-27 16:55:26,480 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,480 INFO | (200, 512)
2021-05-27 16:55:26,486 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,486 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  50%|█████     | 288/574 [00:31<00:30,  9.37it/s]

2021-05-27 16:55:26,557 INFO | INITIAL
2021-05-27 16:55:26,558 INFO | (50, 200)
2021-05-27 16:55:26,566 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:26,566 INFO | (50, 200, 512)
2021-05-27 16:55:26,568 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:26,569 INFO | (50, 200, 512)
2021-05-27 16:55:26,569 INFO | BERT LAYER
2021-05-27 16:55:26,570 INFO | (200, 512)
2021-05-27 16:55:26,570 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,570 INFO | (200, 512)
2021-05-27 16:55:26,571 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,571 INFO | (200, 512)
2021-05-27 16:55:26,580 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,580 INFO | (200, 512)
2021-05-27 16:55:26,581 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,581 INFO | (200, 512)
2021-05-27 16:55:26,586 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,587 INFO | (200, 512)
2021-05-27 16:55:26,587 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,588 INFO | (200, 512)
2021-05-27 16:55:26,596 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,596 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  50%|█████     | 289/574 [00:31<00:31,  9.15it/s]

2021-05-27 16:55:26,672 INFO | INITIAL
2021-05-27 16:55:26,673 INFO | (50, 200)
2021-05-27 16:55:26,679 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:26,679 INFO | (50, 200, 512)
2021-05-27 16:55:26,680 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:26,681 INFO | (50, 200, 512)
2021-05-27 16:55:26,682 INFO | BERT LAYER
2021-05-27 16:55:26,682 INFO | (200, 512)
2021-05-27 16:55:26,682 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,683 INFO | (200, 512)
2021-05-27 16:55:26,683 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,683 INFO | (200, 512)
2021-05-27 16:55:26,688 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,689 INFO | (200, 512)
2021-05-27 16:55:26,689 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,690 INFO | (200, 512)
2021-05-27 16:55:26,695 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,696 INFO | (200, 512)
2021-05-27 16:55:26,696 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,697 INFO | (200, 512)
2021-05-27 16:55:26,703 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,704 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  51%|█████     | 290/574 [00:31<00:30,  9.20it/s]

2021-05-27 16:55:26,780 INFO | INITIAL
2021-05-27 16:55:26,781 INFO | (50, 200)
2021-05-27 16:55:26,786 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:26,786 INFO | (50, 200, 512)
2021-05-27 16:55:26,788 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:26,788 INFO | (50, 200, 512)
2021-05-27 16:55:26,789 INFO | BERT LAYER
2021-05-27 16:55:26,789 INFO | (200, 512)
2021-05-27 16:55:26,790 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,790 INFO | (200, 512)
2021-05-27 16:55:26,790 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,791 INFO | (200, 512)
2021-05-27 16:55:26,799 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,799 INFO | (200, 512)
2021-05-27 16:55:26,800 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,800 INFO | (200, 512)
2021-05-27 16:55:26,805 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,805 INFO | (200, 512)
2021-05-27 16:55:26,806 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,806 INFO | (200, 512)
2021-05-27 16:55:26,812 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,812 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  51%|█████     | 291/574 [00:32<00:30,  9.24it/s]

2021-05-27 16:55:26,887 INFO | INITIAL
2021-05-27 16:55:26,887 INFO | (50, 200)
2021-05-27 16:55:26,893 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:26,894 INFO | (50, 200, 512)
2021-05-27 16:55:26,896 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:26,896 INFO | (50, 200, 512)
2021-05-27 16:55:26,897 INFO | BERT LAYER
2021-05-27 16:55:26,897 INFO | (200, 512)
2021-05-27 16:55:26,898 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,898 INFO | (200, 512)
2021-05-27 16:55:26,898 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,899 INFO | (200, 512)
2021-05-27 16:55:26,906 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,906 INFO | (200, 512)
2021-05-27 16:55:26,907 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,907 INFO | (200, 512)
2021-05-27 16:55:26,913 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,914 INFO | (200, 512)
2021-05-27 16:55:26,914 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,915 INFO | (200, 512)
2021-05-27 16:55:26,920 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,920 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  51%|█████     | 291/574 [00:32<00:30,  9.24it/s]

2021-05-27 16:55:26,986 INFO | INITIAL
2021-05-27 16:55:26,987 INFO | (50, 200)
2021-05-27 16:55:26,992 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:26,992 INFO | (50, 200, 512)
2021-05-27 16:55:26,994 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:26,994 INFO | (50, 200, 512)
2021-05-27 16:55:26,995 INFO | BERT LAYER
2021-05-27 16:55:26,995 INFO | (200, 512)
2021-05-27 16:55:26,995 INFO | BERT LAYER LOOP
2021-05-27 16:55:26,996 INFO | (200, 512)
2021-05-27 16:55:26,996 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:26,996 INFO | (200, 512)
2021-05-27 16:55:27,003 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,004 INFO | (200, 512)
2021-05-27 16:55:27,005 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,005 INFO | (200, 512)
2021-05-27 16:55:27,012 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,013 INFO | (200, 512)
2021-05-27 16:55:27,013 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,014 INFO | (200, 512)
2021-05-27 16:55:27,019 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,019 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  51%|█████     | 293/574 [00:32<00:29,  9.63it/s]

2021-05-27 16:55:27,085 INFO | INITIAL
2021-05-27 16:55:27,085 INFO | (50, 200)
2021-05-27 16:55:27,090 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:27,092 INFO | (50, 200, 512)
2021-05-27 16:55:27,093 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:27,094 INFO | (50, 200, 512)
2021-05-27 16:55:27,095 INFO | BERT LAYER
2021-05-27 16:55:27,095 INFO | (200, 512)
2021-05-27 16:55:27,095 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,096 INFO | (200, 512)
2021-05-27 16:55:27,096 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,096 INFO | (200, 512)
2021-05-27 16:55:27,103 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,104 INFO | (200, 512)
2021-05-27 16:55:27,104 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,104 INFO | (200, 512)
2021-05-27 16:55:27,110 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,111 INFO | (200, 512)
2021-05-27 16:55:27,111 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,112 INFO | (200, 512)
2021-05-27 16:55:27,117 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,118 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  51%|█████     | 294/574 [00:32<00:29,  9.65it/s]

2021-05-27 16:55:27,187 INFO | INITIAL
2021-05-27 16:55:27,188 INFO | (50, 200)
2021-05-27 16:55:27,193 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:27,194 INFO | (50, 200, 512)
2021-05-27 16:55:27,195 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:27,196 INFO | (50, 200, 512)
2021-05-27 16:55:27,196 INFO | BERT LAYER
2021-05-27 16:55:27,197 INFO | (200, 512)
2021-05-27 16:55:27,197 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,197 INFO | (200, 512)
2021-05-27 16:55:27,198 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,198 INFO | (200, 512)
2021-05-27 16:55:27,204 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,205 INFO | (200, 512)
2021-05-27 16:55:27,208 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,213 INFO | (200, 512)
2021-05-27 16:55:27,218 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,219 INFO | (200, 512)
2021-05-27 16:55:27,219 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,220 INFO | (200, 512)
2021-05-27 16:55:27,225 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,226 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  51%|█████▏    | 295/574 [00:32<00:29,  9.52it/s]

2021-05-27 16:55:27,296 INFO | INITIAL
2021-05-27 16:55:27,297 INFO | (50, 200)
2021-05-27 16:55:27,302 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:27,302 INFO | (50, 200, 512)
2021-05-27 16:55:27,304 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:27,304 INFO | (50, 200, 512)
2021-05-27 16:55:27,305 INFO | BERT LAYER
2021-05-27 16:55:27,305 INFO | (200, 512)
2021-05-27 16:55:27,306 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,306 INFO | (200, 512)
2021-05-27 16:55:27,307 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,307 INFO | (200, 512)
2021-05-27 16:55:27,314 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,315 INFO | (200, 512)
2021-05-27 16:55:27,316 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,316 INFO | (200, 512)
2021-05-27 16:55:27,323 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,324 INFO | (200, 512)
2021-05-27 16:55:27,325 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,325 INFO | (200, 512)
2021-05-27 16:55:27,331 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,331 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  52%|█████▏    | 296/574 [00:32<00:29,  9.42it/s]

2021-05-27 16:55:27,406 INFO | INITIAL
2021-05-27 16:55:27,406 INFO | (50, 200)
2021-05-27 16:55:27,415 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:27,415 INFO | (50, 200, 512)
2021-05-27 16:55:27,417 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:27,417 INFO | (50, 200, 512)
2021-05-27 16:55:27,418 INFO | BERT LAYER
2021-05-27 16:55:27,419 INFO | (200, 512)
2021-05-27 16:55:27,419 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,420 INFO | (200, 512)
2021-05-27 16:55:27,420 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,420 INFO | (200, 512)
2021-05-27 16:55:27,427 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,427 INFO | (200, 512)
2021-05-27 16:55:27,428 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,428 INFO | (200, 512)
2021-05-27 16:55:27,433 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,434 INFO | (200, 512)
2021-05-27 16:55:27,434 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,435 INFO | (200, 512)
2021-05-27 16:55:27,441 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,441 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  52%|█████▏    | 297/574 [00:32<00:29,  9.35it/s]

2021-05-27 16:55:27,515 INFO | INITIAL
2021-05-27 16:55:27,515 INFO | (50, 200)
2021-05-27 16:55:27,521 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:27,521 INFO | (50, 200, 512)
2021-05-27 16:55:27,523 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:27,523 INFO | (50, 200, 512)
2021-05-27 16:55:27,524 INFO | BERT LAYER
2021-05-27 16:55:27,525 INFO | (200, 512)
2021-05-27 16:55:27,526 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,528 INFO | (200, 512)
2021-05-27 16:55:27,529 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,530 INFO | (200, 512)
2021-05-27 16:55:27,535 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,536 INFO | (200, 512)
2021-05-27 16:55:27,536 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,536 INFO | (200, 512)
2021-05-27 16:55:27,542 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,542 INFO | (200, 512)
2021-05-27 16:55:27,543 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,543 INFO | (200, 512)
2021-05-27 16:55:27,548 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,549 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  52%|█████▏    | 298/574 [00:32<00:29,  9.34it/s]

2021-05-27 16:55:27,622 INFO | INITIAL
2021-05-27 16:55:27,623 INFO | (50, 200)
2021-05-27 16:55:27,629 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:27,630 INFO | (50, 200, 512)
2021-05-27 16:55:27,631 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:27,632 INFO | (50, 200, 512)
2021-05-27 16:55:27,633 INFO | BERT LAYER
2021-05-27 16:55:27,634 INFO | (200, 512)
2021-05-27 16:55:27,634 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,635 INFO | (200, 512)
2021-05-27 16:55:27,636 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,636 INFO | (200, 512)
2021-05-27 16:55:27,645 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,645 INFO | (200, 512)
2021-05-27 16:55:27,646 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,646 INFO | (200, 512)
2021-05-27 16:55:27,652 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,652 INFO | (200, 512)
2021-05-27 16:55:27,652 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,653 INFO | (200, 512)
2021-05-27 16:55:27,658 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,659 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  52%|█████▏    | 299/574 [00:32<00:29,  9.30it/s]

2021-05-27 16:55:27,731 INFO | INITIAL
2021-05-27 16:55:27,731 INFO | (50, 200)
2021-05-27 16:55:27,736 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:27,737 INFO | (50, 200, 512)
2021-05-27 16:55:27,738 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:27,739 INFO | (50, 200, 512)
2021-05-27 16:55:27,740 INFO | BERT LAYER
2021-05-27 16:55:27,740 INFO | (200, 512)
2021-05-27 16:55:27,740 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,741 INFO | (200, 512)
2021-05-27 16:55:27,742 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,742 INFO | (200, 512)
2021-05-27 16:55:27,748 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,749 INFO | (200, 512)
2021-05-27 16:55:27,749 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,749 INFO | (200, 512)
2021-05-27 16:55:27,757 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,757 INFO | (200, 512)
2021-05-27 16:55:27,758 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,758 INFO | (200, 512)
2021-05-27 16:55:27,764 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,765 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  52%|█████▏    | 300/574 [00:33<00:30,  9.09it/s]

2021-05-27 16:55:27,846 INFO | INITIAL
2021-05-27 16:55:27,847 INFO | (50, 200)
2021-05-27 16:55:27,852 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:27,853 INFO | (50, 200, 512)
2021-05-27 16:55:27,855 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:27,855 INFO | (50, 200, 512)
2021-05-27 16:55:27,856 INFO | BERT LAYER
2021-05-27 16:55:27,857 INFO | (200, 512)
2021-05-27 16:55:27,858 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,858 INFO | (200, 512)
2021-05-27 16:55:27,859 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,859 INFO | (200, 512)
2021-05-27 16:55:27,866 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,867 INFO | (200, 512)
2021-05-27 16:55:27,867 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,868 INFO | (200, 512)
2021-05-27 16:55:27,877 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,878 INFO | (200, 512)
2021-05-27 16:55:27,878 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,879 INFO | (200, 512)
2021-05-27 16:55:27,885 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,885 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  52%|█████▏    | 301/574 [00:33<00:30,  9.07it/s]

2021-05-27 16:55:27,957 INFO | INITIAL
2021-05-27 16:55:27,958 INFO | (50, 200)
2021-05-27 16:55:27,965 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:27,966 INFO | (50, 200, 512)
2021-05-27 16:55:27,967 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:27,968 INFO | (50, 200, 512)
2021-05-27 16:55:27,968 INFO | BERT LAYER
2021-05-27 16:55:27,969 INFO | (200, 512)
2021-05-27 16:55:27,969 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,970 INFO | (200, 512)
2021-05-27 16:55:27,970 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,971 INFO | (200, 512)
2021-05-27 16:55:27,977 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,978 INFO | (200, 512)
2021-05-27 16:55:27,978 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,978 INFO | (200, 512)
2021-05-27 16:55:27,983 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,984 INFO | (200, 512)
2021-05-27 16:55:27,984 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:27,984 INFO | (200, 512)
2021-05-27 16:55:27,990 INFO | BERT LAYER LOOP
2021-05-27 16:55:27,990 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  53%|█████▎    | 302/574 [00:33<00:29,  9.21it/s]

2021-05-27 16:55:28,062 INFO | INITIAL
2021-05-27 16:55:28,063 INFO | (50, 200)
2021-05-27 16:55:28,069 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:28,070 INFO | (50, 200, 512)
2021-05-27 16:55:28,071 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:28,071 INFO | (50, 200, 512)
2021-05-27 16:55:28,072 INFO | BERT LAYER
2021-05-27 16:55:28,073 INFO | (200, 512)
2021-05-27 16:55:28,073 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,073 INFO | (200, 512)
2021-05-27 16:55:28,074 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,074 INFO | (200, 512)
2021-05-27 16:55:28,081 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,081 INFO | (200, 512)
2021-05-27 16:55:28,081 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,082 INFO | (200, 512)
2021-05-27 16:55:28,088 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,089 INFO | (200, 512)
2021-05-27 16:55:28,089 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,089 INFO | (200, 512)
2021-05-27 16:55:28,095 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,095 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  53%|█████▎    | 303/574 [00:33<00:29,  9.18it/s]

2021-05-27 16:55:28,172 INFO | INITIAL
2021-05-27 16:55:28,173 INFO | (50, 200)
2021-05-27 16:55:28,181 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:28,181 INFO | (50, 200, 512)
2021-05-27 16:55:28,183 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:28,183 INFO | (50, 200, 512)
2021-05-27 16:55:28,184 INFO | BERT LAYER
2021-05-27 16:55:28,185 INFO | (200, 512)
2021-05-27 16:55:28,185 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,185 INFO | (200, 512)
2021-05-27 16:55:28,186 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,186 INFO | (200, 512)
2021-05-27 16:55:28,192 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,193 INFO | (200, 512)
2021-05-27 16:55:28,193 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,194 INFO | (200, 512)
2021-05-27 16:55:28,201 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,201 INFO | (200, 512)
2021-05-27 16:55:28,202 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,202 INFO | (200, 512)
2021-05-27 16:55:28,209 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,209 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  53%|█████▎    | 304/574 [00:33<00:29,  9.16it/s]

2021-05-27 16:55:28,282 INFO | INITIAL
2021-05-27 16:55:28,282 INFO | (50, 200)
2021-05-27 16:55:28,287 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:28,288 INFO | (50, 200, 512)
2021-05-27 16:55:28,289 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:28,290 INFO | (50, 200, 512)
2021-05-27 16:55:28,290 INFO | BERT LAYER
2021-05-27 16:55:28,291 INFO | (200, 512)
2021-05-27 16:55:28,291 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,292 INFO | (200, 512)
2021-05-27 16:55:28,292 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,293 INFO | (200, 512)
2021-05-27 16:55:28,299 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,299 INFO | (200, 512)
2021-05-27 16:55:28,299 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,300 INFO | (200, 512)
2021-05-27 16:55:28,306 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,307 INFO | (200, 512)
2021-05-27 16:55:28,308 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,308 INFO | (200, 512)
2021-05-27 16:55:28,314 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,314 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  53%|█████▎    | 305/574 [00:33<00:29,  9.26it/s]

2021-05-27 16:55:28,386 INFO | INITIAL
2021-05-27 16:55:28,387 INFO | (50, 200)
2021-05-27 16:55:28,392 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:28,393 INFO | (50, 200, 512)
2021-05-27 16:55:28,394 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:28,395 INFO | (50, 200, 512)
2021-05-27 16:55:28,396 INFO | BERT LAYER
2021-05-27 16:55:28,396 INFO | (200, 512)
2021-05-27 16:55:28,396 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,397 INFO | (200, 512)
2021-05-27 16:55:28,397 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,397 INFO | (200, 512)
2021-05-27 16:55:28,403 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,403 INFO | (200, 512)
2021-05-27 16:55:28,404 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,404 INFO | (200, 512)
2021-05-27 16:55:28,410 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,411 INFO | (200, 512)
2021-05-27 16:55:28,411 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,411 INFO | (200, 512)
2021-05-27 16:55:28,417 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,418 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  53%|█████▎    | 306/574 [00:33<00:28,  9.28it/s]

2021-05-27 16:55:28,494 INFO | INITIAL
2021-05-27 16:55:28,495 INFO | (50, 200)
2021-05-27 16:55:28,500 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:28,500 INFO | (50, 200, 512)
2021-05-27 16:55:28,502 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:28,502 INFO | (50, 200, 512)
2021-05-27 16:55:28,503 INFO | BERT LAYER
2021-05-27 16:55:28,504 INFO | (200, 512)
2021-05-27 16:55:28,504 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,504 INFO | (200, 512)
2021-05-27 16:55:28,505 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,505 INFO | (200, 512)
2021-05-27 16:55:28,512 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,513 INFO | (200, 512)
2021-05-27 16:55:28,513 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,513 INFO | (200, 512)
2021-05-27 16:55:28,520 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,520 INFO | (200, 512)
2021-05-27 16:55:28,521 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,521 INFO | (200, 512)
2021-05-27 16:55:28,527 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,528 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  53%|█████▎    | 307/574 [00:33<00:28,  9.43it/s]

2021-05-27 16:55:28,596 INFO | INITIAL
2021-05-27 16:55:28,596 INFO | (50, 200)
2021-05-27 16:55:28,601 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:28,602 INFO | (50, 200, 512)
2021-05-27 16:55:28,603 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:28,604 INFO | (50, 200, 512)
2021-05-27 16:55:28,605 INFO | BERT LAYER
2021-05-27 16:55:28,605 INFO | (200, 512)
2021-05-27 16:55:28,605 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,606 INFO | (200, 512)
2021-05-27 16:55:28,606 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,607 INFO | (200, 512)
2021-05-27 16:55:28,615 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,616 INFO | (200, 512)
2021-05-27 16:55:28,617 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,617 INFO | (200, 512)
2021-05-27 16:55:28,625 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,626 INFO | (200, 512)
2021-05-27 16:55:28,626 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,627 INFO | (200, 512)
2021-05-27 16:55:28,632 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,633 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  54%|█████▎    | 308/574 [00:33<00:28,  9.38it/s]

2021-05-27 16:55:28,705 INFO | INITIAL
2021-05-27 16:55:28,705 INFO | (50, 200)
2021-05-27 16:55:28,713 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:28,713 INFO | (50, 200, 512)
2021-05-27 16:55:28,715 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:28,715 INFO | (50, 200, 512)
2021-05-27 16:55:28,716 INFO | BERT LAYER
2021-05-27 16:55:28,719 INFO | (200, 512)
2021-05-27 16:55:28,719 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,720 INFO | (200, 512)
2021-05-27 16:55:28,720 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,722 INFO | (200, 512)
2021-05-27 16:55:28,729 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,729 INFO | (200, 512)
2021-05-27 16:55:28,730 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,730 INFO | (200, 512)
2021-05-27 16:55:28,736 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,737 INFO | (200, 512)
2021-05-27 16:55:28,737 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,737 INFO | (200, 512)
2021-05-27 16:55:28,744 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,744 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  54%|█████▍    | 309/574 [00:34<00:28,  9.24it/s]

2021-05-27 16:55:28,816 INFO | INITIAL
2021-05-27 16:55:28,816 INFO | (50, 200)
2021-05-27 16:55:28,821 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:28,822 INFO | (50, 200, 512)
2021-05-27 16:55:28,823 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:28,823 INFO | (50, 200, 512)
2021-05-27 16:55:28,824 INFO | BERT LAYER
2021-05-27 16:55:28,825 INFO | (200, 512)
2021-05-27 16:55:28,825 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,826 INFO | (200, 512)
2021-05-27 16:55:28,827 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,828 INFO | (200, 512)
2021-05-27 16:55:28,835 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,836 INFO | (200, 512)
2021-05-27 16:55:28,837 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,837 INFO | (200, 512)
2021-05-27 16:55:28,845 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,845 INFO | (200, 512)
2021-05-27 16:55:28,846 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,846 INFO | (200, 512)
2021-05-27 16:55:28,851 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,852 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  54%|█████▍    | 310/574 [00:34<00:28,  9.31it/s]

2021-05-27 16:55:28,921 INFO | INITIAL
2021-05-27 16:55:28,922 INFO | (50, 200)
2021-05-27 16:55:28,927 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:28,927 INFO | (50, 200, 512)
2021-05-27 16:55:28,929 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:28,929 INFO | (50, 200, 512)
2021-05-27 16:55:28,930 INFO | BERT LAYER
2021-05-27 16:55:28,931 INFO | (200, 512)
2021-05-27 16:55:28,931 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,931 INFO | (200, 512)
2021-05-27 16:55:28,932 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,932 INFO | (200, 512)
2021-05-27 16:55:28,937 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,938 INFO | (200, 512)
2021-05-27 16:55:28,938 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,939 INFO | (200, 512)
2021-05-27 16:55:28,945 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,946 INFO | (200, 512)
2021-05-27 16:55:28,946 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:28,947 INFO | (200, 512)
2021-05-27 16:55:28,952 INFO | BERT LAYER LOOP
2021-05-27 16:55:28,953 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  54%|█████▍    | 311/574 [00:34<00:27,  9.45it/s]

2021-05-27 16:55:29,024 INFO | INITIAL
2021-05-27 16:55:29,025 INFO | (50, 200)
2021-05-27 16:55:29,031 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:29,032 INFO | (50, 200, 512)
2021-05-27 16:55:29,033 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:29,033 INFO | (50, 200, 512)
2021-05-27 16:55:29,034 INFO | BERT LAYER
2021-05-27 16:55:29,034 INFO | (200, 512)
2021-05-27 16:55:29,035 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,035 INFO | (200, 512)
2021-05-27 16:55:29,036 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,036 INFO | (200, 512)
2021-05-27 16:55:29,043 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,044 INFO | (200, 512)
2021-05-27 16:55:29,044 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,045 INFO | (200, 512)
2021-05-27 16:55:29,050 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,051 INFO | (200, 512)
2021-05-27 16:55:29,051 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,052 INFO | (200, 512)
2021-05-27 16:55:29,058 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,058 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  54%|█████▍    | 312/574 [00:34<00:27,  9.39it/s]

2021-05-27 16:55:29,132 INFO | INITIAL
2021-05-27 16:55:29,132 INFO | (50, 200)
2021-05-27 16:55:29,138 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:29,139 INFO | (50, 200, 512)
2021-05-27 16:55:29,140 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:29,140 INFO | (50, 200, 512)
2021-05-27 16:55:29,142 INFO | BERT LAYER
2021-05-27 16:55:29,142 INFO | (200, 512)
2021-05-27 16:55:29,143 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,143 INFO | (200, 512)
2021-05-27 16:55:29,144 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,145 INFO | (200, 512)
2021-05-27 16:55:29,152 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,152 INFO | (200, 512)
2021-05-27 16:55:29,153 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,153 INFO | (200, 512)
2021-05-27 16:55:29,159 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,159 INFO | (200, 512)
2021-05-27 16:55:29,160 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,160 INFO | (200, 512)
2021-05-27 16:55:29,166 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,166 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  55%|█████▍    | 313/574 [00:34<00:28,  9.25it/s]

2021-05-27 16:55:29,244 INFO | INITIAL
2021-05-27 16:55:29,244 INFO | (50, 200)
2021-05-27 16:55:29,249 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:29,250 INFO | (50, 200, 512)
2021-05-27 16:55:29,251 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:29,252 INFO | (50, 200, 512)
2021-05-27 16:55:29,253 INFO | BERT LAYER
2021-05-27 16:55:29,253 INFO | (200, 512)
2021-05-27 16:55:29,253 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,254 INFO | (200, 512)
2021-05-27 16:55:29,254 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,255 INFO | (200, 512)
2021-05-27 16:55:29,262 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,263 INFO | (200, 512)
2021-05-27 16:55:29,263 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,264 INFO | (200, 512)
2021-05-27 16:55:29,270 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,270 INFO | (200, 512)
2021-05-27 16:55:29,271 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,271 INFO | (200, 512)
2021-05-27 16:55:29,277 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,278 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  55%|█████▍    | 314/574 [00:34<00:27,  9.30it/s]

2021-05-27 16:55:29,349 INFO | INITIAL
2021-05-27 16:55:29,350 INFO | (50, 200)
2021-05-27 16:55:29,354 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:29,355 INFO | (50, 200, 512)
2021-05-27 16:55:29,356 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:29,356 INFO | (50, 200, 512)
2021-05-27 16:55:29,357 INFO | BERT LAYER
2021-05-27 16:55:29,358 INFO | (200, 512)
2021-05-27 16:55:29,358 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,359 INFO | (200, 512)
2021-05-27 16:55:29,359 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,360 INFO | (200, 512)
2021-05-27 16:55:29,365 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,365 INFO | (200, 512)
2021-05-27 16:55:29,366 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,366 INFO | (200, 512)
2021-05-27 16:55:29,374 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,374 INFO | (200, 512)
2021-05-27 16:55:29,375 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,376 INFO | (200, 512)
2021-05-27 16:55:29,382 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,383 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  55%|█████▍    | 315/574 [00:34<00:27,  9.46it/s]

2021-05-27 16:55:29,452 INFO | INITIAL
2021-05-27 16:55:29,452 INFO | (50, 200)
2021-05-27 16:55:29,457 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:29,457 INFO | (50, 200, 512)
2021-05-27 16:55:29,458 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:29,459 INFO | (50, 200, 512)
2021-05-27 16:55:29,460 INFO | BERT LAYER
2021-05-27 16:55:29,460 INFO | (200, 512)
2021-05-27 16:55:29,460 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,461 INFO | (200, 512)
2021-05-27 16:55:29,461 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,462 INFO | (200, 512)
2021-05-27 16:55:29,469 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,469 INFO | (200, 512)
2021-05-27 16:55:29,470 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,470 INFO | (200, 512)
2021-05-27 16:55:29,476 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,477 INFO | (200, 512)
2021-05-27 16:55:29,477 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,478 INFO | (200, 512)
2021-05-27 16:55:29,484 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,484 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  55%|█████▌    | 316/574 [00:34<00:27,  9.50it/s]

2021-05-27 16:55:29,555 INFO | INITIAL
2021-05-27 16:55:29,556 INFO | (50, 200)
2021-05-27 16:55:29,561 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:29,562 INFO | (50, 200, 512)
2021-05-27 16:55:29,563 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:29,563 INFO | (50, 200, 512)
2021-05-27 16:55:29,564 INFO | BERT LAYER
2021-05-27 16:55:29,565 INFO | (200, 512)
2021-05-27 16:55:29,565 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,566 INFO | (200, 512)
2021-05-27 16:55:29,566 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,566 INFO | (200, 512)
2021-05-27 16:55:29,573 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,574 INFO | (200, 512)
2021-05-27 16:55:29,574 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,575 INFO | (200, 512)
2021-05-27 16:55:29,581 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,581 INFO | (200, 512)
2021-05-27 16:55:29,582 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,582 INFO | (200, 512)
2021-05-27 16:55:29,588 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,588 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  55%|█████▌    | 317/574 [00:34<00:26,  9.59it/s]

2021-05-27 16:55:29,657 INFO | INITIAL
2021-05-27 16:55:29,658 INFO | (50, 200)
2021-05-27 16:55:29,665 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:29,666 INFO | (50, 200, 512)
2021-05-27 16:55:29,667 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:29,667 INFO | (50, 200, 512)
2021-05-27 16:55:29,668 INFO | BERT LAYER
2021-05-27 16:55:29,668 INFO | (200, 512)
2021-05-27 16:55:29,669 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,669 INFO | (200, 512)
2021-05-27 16:55:29,670 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,671 INFO | (200, 512)
2021-05-27 16:55:29,678 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,678 INFO | (200, 512)
2021-05-27 16:55:29,679 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,679 INFO | (200, 512)
2021-05-27 16:55:29,685 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,686 INFO | (200, 512)
2021-05-27 16:55:29,686 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,686 INFO | (200, 512)
2021-05-27 16:55:29,692 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,692 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  55%|█████▌    | 318/574 [00:34<00:27,  9.48it/s]

2021-05-27 16:55:29,766 INFO | INITIAL
2021-05-27 16:55:29,766 INFO | (50, 200)
2021-05-27 16:55:29,771 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:29,772 INFO | (50, 200, 512)
2021-05-27 16:55:29,773 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:29,774 INFO | (50, 200, 512)
2021-05-27 16:55:29,775 INFO | BERT LAYER
2021-05-27 16:55:29,776 INFO | (200, 512)
2021-05-27 16:55:29,777 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,777 INFO | (200, 512)
2021-05-27 16:55:29,778 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,779 INFO | (200, 512)
2021-05-27 16:55:29,785 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,785 INFO | (200, 512)
2021-05-27 16:55:29,785 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,786 INFO | (200, 512)
2021-05-27 16:55:29,792 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,793 INFO | (200, 512)
2021-05-27 16:55:29,794 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,794 INFO | (200, 512)
2021-05-27 16:55:29,800 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,801 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  56%|█████▌    | 319/574 [00:35<00:26,  9.49it/s]

2021-05-27 16:55:29,871 INFO | INITIAL
2021-05-27 16:55:29,871 INFO | (50, 200)
2021-05-27 16:55:29,879 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:29,880 INFO | (50, 200, 512)
2021-05-27 16:55:29,881 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:29,881 INFO | (50, 200, 512)
2021-05-27 16:55:29,882 INFO | BERT LAYER
2021-05-27 16:55:29,883 INFO | (200, 512)
2021-05-27 16:55:29,883 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,884 INFO | (200, 512)
2021-05-27 16:55:29,884 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,885 INFO | (200, 512)
2021-05-27 16:55:29,890 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,890 INFO | (200, 512)
2021-05-27 16:55:29,891 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,892 INFO | (200, 512)
2021-05-27 16:55:29,897 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,897 INFO | (200, 512)
2021-05-27 16:55:29,898 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,898 INFO | (200, 512)
2021-05-27 16:55:29,903 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,904 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  56%|█████▌    | 320/574 [00:35<00:26,  9.49it/s]

2021-05-27 16:55:29,979 INFO | INITIAL
2021-05-27 16:55:29,979 INFO | (50, 200)
2021-05-27 16:55:29,985 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:29,986 INFO | (50, 200, 512)
2021-05-27 16:55:29,987 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:29,988 INFO | (50, 200, 512)
2021-05-27 16:55:29,988 INFO | BERT LAYER
2021-05-27 16:55:29,989 INFO | (200, 512)
2021-05-27 16:55:29,989 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,990 INFO | (200, 512)
2021-05-27 16:55:29,990 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,991 INFO | (200, 512)
2021-05-27 16:55:29,996 INFO | BERT LAYER LOOP
2021-05-27 16:55:29,997 INFO | (200, 512)
2021-05-27 16:55:29,998 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:29,998 INFO | (200, 512)
2021-05-27 16:55:30,003 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,003 INFO | (200, 512)
2021-05-27 16:55:30,003 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,004 INFO | (200, 512)
2021-05-27 16:55:30,009 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,010 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  56%|█████▌    | 321/574 [00:35<00:26,  9.52it/s]

2021-05-27 16:55:30,081 INFO | INITIAL
2021-05-27 16:55:30,081 INFO | (50, 200)
2021-05-27 16:55:30,086 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:30,086 INFO | (50, 200, 512)
2021-05-27 16:55:30,087 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:30,088 INFO | (50, 200, 512)
2021-05-27 16:55:30,088 INFO | BERT LAYER
2021-05-27 16:55:30,089 INFO | (200, 512)
2021-05-27 16:55:30,089 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,089 INFO | (200, 512)
2021-05-27 16:55:30,090 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,090 INFO | (200, 512)
2021-05-27 16:55:30,096 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,097 INFO | (200, 512)
2021-05-27 16:55:30,097 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,097 INFO | (200, 512)
2021-05-27 16:55:30,104 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,105 INFO | (200, 512)
2021-05-27 16:55:30,106 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,106 INFO | (200, 512)
2021-05-27 16:55:30,115 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,115 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  56%|█████▌    | 322/574 [00:35<00:26,  9.41it/s]

2021-05-27 16:55:30,189 INFO | INITIAL
2021-05-27 16:55:30,190 INFO | (50, 200)
2021-05-27 16:55:30,198 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:30,198 INFO | (50, 200, 512)
2021-05-27 16:55:30,199 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:30,200 INFO | (50, 200, 512)
2021-05-27 16:55:30,201 INFO | BERT LAYER
2021-05-27 16:55:30,201 INFO | (200, 512)
2021-05-27 16:55:30,201 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,202 INFO | (200, 512)
2021-05-27 16:55:30,202 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,202 INFO | (200, 512)
2021-05-27 16:55:30,208 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,209 INFO | (200, 512)
2021-05-27 16:55:30,209 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,210 INFO | (200, 512)
2021-05-27 16:55:30,214 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,215 INFO | (200, 512)
2021-05-27 16:55:30,215 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,216 INFO | (200, 512)
2021-05-27 16:55:30,223 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,224 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  56%|█████▋    | 323/574 [00:35<00:26,  9.39it/s]

2021-05-27 16:55:30,297 INFO | INITIAL
2021-05-27 16:55:30,297 INFO | (50, 200)
2021-05-27 16:55:30,302 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:30,303 INFO | (50, 200, 512)
2021-05-27 16:55:30,304 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:30,304 INFO | (50, 200, 512)
2021-05-27 16:55:30,306 INFO | BERT LAYER
2021-05-27 16:55:30,306 INFO | (200, 512)
2021-05-27 16:55:30,306 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,307 INFO | (200, 512)
2021-05-27 16:55:30,307 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,309 INFO | (200, 512)
2021-05-27 16:55:30,316 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,316 INFO | (200, 512)
2021-05-27 16:55:30,317 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,317 INFO | (200, 512)
2021-05-27 16:55:30,323 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,323 INFO | (200, 512)
2021-05-27 16:55:30,324 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,325 INFO | (200, 512)
2021-05-27 16:55:30,331 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,332 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  56%|█████▋    | 324/574 [00:35<00:27,  9.05it/s]

2021-05-27 16:55:30,416 INFO | INITIAL
2021-05-27 16:55:30,417 INFO | (50, 200)
2021-05-27 16:55:30,422 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:30,422 INFO | (50, 200, 512)
2021-05-27 16:55:30,424 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:30,424 INFO | (50, 200, 512)
2021-05-27 16:55:30,425 INFO | BERT LAYER
2021-05-27 16:55:30,425 INFO | (200, 512)
2021-05-27 16:55:30,426 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,426 INFO | (200, 512)
2021-05-27 16:55:30,427 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,427 INFO | (200, 512)
2021-05-27 16:55:30,433 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,433 INFO | (200, 512)
2021-05-27 16:55:30,434 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,434 INFO | (200, 512)
2021-05-27 16:55:30,441 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,441 INFO | (200, 512)
2021-05-27 16:55:30,442 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,443 INFO | (200, 512)
2021-05-27 16:55:30,451 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,451 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  57%|█████▋    | 325/574 [00:35<00:27,  9.06it/s]

2021-05-27 16:55:30,527 INFO | INITIAL
2021-05-27 16:55:30,527 INFO | (50, 200)
2021-05-27 16:55:30,532 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:30,533 INFO | (50, 200, 512)
2021-05-27 16:55:30,535 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:30,535 INFO | (50, 200, 512)
2021-05-27 16:55:30,536 INFO | BERT LAYER
2021-05-27 16:55:30,536 INFO | (200, 512)
2021-05-27 16:55:30,537 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,537 INFO | (200, 512)
2021-05-27 16:55:30,538 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,538 INFO | (200, 512)
2021-05-27 16:55:30,545 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,546 INFO | (200, 512)
2021-05-27 16:55:30,546 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,547 INFO | (200, 512)
2021-05-27 16:55:30,553 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,554 INFO | (200, 512)
2021-05-27 16:55:30,554 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,555 INFO | (200, 512)
2021-05-27 16:55:30,561 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,562 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  57%|█████▋    | 326/574 [00:35<00:27,  9.01it/s]

2021-05-27 16:55:30,639 INFO | INITIAL
2021-05-27 16:55:30,640 INFO | (50, 200)
2021-05-27 16:55:30,649 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:30,649 INFO | (50, 200, 512)
2021-05-27 16:55:30,651 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:30,651 INFO | (50, 200, 512)
2021-05-27 16:55:30,652 INFO | BERT LAYER
2021-05-27 16:55:30,653 INFO | (200, 512)
2021-05-27 16:55:30,654 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,654 INFO | (200, 512)
2021-05-27 16:55:30,655 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,656 INFO | (200, 512)
2021-05-27 16:55:30,663 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,664 INFO | (200, 512)
2021-05-27 16:55:30,665 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,665 INFO | (200, 512)
2021-05-27 16:55:30,672 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,673 INFO | (200, 512)
2021-05-27 16:55:30,673 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,674 INFO | (200, 512)
2021-05-27 16:55:30,681 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,682 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  57%|█████▋    | 327/574 [00:35<00:28,  8.68it/s]

2021-05-27 16:55:30,764 INFO | INITIAL
2021-05-27 16:55:30,764 INFO | (50, 200)
2021-05-27 16:55:30,769 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:30,770 INFO | (50, 200, 512)
2021-05-27 16:55:30,771 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:30,771 INFO | (50, 200, 512)
2021-05-27 16:55:30,772 INFO | BERT LAYER
2021-05-27 16:55:30,772 INFO | (200, 512)
2021-05-27 16:55:30,773 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,773 INFO | (200, 512)
2021-05-27 16:55:30,773 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,774 INFO | (200, 512)
2021-05-27 16:55:30,781 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,781 INFO | (200, 512)
2021-05-27 16:55:30,782 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,782 INFO | (200, 512)
2021-05-27 16:55:30,787 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,788 INFO | (200, 512)
2021-05-27 16:55:30,788 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,789 INFO | (200, 512)
2021-05-27 16:55:30,795 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,796 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  57%|█████▋    | 328/574 [00:36<00:27,  9.01it/s]

2021-05-27 16:55:30,865 INFO | INITIAL
2021-05-27 16:55:30,865 INFO | (50, 200)
2021-05-27 16:55:30,877 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:30,877 INFO | (50, 200, 512)
2021-05-27 16:55:30,879 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:30,879 INFO | (50, 200, 512)
2021-05-27 16:55:30,880 INFO | BERT LAYER
2021-05-27 16:55:30,880 INFO | (200, 512)
2021-05-27 16:55:30,881 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,881 INFO | (200, 512)
2021-05-27 16:55:30,881 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,882 INFO | (200, 512)
2021-05-27 16:55:30,887 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,887 INFO | (200, 512)
2021-05-27 16:55:30,887 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,888 INFO | (200, 512)
2021-05-27 16:55:30,893 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,894 INFO | (200, 512)
2021-05-27 16:55:30,894 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,894 INFO | (200, 512)
2021-05-27 16:55:30,900 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,900 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  57%|█████▋    | 329/574 [00:36<00:26,  9.13it/s]

2021-05-27 16:55:30,971 INFO | INITIAL
2021-05-27 16:55:30,971 INFO | (50, 200)
2021-05-27 16:55:30,977 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:30,977 INFO | (50, 200, 512)
2021-05-27 16:55:30,979 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:30,979 INFO | (50, 200, 512)
2021-05-27 16:55:30,980 INFO | BERT LAYER
2021-05-27 16:55:30,982 INFO | (200, 512)
2021-05-27 16:55:30,982 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,983 INFO | (200, 512)
2021-05-27 16:55:30,983 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,984 INFO | (200, 512)
2021-05-27 16:55:30,990 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,990 INFO | (200, 512)
2021-05-27 16:55:30,991 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,991 INFO | (200, 512)
2021-05-27 16:55:30,997 INFO | BERT LAYER LOOP
2021-05-27 16:55:30,998 INFO | (200, 512)
2021-05-27 16:55:30,998 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:30,999 INFO | (200, 512)
2021-05-27 16:55:31,004 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,004 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  57%|█████▋    | 330/574 [00:36<00:26,  9.11it/s]

2021-05-27 16:55:31,081 INFO | INITIAL
2021-05-27 16:55:31,082 INFO | (50, 200)
2021-05-27 16:55:31,087 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:31,088 INFO | (50, 200, 512)
2021-05-27 16:55:31,089 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:31,090 INFO | (50, 200, 512)
2021-05-27 16:55:31,091 INFO | BERT LAYER
2021-05-27 16:55:31,091 INFO | (200, 512)
2021-05-27 16:55:31,092 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,092 INFO | (200, 512)
2021-05-27 16:55:31,093 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,094 INFO | (200, 512)
2021-05-27 16:55:31,100 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,100 INFO | (200, 512)
2021-05-27 16:55:31,101 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,101 INFO | (200, 512)
2021-05-27 16:55:31,108 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,108 INFO | (200, 512)
2021-05-27 16:55:31,109 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,110 INFO | (200, 512)
2021-05-27 16:55:31,116 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,117 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  58%|█████▊    | 331/574 [00:36<00:26,  9.25it/s]

2021-05-27 16:55:31,186 INFO | INITIAL
2021-05-27 16:55:31,186 INFO | (50, 200)
2021-05-27 16:55:31,199 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:31,199 INFO | (50, 200, 512)
2021-05-27 16:55:31,201 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:31,201 INFO | (50, 200, 512)
2021-05-27 16:55:31,202 INFO | BERT LAYER
2021-05-27 16:55:31,203 INFO | (200, 512)
2021-05-27 16:55:31,203 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,203 INFO | (200, 512)
2021-05-27 16:55:31,204 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,204 INFO | (200, 512)
2021-05-27 16:55:31,211 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,212 INFO | (200, 512)
2021-05-27 16:55:31,212 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,213 INFO | (200, 512)
2021-05-27 16:55:31,218 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,218 INFO | (200, 512)
2021-05-27 16:55:31,219 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,219 INFO | (200, 512)
2021-05-27 16:55:31,224 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,224 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  58%|█████▊    | 332/574 [00:36<00:26,  9.22it/s]

2021-05-27 16:55:31,295 INFO | INITIAL
2021-05-27 16:55:31,295 INFO | (50, 200)
2021-05-27 16:55:31,300 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:31,301 INFO | (50, 200, 512)
2021-05-27 16:55:31,302 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:31,302 INFO | (50, 200, 512)
2021-05-27 16:55:31,303 INFO | BERT LAYER
2021-05-27 16:55:31,303 INFO | (200, 512)
2021-05-27 16:55:31,304 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,304 INFO | (200, 512)
2021-05-27 16:55:31,304 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,305 INFO | (200, 512)
2021-05-27 16:55:31,312 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,313 INFO | (200, 512)
2021-05-27 16:55:31,314 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,314 INFO | (200, 512)
2021-05-27 16:55:31,320 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,321 INFO | (200, 512)
2021-05-27 16:55:31,322 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,323 INFO | (200, 512)
2021-05-27 16:55:31,330 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,330 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  58%|█████▊    | 333/574 [00:36<00:26,  9.09it/s]

2021-05-27 16:55:31,409 INFO | INITIAL
2021-05-27 16:55:31,410 INFO | (50, 200)
2021-05-27 16:55:31,415 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:31,417 INFO | (50, 200, 512)
2021-05-27 16:55:31,418 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:31,419 INFO | (50, 200, 512)
2021-05-27 16:55:31,420 INFO | BERT LAYER
2021-05-27 16:55:31,420 INFO | (200, 512)
2021-05-27 16:55:31,421 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,421 INFO | (200, 512)
2021-05-27 16:55:31,421 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,422 INFO | (200, 512)
2021-05-27 16:55:31,429 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,430 INFO | (200, 512)
2021-05-27 16:55:31,430 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,431 INFO | (200, 512)
2021-05-27 16:55:31,437 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,438 INFO | (200, 512)
2021-05-27 16:55:31,438 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,439 INFO | (200, 512)
2021-05-27 16:55:31,446 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,447 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  58%|█████▊    | 334/574 [00:36<00:26,  9.07it/s]

2021-05-27 16:55:31,519 INFO | INITIAL
2021-05-27 16:55:31,520 INFO | (50, 200)
2021-05-27 16:55:31,526 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:31,527 INFO | (50, 200, 512)
2021-05-27 16:55:31,528 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:31,529 INFO | (50, 200, 512)
2021-05-27 16:55:31,529 INFO | BERT LAYER
2021-05-27 16:55:31,530 INFO | (200, 512)
2021-05-27 16:55:31,530 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,531 INFO | (200, 512)
2021-05-27 16:55:31,531 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,532 INFO | (200, 512)
2021-05-27 16:55:31,538 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,540 INFO | (200, 512)
2021-05-27 16:55:31,540 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,541 INFO | (200, 512)
2021-05-27 16:55:31,547 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,547 INFO | (200, 512)
2021-05-27 16:55:31,548 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,548 INFO | (200, 512)
2021-05-27 16:55:31,553 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,554 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  58%|█████▊    | 335/574 [00:36<00:25,  9.22it/s]

2021-05-27 16:55:31,624 INFO | INITIAL
2021-05-27 16:55:31,624 INFO | (50, 200)
2021-05-27 16:55:31,633 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:31,633 INFO | (50, 200, 512)
2021-05-27 16:55:31,634 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:31,635 INFO | (50, 200, 512)
2021-05-27 16:55:31,635 INFO | BERT LAYER
2021-05-27 16:55:31,636 INFO | (200, 512)
2021-05-27 16:55:31,636 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,636 INFO | (200, 512)
2021-05-27 16:55:31,637 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,637 INFO | (200, 512)
2021-05-27 16:55:31,644 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,644 INFO | (200, 512)
2021-05-27 16:55:31,645 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,646 INFO | (200, 512)
2021-05-27 16:55:31,652 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,653 INFO | (200, 512)
2021-05-27 16:55:31,653 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,654 INFO | (200, 512)
2021-05-27 16:55:31,660 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,661 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  59%|█████▊    | 336/574 [00:36<00:26,  9.09it/s]

2021-05-27 16:55:31,737 INFO | INITIAL
2021-05-27 16:55:31,738 INFO | (50, 200)
2021-05-27 16:55:31,744 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:31,745 INFO | (50, 200, 512)
2021-05-27 16:55:31,747 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:31,747 INFO | (50, 200, 512)
2021-05-27 16:55:31,748 INFO | BERT LAYER
2021-05-27 16:55:31,748 INFO | (200, 512)
2021-05-27 16:55:31,749 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,749 INFO | (200, 512)
2021-05-27 16:55:31,750 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,750 INFO | (200, 512)
2021-05-27 16:55:31,755 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,756 INFO | (200, 512)
2021-05-27 16:55:31,756 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,757 INFO | (200, 512)
2021-05-27 16:55:31,762 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,763 INFO | (200, 512)
2021-05-27 16:55:31,763 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,764 INFO | (200, 512)
2021-05-27 16:55:31,770 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,771 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  59%|█████▊    | 337/574 [00:37<00:25,  9.18it/s]

2021-05-27 16:55:31,845 INFO | INITIAL
2021-05-27 16:55:31,845 INFO | (50, 200)
2021-05-27 16:55:31,852 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:31,853 INFO | (50, 200, 512)
2021-05-27 16:55:31,854 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:31,854 INFO | (50, 200, 512)
2021-05-27 16:55:31,855 INFO | BERT LAYER
2021-05-27 16:55:31,856 INFO | (200, 512)
2021-05-27 16:55:31,856 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,857 INFO | (200, 512)
2021-05-27 16:55:31,857 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,858 INFO | (200, 512)
2021-05-27 16:55:31,865 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,866 INFO | (200, 512)
2021-05-27 16:55:31,866 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,867 INFO | (200, 512)
2021-05-27 16:55:31,873 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,874 INFO | (200, 512)
2021-05-27 16:55:31,875 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,875 INFO | (200, 512)
2021-05-27 16:55:31,881 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,881 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  59%|█████▉    | 338/574 [00:37<00:25,  9.25it/s]

2021-05-27 16:55:31,950 INFO | INITIAL
2021-05-27 16:55:31,951 INFO | (50, 200)
2021-05-27 16:55:31,957 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:31,957 INFO | (50, 200, 512)
2021-05-27 16:55:31,959 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:31,960 INFO | (50, 200, 512)
2021-05-27 16:55:31,960 INFO | BERT LAYER
2021-05-27 16:55:31,961 INFO | (200, 512)
2021-05-27 16:55:31,961 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,962 INFO | (200, 512)
2021-05-27 16:55:31,963 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,963 INFO | (200, 512)
2021-05-27 16:55:31,969 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,970 INFO | (200, 512)
2021-05-27 16:55:31,970 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,971 INFO | (200, 512)
2021-05-27 16:55:31,976 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,977 INFO | (200, 512)
2021-05-27 16:55:31,977 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:31,978 INFO | (200, 512)
2021-05-27 16:55:31,984 INFO | BERT LAYER LOOP
2021-05-27 16:55:31,984 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  59%|█████▉    | 339/574 [00:37<00:25,  9.26it/s]

2021-05-27 16:55:32,057 INFO | INITIAL
2021-05-27 16:55:32,058 INFO | (50, 200)
2021-05-27 16:55:32,065 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:32,065 INFO | (50, 200, 512)
2021-05-27 16:55:32,066 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:32,067 INFO | (50, 200, 512)
2021-05-27 16:55:32,068 INFO | BERT LAYER
2021-05-27 16:55:32,068 INFO | (200, 512)
2021-05-27 16:55:32,069 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,069 INFO | (200, 512)
2021-05-27 16:55:32,070 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,070 INFO | (200, 512)
2021-05-27 16:55:32,077 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,080 INFO | (200, 512)
2021-05-27 16:55:32,080 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,081 INFO | (200, 512)
2021-05-27 16:55:32,086 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,087 INFO | (200, 512)
2021-05-27 16:55:32,087 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,087 INFO | (200, 512)
2021-05-27 16:55:32,093 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,093 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  59%|█████▉    | 340/574 [00:37<00:25,  9.29it/s]

2021-05-27 16:55:32,164 INFO | INITIAL
2021-05-27 16:55:32,165 INFO | (50, 200)
2021-05-27 16:55:32,170 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:32,170 INFO | (50, 200, 512)
2021-05-27 16:55:32,171 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:32,172 INFO | (50, 200, 512)
2021-05-27 16:55:32,172 INFO | BERT LAYER
2021-05-27 16:55:32,172 INFO | (200, 512)
2021-05-27 16:55:32,173 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,173 INFO | (200, 512)
2021-05-27 16:55:32,174 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,175 INFO | (200, 512)
2021-05-27 16:55:32,184 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,185 INFO | (200, 512)
2021-05-27 16:55:32,185 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,186 INFO | (200, 512)
2021-05-27 16:55:32,193 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,194 INFO | (200, 512)
2021-05-27 16:55:32,194 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,195 INFO | (200, 512)
2021-05-27 16:55:32,200 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,201 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  59%|█████▉    | 341/574 [00:37<00:25,  9.28it/s]

2021-05-27 16:55:32,272 INFO | INITIAL
2021-05-27 16:55:32,273 INFO | (50, 200)
2021-05-27 16:55:32,279 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:32,280 INFO | (50, 200, 512)
2021-05-27 16:55:32,281 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:32,282 INFO | (50, 200, 512)
2021-05-27 16:55:32,283 INFO | BERT LAYER
2021-05-27 16:55:32,283 INFO | (200, 512)
2021-05-27 16:55:32,283 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,284 INFO | (200, 512)
2021-05-27 16:55:32,284 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,285 INFO | (200, 512)
2021-05-27 16:55:32,291 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,291 INFO | (200, 512)
2021-05-27 16:55:32,292 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,292 INFO | (200, 512)
2021-05-27 16:55:32,298 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,298 INFO | (200, 512)
2021-05-27 16:55:32,299 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,299 INFO | (200, 512)
2021-05-27 16:55:32,306 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,307 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  59%|█████▉    | 341/574 [00:37<00:25,  9.28it/s]

2021-05-27 16:55:32,372 INFO | INITIAL
2021-05-27 16:55:32,372 INFO | (50, 200)
2021-05-27 16:55:32,378 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:32,379 INFO | (50, 200, 512)
2021-05-27 16:55:32,381 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:32,382 INFO | (50, 200, 512)
2021-05-27 16:55:32,383 INFO | BERT LAYER
2021-05-27 16:55:32,384 INFO | (200, 512)
2021-05-27 16:55:32,384 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,384 INFO | (200, 512)
2021-05-27 16:55:32,385 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,385 INFO | (200, 512)
2021-05-27 16:55:32,392 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,392 INFO | (200, 512)
2021-05-27 16:55:32,393 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,393 INFO | (200, 512)
2021-05-27 16:55:32,399 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,400 INFO | (200, 512)
2021-05-27 16:55:32,400 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,400 INFO | (200, 512)
2021-05-27 16:55:32,406 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,407 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  60%|█████▉    | 343/574 [00:37<00:24,  9.51it/s]

2021-05-27 16:55:32,477 INFO | INITIAL
2021-05-27 16:55:32,478 INFO | (50, 200)
2021-05-27 16:55:32,484 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:32,485 INFO | (50, 200, 512)
2021-05-27 16:55:32,486 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:32,487 INFO | (50, 200, 512)
2021-05-27 16:55:32,488 INFO | BERT LAYER
2021-05-27 16:55:32,488 INFO | (200, 512)
2021-05-27 16:55:32,489 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,489 INFO | (200, 512)
2021-05-27 16:55:32,489 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,490 INFO | (200, 512)
2021-05-27 16:55:32,497 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,497 INFO | (200, 512)
2021-05-27 16:55:32,497 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,498 INFO | (200, 512)
2021-05-27 16:55:32,503 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,503 INFO | (200, 512)
2021-05-27 16:55:32,504 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,504 INFO | (200, 512)
2021-05-27 16:55:32,510 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,512 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  60%|█████▉    | 344/574 [00:37<00:24,  9.53it/s]

2021-05-27 16:55:32,581 INFO | INITIAL
2021-05-27 16:55:32,582 INFO | (50, 200)
2021-05-27 16:55:32,587 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:32,588 INFO | (50, 200, 512)
2021-05-27 16:55:32,589 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:32,589 INFO | (50, 200, 512)
2021-05-27 16:55:32,590 INFO | BERT LAYER
2021-05-27 16:55:32,591 INFO | (200, 512)
2021-05-27 16:55:32,591 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,591 INFO | (200, 512)
2021-05-27 16:55:32,592 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,592 INFO | (200, 512)
2021-05-27 16:55:32,598 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,598 INFO | (200, 512)
2021-05-27 16:55:32,599 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,599 INFO | (200, 512)
2021-05-27 16:55:32,605 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,605 INFO | (200, 512)
2021-05-27 16:55:32,606 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,606 INFO | (200, 512)
2021-05-27 16:55:32,616 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,617 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  60%|██████    | 345/574 [00:37<00:24,  9.47it/s]

2021-05-27 16:55:32,688 INFO | INITIAL
2021-05-27 16:55:32,689 INFO | (50, 200)
2021-05-27 16:55:32,694 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:32,695 INFO | (50, 200, 512)
2021-05-27 16:55:32,696 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:32,696 INFO | (50, 200, 512)
2021-05-27 16:55:32,697 INFO | BERT LAYER
2021-05-27 16:55:32,697 INFO | (200, 512)
2021-05-27 16:55:32,697 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,698 INFO | (200, 512)
2021-05-27 16:55:32,698 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,699 INFO | (200, 512)
2021-05-27 16:55:32,704 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,704 INFO | (200, 512)
2021-05-27 16:55:32,705 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,705 INFO | (200, 512)
2021-05-27 16:55:32,711 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,712 INFO | (200, 512)
2021-05-27 16:55:32,712 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,713 INFO | (200, 512)
2021-05-27 16:55:32,720 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,721 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  60%|██████    | 346/574 [00:37<00:23,  9.55it/s]

2021-05-27 16:55:32,791 INFO | INITIAL
2021-05-27 16:55:32,793 INFO | (50, 200)
2021-05-27 16:55:32,800 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:32,801 INFO | (50, 200, 512)
2021-05-27 16:55:32,802 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:32,802 INFO | (50, 200, 512)
2021-05-27 16:55:32,803 INFO | BERT LAYER
2021-05-27 16:55:32,804 INFO | (200, 512)
2021-05-27 16:55:32,804 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,805 INFO | (200, 512)
2021-05-27 16:55:32,805 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,805 INFO | (200, 512)
2021-05-27 16:55:32,811 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,811 INFO | (200, 512)
2021-05-27 16:55:32,812 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,813 INFO | (200, 512)
2021-05-27 16:55:32,818 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,819 INFO | (200, 512)
2021-05-27 16:55:32,819 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,820 INFO | (200, 512)
2021-05-27 16:55:32,825 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,826 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  60%|██████    | 347/574 [00:38<00:23,  9.58it/s]

2021-05-27 16:55:32,894 INFO | INITIAL
2021-05-27 16:55:32,895 INFO | (50, 200)
2021-05-27 16:55:32,900 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:32,900 INFO | (50, 200, 512)
2021-05-27 16:55:32,902 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:32,903 INFO | (50, 200, 512)
2021-05-27 16:55:32,904 INFO | BERT LAYER
2021-05-27 16:55:32,904 INFO | (200, 512)
2021-05-27 16:55:32,905 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,905 INFO | (200, 512)
2021-05-27 16:55:32,906 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,907 INFO | (200, 512)
2021-05-27 16:55:32,915 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,916 INFO | (200, 512)
2021-05-27 16:55:32,916 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,917 INFO | (200, 512)
2021-05-27 16:55:32,924 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,924 INFO | (200, 512)
2021-05-27 16:55:32,925 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:32,925 INFO | (200, 512)
2021-05-27 16:55:32,931 INFO | BERT LAYER LOOP
2021-05-27 16:55:32,931 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  61%|██████    | 348/574 [00:38<00:23,  9.45it/s]

2021-05-27 16:55:33,004 INFO | INITIAL
2021-05-27 16:55:33,004 INFO | (50, 200)
2021-05-27 16:55:33,010 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:33,011 INFO | (50, 200, 512)
2021-05-27 16:55:33,013 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:33,013 INFO | (50, 200, 512)
2021-05-27 16:55:33,014 INFO | BERT LAYER
2021-05-27 16:55:33,015 INFO | (200, 512)
2021-05-27 16:55:33,015 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,015 INFO | (200, 512)
2021-05-27 16:55:33,016 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,017 INFO | (200, 512)
2021-05-27 16:55:33,022 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,023 INFO | (200, 512)
2021-05-27 16:55:33,024 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,024 INFO | (200, 512)
2021-05-27 16:55:33,030 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,030 INFO | (200, 512)
2021-05-27 16:55:33,031 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,031 INFO | (200, 512)
2021-05-27 16:55:33,036 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,036 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  61%|██████    | 349/574 [00:38<00:23,  9.54it/s]

2021-05-27 16:55:33,106 INFO | INITIAL
2021-05-27 16:55:33,106 INFO | (50, 200)
2021-05-27 16:55:33,112 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:33,113 INFO | (50, 200, 512)
2021-05-27 16:55:33,115 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:33,115 INFO | (50, 200, 512)
2021-05-27 16:55:33,116 INFO | BERT LAYER
2021-05-27 16:55:33,116 INFO | (200, 512)
2021-05-27 16:55:33,117 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,123 INFO | (200, 512)
2021-05-27 16:55:33,123 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,124 INFO | (200, 512)
2021-05-27 16:55:33,129 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,130 INFO | (200, 512)
2021-05-27 16:55:33,130 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,131 INFO | (200, 512)
2021-05-27 16:55:33,136 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,137 INFO | (200, 512)
2021-05-27 16:55:33,137 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,137 INFO | (200, 512)
2021-05-27 16:55:33,144 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,145 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  61%|██████    | 350/574 [00:38<00:24,  9.26it/s]

2021-05-27 16:55:33,222 INFO | INITIAL
2021-05-27 16:55:33,222 INFO | (50, 200)
2021-05-27 16:55:33,228 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:33,228 INFO | (50, 200, 512)
2021-05-27 16:55:33,229 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:33,230 INFO | (50, 200, 512)
2021-05-27 16:55:33,230 INFO | BERT LAYER
2021-05-27 16:55:33,231 INFO | (200, 512)
2021-05-27 16:55:33,231 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,231 INFO | (200, 512)
2021-05-27 16:55:33,232 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,232 INFO | (200, 512)
2021-05-27 16:55:33,237 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,238 INFO | (200, 512)
2021-05-27 16:55:33,238 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,238 INFO | (200, 512)
2021-05-27 16:55:33,245 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,246 INFO | (200, 512)
2021-05-27 16:55:33,247 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,247 INFO | (200, 512)
2021-05-27 16:55:33,255 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,255 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  61%|██████    | 351/574 [00:38<00:24,  9.15it/s]

2021-05-27 16:55:33,334 INFO | INITIAL
2021-05-27 16:55:33,335 INFO | (50, 200)
2021-05-27 16:55:33,342 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:33,343 INFO | (50, 200, 512)
2021-05-27 16:55:33,345 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:33,345 INFO | (50, 200, 512)
2021-05-27 16:55:33,346 INFO | BERT LAYER
2021-05-27 16:55:33,346 INFO | (200, 512)
2021-05-27 16:55:33,347 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,347 INFO | (200, 512)
2021-05-27 16:55:33,347 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,348 INFO | (200, 512)
2021-05-27 16:55:33,353 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,354 INFO | (200, 512)
2021-05-27 16:55:33,354 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,355 INFO | (200, 512)
2021-05-27 16:55:33,360 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,361 INFO | (200, 512)
2021-05-27 16:55:33,361 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,362 INFO | (200, 512)
2021-05-27 16:55:33,367 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,367 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  61%|██████▏   | 352/574 [00:38<00:23,  9.33it/s]

2021-05-27 16:55:33,436 INFO | INITIAL
2021-05-27 16:55:33,436 INFO | (50, 200)
2021-05-27 16:55:33,442 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:33,442 INFO | (50, 200, 512)
2021-05-27 16:55:33,444 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:33,445 INFO | (50, 200, 512)
2021-05-27 16:55:33,446 INFO | BERT LAYER
2021-05-27 16:55:33,449 INFO | (200, 512)
2021-05-27 16:55:33,450 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,450 INFO | (200, 512)
2021-05-27 16:55:33,451 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,452 INFO | (200, 512)
2021-05-27 16:55:33,459 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,460 INFO | (200, 512)
2021-05-27 16:55:33,460 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,461 INFO | (200, 512)
2021-05-27 16:55:33,467 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,467 INFO | (200, 512)
2021-05-27 16:55:33,467 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,468 INFO | (200, 512)
2021-05-27 16:55:33,474 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,474 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  61%|██████▏   | 353/574 [00:38<00:23,  9.23it/s]

2021-05-27 16:55:33,547 INFO | INITIAL
2021-05-27 16:55:33,548 INFO | (50, 200)
2021-05-27 16:55:33,552 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:33,552 INFO | (50, 200, 512)
2021-05-27 16:55:33,554 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:33,554 INFO | (50, 200, 512)
2021-05-27 16:55:33,555 INFO | BERT LAYER
2021-05-27 16:55:33,555 INFO | (200, 512)
2021-05-27 16:55:33,556 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,556 INFO | (200, 512)
2021-05-27 16:55:33,556 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,557 INFO | (200, 512)
2021-05-27 16:55:33,563 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,564 INFO | (200, 512)
2021-05-27 16:55:33,564 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,565 INFO | (200, 512)
2021-05-27 16:55:33,571 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,571 INFO | (200, 512)
2021-05-27 16:55:33,572 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,572 INFO | (200, 512)
2021-05-27 16:55:33,580 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,581 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  62%|██████▏   | 354/574 [00:38<00:24,  9.16it/s]

2021-05-27 16:55:33,659 INFO | INITIAL
2021-05-27 16:55:33,660 INFO | (50, 200)
2021-05-27 16:55:33,666 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:33,667 INFO | (50, 200, 512)
2021-05-27 16:55:33,668 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:33,668 INFO | (50, 200, 512)
2021-05-27 16:55:33,669 INFO | BERT LAYER
2021-05-27 16:55:33,669 INFO | (200, 512)
2021-05-27 16:55:33,670 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,670 INFO | (200, 512)
2021-05-27 16:55:33,671 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,671 INFO | (200, 512)
2021-05-27 16:55:33,678 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,678 INFO | (200, 512)
2021-05-27 16:55:33,679 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,679 INFO | (200, 512)
2021-05-27 16:55:33,685 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,686 INFO | (200, 512)
2021-05-27 16:55:33,686 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,687 INFO | (200, 512)
2021-05-27 16:55:33,693 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,694 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  62%|██████▏   | 355/574 [00:38<00:23,  9.23it/s]

2021-05-27 16:55:33,765 INFO | INITIAL
2021-05-27 16:55:33,766 INFO | (50, 200)
2021-05-27 16:55:33,771 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:33,772 INFO | (50, 200, 512)
2021-05-27 16:55:33,773 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:33,774 INFO | (50, 200, 512)
2021-05-27 16:55:33,775 INFO | BERT LAYER
2021-05-27 16:55:33,776 INFO | (200, 512)
2021-05-27 16:55:33,776 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,776 INFO | (200, 512)
2021-05-27 16:55:33,777 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,777 INFO | (200, 512)
2021-05-27 16:55:33,785 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,785 INFO | (200, 512)
2021-05-27 16:55:33,785 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,786 INFO | (200, 512)
2021-05-27 16:55:33,793 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,793 INFO | (200, 512)
2021-05-27 16:55:33,794 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,794 INFO | (200, 512)
2021-05-27 16:55:33,800 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,800 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  62%|██████▏   | 356/574 [00:39<00:23,  9.29it/s]

2021-05-27 16:55:33,871 INFO | INITIAL
2021-05-27 16:55:33,872 INFO | (50, 200)
2021-05-27 16:55:33,880 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:33,880 INFO | (50, 200, 512)
2021-05-27 16:55:33,882 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:33,882 INFO | (50, 200, 512)
2021-05-27 16:55:33,885 INFO | BERT LAYER
2021-05-27 16:55:33,885 INFO | (200, 512)
2021-05-27 16:55:33,886 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,886 INFO | (200, 512)
2021-05-27 16:55:33,887 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,887 INFO | (200, 512)
2021-05-27 16:55:33,895 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,895 INFO | (200, 512)
2021-05-27 16:55:33,896 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,896 INFO | (200, 512)
2021-05-27 16:55:33,901 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,901 INFO | (200, 512)
2021-05-27 16:55:33,902 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,902 INFO | (200, 512)
2021-05-27 16:55:33,907 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,907 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  62%|██████▏   | 357/574 [00:39<00:23,  9.32it/s]

2021-05-27 16:55:33,977 INFO | INITIAL
2021-05-27 16:55:33,978 INFO | (50, 200)
2021-05-27 16:55:33,986 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:33,986 INFO | (50, 200, 512)
2021-05-27 16:55:33,988 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:33,988 INFO | (50, 200, 512)
2021-05-27 16:55:33,989 INFO | BERT LAYER
2021-05-27 16:55:33,989 INFO | (200, 512)
2021-05-27 16:55:33,990 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,990 INFO | (200, 512)
2021-05-27 16:55:33,990 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,991 INFO | (200, 512)
2021-05-27 16:55:33,997 INFO | BERT LAYER LOOP
2021-05-27 16:55:33,998 INFO | (200, 512)
2021-05-27 16:55:33,998 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:33,998 INFO | (200, 512)
2021-05-27 16:55:34,005 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,005 INFO | (200, 512)
2021-05-27 16:55:34,006 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,006 INFO | (200, 512)
2021-05-27 16:55:34,013 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,014 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  62%|██████▏   | 358/574 [00:39<00:23,  9.25it/s]

2021-05-27 16:55:34,087 INFO | INITIAL
2021-05-27 16:55:34,088 INFO | (50, 200)
2021-05-27 16:55:34,093 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:34,093 INFO | (50, 200, 512)
2021-05-27 16:55:34,095 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:34,095 INFO | (50, 200, 512)
2021-05-27 16:55:34,095 INFO | BERT LAYER
2021-05-27 16:55:34,096 INFO | (200, 512)
2021-05-27 16:55:34,096 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,096 INFO | (200, 512)
2021-05-27 16:55:34,097 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,097 INFO | (200, 512)
2021-05-27 16:55:34,102 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,102 INFO | (200, 512)
2021-05-27 16:55:34,103 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,103 INFO | (200, 512)
2021-05-27 16:55:34,108 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,109 INFO | (200, 512)
2021-05-27 16:55:34,109 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,110 INFO | (200, 512)
2021-05-27 16:55:34,115 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,116 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  62%|██████▏   | 358/574 [00:39<00:23,  9.25it/s]

2021-05-27 16:55:34,185 INFO | INITIAL
2021-05-27 16:55:34,185 INFO | (50, 200)
2021-05-27 16:55:34,190 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:34,191 INFO | (50, 200, 512)
2021-05-27 16:55:34,193 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:34,193 INFO | (50, 200, 512)
2021-05-27 16:55:34,194 INFO | BERT LAYER
2021-05-27 16:55:34,195 INFO | (200, 512)
2021-05-27 16:55:34,195 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,196 INFO | (200, 512)
2021-05-27 16:55:34,196 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,197 INFO | (200, 512)
2021-05-27 16:55:34,204 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,205 INFO | (200, 512)
2021-05-27 16:55:34,206 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,207 INFO | (200, 512)
2021-05-27 16:55:34,215 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,215 INFO | (200, 512)
2021-05-27 16:55:34,216 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,216 INFO | (200, 512)
2021-05-27 16:55:34,223 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,223 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  63%|██████▎   | 360/574 [00:39<00:22,  9.42it/s]

2021-05-27 16:55:34,295 INFO | INITIAL
2021-05-27 16:55:34,295 INFO | (50, 200)
2021-05-27 16:55:34,300 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:34,300 INFO | (50, 200, 512)
2021-05-27 16:55:34,301 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:34,302 INFO | (50, 200, 512)
2021-05-27 16:55:34,302 INFO | BERT LAYER
2021-05-27 16:55:34,303 INFO | (200, 512)
2021-05-27 16:55:34,303 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,303 INFO | (200, 512)
2021-05-27 16:55:34,304 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,304 INFO | (200, 512)
2021-05-27 16:55:34,310 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,310 INFO | (200, 512)
2021-05-27 16:55:34,311 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,311 INFO | (200, 512)
2021-05-27 16:55:34,319 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,319 INFO | (200, 512)
2021-05-27 16:55:34,320 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,320 INFO | (200, 512)
2021-05-27 16:55:34,328 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,328 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  63%|██████▎   | 361/574 [00:39<00:22,  9.44it/s]

2021-05-27 16:55:34,401 INFO | INITIAL
2021-05-27 16:55:34,401 INFO | (50, 200)
2021-05-27 16:55:34,406 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:34,406 INFO | (50, 200, 512)
2021-05-27 16:55:34,407 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:34,408 INFO | (50, 200, 512)
2021-05-27 16:55:34,409 INFO | BERT LAYER
2021-05-27 16:55:34,409 INFO | (200, 512)
2021-05-27 16:55:34,409 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,410 INFO | (200, 512)
2021-05-27 16:55:34,410 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,410 INFO | (200, 512)
2021-05-27 16:55:34,416 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,417 INFO | (200, 512)
2021-05-27 16:55:34,417 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,418 INFO | (200, 512)
2021-05-27 16:55:34,423 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,423 INFO | (200, 512)
2021-05-27 16:55:34,424 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,424 INFO | (200, 512)
2021-05-27 16:55:34,429 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,430 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  63%|██████▎   | 361/574 [00:39<00:22,  9.44it/s]

2021-05-27 16:55:34,499 INFO | INITIAL
2021-05-27 16:55:34,500 INFO | (50, 200)
2021-05-27 16:55:34,505 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:34,506 INFO | (50, 200, 512)
2021-05-27 16:55:34,507 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:34,508 INFO | (50, 200, 512)
2021-05-27 16:55:34,509 INFO | BERT LAYER
2021-05-27 16:55:34,509 INFO | (200, 512)
2021-05-27 16:55:34,510 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,511 INFO | (200, 512)
2021-05-27 16:55:34,512 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,512 INFO | (200, 512)
2021-05-27 16:55:34,519 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,519 INFO | (200, 512)
2021-05-27 16:55:34,520 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,520 INFO | (200, 512)
2021-05-27 16:55:34,525 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,526 INFO | (200, 512)
2021-05-27 16:55:34,527 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,527 INFO | (200, 512)
2021-05-27 16:55:34,532 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,532 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  63%|██████▎   | 363/574 [00:39<00:21,  9.63it/s]

2021-05-27 16:55:34,602 INFO | INITIAL
2021-05-27 16:55:34,603 INFO | (50, 200)
2021-05-27 16:55:34,609 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:34,609 INFO | (50, 200, 512)
2021-05-27 16:55:34,610 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:34,611 INFO | (50, 200, 512)
2021-05-27 16:55:34,612 INFO | BERT LAYER
2021-05-27 16:55:34,612 INFO | (200, 512)
2021-05-27 16:55:34,613 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,613 INFO | (200, 512)
2021-05-27 16:55:34,614 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,614 INFO | (200, 512)
2021-05-27 16:55:34,622 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,623 INFO | (200, 512)
2021-05-27 16:55:34,623 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,624 INFO | (200, 512)
2021-05-27 16:55:34,631 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,631 INFO | (200, 512)
2021-05-27 16:55:34,632 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,632 INFO | (200, 512)
2021-05-27 16:55:34,637 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,637 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  63%|██████▎   | 364/574 [00:39<00:22,  9.53it/s]

2021-05-27 16:55:34,711 INFO | INITIAL
2021-05-27 16:55:34,712 INFO | (50, 200)
2021-05-27 16:55:34,717 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:34,718 INFO | (50, 200, 512)
2021-05-27 16:55:34,719 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:34,719 INFO | (50, 200, 512)
2021-05-27 16:55:34,720 INFO | BERT LAYER
2021-05-27 16:55:34,721 INFO | (200, 512)
2021-05-27 16:55:34,721 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,721 INFO | (200, 512)
2021-05-27 16:55:34,722 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,722 INFO | (200, 512)
2021-05-27 16:55:34,729 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,729 INFO | (200, 512)
2021-05-27 16:55:34,730 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,730 INFO | (200, 512)
2021-05-27 16:55:34,736 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,736 INFO | (200, 512)
2021-05-27 16:55:34,737 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,737 INFO | (200, 512)
2021-05-27 16:55:34,745 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,746 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  64%|██████▎   | 365/574 [00:40<00:22,  9.48it/s]

2021-05-27 16:55:34,819 INFO | INITIAL
2021-05-27 16:55:34,820 INFO | (50, 200)
2021-05-27 16:55:34,825 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:34,826 INFO | (50, 200, 512)
2021-05-27 16:55:34,828 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:34,828 INFO | (50, 200, 512)
2021-05-27 16:55:34,829 INFO | BERT LAYER
2021-05-27 16:55:34,830 INFO | (200, 512)
2021-05-27 16:55:34,830 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,831 INFO | (200, 512)
2021-05-27 16:55:34,831 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,831 INFO | (200, 512)
2021-05-27 16:55:34,838 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,838 INFO | (200, 512)
2021-05-27 16:55:34,839 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,839 INFO | (200, 512)
2021-05-27 16:55:34,847 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,848 INFO | (200, 512)
2021-05-27 16:55:34,848 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,849 INFO | (200, 512)
2021-05-27 16:55:34,855 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,856 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  64%|██████▍   | 366/574 [00:40<00:22,  9.41it/s]

2021-05-27 16:55:34,927 INFO | INITIAL
2021-05-27 16:55:34,927 INFO | (50, 200)
2021-05-27 16:55:34,933 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:34,933 INFO | (50, 200, 512)
2021-05-27 16:55:34,935 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:34,935 INFO | (50, 200, 512)
2021-05-27 16:55:34,936 INFO | BERT LAYER
2021-05-27 16:55:34,937 INFO | (200, 512)
2021-05-27 16:55:34,938 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,938 INFO | (200, 512)
2021-05-27 16:55:34,939 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,939 INFO | (200, 512)
2021-05-27 16:55:34,947 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,947 INFO | (200, 512)
2021-05-27 16:55:34,948 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,948 INFO | (200, 512)
2021-05-27 16:55:34,955 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,956 INFO | (200, 512)
2021-05-27 16:55:34,956 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:34,957 INFO | (200, 512)
2021-05-27 16:55:34,964 INFO | BERT LAYER LOOP
2021-05-27 16:55:34,964 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  64%|██████▍   | 367/574 [00:40<00:22,  9.29it/s]

2021-05-27 16:55:35,038 INFO | INITIAL
2021-05-27 16:55:35,039 INFO | (50, 200)
2021-05-27 16:55:35,046 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:35,046 INFO | (50, 200, 512)
2021-05-27 16:55:35,048 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:35,048 INFO | (50, 200, 512)
2021-05-27 16:55:35,049 INFO | BERT LAYER
2021-05-27 16:55:35,050 INFO | (200, 512)
2021-05-27 16:55:35,051 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,051 INFO | (200, 512)
2021-05-27 16:55:35,052 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,052 INFO | (200, 512)
2021-05-27 16:55:35,059 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,059 INFO | (200, 512)
2021-05-27 16:55:35,060 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,060 INFO | (200, 512)
2021-05-27 16:55:35,066 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,067 INFO | (200, 512)
2021-05-27 16:55:35,067 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,068 INFO | (200, 512)
2021-05-27 16:55:35,074 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,075 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  64%|██████▍   | 368/574 [00:40<00:22,  9.22it/s]

2021-05-27 16:55:35,149 INFO | INITIAL
2021-05-27 16:55:35,149 INFO | (50, 200)
2021-05-27 16:55:35,154 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:35,155 INFO | (50, 200, 512)
2021-05-27 16:55:35,156 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:35,156 INFO | (50, 200, 512)
2021-05-27 16:55:35,157 INFO | BERT LAYER
2021-05-27 16:55:35,157 INFO | (200, 512)
2021-05-27 16:55:35,158 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,158 INFO | (200, 512)
2021-05-27 16:55:35,159 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,159 INFO | (200, 512)
2021-05-27 16:55:35,164 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,165 INFO | (200, 512)
2021-05-27 16:55:35,165 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,165 INFO | (200, 512)
2021-05-27 16:55:35,171 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,171 INFO | (200, 512)
2021-05-27 16:55:35,172 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,172 INFO | (200, 512)
2021-05-27 16:55:35,178 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,179 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  64%|██████▍   | 368/574 [00:40<00:22,  9.22it/s]

2021-05-27 16:55:35,248 INFO | INITIAL
2021-05-27 16:55:35,248 INFO | (50, 200)
2021-05-27 16:55:35,255 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:35,255 INFO | (50, 200, 512)
2021-05-27 16:55:35,256 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:35,257 INFO | (50, 200, 512)
2021-05-27 16:55:35,258 INFO | BERT LAYER
2021-05-27 16:55:35,259 INFO | (200, 512)
2021-05-27 16:55:35,259 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,260 INFO | (200, 512)
2021-05-27 16:55:35,261 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,261 INFO | (200, 512)
2021-05-27 16:55:35,269 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,269 INFO | (200, 512)
2021-05-27 16:55:35,270 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,270 INFO | (200, 512)
2021-05-27 16:55:35,277 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,277 INFO | (200, 512)
2021-05-27 16:55:35,278 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,278 INFO | (200, 512)
2021-05-27 16:55:35,283 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,284 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  64%|██████▍   | 370/574 [00:40<00:21,  9.41it/s]

2021-05-27 16:55:35,356 INFO | INITIAL
2021-05-27 16:55:35,356 INFO | (50, 200)
2021-05-27 16:55:35,363 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:35,363 INFO | (50, 200, 512)
2021-05-27 16:55:35,365 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:35,365 INFO | (50, 200, 512)
2021-05-27 16:55:35,366 INFO | BERT LAYER
2021-05-27 16:55:35,366 INFO | (200, 512)
2021-05-27 16:55:35,367 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,367 INFO | (200, 512)
2021-05-27 16:55:35,368 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,369 INFO | (200, 512)
2021-05-27 16:55:35,373 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,374 INFO | (200, 512)
2021-05-27 16:55:35,374 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,375 INFO | (200, 512)
2021-05-27 16:55:35,381 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,381 INFO | (200, 512)
2021-05-27 16:55:35,381 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,382 INFO | (200, 512)
2021-05-27 16:55:35,388 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,388 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  65%|██████▍   | 371/574 [00:40<00:21,  9.38it/s]

2021-05-27 16:55:35,464 INFO | INITIAL
2021-05-27 16:55:35,464 INFO | (50, 200)
2021-05-27 16:55:35,469 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:35,471 INFO | (50, 200, 512)
2021-05-27 16:55:35,472 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:35,473 INFO | (50, 200, 512)
2021-05-27 16:55:35,473 INFO | BERT LAYER
2021-05-27 16:55:35,474 INFO | (200, 512)
2021-05-27 16:55:35,474 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,475 INFO | (200, 512)
2021-05-27 16:55:35,475 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,476 INFO | (200, 512)
2021-05-27 16:55:35,481 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,481 INFO | (200, 512)
2021-05-27 16:55:35,482 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,482 INFO | (200, 512)
2021-05-27 16:55:35,488 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,489 INFO | (200, 512)
2021-05-27 16:55:35,489 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,489 INFO | (200, 512)
2021-05-27 16:55:35,495 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,495 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  65%|██████▍   | 372/574 [00:40<00:21,  9.47it/s]

2021-05-27 16:55:35,566 INFO | INITIAL
2021-05-27 16:55:35,567 INFO | (50, 200)
2021-05-27 16:55:35,572 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:35,572 INFO | (50, 200, 512)
2021-05-27 16:55:35,573 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:35,574 INFO | (50, 200, 512)
2021-05-27 16:55:35,574 INFO | BERT LAYER
2021-05-27 16:55:35,575 INFO | (200, 512)
2021-05-27 16:55:35,575 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,576 INFO | (200, 512)
2021-05-27 16:55:35,576 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,576 INFO | (200, 512)
2021-05-27 16:55:35,585 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,586 INFO | (200, 512)
2021-05-27 16:55:35,587 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,587 INFO | (200, 512)
2021-05-27 16:55:35,595 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,595 INFO | (200, 512)
2021-05-27 16:55:35,596 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,597 INFO | (200, 512)
2021-05-27 16:55:35,603 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,604 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  65%|██████▍   | 373/574 [00:40<00:21,  9.26it/s]

2021-05-27 16:55:35,681 INFO | INITIAL
2021-05-27 16:55:35,681 INFO | (50, 200)
2021-05-27 16:55:35,686 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:35,687 INFO | (50, 200, 512)
2021-05-27 16:55:35,688 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:35,691 INFO | (50, 200, 512)
2021-05-27 16:55:35,692 INFO | BERT LAYER
2021-05-27 16:55:35,692 INFO | (200, 512)
2021-05-27 16:55:35,693 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,693 INFO | (200, 512)
2021-05-27 16:55:35,694 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,694 INFO | (200, 512)
2021-05-27 16:55:35,700 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,701 INFO | (200, 512)
2021-05-27 16:55:35,701 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,702 INFO | (200, 512)
2021-05-27 16:55:35,709 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,710 INFO | (200, 512)
2021-05-27 16:55:35,710 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,711 INFO | (200, 512)
2021-05-27 16:55:35,719 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,720 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  65%|██████▌   | 374/574 [00:40<00:21,  9.19it/s]

2021-05-27 16:55:35,792 INFO | INITIAL
2021-05-27 16:55:35,792 INFO | (50, 200)
2021-05-27 16:55:35,799 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:35,800 INFO | (50, 200, 512)
2021-05-27 16:55:35,801 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:35,801 INFO | (50, 200, 512)
2021-05-27 16:55:35,802 INFO | BERT LAYER
2021-05-27 16:55:35,802 INFO | (200, 512)
2021-05-27 16:55:35,803 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,803 INFO | (200, 512)
2021-05-27 16:55:35,803 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,804 INFO | (200, 512)
2021-05-27 16:55:35,809 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,810 INFO | (200, 512)
2021-05-27 16:55:35,810 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,811 INFO | (200, 512)
2021-05-27 16:55:35,816 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,816 INFO | (200, 512)
2021-05-27 16:55:35,816 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,817 INFO | (200, 512)
2021-05-27 16:55:35,824 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,824 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  65%|██████▌   | 375/574 [00:41<00:21,  9.25it/s]

2021-05-27 16:55:35,898 INFO | INITIAL
2021-05-27 16:55:35,899 INFO | (50, 200)
2021-05-27 16:55:35,904 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:35,905 INFO | (50, 200, 512)
2021-05-27 16:55:35,907 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:35,907 INFO | (50, 200, 512)
2021-05-27 16:55:35,908 INFO | BERT LAYER
2021-05-27 16:55:35,908 INFO | (200, 512)
2021-05-27 16:55:35,909 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,909 INFO | (200, 512)
2021-05-27 16:55:35,909 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,910 INFO | (200, 512)
2021-05-27 16:55:35,915 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,916 INFO | (200, 512)
2021-05-27 16:55:35,916 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,917 INFO | (200, 512)
2021-05-27 16:55:35,923 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,924 INFO | (200, 512)
2021-05-27 16:55:35,924 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:35,925 INFO | (200, 512)
2021-05-27 16:55:35,931 INFO | BERT LAYER LOOP
2021-05-27 16:55:35,931 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  66%|██████▌   | 376/574 [00:41<00:21,  9.36it/s]

2021-05-27 16:55:36,001 INFO | INITIAL
2021-05-27 16:55:36,002 INFO | (50, 200)
2021-05-27 16:55:36,006 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:36,007 INFO | (50, 200, 512)
2021-05-27 16:55:36,008 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:36,009 INFO | (50, 200, 512)
2021-05-27 16:55:36,010 INFO | BERT LAYER
2021-05-27 16:55:36,010 INFO | (200, 512)
2021-05-27 16:55:36,010 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,011 INFO | (200, 512)
2021-05-27 16:55:36,011 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,011 INFO | (200, 512)
2021-05-27 16:55:36,018 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,018 INFO | (200, 512)
2021-05-27 16:55:36,019 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,019 INFO | (200, 512)
2021-05-27 16:55:36,025 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,025 INFO | (200, 512)
2021-05-27 16:55:36,026 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,026 INFO | (200, 512)
2021-05-27 16:55:36,033 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,033 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  66%|██████▌   | 377/574 [00:41<00:20,  9.46it/s]

2021-05-27 16:55:36,104 INFO | INITIAL
2021-05-27 16:55:36,105 INFO | (50, 200)
2021-05-27 16:55:36,110 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:36,111 INFO | (50, 200, 512)
2021-05-27 16:55:36,112 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:36,113 INFO | (50, 200, 512)
2021-05-27 16:55:36,114 INFO | BERT LAYER
2021-05-27 16:55:36,114 INFO | (200, 512)
2021-05-27 16:55:36,115 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,115 INFO | (200, 512)
2021-05-27 16:55:36,117 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,117 INFO | (200, 512)
2021-05-27 16:55:36,124 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,124 INFO | (200, 512)
2021-05-27 16:55:36,125 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,125 INFO | (200, 512)
2021-05-27 16:55:36,131 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,132 INFO | (200, 512)
2021-05-27 16:55:36,133 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,133 INFO | (200, 512)
2021-05-27 16:55:36,138 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,138 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  66%|██████▌   | 378/574 [00:41<00:20,  9.49it/s]

2021-05-27 16:55:36,209 INFO | INITIAL
2021-05-27 16:55:36,210 INFO | (50, 200)
2021-05-27 16:55:36,215 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:36,216 INFO | (50, 200, 512)
2021-05-27 16:55:36,217 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:36,217 INFO | (50, 200, 512)
2021-05-27 16:55:36,218 INFO | BERT LAYER
2021-05-27 16:55:36,218 INFO | (200, 512)
2021-05-27 16:55:36,219 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,219 INFO | (200, 512)
2021-05-27 16:55:36,219 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,220 INFO | (200, 512)
2021-05-27 16:55:36,226 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,227 INFO | (200, 512)
2021-05-27 16:55:36,227 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,228 INFO | (200, 512)
2021-05-27 16:55:36,233 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,233 INFO | (200, 512)
2021-05-27 16:55:36,234 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,234 INFO | (200, 512)
2021-05-27 16:55:36,241 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,242 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  66%|██████▌   | 379/574 [00:41<00:20,  9.40it/s]

2021-05-27 16:55:36,319 INFO | INITIAL
2021-05-27 16:55:36,320 INFO | (50, 200)
2021-05-27 16:55:36,326 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:36,326 INFO | (50, 200, 512)
2021-05-27 16:55:36,327 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:36,328 INFO | (50, 200, 512)
2021-05-27 16:55:36,329 INFO | BERT LAYER
2021-05-27 16:55:36,329 INFO | (200, 512)
2021-05-27 16:55:36,329 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,330 INFO | (200, 512)
2021-05-27 16:55:36,330 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,331 INFO | (200, 512)
2021-05-27 16:55:36,336 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,336 INFO | (200, 512)
2021-05-27 16:55:36,337 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,337 INFO | (200, 512)
2021-05-27 16:55:36,343 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,344 INFO | (200, 512)
2021-05-27 16:55:36,345 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,346 INFO | (200, 512)
2021-05-27 16:55:36,351 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,351 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  66%|██████▌   | 380/574 [00:41<00:20,  9.39it/s]

2021-05-27 16:55:36,425 INFO | INITIAL
2021-05-27 16:55:36,425 INFO | (50, 200)
2021-05-27 16:55:36,432 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:36,432 INFO | (50, 200, 512)
2021-05-27 16:55:36,433 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:36,434 INFO | (50, 200, 512)
2021-05-27 16:55:36,435 INFO | BERT LAYER
2021-05-27 16:55:36,436 INFO | (200, 512)
2021-05-27 16:55:36,436 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,436 INFO | (200, 512)
2021-05-27 16:55:36,437 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,437 INFO | (200, 512)
2021-05-27 16:55:36,444 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,445 INFO | (200, 512)
2021-05-27 16:55:36,446 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,446 INFO | (200, 512)
2021-05-27 16:55:36,453 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,454 INFO | (200, 512)
2021-05-27 16:55:36,454 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,455 INFO | (200, 512)
2021-05-27 16:55:36,461 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,461 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  66%|██████▋   | 381/574 [00:41<00:20,  9.34it/s]

2021-05-27 16:55:36,533 INFO | INITIAL
2021-05-27 16:55:36,533 INFO | (50, 200)
2021-05-27 16:55:36,538 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:36,539 INFO | (50, 200, 512)
2021-05-27 16:55:36,540 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:36,540 INFO | (50, 200, 512)
2021-05-27 16:55:36,541 INFO | BERT LAYER
2021-05-27 16:55:36,541 INFO | (200, 512)
2021-05-27 16:55:36,542 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,542 INFO | (200, 512)
2021-05-27 16:55:36,542 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,543 INFO | (200, 512)
2021-05-27 16:55:36,548 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,549 INFO | (200, 512)
2021-05-27 16:55:36,549 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,550 INFO | (200, 512)
2021-05-27 16:55:36,555 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,555 INFO | (200, 512)
2021-05-27 16:55:36,556 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,556 INFO | (200, 512)
2021-05-27 16:55:36,562 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,562 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  66%|██████▋   | 381/574 [00:41<00:20,  9.34it/s]

2021-05-27 16:55:36,631 INFO | INITIAL
2021-05-27 16:55:36,631 INFO | (50, 200)
2021-05-27 16:55:36,636 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:36,637 INFO | (50, 200, 512)
2021-05-27 16:55:36,638 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:36,639 INFO | (50, 200, 512)
2021-05-27 16:55:36,639 INFO | BERT LAYER
2021-05-27 16:55:36,640 INFO | (200, 512)
2021-05-27 16:55:36,640 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,640 INFO | (200, 512)
2021-05-27 16:55:36,641 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,641 INFO | (200, 512)
2021-05-27 16:55:36,647 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,647 INFO | (200, 512)
2021-05-27 16:55:36,648 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,648 INFO | (200, 512)
2021-05-27 16:55:36,656 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,657 INFO | (200, 512)
2021-05-27 16:55:36,658 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,658 INFO | (200, 512)
2021-05-27 16:55:36,664 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,664 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  67%|██████▋   | 383/574 [00:41<00:20,  9.49it/s]

2021-05-27 16:55:36,739 INFO | INITIAL
2021-05-27 16:55:36,740 INFO | (50, 200)
2021-05-27 16:55:36,748 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:36,748 INFO | (50, 200, 512)
2021-05-27 16:55:36,750 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:36,750 INFO | (50, 200, 512)
2021-05-27 16:55:36,751 INFO | BERT LAYER
2021-05-27 16:55:36,751 INFO | (200, 512)
2021-05-27 16:55:36,751 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,752 INFO | (200, 512)
2021-05-27 16:55:36,752 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,753 INFO | (200, 512)
2021-05-27 16:55:36,759 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,759 INFO | (200, 512)
2021-05-27 16:55:36,760 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,760 INFO | (200, 512)
2021-05-27 16:55:36,766 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,767 INFO | (200, 512)
2021-05-27 16:55:36,767 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,767 INFO | (200, 512)
2021-05-27 16:55:36,773 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,773 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  67%|██████▋   | 384/574 [00:42<00:19,  9.58it/s]

2021-05-27 16:55:36,841 INFO | INITIAL
2021-05-27 16:55:36,842 INFO | (50, 200)
2021-05-27 16:55:36,848 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:36,849 INFO | (50, 200, 512)
2021-05-27 16:55:36,851 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:36,851 INFO | (50, 200, 512)
2021-05-27 16:55:36,852 INFO | BERT LAYER
2021-05-27 16:55:36,853 INFO | (200, 512)
2021-05-27 16:55:36,853 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,854 INFO | (200, 512)
2021-05-27 16:55:36,855 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,856 INFO | (200, 512)
2021-05-27 16:55:36,863 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,864 INFO | (200, 512)
2021-05-27 16:55:36,864 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,865 INFO | (200, 512)
2021-05-27 16:55:36,871 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,871 INFO | (200, 512)
2021-05-27 16:55:36,872 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,872 INFO | (200, 512)
2021-05-27 16:55:36,877 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,878 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  67%|██████▋   | 385/574 [00:42<00:20,  9.42it/s]

2021-05-27 16:55:36,953 INFO | INITIAL
2021-05-27 16:55:36,953 INFO | (50, 200)
2021-05-27 16:55:36,958 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:36,958 INFO | (50, 200, 512)
2021-05-27 16:55:36,960 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:36,961 INFO | (50, 200, 512)
2021-05-27 16:55:36,961 INFO | BERT LAYER
2021-05-27 16:55:36,962 INFO | (200, 512)
2021-05-27 16:55:36,962 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,963 INFO | (200, 512)
2021-05-27 16:55:36,963 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,964 INFO | (200, 512)
2021-05-27 16:55:36,969 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,970 INFO | (200, 512)
2021-05-27 16:55:36,970 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,971 INFO | (200, 512)
2021-05-27 16:55:36,977 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,977 INFO | (200, 512)
2021-05-27 16:55:36,978 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:36,978 INFO | (200, 512)
2021-05-27 16:55:36,985 INFO | BERT LAYER LOOP
2021-05-27 16:55:36,986 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  67%|██████▋   | 385/574 [00:42<00:20,  9.42it/s]

2021-05-27 16:55:37,052 INFO | INITIAL
2021-05-27 16:55:37,052 INFO | (50, 200)
2021-05-27 16:55:37,059 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:37,059 INFO | (50, 200, 512)
2021-05-27 16:55:37,061 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:37,061 INFO | (50, 200, 512)
2021-05-27 16:55:37,062 INFO | BERT LAYER
2021-05-27 16:55:37,063 INFO | (200, 512)
2021-05-27 16:55:37,064 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,065 INFO | (200, 512)
2021-05-27 16:55:37,065 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,066 INFO | (200, 512)
2021-05-27 16:55:37,072 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,073 INFO | (200, 512)
2021-05-27 16:55:37,073 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,073 INFO | (200, 512)
2021-05-27 16:55:37,079 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,079 INFO | (200, 512)
2021-05-27 16:55:37,080 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,081 INFO | (200, 512)
2021-05-27 16:55:37,088 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,088 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  67%|██████▋   | 387/574 [00:42<00:19,  9.48it/s]

2021-05-27 16:55:37,162 INFO | INITIAL
2021-05-27 16:55:37,162 INFO | (50, 200)
2021-05-27 16:55:37,168 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:37,168 INFO | (50, 200, 512)
2021-05-27 16:55:37,170 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:37,170 INFO | (50, 200, 512)
2021-05-27 16:55:37,171 INFO | BERT LAYER
2021-05-27 16:55:37,171 INFO | (200, 512)
2021-05-27 16:55:37,172 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,172 INFO | (200, 512)
2021-05-27 16:55:37,172 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,172 INFO | (200, 512)
2021-05-27 16:55:37,178 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,178 INFO | (200, 512)
2021-05-27 16:55:37,178 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,179 INFO | (200, 512)
2021-05-27 16:55:37,185 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,186 INFO | (200, 512)
2021-05-27 16:55:37,187 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,188 INFO | (200, 512)
2021-05-27 16:55:37,194 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,194 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  68%|██████▊   | 388/574 [00:42<00:19,  9.47it/s]

2021-05-27 16:55:37,268 INFO | INITIAL
2021-05-27 16:55:37,268 INFO | (50, 200)
2021-05-27 16:55:37,273 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:37,274 INFO | (50, 200, 512)
2021-05-27 16:55:37,275 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:37,275 INFO | (50, 200, 512)
2021-05-27 16:55:37,276 INFO | BERT LAYER
2021-05-27 16:55:37,276 INFO | (200, 512)
2021-05-27 16:55:37,277 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,277 INFO | (200, 512)
2021-05-27 16:55:37,277 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,283 INFO | (200, 512)
2021-05-27 16:55:37,290 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,290 INFO | (200, 512)
2021-05-27 16:55:37,291 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,291 INFO | (200, 512)
2021-05-27 16:55:37,297 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,297 INFO | (200, 512)
2021-05-27 16:55:37,298 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,298 INFO | (200, 512)
2021-05-27 16:55:37,305 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,305 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  68%|██████▊   | 389/574 [00:42<00:19,  9.35it/s]

2021-05-27 16:55:37,379 INFO | INITIAL
2021-05-27 16:55:37,379 INFO | (50, 200)
2021-05-27 16:55:37,384 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:37,385 INFO | (50, 200, 512)
2021-05-27 16:55:37,386 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:37,386 INFO | (50, 200, 512)
2021-05-27 16:55:37,387 INFO | BERT LAYER
2021-05-27 16:55:37,387 INFO | (200, 512)
2021-05-27 16:55:37,388 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,388 INFO | (200, 512)
2021-05-27 16:55:37,389 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,389 INFO | (200, 512)
2021-05-27 16:55:37,395 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,395 INFO | (200, 512)
2021-05-27 16:55:37,396 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,396 INFO | (200, 512)
2021-05-27 16:55:37,401 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,402 INFO | (200, 512)
2021-05-27 16:55:37,402 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,402 INFO | (200, 512)
2021-05-27 16:55:37,407 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,408 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  68%|██████▊   | 390/574 [00:42<00:19,  9.31it/s]

2021-05-27 16:55:37,487 INFO | INITIAL
2021-05-27 16:55:37,488 INFO | (50, 200)
2021-05-27 16:55:37,494 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:37,495 INFO | (50, 200, 512)
2021-05-27 16:55:37,496 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:37,496 INFO | (50, 200, 512)
2021-05-27 16:55:37,497 INFO | BERT LAYER
2021-05-27 16:55:37,497 INFO | (200, 512)
2021-05-27 16:55:37,498 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,498 INFO | (200, 512)
2021-05-27 16:55:37,499 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,499 INFO | (200, 512)
2021-05-27 16:55:37,504 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,504 INFO | (200, 512)
2021-05-27 16:55:37,505 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,505 INFO | (200, 512)
2021-05-27 16:55:37,510 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,510 INFO | (200, 512)
2021-05-27 16:55:37,511 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,511 INFO | (200, 512)
2021-05-27 16:55:37,518 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,519 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  68%|██████▊   | 391/574 [00:42<00:19,  9.47it/s]

2021-05-27 16:55:37,588 INFO | INITIAL
2021-05-27 16:55:37,588 INFO | (50, 200)
2021-05-27 16:55:37,594 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:37,594 INFO | (50, 200, 512)
2021-05-27 16:55:37,596 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:37,596 INFO | (50, 200, 512)
2021-05-27 16:55:37,597 INFO | BERT LAYER
2021-05-27 16:55:37,597 INFO | (200, 512)
2021-05-27 16:55:37,598 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,598 INFO | (200, 512)
2021-05-27 16:55:37,598 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,599 INFO | (200, 512)
2021-05-27 16:55:37,604 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,605 INFO | (200, 512)
2021-05-27 16:55:37,605 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,605 INFO | (200, 512)
2021-05-27 16:55:37,610 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,611 INFO | (200, 512)
2021-05-27 16:55:37,611 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,612 INFO | (200, 512)
2021-05-27 16:55:37,619 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,619 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  68%|██████▊   | 392/574 [00:42<00:19,  9.43it/s]

2021-05-27 16:55:37,695 INFO | INITIAL
2021-05-27 16:55:37,695 INFO | (50, 200)
2021-05-27 16:55:37,700 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:37,701 INFO | (50, 200, 512)
2021-05-27 16:55:37,702 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:37,702 INFO | (50, 200, 512)
2021-05-27 16:55:37,703 INFO | BERT LAYER
2021-05-27 16:55:37,703 INFO | (200, 512)
2021-05-27 16:55:37,704 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,704 INFO | (200, 512)
2021-05-27 16:55:37,705 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,705 INFO | (200, 512)
2021-05-27 16:55:37,710 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,710 INFO | (200, 512)
2021-05-27 16:55:37,710 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,711 INFO | (200, 512)
2021-05-27 16:55:37,717 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,718 INFO | (200, 512)
2021-05-27 16:55:37,718 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,719 INFO | (200, 512)
2021-05-27 16:55:37,724 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,725 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  68%|██████▊   | 392/574 [00:42<00:19,  9.43it/s]

2021-05-27 16:55:37,792 INFO | INITIAL
2021-05-27 16:55:37,792 INFO | (50, 200)
2021-05-27 16:55:37,798 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:37,799 INFO | (50, 200, 512)
2021-05-27 16:55:37,800 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:37,800 INFO | (50, 200, 512)
2021-05-27 16:55:37,801 INFO | BERT LAYER
2021-05-27 16:55:37,802 INFO | (200, 512)
2021-05-27 16:55:37,803 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,803 INFO | (200, 512)
2021-05-27 16:55:37,803 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,804 INFO | (200, 512)
2021-05-27 16:55:37,810 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,811 INFO | (200, 512)
2021-05-27 16:55:37,811 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,812 INFO | (200, 512)
2021-05-27 16:55:37,818 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,819 INFO | (200, 512)
2021-05-27 16:55:37,820 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,820 INFO | (200, 512)
2021-05-27 16:55:37,827 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,828 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  69%|██████▊   | 394/574 [00:43<00:18,  9.50it/s]

2021-05-27 16:55:37,904 INFO | INITIAL
2021-05-27 16:55:37,905 INFO | (50, 200)
2021-05-27 16:55:37,911 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:37,912 INFO | (50, 200, 512)
2021-05-27 16:55:37,913 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:37,914 INFO | (50, 200, 512)
2021-05-27 16:55:37,915 INFO | BERT LAYER
2021-05-27 16:55:37,916 INFO | (200, 512)
2021-05-27 16:55:37,916 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,917 INFO | (200, 512)
2021-05-27 16:55:37,918 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,918 INFO | (200, 512)
2021-05-27 16:55:37,925 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,926 INFO | (200, 512)
2021-05-27 16:55:37,927 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,927 INFO | (200, 512)
2021-05-27 16:55:37,933 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,933 INFO | (200, 512)
2021-05-27 16:55:37,933 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:37,934 INFO | (200, 512)
2021-05-27 16:55:37,939 INFO | BERT LAYER LOOP
2021-05-27 16:55:37,940 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  69%|██████▉   | 395/574 [00:43<00:19,  9.38it/s]

2021-05-27 16:55:38,015 INFO | INITIAL
2021-05-27 16:55:38,015 INFO | (50, 200)
2021-05-27 16:55:38,022 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:38,023 INFO | (50, 200, 512)
2021-05-27 16:55:38,025 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:38,025 INFO | (50, 200, 512)
2021-05-27 16:55:38,026 INFO | BERT LAYER
2021-05-27 16:55:38,026 INFO | (200, 512)
2021-05-27 16:55:38,026 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,027 INFO | (200, 512)
2021-05-27 16:55:38,027 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,028 INFO | (200, 512)
2021-05-27 16:55:38,034 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,035 INFO | (200, 512)
2021-05-27 16:55:38,035 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,035 INFO | (200, 512)
2021-05-27 16:55:38,043 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,043 INFO | (200, 512)
2021-05-27 16:55:38,044 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,044 INFO | (200, 512)
2021-05-27 16:55:38,050 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,051 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  69%|██████▉   | 396/574 [00:43<00:19,  9.21it/s]

2021-05-27 16:55:38,129 INFO | INITIAL
2021-05-27 16:55:38,129 INFO | (50, 200)
2021-05-27 16:55:38,135 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:38,135 INFO | (50, 200, 512)
2021-05-27 16:55:38,137 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:38,141 INFO | (50, 200, 512)
2021-05-27 16:55:38,142 INFO | BERT LAYER
2021-05-27 16:55:38,143 INFO | (200, 512)
2021-05-27 16:55:38,143 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,143 INFO | (200, 512)
2021-05-27 16:55:38,144 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,144 INFO | (200, 512)
2021-05-27 16:55:38,151 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,155 INFO | (200, 512)
2021-05-27 16:55:38,155 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,156 INFO | (200, 512)
2021-05-27 16:55:38,162 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,163 INFO | (200, 512)
2021-05-27 16:55:38,164 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,164 INFO | (200, 512)
2021-05-27 16:55:38,171 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,171 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  69%|██████▉   | 397/574 [00:43<00:19,  9.24it/s]

2021-05-27 16:55:38,237 INFO | INITIAL
2021-05-27 16:55:38,237 INFO | (50, 200)
2021-05-27 16:55:38,243 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:38,244 INFO | (50, 200, 512)
2021-05-27 16:55:38,245 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:38,245 INFO | (50, 200, 512)
2021-05-27 16:55:38,246 INFO | BERT LAYER
2021-05-27 16:55:38,247 INFO | (200, 512)
2021-05-27 16:55:38,247 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,247 INFO | (200, 512)
2021-05-27 16:55:38,248 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,248 INFO | (200, 512)
2021-05-27 16:55:38,256 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,256 INFO | (200, 512)
2021-05-27 16:55:38,257 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,257 INFO | (200, 512)
2021-05-27 16:55:38,263 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,263 INFO | (200, 512)
2021-05-27 16:55:38,264 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,264 INFO | (200, 512)
2021-05-27 16:55:38,271 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,271 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  69%|██████▉   | 398/574 [00:43<00:18,  9.34it/s]

2021-05-27 16:55:38,341 INFO | INITIAL
2021-05-27 16:55:38,341 INFO | (50, 200)
2021-05-27 16:55:38,349 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:38,349 INFO | (50, 200, 512)
2021-05-27 16:55:38,350 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:38,351 INFO | (50, 200, 512)
2021-05-27 16:55:38,352 INFO | BERT LAYER
2021-05-27 16:55:38,352 INFO | (200, 512)
2021-05-27 16:55:38,352 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,353 INFO | (200, 512)
2021-05-27 16:55:38,353 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,353 INFO | (200, 512)
2021-05-27 16:55:38,359 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,359 INFO | (200, 512)
2021-05-27 16:55:38,360 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,360 INFO | (200, 512)
2021-05-27 16:55:38,366 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,367 INFO | (200, 512)
2021-05-27 16:55:38,368 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,368 INFO | (200, 512)
2021-05-27 16:55:38,374 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,375 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  70%|██████▉   | 399/574 [00:43<00:19,  9.13it/s]

2021-05-27 16:55:38,456 INFO | INITIAL
2021-05-27 16:55:38,456 INFO | (50, 200)
2021-05-27 16:55:38,464 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:38,464 INFO | (50, 200, 512)
2021-05-27 16:55:38,466 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:38,466 INFO | (50, 200, 512)
2021-05-27 16:55:38,467 INFO | BERT LAYER
2021-05-27 16:55:38,467 INFO | (200, 512)
2021-05-27 16:55:38,468 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,468 INFO | (200, 512)
2021-05-27 16:55:38,469 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,470 INFO | (200, 512)
2021-05-27 16:55:38,475 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,475 INFO | (200, 512)
2021-05-27 16:55:38,476 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,476 INFO | (200, 512)
2021-05-27 16:55:38,482 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,483 INFO | (200, 512)
2021-05-27 16:55:38,483 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,484 INFO | (200, 512)
2021-05-27 16:55:38,489 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,490 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  70%|██████▉   | 400/574 [00:43<00:19,  9.16it/s]

2021-05-27 16:55:38,565 INFO | INITIAL
2021-05-27 16:55:38,565 INFO | (50, 200)
2021-05-27 16:55:38,570 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:38,571 INFO | (50, 200, 512)
2021-05-27 16:55:38,572 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:38,573 INFO | (50, 200, 512)
2021-05-27 16:55:38,574 INFO | BERT LAYER
2021-05-27 16:55:38,574 INFO | (200, 512)
2021-05-27 16:55:38,574 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,575 INFO | (200, 512)
2021-05-27 16:55:38,575 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,576 INFO | (200, 512)
2021-05-27 16:55:38,583 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,584 INFO | (200, 512)
2021-05-27 16:55:38,584 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,584 INFO | (200, 512)
2021-05-27 16:55:38,590 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,591 INFO | (200, 512)
2021-05-27 16:55:38,591 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,592 INFO | (200, 512)
2021-05-27 16:55:38,597 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,597 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  70%|██████▉   | 401/574 [00:43<00:18,  9.20it/s]

2021-05-27 16:55:38,672 INFO | INITIAL
2021-05-27 16:55:38,672 INFO | (50, 200)
2021-05-27 16:55:38,679 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:38,679 INFO | (50, 200, 512)
2021-05-27 16:55:38,681 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:38,681 INFO | (50, 200, 512)
2021-05-27 16:55:38,682 INFO | BERT LAYER
2021-05-27 16:55:38,682 INFO | (200, 512)
2021-05-27 16:55:38,683 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,683 INFO | (200, 512)
2021-05-27 16:55:38,683 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,684 INFO | (200, 512)
2021-05-27 16:55:38,691 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,691 INFO | (200, 512)
2021-05-27 16:55:38,692 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,692 INFO | (200, 512)
2021-05-27 16:55:38,699 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,700 INFO | (200, 512)
2021-05-27 16:55:38,700 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,701 INFO | (200, 512)
2021-05-27 16:55:38,706 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,707 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  70%|███████   | 402/574 [00:43<00:18,  9.31it/s]

2021-05-27 16:55:38,777 INFO | INITIAL
2021-05-27 16:55:38,777 INFO | (50, 200)
2021-05-27 16:55:38,783 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:38,783 INFO | (50, 200, 512)
2021-05-27 16:55:38,784 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:38,785 INFO | (50, 200, 512)
2021-05-27 16:55:38,785 INFO | BERT LAYER
2021-05-27 16:55:38,786 INFO | (200, 512)
2021-05-27 16:55:38,786 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,787 INFO | (200, 512)
2021-05-27 16:55:38,787 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,787 INFO | (200, 512)
2021-05-27 16:55:38,793 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,794 INFO | (200, 512)
2021-05-27 16:55:38,794 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,794 INFO | (200, 512)
2021-05-27 16:55:38,800 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,801 INFO | (200, 512)
2021-05-27 16:55:38,801 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,802 INFO | (200, 512)
2021-05-27 16:55:38,807 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,807 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  70%|███████   | 403/574 [00:44<00:18,  9.37it/s]

2021-05-27 16:55:38,882 INFO | INITIAL
2021-05-27 16:55:38,883 INFO | (50, 200)
2021-05-27 16:55:38,889 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:38,889 INFO | (50, 200, 512)
2021-05-27 16:55:38,891 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:38,891 INFO | (50, 200, 512)
2021-05-27 16:55:38,893 INFO | BERT LAYER
2021-05-27 16:55:38,893 INFO | (200, 512)
2021-05-27 16:55:38,893 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,894 INFO | (200, 512)
2021-05-27 16:55:38,894 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,895 INFO | (200, 512)
2021-05-27 16:55:38,901 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,901 INFO | (200, 512)
2021-05-27 16:55:38,902 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,902 INFO | (200, 512)
2021-05-27 16:55:38,907 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,908 INFO | (200, 512)
2021-05-27 16:55:38,908 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,908 INFO | (200, 512)
2021-05-27 16:55:38,913 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,914 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  70%|███████   | 404/574 [00:44<00:17,  9.47it/s]

2021-05-27 16:55:38,985 INFO | INITIAL
2021-05-27 16:55:38,985 INFO | (50, 200)
2021-05-27 16:55:38,992 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:38,992 INFO | (50, 200, 512)
2021-05-27 16:55:38,994 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:38,995 INFO | (50, 200, 512)
2021-05-27 16:55:38,995 INFO | BERT LAYER
2021-05-27 16:55:38,996 INFO | (200, 512)
2021-05-27 16:55:38,996 INFO | BERT LAYER LOOP
2021-05-27 16:55:38,996 INFO | (200, 512)
2021-05-27 16:55:38,997 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:38,998 INFO | (200, 512)
2021-05-27 16:55:39,004 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,005 INFO | (200, 512)
2021-05-27 16:55:39,005 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,006 INFO | (200, 512)
2021-05-27 16:55:39,013 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,013 INFO | (200, 512)
2021-05-27 16:55:39,014 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,015 INFO | (200, 512)
2021-05-27 16:55:39,021 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,022 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  71%|███████   | 405/574 [00:44<00:18,  9.23it/s]

2021-05-27 16:55:39,099 INFO | INITIAL
2021-05-27 16:55:39,100 INFO | (50, 200)
2021-05-27 16:55:39,105 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:39,105 INFO | (50, 200, 512)
2021-05-27 16:55:39,106 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:39,107 INFO | (50, 200, 512)
2021-05-27 16:55:39,107 INFO | BERT LAYER
2021-05-27 16:55:39,108 INFO | (200, 512)
2021-05-27 16:55:39,108 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,109 INFO | (200, 512)
2021-05-27 16:55:39,109 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,110 INFO | (200, 512)
2021-05-27 16:55:39,115 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,115 INFO | (200, 512)
2021-05-27 16:55:39,116 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,116 INFO | (200, 512)
2021-05-27 16:55:39,122 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,123 INFO | (200, 512)
2021-05-27 16:55:39,124 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,124 INFO | (200, 512)
2021-05-27 16:55:39,132 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,132 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  71%|███████   | 406/574 [00:44<00:18,  9.26it/s]

2021-05-27 16:55:39,207 INFO | INITIAL
2021-05-27 16:55:39,208 INFO | (50, 200)
2021-05-27 16:55:39,216 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:39,217 INFO | (50, 200, 512)
2021-05-27 16:55:39,218 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:39,218 INFO | (50, 200, 512)
2021-05-27 16:55:39,219 INFO | BERT LAYER
2021-05-27 16:55:39,220 INFO | (200, 512)
2021-05-27 16:55:39,220 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,220 INFO | (200, 512)
2021-05-27 16:55:39,221 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,221 INFO | (200, 512)
2021-05-27 16:55:39,227 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,227 INFO | (200, 512)
2021-05-27 16:55:39,228 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,228 INFO | (200, 512)
2021-05-27 16:55:39,233 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,234 INFO | (200, 512)
2021-05-27 16:55:39,234 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,234 INFO | (200, 512)
2021-05-27 16:55:39,240 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,240 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  71%|███████   | 407/574 [00:44<00:23,  7.22it/s]

2021-05-27 16:55:39,417 INFO | INITIAL
2021-05-27 16:55:39,418 INFO | (50, 200)
2021-05-27 16:55:39,424 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:39,424 INFO | (50, 200, 512)
2021-05-27 16:55:39,426 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:39,426 INFO | (50, 200, 512)
2021-05-27 16:55:39,428 INFO | BERT LAYER
2021-05-27 16:55:39,428 INFO | (200, 512)
2021-05-27 16:55:39,428 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,429 INFO | (200, 512)
2021-05-27 16:55:39,429 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,430 INFO | (200, 512)
2021-05-27 16:55:39,436 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,437 INFO | (200, 512)
2021-05-27 16:55:39,437 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,437 INFO | (200, 512)
2021-05-27 16:55:39,443 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,443 INFO | (200, 512)
2021-05-27 16:55:39,444 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,444 INFO | (200, 512)
2021-05-27 16:55:39,449 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,450 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  71%|███████   | 408/574 [00:44<00:21,  7.80it/s]

2021-05-27 16:55:39,521 INFO | INITIAL
2021-05-27 16:55:39,522 INFO | (50, 200)
2021-05-27 16:55:39,528 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:39,528 INFO | (50, 200, 512)
2021-05-27 16:55:39,529 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:39,529 INFO | (50, 200, 512)
2021-05-27 16:55:39,530 INFO | BERT LAYER
2021-05-27 16:55:39,530 INFO | (200, 512)
2021-05-27 16:55:39,531 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,531 INFO | (200, 512)
2021-05-27 16:55:39,532 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,532 INFO | (200, 512)
2021-05-27 16:55:39,539 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,539 INFO | (200, 512)
2021-05-27 16:55:39,540 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,542 INFO | (200, 512)
2021-05-27 16:55:39,548 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,548 INFO | (200, 512)
2021-05-27 16:55:39,549 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,549 INFO | (200, 512)
2021-05-27 16:55:39,555 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,555 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  71%|███████▏  | 409/574 [00:44<00:20,  8.11it/s]

2021-05-27 16:55:39,632 INFO | INITIAL
2021-05-27 16:55:39,632 INFO | (50, 200)
2021-05-27 16:55:39,638 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:39,638 INFO | (50, 200, 512)
2021-05-27 16:55:39,640 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:39,640 INFO | (50, 200, 512)
2021-05-27 16:55:39,641 INFO | BERT LAYER
2021-05-27 16:55:39,641 INFO | (200, 512)
2021-05-27 16:55:39,641 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,642 INFO | (200, 512)
2021-05-27 16:55:39,642 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,642 INFO | (200, 512)
2021-05-27 16:55:39,649 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,649 INFO | (200, 512)
2021-05-27 16:55:39,650 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,651 INFO | (200, 512)
2021-05-27 16:55:39,659 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,659 INFO | (200, 512)
2021-05-27 16:55:39,660 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,660 INFO | (200, 512)
2021-05-27 16:55:39,667 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,667 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  71%|███████▏  | 410/574 [00:44<00:19,  8.49it/s]

2021-05-27 16:55:39,737 INFO | INITIAL
2021-05-27 16:55:39,738 INFO | (50, 200)
2021-05-27 16:55:39,744 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:39,745 INFO | (50, 200, 512)
2021-05-27 16:55:39,746 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:39,746 INFO | (50, 200, 512)
2021-05-27 16:55:39,747 INFO | BERT LAYER
2021-05-27 16:55:39,748 INFO | (200, 512)
2021-05-27 16:55:39,749 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,749 INFO | (200, 512)
2021-05-27 16:55:39,750 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,750 INFO | (200, 512)
2021-05-27 16:55:39,756 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,756 INFO | (200, 512)
2021-05-27 16:55:39,756 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,757 INFO | (200, 512)
2021-05-27 16:55:39,762 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,763 INFO | (200, 512)
2021-05-27 16:55:39,763 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,764 INFO | (200, 512)
2021-05-27 16:55:39,769 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,770 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  72%|███████▏  | 411/574 [00:45<00:18,  8.82it/s]

2021-05-27 16:55:39,840 INFO | INITIAL
2021-05-27 16:55:39,840 INFO | (50, 200)
2021-05-27 16:55:39,847 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:39,847 INFO | (50, 200, 512)
2021-05-27 16:55:39,848 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:39,849 INFO | (50, 200, 512)
2021-05-27 16:55:39,850 INFO | BERT LAYER
2021-05-27 16:55:39,850 INFO | (200, 512)
2021-05-27 16:55:39,851 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,852 INFO | (200, 512)
2021-05-27 16:55:39,852 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,853 INFO | (200, 512)
2021-05-27 16:55:39,861 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,861 INFO | (200, 512)
2021-05-27 16:55:39,862 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,862 INFO | (200, 512)
2021-05-27 16:55:39,869 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,869 INFO | (200, 512)
2021-05-27 16:55:39,869 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,870 INFO | (200, 512)
2021-05-27 16:55:39,875 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,876 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  72%|███████▏  | 412/574 [00:45<00:17,  9.01it/s]

2021-05-27 16:55:39,946 INFO | INITIAL
2021-05-27 16:55:39,946 INFO | (50, 200)
2021-05-27 16:55:39,951 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:39,951 INFO | (50, 200, 512)
2021-05-27 16:55:39,953 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:39,953 INFO | (50, 200, 512)
2021-05-27 16:55:39,954 INFO | BERT LAYER
2021-05-27 16:55:39,954 INFO | (200, 512)
2021-05-27 16:55:39,955 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,955 INFO | (200, 512)
2021-05-27 16:55:39,955 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,956 INFO | (200, 512)
2021-05-27 16:55:39,962 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,962 INFO | (200, 512)
2021-05-27 16:55:39,963 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,963 INFO | (200, 512)
2021-05-27 16:55:39,968 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,969 INFO | (200, 512)
2021-05-27 16:55:39,969 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:39,969 INFO | (200, 512)
2021-05-27 16:55:39,974 INFO | BERT LAYER LOOP
2021-05-27 16:55:39,975 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  72%|███████▏  | 412/574 [00:45<00:17,  9.01it/s]

2021-05-27 16:55:40,043 INFO | INITIAL
2021-05-27 16:55:40,043 INFO | (50, 200)
2021-05-27 16:55:40,048 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:40,049 INFO | (50, 200, 512)
2021-05-27 16:55:40,050 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:40,050 INFO | (50, 200, 512)
2021-05-27 16:55:40,051 INFO | BERT LAYER
2021-05-27 16:55:40,052 INFO | (200, 512)
2021-05-27 16:55:40,052 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,053 INFO | (200, 512)
2021-05-27 16:55:40,054 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,054 INFO | (200, 512)
2021-05-27 16:55:40,060 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,061 INFO | (200, 512)
2021-05-27 16:55:40,061 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,062 INFO | (200, 512)
2021-05-27 16:55:40,067 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,068 INFO | (200, 512)
2021-05-27 16:55:40,068 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,069 INFO | (200, 512)
2021-05-27 16:55:40,074 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,074 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  72%|███████▏  | 414/574 [00:45<00:17,  9.36it/s]

2021-05-27 16:55:40,150 INFO | INITIAL
2021-05-27 16:55:40,150 INFO | (50, 200)
2021-05-27 16:55:40,157 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:40,157 INFO | (50, 200, 512)
2021-05-27 16:55:40,159 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:40,160 INFO | (50, 200, 512)
2021-05-27 16:55:40,160 INFO | BERT LAYER
2021-05-27 16:55:40,161 INFO | (200, 512)
2021-05-27 16:55:40,161 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,162 INFO | (200, 512)
2021-05-27 16:55:40,162 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,163 INFO | (200, 512)
2021-05-27 16:55:40,169 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,169 INFO | (200, 512)
2021-05-27 16:55:40,170 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,170 INFO | (200, 512)
2021-05-27 16:55:40,176 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,177 INFO | (200, 512)
2021-05-27 16:55:40,177 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,177 INFO | (200, 512)
2021-05-27 16:55:40,183 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,183 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  72%|███████▏  | 415/574 [00:45<00:16,  9.47it/s]

2021-05-27 16:55:40,251 INFO | INITIAL
2021-05-27 16:55:40,252 INFO | (50, 200)
2021-05-27 16:55:40,259 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:40,259 INFO | (50, 200, 512)
2021-05-27 16:55:40,262 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:40,262 INFO | (50, 200, 512)
2021-05-27 16:55:40,263 INFO | BERT LAYER
2021-05-27 16:55:40,263 INFO | (200, 512)
2021-05-27 16:55:40,264 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,264 INFO | (200, 512)
2021-05-27 16:55:40,265 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,265 INFO | (200, 512)
2021-05-27 16:55:40,272 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,272 INFO | (200, 512)
2021-05-27 16:55:40,273 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,274 INFO | (200, 512)
2021-05-27 16:55:40,280 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,281 INFO | (200, 512)
2021-05-27 16:55:40,281 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,281 INFO | (200, 512)
2021-05-27 16:55:40,287 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,288 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  72%|███████▏  | 416/574 [00:45<00:16,  9.41it/s]

2021-05-27 16:55:40,359 INFO | INITIAL
2021-05-27 16:55:40,360 INFO | (50, 200)
2021-05-27 16:55:40,366 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:40,367 INFO | (50, 200, 512)
2021-05-27 16:55:40,368 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:40,368 INFO | (50, 200, 512)
2021-05-27 16:55:40,369 INFO | BERT LAYER
2021-05-27 16:55:40,369 INFO | (200, 512)
2021-05-27 16:55:40,369 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,370 INFO | (200, 512)
2021-05-27 16:55:40,370 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,370 INFO | (200, 512)
2021-05-27 16:55:40,377 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,378 INFO | (200, 512)
2021-05-27 16:55:40,378 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,379 INFO | (200, 512)
2021-05-27 16:55:40,385 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,385 INFO | (200, 512)
2021-05-27 16:55:40,386 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,386 INFO | (200, 512)
2021-05-27 16:55:40,394 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,394 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  73%|███████▎  | 417/574 [00:45<00:16,  9.31it/s]

2021-05-27 16:55:40,470 INFO | INITIAL
2021-05-27 16:55:40,471 INFO | (50, 200)
2021-05-27 16:55:40,477 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:40,478 INFO | (50, 200, 512)
2021-05-27 16:55:40,479 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:40,480 INFO | (50, 200, 512)
2021-05-27 16:55:40,481 INFO | BERT LAYER
2021-05-27 16:55:40,481 INFO | (200, 512)
2021-05-27 16:55:40,482 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,482 INFO | (200, 512)
2021-05-27 16:55:40,482 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,483 INFO | (200, 512)
2021-05-27 16:55:40,489 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,489 INFO | (200, 512)
2021-05-27 16:55:40,490 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,490 INFO | (200, 512)
2021-05-27 16:55:40,496 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,496 INFO | (200, 512)
2021-05-27 16:55:40,497 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,497 INFO | (200, 512)
2021-05-27 16:55:40,503 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,504 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  73%|███████▎  | 418/574 [00:45<00:16,  9.35it/s]

2021-05-27 16:55:40,576 INFO | INITIAL
2021-05-27 16:55:40,576 INFO | (50, 200)
2021-05-27 16:55:40,582 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:40,582 INFO | (50, 200, 512)
2021-05-27 16:55:40,583 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:40,584 INFO | (50, 200, 512)
2021-05-27 16:55:40,585 INFO | BERT LAYER
2021-05-27 16:55:40,585 INFO | (200, 512)
2021-05-27 16:55:40,586 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,587 INFO | (200, 512)
2021-05-27 16:55:40,587 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,588 INFO | (200, 512)
2021-05-27 16:55:40,596 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,597 INFO | (200, 512)
2021-05-27 16:55:40,598 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,598 INFO | (200, 512)
2021-05-27 16:55:40,607 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,608 INFO | (200, 512)
2021-05-27 16:55:40,609 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,609 INFO | (200, 512)
2021-05-27 16:55:40,616 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,616 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  73%|███████▎  | 419/574 [00:45<00:17,  9.05it/s]

2021-05-27 16:55:40,695 INFO | INITIAL
2021-05-27 16:55:40,695 INFO | (50, 200)
2021-05-27 16:55:40,701 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:40,701 INFO | (50, 200, 512)
2021-05-27 16:55:40,703 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:40,703 INFO | (50, 200, 512)
2021-05-27 16:55:40,704 INFO | BERT LAYER
2021-05-27 16:55:40,704 INFO | (200, 512)
2021-05-27 16:55:40,705 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,705 INFO | (200, 512)
2021-05-27 16:55:40,705 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,706 INFO | (200, 512)
2021-05-27 16:55:40,712 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,712 INFO | (200, 512)
2021-05-27 16:55:40,713 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,714 INFO | (200, 512)
2021-05-27 16:55:40,719 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,720 INFO | (200, 512)
2021-05-27 16:55:40,721 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,721 INFO | (200, 512)
2021-05-27 16:55:40,728 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,729 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  73%|███████▎  | 420/574 [00:45<00:17,  9.02it/s]

2021-05-27 16:55:40,807 INFO | INITIAL
2021-05-27 16:55:40,807 INFO | (50, 200)
2021-05-27 16:55:40,815 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:40,815 INFO | (50, 200, 512)
2021-05-27 16:55:40,816 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:40,817 INFO | (50, 200, 512)
2021-05-27 16:55:40,818 INFO | BERT LAYER
2021-05-27 16:55:40,818 INFO | (200, 512)
2021-05-27 16:55:40,818 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,819 INFO | (200, 512)
2021-05-27 16:55:40,819 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,819 INFO | (200, 512)
2021-05-27 16:55:40,825 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,826 INFO | (200, 512)
2021-05-27 16:55:40,826 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,826 INFO | (200, 512)
2021-05-27 16:55:40,833 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,834 INFO | (200, 512)
2021-05-27 16:55:40,834 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,834 INFO | (200, 512)
2021-05-27 16:55:40,840 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,840 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  73%|███████▎  | 421/574 [00:46<00:16,  9.19it/s]

2021-05-27 16:55:40,911 INFO | INITIAL
2021-05-27 16:55:40,911 INFO | (50, 200)
2021-05-27 16:55:40,917 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:40,917 INFO | (50, 200, 512)
2021-05-27 16:55:40,919 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:40,920 INFO | (50, 200, 512)
2021-05-27 16:55:40,922 INFO | BERT LAYER
2021-05-27 16:55:40,923 INFO | (200, 512)
2021-05-27 16:55:40,923 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,924 INFO | (200, 512)
2021-05-27 16:55:40,924 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,925 INFO | (200, 512)
2021-05-27 16:55:40,933 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,934 INFO | (200, 512)
2021-05-27 16:55:40,934 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,934 INFO | (200, 512)
2021-05-27 16:55:40,941 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,942 INFO | (200, 512)
2021-05-27 16:55:40,942 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:40,943 INFO | (200, 512)
2021-05-27 16:55:40,948 INFO | BERT LAYER LOOP
2021-05-27 16:55:40,949 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  74%|███████▎  | 422/574 [00:46<00:16,  9.06it/s]

2021-05-27 16:55:41,025 INFO | INITIAL
2021-05-27 16:55:41,025 INFO | (50, 200)
2021-05-27 16:55:41,032 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:41,032 INFO | (50, 200, 512)
2021-05-27 16:55:41,033 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:41,034 INFO | (50, 200, 512)
2021-05-27 16:55:41,035 INFO | BERT LAYER
2021-05-27 16:55:41,035 INFO | (200, 512)
2021-05-27 16:55:41,036 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,036 INFO | (200, 512)
2021-05-27 16:55:41,036 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,037 INFO | (200, 512)
2021-05-27 16:55:41,045 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,046 INFO | (200, 512)
2021-05-27 16:55:41,046 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,047 INFO | (200, 512)
2021-05-27 16:55:41,054 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,054 INFO | (200, 512)
2021-05-27 16:55:41,055 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,055 INFO | (200, 512)
2021-05-27 16:55:41,062 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,063 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  74%|███████▎  | 423/574 [00:46<00:16,  9.07it/s]

2021-05-27 16:55:41,135 INFO | INITIAL
2021-05-27 16:55:41,135 INFO | (50, 200)
2021-05-27 16:55:41,141 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:41,141 INFO | (50, 200, 512)
2021-05-27 16:55:41,142 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:41,143 INFO | (50, 200, 512)
2021-05-27 16:55:41,144 INFO | BERT LAYER
2021-05-27 16:55:41,144 INFO | (200, 512)
2021-05-27 16:55:41,145 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,145 INFO | (200, 512)
2021-05-27 16:55:41,146 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,146 INFO | (200, 512)
2021-05-27 16:55:41,152 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,152 INFO | (200, 512)
2021-05-27 16:55:41,152 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,153 INFO | (200, 512)
2021-05-27 16:55:41,158 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,159 INFO | (200, 512)
2021-05-27 16:55:41,159 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,159 INFO | (200, 512)
2021-05-27 16:55:41,165 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,166 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  74%|███████▍  | 424/574 [00:46<00:16,  9.31it/s]

2021-05-27 16:55:41,235 INFO | INITIAL
2021-05-27 16:55:41,236 INFO | (50, 200)
2021-05-27 16:55:41,242 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:41,243 INFO | (50, 200, 512)
2021-05-27 16:55:41,244 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:41,245 INFO | (50, 200, 512)
2021-05-27 16:55:41,246 INFO | BERT LAYER
2021-05-27 16:55:41,247 INFO | (200, 512)
2021-05-27 16:55:41,247 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,248 INFO | (200, 512)
2021-05-27 16:55:41,248 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,249 INFO | (200, 512)
2021-05-27 16:55:41,255 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,255 INFO | (200, 512)
2021-05-27 16:55:41,256 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,256 INFO | (200, 512)
2021-05-27 16:55:41,262 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,262 INFO | (200, 512)
2021-05-27 16:55:41,263 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,263 INFO | (200, 512)
2021-05-27 16:55:41,269 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,270 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  74%|███████▍  | 425/574 [00:46<00:16,  9.31it/s]

2021-05-27 16:55:41,343 INFO | INITIAL
2021-05-27 16:55:41,343 INFO | (50, 200)
2021-05-27 16:55:41,348 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:41,349 INFO | (50, 200, 512)
2021-05-27 16:55:41,350 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:41,351 INFO | (50, 200, 512)
2021-05-27 16:55:41,351 INFO | BERT LAYER
2021-05-27 16:55:41,352 INFO | (200, 512)
2021-05-27 16:55:41,352 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,352 INFO | (200, 512)
2021-05-27 16:55:41,353 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,353 INFO | (200, 512)
2021-05-27 16:55:41,360 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,360 INFO | (200, 512)
2021-05-27 16:55:41,361 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,361 INFO | (200, 512)
2021-05-27 16:55:41,367 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,367 INFO | (200, 512)
2021-05-27 16:55:41,368 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,368 INFO | (200, 512)
2021-05-27 16:55:41,374 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,374 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  74%|███████▍  | 426/574 [00:46<00:15,  9.34it/s]

2021-05-27 16:55:41,449 INFO | INITIAL
2021-05-27 16:55:41,450 INFO | (50, 200)
2021-05-27 16:55:41,455 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:41,455 INFO | (50, 200, 512)
2021-05-27 16:55:41,457 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:41,457 INFO | (50, 200, 512)
2021-05-27 16:55:41,458 INFO | BERT LAYER
2021-05-27 16:55:41,459 INFO | (200, 512)
2021-05-27 16:55:41,459 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,460 INFO | (200, 512)
2021-05-27 16:55:41,461 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,462 INFO | (200, 512)
2021-05-27 16:55:41,467 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,468 INFO | (200, 512)
2021-05-27 16:55:41,468 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,469 INFO | (200, 512)
2021-05-27 16:55:41,474 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,475 INFO | (200, 512)
2021-05-27 16:55:41,475 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,476 INFO | (200, 512)
2021-05-27 16:55:41,482 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,482 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  74%|███████▍  | 427/574 [00:46<00:15,  9.38it/s]

2021-05-27 16:55:41,554 INFO | INITIAL
2021-05-27 16:55:41,555 INFO | (50, 200)
2021-05-27 16:55:41,561 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:41,562 INFO | (50, 200, 512)
2021-05-27 16:55:41,563 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:41,564 INFO | (50, 200, 512)
2021-05-27 16:55:41,565 INFO | BERT LAYER
2021-05-27 16:55:41,565 INFO | (200, 512)
2021-05-27 16:55:41,566 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,566 INFO | (200, 512)
2021-05-27 16:55:41,567 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,567 INFO | (200, 512)
2021-05-27 16:55:41,572 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,573 INFO | (200, 512)
2021-05-27 16:55:41,573 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,573 INFO | (200, 512)
2021-05-27 16:55:41,579 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,579 INFO | (200, 512)
2021-05-27 16:55:41,579 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,580 INFO | (200, 512)
2021-05-27 16:55:41,587 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,587 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  75%|███████▍  | 428/574 [00:46<00:15,  9.44it/s]

2021-05-27 16:55:41,659 INFO | INITIAL
2021-05-27 16:55:41,661 INFO | (50, 200)
2021-05-27 16:55:41,667 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:41,667 INFO | (50, 200, 512)
2021-05-27 16:55:41,669 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:41,669 INFO | (50, 200, 512)
2021-05-27 16:55:41,670 INFO | BERT LAYER
2021-05-27 16:55:41,671 INFO | (200, 512)
2021-05-27 16:55:41,672 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,672 INFO | (200, 512)
2021-05-27 16:55:41,673 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,673 INFO | (200, 512)
2021-05-27 16:55:41,681 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,681 INFO | (200, 512)
2021-05-27 16:55:41,682 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,682 INFO | (200, 512)
2021-05-27 16:55:41,688 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,688 INFO | (200, 512)
2021-05-27 16:55:41,689 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,689 INFO | (200, 512)
2021-05-27 16:55:41,696 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,696 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  75%|███████▍  | 429/574 [00:46<00:15,  9.33it/s]

2021-05-27 16:55:41,769 INFO | INITIAL
2021-05-27 16:55:41,770 INFO | (50, 200)
2021-05-27 16:55:41,775 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:41,775 INFO | (50, 200, 512)
2021-05-27 16:55:41,776 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:41,777 INFO | (50, 200, 512)
2021-05-27 16:55:41,778 INFO | BERT LAYER
2021-05-27 16:55:41,778 INFO | (200, 512)
2021-05-27 16:55:41,778 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,779 INFO | (200, 512)
2021-05-27 16:55:41,779 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,779 INFO | (200, 512)
2021-05-27 16:55:41,785 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,785 INFO | (200, 512)
2021-05-27 16:55:41,786 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,786 INFO | (200, 512)
2021-05-27 16:55:41,794 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,795 INFO | (200, 512)
2021-05-27 16:55:41,796 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,796 INFO | (200, 512)
2021-05-27 16:55:41,803 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,803 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  75%|███████▍  | 430/574 [00:47<00:15,  9.22it/s]

2021-05-27 16:55:41,881 INFO | INITIAL
2021-05-27 16:55:41,881 INFO | (50, 200)
2021-05-27 16:55:41,887 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:41,888 INFO | (50, 200, 512)
2021-05-27 16:55:41,889 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:41,889 INFO | (50, 200, 512)
2021-05-27 16:55:41,890 INFO | BERT LAYER
2021-05-27 16:55:41,890 INFO | (200, 512)
2021-05-27 16:55:41,891 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,891 INFO | (200, 512)
2021-05-27 16:55:41,891 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,892 INFO | (200, 512)
2021-05-27 16:55:41,897 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,897 INFO | (200, 512)
2021-05-27 16:55:41,898 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,898 INFO | (200, 512)
2021-05-27 16:55:41,903 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,904 INFO | (200, 512)
2021-05-27 16:55:41,904 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,904 INFO | (200, 512)
2021-05-27 16:55:41,909 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,910 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  75%|███████▌  | 431/574 [00:47<00:15,  9.35it/s]

2021-05-27 16:55:41,984 INFO | INITIAL
2021-05-27 16:55:41,985 INFO | (50, 200)
2021-05-27 16:55:41,990 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:41,991 INFO | (50, 200, 512)
2021-05-27 16:55:41,992 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:41,992 INFO | (50, 200, 512)
2021-05-27 16:55:41,993 INFO | BERT LAYER
2021-05-27 16:55:41,994 INFO | (200, 512)
2021-05-27 16:55:41,994 INFO | BERT LAYER LOOP
2021-05-27 16:55:41,994 INFO | (200, 512)
2021-05-27 16:55:41,995 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:41,996 INFO | (200, 512)
2021-05-27 16:55:42,003 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,004 INFO | (200, 512)
2021-05-27 16:55:42,004 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,005 INFO | (200, 512)
2021-05-27 16:55:42,011 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,012 INFO | (200, 512)
2021-05-27 16:55:42,012 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,013 INFO | (200, 512)
2021-05-27 16:55:42,020 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,020 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  75%|███████▌  | 432/574 [00:47<00:15,  9.46it/s]

2021-05-27 16:55:42,087 INFO | INITIAL
2021-05-27 16:55:42,087 INFO | (50, 200)
2021-05-27 16:55:42,094 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:42,094 INFO | (50, 200, 512)
2021-05-27 16:55:42,096 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:42,096 INFO | (50, 200, 512)
2021-05-27 16:55:42,097 INFO | BERT LAYER
2021-05-27 16:55:42,097 INFO | (200, 512)
2021-05-27 16:55:42,097 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,098 INFO | (200, 512)
2021-05-27 16:55:42,098 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,098 INFO | (200, 512)
2021-05-27 16:55:42,104 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,104 INFO | (200, 512)
2021-05-27 16:55:42,105 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,105 INFO | (200, 512)
2021-05-27 16:55:42,111 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,111 INFO | (200, 512)
2021-05-27 16:55:42,112 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,112 INFO | (200, 512)
2021-05-27 16:55:42,117 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,118 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  75%|███████▌  | 433/574 [00:47<00:14,  9.60it/s]

2021-05-27 16:55:42,187 INFO | INITIAL
2021-05-27 16:55:42,188 INFO | (50, 200)
2021-05-27 16:55:42,193 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:42,194 INFO | (50, 200, 512)
2021-05-27 16:55:42,195 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:42,197 INFO | (50, 200, 512)
2021-05-27 16:55:42,199 INFO | BERT LAYER
2021-05-27 16:55:42,199 INFO | (200, 512)
2021-05-27 16:55:42,200 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,201 INFO | (200, 512)
2021-05-27 16:55:42,201 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,202 INFO | (200, 512)
2021-05-27 16:55:42,209 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,210 INFO | (200, 512)
2021-05-27 16:55:42,211 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,211 INFO | (200, 512)
2021-05-27 16:55:42,219 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,219 INFO | (200, 512)
2021-05-27 16:55:42,219 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,220 INFO | (200, 512)
2021-05-27 16:55:42,228 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,228 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  76%|███████▌  | 434/574 [00:47<00:15,  9.31it/s]

2021-05-27 16:55:42,302 INFO | INITIAL
2021-05-27 16:55:42,303 INFO | (50, 200)
2021-05-27 16:55:42,308 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:42,309 INFO | (50, 200, 512)
2021-05-27 16:55:42,310 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:42,311 INFO | (50, 200, 512)
2021-05-27 16:55:42,312 INFO | BERT LAYER
2021-05-27 16:55:42,312 INFO | (200, 512)
2021-05-27 16:55:42,312 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,312 INFO | (200, 512)
2021-05-27 16:55:42,313 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,313 INFO | (200, 512)
2021-05-27 16:55:42,320 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,321 INFO | (200, 512)
2021-05-27 16:55:42,321 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,321 INFO | (200, 512)
2021-05-27 16:55:42,328 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,329 INFO | (200, 512)
2021-05-27 16:55:42,330 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,330 INFO | (200, 512)
2021-05-27 16:55:42,336 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,336 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  76%|███████▌  | 435/574 [00:47<00:14,  9.40it/s]

2021-05-27 16:55:42,406 INFO | INITIAL
2021-05-27 16:55:42,406 INFO | (50, 200)
2021-05-27 16:55:42,413 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:42,413 INFO | (50, 200, 512)
2021-05-27 16:55:42,415 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:42,415 INFO | (50, 200, 512)
2021-05-27 16:55:42,416 INFO | BERT LAYER
2021-05-27 16:55:42,416 INFO | (200, 512)
2021-05-27 16:55:42,417 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,417 INFO | (200, 512)
2021-05-27 16:55:42,418 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,418 INFO | (200, 512)
2021-05-27 16:55:42,424 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,424 INFO | (200, 512)
2021-05-27 16:55:42,425 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,425 INFO | (200, 512)
2021-05-27 16:55:42,433 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,433 INFO | (200, 512)
2021-05-27 16:55:42,434 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,434 INFO | (200, 512)
2021-05-27 16:55:42,440 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,441 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  76%|███████▌  | 436/574 [00:47<00:14,  9.28it/s]

2021-05-27 16:55:42,517 INFO | INITIAL
2021-05-27 16:55:42,518 INFO | (50, 200)
2021-05-27 16:55:42,524 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:42,525 INFO | (50, 200, 512)
2021-05-27 16:55:42,528 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:42,528 INFO | (50, 200, 512)
2021-05-27 16:55:42,529 INFO | BERT LAYER
2021-05-27 16:55:42,531 INFO | (200, 512)
2021-05-27 16:55:42,532 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,532 INFO | (200, 512)
2021-05-27 16:55:42,533 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,533 INFO | (200, 512)
2021-05-27 16:55:42,539 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,539 INFO | (200, 512)
2021-05-27 16:55:42,540 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,540 INFO | (200, 512)
2021-05-27 16:55:42,546 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,546 INFO | (200, 512)
2021-05-27 16:55:42,547 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,547 INFO | (200, 512)
2021-05-27 16:55:42,553 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,553 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  76%|███████▌  | 437/574 [00:47<00:15,  9.10it/s]

2021-05-27 16:55:42,633 INFO | INITIAL
2021-05-27 16:55:42,633 INFO | (50, 200)
2021-05-27 16:55:42,639 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:42,640 INFO | (50, 200, 512)
2021-05-27 16:55:42,641 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:42,642 INFO | (50, 200, 512)
2021-05-27 16:55:42,643 INFO | BERT LAYER
2021-05-27 16:55:42,644 INFO | (200, 512)
2021-05-27 16:55:42,644 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,645 INFO | (200, 512)
2021-05-27 16:55:42,645 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,646 INFO | (200, 512)
2021-05-27 16:55:42,652 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,652 INFO | (200, 512)
2021-05-27 16:55:42,653 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,653 INFO | (200, 512)
2021-05-27 16:55:42,660 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,662 INFO | (200, 512)
2021-05-27 16:55:42,663 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,664 INFO | (200, 512)
2021-05-27 16:55:42,671 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,671 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  76%|███████▋  | 438/574 [00:47<00:15,  8.98it/s]

2021-05-27 16:55:42,747 INFO | INITIAL
2021-05-27 16:55:42,747 INFO | (50, 200)
2021-05-27 16:55:42,752 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:42,753 INFO | (50, 200, 512)
2021-05-27 16:55:42,754 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:42,755 INFO | (50, 200, 512)
2021-05-27 16:55:42,756 INFO | BERT LAYER
2021-05-27 16:55:42,756 INFO | (200, 512)
2021-05-27 16:55:42,757 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,758 INFO | (200, 512)
2021-05-27 16:55:42,759 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,759 INFO | (200, 512)
2021-05-27 16:55:42,765 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,766 INFO | (200, 512)
2021-05-27 16:55:42,766 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,767 INFO | (200, 512)
2021-05-27 16:55:42,774 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,775 INFO | (200, 512)
2021-05-27 16:55:42,775 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,776 INFO | (200, 512)
2021-05-27 16:55:42,783 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,784 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  76%|███████▋  | 439/574 [00:48<00:15,  8.95it/s]

2021-05-27 16:55:42,860 INFO | INITIAL
2021-05-27 16:55:42,862 INFO | (50, 200)
2021-05-27 16:55:42,869 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:42,869 INFO | (50, 200, 512)
2021-05-27 16:55:42,871 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:42,871 INFO | (50, 200, 512)
2021-05-27 16:55:42,872 INFO | BERT LAYER
2021-05-27 16:55:42,873 INFO | (200, 512)
2021-05-27 16:55:42,873 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,873 INFO | (200, 512)
2021-05-27 16:55:42,874 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,874 INFO | (200, 512)
2021-05-27 16:55:42,882 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,882 INFO | (200, 512)
2021-05-27 16:55:42,883 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,883 INFO | (200, 512)
2021-05-27 16:55:42,889 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,889 INFO | (200, 512)
2021-05-27 16:55:42,890 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,890 INFO | (200, 512)
2021-05-27 16:55:42,896 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,896 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  77%|███████▋  | 440/574 [00:48<00:14,  9.08it/s]

2021-05-27 16:55:42,966 INFO | INITIAL
2021-05-27 16:55:42,967 INFO | (50, 200)
2021-05-27 16:55:42,973 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:42,973 INFO | (50, 200, 512)
2021-05-27 16:55:42,975 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:42,975 INFO | (50, 200, 512)
2021-05-27 16:55:42,976 INFO | BERT LAYER
2021-05-27 16:55:42,977 INFO | (200, 512)
2021-05-27 16:55:42,977 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,977 INFO | (200, 512)
2021-05-27 16:55:42,977 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,978 INFO | (200, 512)
2021-05-27 16:55:42,984 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,984 INFO | (200, 512)
2021-05-27 16:55:42,984 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,985 INFO | (200, 512)
2021-05-27 16:55:42,991 INFO | BERT LAYER LOOP
2021-05-27 16:55:42,992 INFO | (200, 512)
2021-05-27 16:55:42,992 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:42,993 INFO | (200, 512)
2021-05-27 16:55:43,000 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,001 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  77%|███████▋  | 441/574 [00:48<00:14,  9.26it/s]

2021-05-27 16:55:43,069 INFO | INITIAL
2021-05-27 16:55:43,069 INFO | (50, 200)
2021-05-27 16:55:43,075 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:43,076 INFO | (50, 200, 512)
2021-05-27 16:55:43,077 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:43,078 INFO | (50, 200, 512)
2021-05-27 16:55:43,079 INFO | BERT LAYER
2021-05-27 16:55:43,079 INFO | (200, 512)
2021-05-27 16:55:43,080 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,080 INFO | (200, 512)
2021-05-27 16:55:43,081 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,081 INFO | (200, 512)
2021-05-27 16:55:43,089 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,090 INFO | (200, 512)
2021-05-27 16:55:43,091 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,091 INFO | (200, 512)
2021-05-27 16:55:43,099 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,100 INFO | (200, 512)
2021-05-27 16:55:43,100 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,100 INFO | (200, 512)
2021-05-27 16:55:43,106 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,107 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  77%|███████▋  | 442/574 [00:48<00:14,  9.11it/s]

2021-05-27 16:55:43,183 INFO | INITIAL
2021-05-27 16:55:43,184 INFO | (50, 200)
2021-05-27 16:55:43,189 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:43,190 INFO | (50, 200, 512)
2021-05-27 16:55:43,191 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:43,192 INFO | (50, 200, 512)
2021-05-27 16:55:43,193 INFO | BERT LAYER
2021-05-27 16:55:43,194 INFO | (200, 512)
2021-05-27 16:55:43,194 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,195 INFO | (200, 512)
2021-05-27 16:55:43,195 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,196 INFO | (200, 512)
2021-05-27 16:55:43,203 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,204 INFO | (200, 512)
2021-05-27 16:55:43,204 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,205 INFO | (200, 512)
2021-05-27 16:55:43,211 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,211 INFO | (200, 512)
2021-05-27 16:55:43,212 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,213 INFO | (200, 512)
2021-05-27 16:55:43,219 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,220 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  77%|███████▋  | 443/574 [00:48<00:14,  9.04it/s]

2021-05-27 16:55:43,296 INFO | INITIAL
2021-05-27 16:55:43,296 INFO | (50, 200)
2021-05-27 16:55:43,301 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:43,302 INFO | (50, 200, 512)
2021-05-27 16:55:43,304 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:43,304 INFO | (50, 200, 512)
2021-05-27 16:55:43,305 INFO | BERT LAYER
2021-05-27 16:55:43,306 INFO | (200, 512)
2021-05-27 16:55:43,306 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,307 INFO | (200, 512)
2021-05-27 16:55:43,308 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,308 INFO | (200, 512)
2021-05-27 16:55:43,315 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,316 INFO | (200, 512)
2021-05-27 16:55:43,316 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,316 INFO | (200, 512)
2021-05-27 16:55:43,323 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,324 INFO | (200, 512)
2021-05-27 16:55:43,325 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,325 INFO | (200, 512)
2021-05-27 16:55:43,335 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,335 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  77%|███████▋  | 444/574 [00:48<00:14,  8.97it/s]

2021-05-27 16:55:43,410 INFO | INITIAL
2021-05-27 16:55:43,410 INFO | (50, 200)
2021-05-27 16:55:43,417 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:43,418 INFO | (50, 200, 512)
2021-05-27 16:55:43,419 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:43,419 INFO | (50, 200, 512)
2021-05-27 16:55:43,420 INFO | BERT LAYER
2021-05-27 16:55:43,420 INFO | (200, 512)
2021-05-27 16:55:43,421 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,421 INFO | (200, 512)
2021-05-27 16:55:43,422 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,422 INFO | (200, 512)
2021-05-27 16:55:43,429 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,429 INFO | (200, 512)
2021-05-27 16:55:43,430 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,430 INFO | (200, 512)
2021-05-27 16:55:43,435 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,436 INFO | (200, 512)
2021-05-27 16:55:43,436 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,437 INFO | (200, 512)
2021-05-27 16:55:43,442 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,443 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  78%|███████▊  | 445/574 [00:48<00:14,  9.10it/s]

2021-05-27 16:55:43,515 INFO | INITIAL
2021-05-27 16:55:43,515 INFO | (50, 200)
2021-05-27 16:55:43,521 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:43,521 INFO | (50, 200, 512)
2021-05-27 16:55:43,522 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:43,523 INFO | (50, 200, 512)
2021-05-27 16:55:43,523 INFO | BERT LAYER
2021-05-27 16:55:43,524 INFO | (200, 512)
2021-05-27 16:55:43,525 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,525 INFO | (200, 512)
2021-05-27 16:55:43,527 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,527 INFO | (200, 512)
2021-05-27 16:55:43,533 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,533 INFO | (200, 512)
2021-05-27 16:55:43,534 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,534 INFO | (200, 512)
2021-05-27 16:55:43,540 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,540 INFO | (200, 512)
2021-05-27 16:55:43,541 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,541 INFO | (200, 512)
2021-05-27 16:55:43,547 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,548 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  78%|███████▊  | 446/574 [00:48<00:13,  9.15it/s]

2021-05-27 16:55:43,623 INFO | INITIAL
2021-05-27 16:55:43,624 INFO | (50, 200)
2021-05-27 16:55:43,631 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:43,632 INFO | (50, 200, 512)
2021-05-27 16:55:43,633 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:43,634 INFO | (50, 200, 512)
2021-05-27 16:55:43,635 INFO | BERT LAYER
2021-05-27 16:55:43,635 INFO | (200, 512)
2021-05-27 16:55:43,635 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,636 INFO | (200, 512)
2021-05-27 16:55:43,636 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,637 INFO | (200, 512)
2021-05-27 16:55:43,643 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,643 INFO | (200, 512)
2021-05-27 16:55:43,643 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,644 INFO | (200, 512)
2021-05-27 16:55:43,649 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,650 INFO | (200, 512)
2021-05-27 16:55:43,650 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,651 INFO | (200, 512)
2021-05-27 16:55:43,656 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,658 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  78%|███████▊  | 447/574 [00:48<00:13,  9.35it/s]

2021-05-27 16:55:43,725 INFO | INITIAL
2021-05-27 16:55:43,725 INFO | (50, 200)
2021-05-27 16:55:43,732 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:43,733 INFO | (50, 200, 512)
2021-05-27 16:55:43,734 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:43,735 INFO | (50, 200, 512)
2021-05-27 16:55:43,735 INFO | BERT LAYER
2021-05-27 16:55:43,736 INFO | (200, 512)
2021-05-27 16:55:43,736 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,737 INFO | (200, 512)
2021-05-27 16:55:43,737 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,738 INFO | (200, 512)
2021-05-27 16:55:43,744 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,745 INFO | (200, 512)
2021-05-27 16:55:43,745 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,746 INFO | (200, 512)
2021-05-27 16:55:43,752 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,753 INFO | (200, 512)
2021-05-27 16:55:43,753 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,754 INFO | (200, 512)
2021-05-27 16:55:43,761 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,761 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  78%|███████▊  | 448/574 [00:49<00:13,  9.26it/s]

2021-05-27 16:55:43,835 INFO | INITIAL
2021-05-27 16:55:43,836 INFO | (50, 200)
2021-05-27 16:55:43,842 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:43,843 INFO | (50, 200, 512)
2021-05-27 16:55:43,844 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:43,844 INFO | (50, 200, 512)
2021-05-27 16:55:43,845 INFO | BERT LAYER
2021-05-27 16:55:43,846 INFO | (200, 512)
2021-05-27 16:55:43,846 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,847 INFO | (200, 512)
2021-05-27 16:55:43,848 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,848 INFO | (200, 512)
2021-05-27 16:55:43,854 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,855 INFO | (200, 512)
2021-05-27 16:55:43,855 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,856 INFO | (200, 512)
2021-05-27 16:55:43,864 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,865 INFO | (200, 512)
2021-05-27 16:55:43,865 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,865 INFO | (200, 512)
2021-05-27 16:55:43,871 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,871 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  78%|███████▊  | 449/574 [00:49<00:13,  9.30it/s]

2021-05-27 16:55:43,942 INFO | INITIAL
2021-05-27 16:55:43,943 INFO | (50, 200)
2021-05-27 16:55:43,948 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:43,949 INFO | (50, 200, 512)
2021-05-27 16:55:43,950 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:43,950 INFO | (50, 200, 512)
2021-05-27 16:55:43,951 INFO | BERT LAYER
2021-05-27 16:55:43,952 INFO | (200, 512)
2021-05-27 16:55:43,952 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,952 INFO | (200, 512)
2021-05-27 16:55:43,953 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,953 INFO | (200, 512)
2021-05-27 16:55:43,959 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,960 INFO | (200, 512)
2021-05-27 16:55:43,960 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,960 INFO | (200, 512)
2021-05-27 16:55:43,967 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,968 INFO | (200, 512)
2021-05-27 16:55:43,969 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:43,969 INFO | (200, 512)
2021-05-27 16:55:43,976 INFO | BERT LAYER LOOP
2021-05-27 16:55:43,976 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  78%|███████▊  | 450/574 [00:49<00:13,  9.32it/s]

2021-05-27 16:55:44,049 INFO | INITIAL
2021-05-27 16:55:44,050 INFO | (50, 200)
2021-05-27 16:55:44,055 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:44,055 INFO | (50, 200, 512)
2021-05-27 16:55:44,057 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:44,057 INFO | (50, 200, 512)
2021-05-27 16:55:44,059 INFO | BERT LAYER
2021-05-27 16:55:44,059 INFO | (200, 512)
2021-05-27 16:55:44,060 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,060 INFO | (200, 512)
2021-05-27 16:55:44,061 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,062 INFO | (200, 512)
2021-05-27 16:55:44,070 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,070 INFO | (200, 512)
2021-05-27 16:55:44,071 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,071 INFO | (200, 512)
2021-05-27 16:55:44,077 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,078 INFO | (200, 512)
2021-05-27 16:55:44,078 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,079 INFO | (200, 512)
2021-05-27 16:55:44,085 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,085 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  79%|███████▊  | 451/574 [00:49<00:13,  9.27it/s]

2021-05-27 16:55:44,158 INFO | INITIAL
2021-05-27 16:55:44,160 INFO | (50, 200)
2021-05-27 16:55:44,165 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:44,166 INFO | (50, 200, 512)
2021-05-27 16:55:44,167 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:44,168 INFO | (50, 200, 512)
2021-05-27 16:55:44,169 INFO | BERT LAYER
2021-05-27 16:55:44,169 INFO | (200, 512)
2021-05-27 16:55:44,170 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,170 INFO | (200, 512)
2021-05-27 16:55:44,170 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,171 INFO | (200, 512)
2021-05-27 16:55:44,177 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,178 INFO | (200, 512)
2021-05-27 16:55:44,178 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,179 INFO | (200, 512)
2021-05-27 16:55:44,184 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,185 INFO | (200, 512)
2021-05-27 16:55:44,185 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,186 INFO | (200, 512)
2021-05-27 16:55:44,192 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,193 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  79%|███████▊  | 452/574 [00:49<00:13,  9.15it/s]

2021-05-27 16:55:44,270 INFO | INITIAL
2021-05-27 16:55:44,270 INFO | (50, 200)
2021-05-27 16:55:44,276 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:44,277 INFO | (50, 200, 512)
2021-05-27 16:55:44,278 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:44,278 INFO | (50, 200, 512)
2021-05-27 16:55:44,279 INFO | BERT LAYER
2021-05-27 16:55:44,280 INFO | (200, 512)
2021-05-27 16:55:44,280 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,280 INFO | (200, 512)
2021-05-27 16:55:44,281 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,281 INFO | (200, 512)
2021-05-27 16:55:44,286 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,287 INFO | (200, 512)
2021-05-27 16:55:44,287 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,287 INFO | (200, 512)
2021-05-27 16:55:44,293 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,293 INFO | (200, 512)
2021-05-27 16:55:44,294 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,294 INFO | (200, 512)
2021-05-27 16:55:44,299 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,299 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  79%|███████▊  | 452/574 [00:49<00:13,  9.15it/s]

2021-05-27 16:55:44,369 INFO | INITIAL
2021-05-27 16:55:44,370 INFO | (50, 200)
2021-05-27 16:55:44,375 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:44,375 INFO | (50, 200, 512)
2021-05-27 16:55:44,377 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:44,377 INFO | (50, 200, 512)
2021-05-27 16:55:44,378 INFO | BERT LAYER
2021-05-27 16:55:44,378 INFO | (200, 512)
2021-05-27 16:55:44,379 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,379 INFO | (200, 512)
2021-05-27 16:55:44,380 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,380 INFO | (200, 512)
2021-05-27 16:55:44,387 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,387 INFO | (200, 512)
2021-05-27 16:55:44,388 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,388 INFO | (200, 512)
2021-05-27 16:55:44,393 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,394 INFO | (200, 512)
2021-05-27 16:55:44,394 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,395 INFO | (200, 512)
2021-05-27 16:55:44,402 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,402 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  79%|███████▉  | 454/574 [00:49<00:12,  9.37it/s]

2021-05-27 16:55:44,478 INFO | INITIAL
2021-05-27 16:55:44,479 INFO | (50, 200)
2021-05-27 16:55:44,484 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:44,484 INFO | (50, 200, 512)
2021-05-27 16:55:44,485 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:44,486 INFO | (50, 200, 512)
2021-05-27 16:55:44,487 INFO | BERT LAYER
2021-05-27 16:55:44,487 INFO | (200, 512)
2021-05-27 16:55:44,488 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,488 INFO | (200, 512)
2021-05-27 16:55:44,489 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,489 INFO | (200, 512)
2021-05-27 16:55:44,496 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,499 INFO | (200, 512)
2021-05-27 16:55:44,500 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,500 INFO | (200, 512)
2021-05-27 16:55:44,506 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,506 INFO | (200, 512)
2021-05-27 16:55:44,507 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,507 INFO | (200, 512)
2021-05-27 16:55:44,513 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,513 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  79%|███████▉  | 455/574 [00:49<00:12,  9.34it/s]

2021-05-27 16:55:44,586 INFO | INITIAL
2021-05-27 16:55:44,586 INFO | (50, 200)
2021-05-27 16:55:44,593 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:44,594 INFO | (50, 200, 512)
2021-05-27 16:55:44,595 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:44,596 INFO | (50, 200, 512)
2021-05-27 16:55:44,597 INFO | BERT LAYER
2021-05-27 16:55:44,597 INFO | (200, 512)
2021-05-27 16:55:44,598 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,599 INFO | (200, 512)
2021-05-27 16:55:44,599 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,600 INFO | (200, 512)
2021-05-27 16:55:44,606 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,606 INFO | (200, 512)
2021-05-27 16:55:44,607 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,607 INFO | (200, 512)
2021-05-27 16:55:44,613 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,613 INFO | (200, 512)
2021-05-27 16:55:44,614 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,614 INFO | (200, 512)
2021-05-27 16:55:44,620 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,620 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  79%|███████▉  | 456/574 [00:49<00:12,  9.23it/s]

2021-05-27 16:55:44,697 INFO | INITIAL
2021-05-27 16:55:44,698 INFO | (50, 200)
2021-05-27 16:55:44,704 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:44,704 INFO | (50, 200, 512)
2021-05-27 16:55:44,705 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:44,706 INFO | (50, 200, 512)
2021-05-27 16:55:44,707 INFO | BERT LAYER
2021-05-27 16:55:44,708 INFO | (200, 512)
2021-05-27 16:55:44,708 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,709 INFO | (200, 512)
2021-05-27 16:55:44,709 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,709 INFO | (200, 512)
2021-05-27 16:55:44,716 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,716 INFO | (200, 512)
2021-05-27 16:55:44,717 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,717 INFO | (200, 512)
2021-05-27 16:55:44,722 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,723 INFO | (200, 512)
2021-05-27 16:55:44,723 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,724 INFO | (200, 512)
2021-05-27 16:55:44,730 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,730 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  80%|███████▉  | 457/574 [00:49<00:12,  9.26it/s]

2021-05-27 16:55:44,804 INFO | INITIAL
2021-05-27 16:55:44,805 INFO | (50, 200)
2021-05-27 16:55:44,811 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:44,811 INFO | (50, 200, 512)
2021-05-27 16:55:44,812 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:44,813 INFO | (50, 200, 512)
2021-05-27 16:55:44,813 INFO | BERT LAYER
2021-05-27 16:55:44,814 INFO | (200, 512)
2021-05-27 16:55:44,814 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,815 INFO | (200, 512)
2021-05-27 16:55:44,815 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,816 INFO | (200, 512)
2021-05-27 16:55:44,822 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,823 INFO | (200, 512)
2021-05-27 16:55:44,824 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,825 INFO | (200, 512)
2021-05-27 16:55:44,831 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,832 INFO | (200, 512)
2021-05-27 16:55:44,832 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,833 INFO | (200, 512)
2021-05-27 16:55:44,839 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,840 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  80%|███████▉  | 458/574 [00:50<00:12,  9.17it/s]

2021-05-27 16:55:44,916 INFO | INITIAL
2021-05-27 16:55:44,917 INFO | (50, 200)
2021-05-27 16:55:44,922 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:44,922 INFO | (50, 200, 512)
2021-05-27 16:55:44,924 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:44,924 INFO | (50, 200, 512)
2021-05-27 16:55:44,925 INFO | BERT LAYER
2021-05-27 16:55:44,925 INFO | (200, 512)
2021-05-27 16:55:44,926 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,926 INFO | (200, 512)
2021-05-27 16:55:44,927 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,927 INFO | (200, 512)
2021-05-27 16:55:44,934 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,934 INFO | (200, 512)
2021-05-27 16:55:44,935 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,935 INFO | (200, 512)
2021-05-27 16:55:44,942 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,942 INFO | (200, 512)
2021-05-27 16:55:44,943 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:44,943 INFO | (200, 512)
2021-05-27 16:55:44,950 INFO | BERT LAYER LOOP
2021-05-27 16:55:44,950 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  80%|███████▉  | 459/574 [00:50<00:12,  9.26it/s]

2021-05-27 16:55:45,022 INFO | INITIAL
2021-05-27 16:55:45,022 INFO | (50, 200)
2021-05-27 16:55:45,029 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:45,030 INFO | (50, 200, 512)
2021-05-27 16:55:45,031 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:45,031 INFO | (50, 200, 512)
2021-05-27 16:55:45,032 INFO | BERT LAYER
2021-05-27 16:55:45,033 INFO | (200, 512)
2021-05-27 16:55:45,033 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,034 INFO | (200, 512)
2021-05-27 16:55:45,034 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,035 INFO | (200, 512)
2021-05-27 16:55:45,042 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,043 INFO | (200, 512)
2021-05-27 16:55:45,043 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,044 INFO | (200, 512)
2021-05-27 16:55:45,051 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,051 INFO | (200, 512)
2021-05-27 16:55:45,051 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,052 INFO | (200, 512)
2021-05-27 16:55:45,058 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,058 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  80%|████████  | 460/574 [00:50<00:12,  9.31it/s]

2021-05-27 16:55:45,128 INFO | INITIAL
2021-05-27 16:55:45,128 INFO | (50, 200)
2021-05-27 16:55:45,135 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:45,136 INFO | (50, 200, 512)
2021-05-27 16:55:45,137 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:45,138 INFO | (50, 200, 512)
2021-05-27 16:55:45,138 INFO | BERT LAYER
2021-05-27 16:55:45,139 INFO | (200, 512)
2021-05-27 16:55:45,139 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,139 INFO | (200, 512)
2021-05-27 16:55:45,140 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,140 INFO | (200, 512)
2021-05-27 16:55:45,146 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,147 INFO | (200, 512)
2021-05-27 16:55:45,147 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,148 INFO | (200, 512)
2021-05-27 16:55:45,153 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,153 INFO | (200, 512)
2021-05-27 16:55:45,154 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,154 INFO | (200, 512)
2021-05-27 16:55:45,159 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,161 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  80%|████████  | 461/574 [00:50<00:12,  9.33it/s]

2021-05-27 16:55:45,234 INFO | INITIAL
2021-05-27 16:55:45,235 INFO | (50, 200)
2021-05-27 16:55:45,239 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:45,240 INFO | (50, 200, 512)
2021-05-27 16:55:45,241 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:45,242 INFO | (50, 200, 512)
2021-05-27 16:55:45,243 INFO | BERT LAYER
2021-05-27 16:55:45,243 INFO | (200, 512)
2021-05-27 16:55:45,243 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,244 INFO | (200, 512)
2021-05-27 16:55:45,244 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,244 INFO | (200, 512)
2021-05-27 16:55:45,249 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,250 INFO | (200, 512)
2021-05-27 16:55:45,250 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,250 INFO | (200, 512)
2021-05-27 16:55:45,255 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,256 INFO | (200, 512)
2021-05-27 16:55:45,256 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,256 INFO | (200, 512)
2021-05-27 16:55:45,262 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,263 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  80%|████████  | 462/574 [00:50<00:12,  9.06it/s]

2021-05-27 16:55:45,352 INFO | INITIAL
2021-05-27 16:55:45,353 INFO | (50, 200)
2021-05-27 16:55:45,359 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:45,360 INFO | (50, 200, 512)
2021-05-27 16:55:45,362 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:45,362 INFO | (50, 200, 512)
2021-05-27 16:55:45,363 INFO | BERT LAYER
2021-05-27 16:55:45,364 INFO | (200, 512)
2021-05-27 16:55:45,365 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,365 INFO | (200, 512)
2021-05-27 16:55:45,366 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,367 INFO | (200, 512)
2021-05-27 16:55:45,373 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,373 INFO | (200, 512)
2021-05-27 16:55:45,374 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,374 INFO | (200, 512)
2021-05-27 16:55:45,381 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,382 INFO | (200, 512)
2021-05-27 16:55:45,382 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,382 INFO | (200, 512)
2021-05-27 16:55:45,387 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,388 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  81%|████████  | 463/574 [00:50<00:12,  9.08it/s]

2021-05-27 16:55:45,462 INFO | INITIAL
2021-05-27 16:55:45,462 INFO | (50, 200)
2021-05-27 16:55:45,470 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:45,471 INFO | (50, 200, 512)
2021-05-27 16:55:45,472 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:45,472 INFO | (50, 200, 512)
2021-05-27 16:55:45,473 INFO | BERT LAYER
2021-05-27 16:55:45,474 INFO | (200, 512)
2021-05-27 16:55:45,474 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,474 INFO | (200, 512)
2021-05-27 16:55:45,475 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,476 INFO | (200, 512)
2021-05-27 16:55:45,481 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,482 INFO | (200, 512)
2021-05-27 16:55:45,482 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,483 INFO | (200, 512)
2021-05-27 16:55:45,488 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,489 INFO | (200, 512)
2021-05-27 16:55:45,489 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,489 INFO | (200, 512)
2021-05-27 16:55:45,494 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,497 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  81%|████████  | 464/574 [00:50<00:12,  9.07it/s]

2021-05-27 16:55:45,572 INFO | INITIAL
2021-05-27 16:55:45,573 INFO | (50, 200)
2021-05-27 16:55:45,579 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:45,580 INFO | (50, 200, 512)
2021-05-27 16:55:45,581 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:45,581 INFO | (50, 200, 512)
2021-05-27 16:55:45,582 INFO | BERT LAYER
2021-05-27 16:55:45,582 INFO | (200, 512)
2021-05-27 16:55:45,583 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,583 INFO | (200, 512)
2021-05-27 16:55:45,584 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,584 INFO | (200, 512)
2021-05-27 16:55:45,589 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,590 INFO | (200, 512)
2021-05-27 16:55:45,590 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,591 INFO | (200, 512)
2021-05-27 16:55:45,600 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,601 INFO | (200, 512)
2021-05-27 16:55:45,601 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,602 INFO | (200, 512)
2021-05-27 16:55:45,608 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,609 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  81%|████████  | 465/574 [00:50<00:11,  9.11it/s]

2021-05-27 16:55:45,681 INFO | INITIAL
2021-05-27 16:55:45,683 INFO | (50, 200)
2021-05-27 16:55:45,688 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:45,689 INFO | (50, 200, 512)
2021-05-27 16:55:45,690 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:45,691 INFO | (50, 200, 512)
2021-05-27 16:55:45,692 INFO | BERT LAYER
2021-05-27 16:55:45,692 INFO | (200, 512)
2021-05-27 16:55:45,693 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,693 INFO | (200, 512)
2021-05-27 16:55:45,694 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,694 INFO | (200, 512)
2021-05-27 16:55:45,699 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,700 INFO | (200, 512)
2021-05-27 16:55:45,700 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,701 INFO | (200, 512)
2021-05-27 16:55:45,707 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,708 INFO | (200, 512)
2021-05-27 16:55:45,708 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,709 INFO | (200, 512)
2021-05-27 16:55:45,714 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,715 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  81%|████████  | 466/574 [00:50<00:11,  9.23it/s]

2021-05-27 16:55:45,786 INFO | INITIAL
2021-05-27 16:55:45,786 INFO | (50, 200)
2021-05-27 16:55:45,791 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:45,792 INFO | (50, 200, 512)
2021-05-27 16:55:45,793 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:45,793 INFO | (50, 200, 512)
2021-05-27 16:55:45,794 INFO | BERT LAYER
2021-05-27 16:55:45,794 INFO | (200, 512)
2021-05-27 16:55:45,795 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,795 INFO | (200, 512)
2021-05-27 16:55:45,795 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,796 INFO | (200, 512)
2021-05-27 16:55:45,802 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,803 INFO | (200, 512)
2021-05-27 16:55:45,804 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,804 INFO | (200, 512)
2021-05-27 16:55:45,809 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,810 INFO | (200, 512)
2021-05-27 16:55:45,810 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,811 INFO | (200, 512)
2021-05-27 16:55:45,816 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,816 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  81%|████████▏ | 467/574 [00:51<00:11,  9.37it/s]

2021-05-27 16:55:45,889 INFO | INITIAL
2021-05-27 16:55:45,891 INFO | (50, 200)
2021-05-27 16:55:45,900 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:45,901 INFO | (50, 200, 512)
2021-05-27 16:55:45,903 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:45,903 INFO | (50, 200, 512)
2021-05-27 16:55:45,904 INFO | BERT LAYER
2021-05-27 16:55:45,905 INFO | (200, 512)
2021-05-27 16:55:45,905 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,905 INFO | (200, 512)
2021-05-27 16:55:45,906 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,906 INFO | (200, 512)
2021-05-27 16:55:45,913 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,914 INFO | (200, 512)
2021-05-27 16:55:45,914 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,915 INFO | (200, 512)
2021-05-27 16:55:45,921 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,922 INFO | (200, 512)
2021-05-27 16:55:45,923 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:45,923 INFO | (200, 512)
2021-05-27 16:55:45,929 INFO | BERT LAYER LOOP
2021-05-27 16:55:45,929 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  82%|████████▏ | 468/574 [00:51<00:11,  9.27it/s]

2021-05-27 16:55:46,000 INFO | INITIAL
2021-05-27 16:55:46,001 INFO | (50, 200)
2021-05-27 16:55:46,007 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:46,007 INFO | (50, 200, 512)
2021-05-27 16:55:46,009 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:46,009 INFO | (50, 200, 512)
2021-05-27 16:55:46,010 INFO | BERT LAYER
2021-05-27 16:55:46,010 INFO | (200, 512)
2021-05-27 16:55:46,011 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,011 INFO | (200, 512)
2021-05-27 16:55:46,012 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,012 INFO | (200, 512)
2021-05-27 16:55:46,018 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,019 INFO | (200, 512)
2021-05-27 16:55:46,019 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,019 INFO | (200, 512)
2021-05-27 16:55:46,026 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,027 INFO | (200, 512)
2021-05-27 16:55:46,028 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,029 INFO | (200, 512)
2021-05-27 16:55:46,036 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,037 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  82%|████████▏ | 469/574 [00:51<00:11,  9.21it/s]

2021-05-27 16:55:46,110 INFO | INITIAL
2021-05-27 16:55:46,110 INFO | (50, 200)
2021-05-27 16:55:46,116 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:46,116 INFO | (50, 200, 512)
2021-05-27 16:55:46,117 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:46,118 INFO | (50, 200, 512)
2021-05-27 16:55:46,119 INFO | BERT LAYER
2021-05-27 16:55:46,119 INFO | (200, 512)
2021-05-27 16:55:46,120 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,120 INFO | (200, 512)
2021-05-27 16:55:46,120 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,120 INFO | (200, 512)
2021-05-27 16:55:46,126 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,126 INFO | (200, 512)
2021-05-27 16:55:46,126 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,127 INFO | (200, 512)
2021-05-27 16:55:46,132 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,133 INFO | (200, 512)
2021-05-27 16:55:46,133 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,133 INFO | (200, 512)
2021-05-27 16:55:46,140 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,140 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  82%|████████▏ | 470/574 [00:51<00:11,  9.36it/s]

2021-05-27 16:55:46,213 INFO | INITIAL
2021-05-27 16:55:46,213 INFO | (50, 200)
2021-05-27 16:55:46,220 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:46,221 INFO | (50, 200, 512)
2021-05-27 16:55:46,222 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:46,223 INFO | (50, 200, 512)
2021-05-27 16:55:46,224 INFO | BERT LAYER
2021-05-27 16:55:46,224 INFO | (200, 512)
2021-05-27 16:55:46,225 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,225 INFO | (200, 512)
2021-05-27 16:55:46,225 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,226 INFO | (200, 512)
2021-05-27 16:55:46,232 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,233 INFO | (200, 512)
2021-05-27 16:55:46,233 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,234 INFO | (200, 512)
2021-05-27 16:55:46,239 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,240 INFO | (200, 512)
2021-05-27 16:55:46,240 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,241 INFO | (200, 512)
2021-05-27 16:55:46,246 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,246 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  82%|████████▏ | 471/574 [00:51<00:10,  9.41it/s]

2021-05-27 16:55:46,317 INFO | INITIAL
2021-05-27 16:55:46,318 INFO | (50, 200)
2021-05-27 16:55:46,323 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:46,323 INFO | (50, 200, 512)
2021-05-27 16:55:46,324 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:46,325 INFO | (50, 200, 512)
2021-05-27 16:55:46,325 INFO | BERT LAYER
2021-05-27 16:55:46,326 INFO | (200, 512)
2021-05-27 16:55:46,326 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,326 INFO | (200, 512)
2021-05-27 16:55:46,327 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,327 INFO | (200, 512)
2021-05-27 16:55:46,334 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,335 INFO | (200, 512)
2021-05-27 16:55:46,335 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,336 INFO | (200, 512)
2021-05-27 16:55:46,340 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,341 INFO | (200, 512)
2021-05-27 16:55:46,342 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,342 INFO | (200, 512)
2021-05-27 16:55:46,347 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,348 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  82%|████████▏ | 472/574 [00:51<00:10,  9.47it/s]

2021-05-27 16:55:46,421 INFO | INITIAL
2021-05-27 16:55:46,422 INFO | (50, 200)
2021-05-27 16:55:46,428 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:46,429 INFO | (50, 200, 512)
2021-05-27 16:55:46,430 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:46,430 INFO | (50, 200, 512)
2021-05-27 16:55:46,431 INFO | BERT LAYER
2021-05-27 16:55:46,431 INFO | (200, 512)
2021-05-27 16:55:46,432 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,432 INFO | (200, 512)
2021-05-27 16:55:46,433 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,433 INFO | (200, 512)
2021-05-27 16:55:46,439 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,439 INFO | (200, 512)
2021-05-27 16:55:46,441 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,441 INFO | (200, 512)
2021-05-27 16:55:46,447 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,448 INFO | (200, 512)
2021-05-27 16:55:46,448 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,448 INFO | (200, 512)
2021-05-27 16:55:46,454 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,454 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  82%|████████▏ | 473/574 [00:51<00:10,  9.49it/s]

2021-05-27 16:55:46,526 INFO | INITIAL
2021-05-27 16:55:46,526 INFO | (50, 200)
2021-05-27 16:55:46,533 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:46,534 INFO | (50, 200, 512)
2021-05-27 16:55:46,535 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:46,536 INFO | (50, 200, 512)
2021-05-27 16:55:46,537 INFO | BERT LAYER
2021-05-27 16:55:46,537 INFO | (200, 512)
2021-05-27 16:55:46,538 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,538 INFO | (200, 512)
2021-05-27 16:55:46,539 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,539 INFO | (200, 512)
2021-05-27 16:55:46,545 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,545 INFO | (200, 512)
2021-05-27 16:55:46,546 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,546 INFO | (200, 512)
2021-05-27 16:55:46,552 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,552 INFO | (200, 512)
2021-05-27 16:55:46,553 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,553 INFO | (200, 512)
2021-05-27 16:55:46,559 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,559 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  83%|████████▎ | 474/574 [00:51<00:10,  9.38it/s]

2021-05-27 16:55:46,636 INFO | INITIAL
2021-05-27 16:55:46,636 INFO | (50, 200)
2021-05-27 16:55:46,643 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:46,643 INFO | (50, 200, 512)
2021-05-27 16:55:46,645 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:46,645 INFO | (50, 200, 512)
2021-05-27 16:55:46,646 INFO | BERT LAYER
2021-05-27 16:55:46,647 INFO | (200, 512)
2021-05-27 16:55:46,647 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,647 INFO | (200, 512)
2021-05-27 16:55:46,648 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,648 INFO | (200, 512)
2021-05-27 16:55:46,654 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,655 INFO | (200, 512)
2021-05-27 16:55:46,655 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,656 INFO | (200, 512)
2021-05-27 16:55:46,662 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,663 INFO | (200, 512)
2021-05-27 16:55:46,664 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,665 INFO | (200, 512)
2021-05-27 16:55:46,670 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,671 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  83%|████████▎ | 475/574 [00:51<00:10,  9.42it/s]

2021-05-27 16:55:46,741 INFO | INITIAL
2021-05-27 16:55:46,741 INFO | (50, 200)
2021-05-27 16:55:46,748 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:46,749 INFO | (50, 200, 512)
2021-05-27 16:55:46,750 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:46,751 INFO | (50, 200, 512)
2021-05-27 16:55:46,751 INFO | BERT LAYER
2021-05-27 16:55:46,752 INFO | (200, 512)
2021-05-27 16:55:46,752 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,753 INFO | (200, 512)
2021-05-27 16:55:46,753 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,754 INFO | (200, 512)
2021-05-27 16:55:46,759 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,760 INFO | (200, 512)
2021-05-27 16:55:46,760 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,761 INFO | (200, 512)
2021-05-27 16:55:46,767 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,767 INFO | (200, 512)
2021-05-27 16:55:46,768 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,768 INFO | (200, 512)
2021-05-27 16:55:46,774 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,774 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  83%|████████▎ | 476/574 [00:52<00:10,  9.48it/s]

2021-05-27 16:55:46,845 INFO | INITIAL
2021-05-27 16:55:46,845 INFO | (50, 200)
2021-05-27 16:55:46,851 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:46,852 INFO | (50, 200, 512)
2021-05-27 16:55:46,853 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:46,854 INFO | (50, 200, 512)
2021-05-27 16:55:46,855 INFO | BERT LAYER
2021-05-27 16:55:46,855 INFO | (200, 512)
2021-05-27 16:55:46,856 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,856 INFO | (200, 512)
2021-05-27 16:55:46,857 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,857 INFO | (200, 512)
2021-05-27 16:55:46,864 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,865 INFO | (200, 512)
2021-05-27 16:55:46,866 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,866 INFO | (200, 512)
2021-05-27 16:55:46,873 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,873 INFO | (200, 512)
2021-05-27 16:55:46,874 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,874 INFO | (200, 512)
2021-05-27 16:55:46,880 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,880 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  83%|████████▎ | 477/574 [00:52<00:10,  9.54it/s]

2021-05-27 16:55:46,948 INFO | INITIAL
2021-05-27 16:55:46,949 INFO | (50, 200)
2021-05-27 16:55:46,954 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:46,954 INFO | (50, 200, 512)
2021-05-27 16:55:46,956 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:46,956 INFO | (50, 200, 512)
2021-05-27 16:55:46,957 INFO | BERT LAYER
2021-05-27 16:55:46,957 INFO | (200, 512)
2021-05-27 16:55:46,958 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,958 INFO | (200, 512)
2021-05-27 16:55:46,958 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,959 INFO | (200, 512)
2021-05-27 16:55:46,965 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,966 INFO | (200, 512)
2021-05-27 16:55:46,966 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,967 INFO | (200, 512)
2021-05-27 16:55:46,973 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,974 INFO | (200, 512)
2021-05-27 16:55:46,974 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:46,975 INFO | (200, 512)
2021-05-27 16:55:46,981 INFO | BERT LAYER LOOP
2021-05-27 16:55:46,982 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  83%|████████▎ | 478/574 [00:52<00:10,  9.45it/s]

2021-05-27 16:55:47,056 INFO | INITIAL
2021-05-27 16:55:47,057 INFO | (50, 200)
2021-05-27 16:55:47,063 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:47,063 INFO | (50, 200, 512)
2021-05-27 16:55:47,065 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:47,065 INFO | (50, 200, 512)
2021-05-27 16:55:47,066 INFO | BERT LAYER
2021-05-27 16:55:47,067 INFO | (200, 512)
2021-05-27 16:55:47,068 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,068 INFO | (200, 512)
2021-05-27 16:55:47,069 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,069 INFO | (200, 512)
2021-05-27 16:55:47,075 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,075 INFO | (200, 512)
2021-05-27 16:55:47,076 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,076 INFO | (200, 512)
2021-05-27 16:55:47,082 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,083 INFO | (200, 512)
2021-05-27 16:55:47,083 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,083 INFO | (200, 512)
2021-05-27 16:55:47,091 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,091 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  83%|████████▎ | 479/574 [00:52<00:10,  9.37it/s]

2021-05-27 16:55:47,165 INFO | INITIAL
2021-05-27 16:55:47,165 INFO | (50, 200)
2021-05-27 16:55:47,171 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:47,171 INFO | (50, 200, 512)
2021-05-27 16:55:47,172 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:47,173 INFO | (50, 200, 512)
2021-05-27 16:55:47,174 INFO | BERT LAYER
2021-05-27 16:55:47,175 INFO | (200, 512)
2021-05-27 16:55:47,176 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,176 INFO | (200, 512)
2021-05-27 16:55:47,177 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,177 INFO | (200, 512)
2021-05-27 16:55:47,184 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,184 INFO | (200, 512)
2021-05-27 16:55:47,185 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,185 INFO | (200, 512)
2021-05-27 16:55:47,192 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,192 INFO | (200, 512)
2021-05-27 16:55:47,193 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,193 INFO | (200, 512)
2021-05-27 16:55:47,200 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,200 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  84%|████████▎ | 480/574 [00:52<00:10,  9.37it/s]

2021-05-27 16:55:47,271 INFO | INITIAL
2021-05-27 16:55:47,272 INFO | (50, 200)
2021-05-27 16:55:47,278 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:47,278 INFO | (50, 200, 512)
2021-05-27 16:55:47,280 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:47,281 INFO | (50, 200, 512)
2021-05-27 16:55:47,282 INFO | BERT LAYER
2021-05-27 16:55:47,282 INFO | (200, 512)
2021-05-27 16:55:47,283 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,283 INFO | (200, 512)
2021-05-27 16:55:47,284 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,285 INFO | (200, 512)
2021-05-27 16:55:47,291 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,292 INFO | (200, 512)
2021-05-27 16:55:47,293 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,293 INFO | (200, 512)
2021-05-27 16:55:47,299 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,300 INFO | (200, 512)
2021-05-27 16:55:47,300 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,300 INFO | (200, 512)
2021-05-27 16:55:47,306 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,307 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  84%|████████▍ | 481/574 [00:52<00:10,  9.28it/s]

2021-05-27 16:55:47,382 INFO | INITIAL
2021-05-27 16:55:47,382 INFO | (50, 200)
2021-05-27 16:55:47,387 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:47,387 INFO | (50, 200, 512)
2021-05-27 16:55:47,388 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:47,389 INFO | (50, 200, 512)
2021-05-27 16:55:47,390 INFO | BERT LAYER
2021-05-27 16:55:47,390 INFO | (200, 512)
2021-05-27 16:55:47,391 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,391 INFO | (200, 512)
2021-05-27 16:55:47,391 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,392 INFO | (200, 512)
2021-05-27 16:55:47,398 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,398 INFO | (200, 512)
2021-05-27 16:55:47,399 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,399 INFO | (200, 512)
2021-05-27 16:55:47,405 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,406 INFO | (200, 512)
2021-05-27 16:55:47,406 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,407 INFO | (200, 512)
2021-05-27 16:55:47,412 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,412 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  84%|████████▍ | 482/574 [00:52<00:09,  9.31it/s]

2021-05-27 16:55:47,489 INFO | INITIAL
2021-05-27 16:55:47,490 INFO | (50, 200)
2021-05-27 16:55:47,498 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:47,499 INFO | (50, 200, 512)
2021-05-27 16:55:47,501 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:47,502 INFO | (50, 200, 512)
2021-05-27 16:55:47,503 INFO | BERT LAYER
2021-05-27 16:55:47,503 INFO | (200, 512)
2021-05-27 16:55:47,504 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,504 INFO | (200, 512)
2021-05-27 16:55:47,504 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,505 INFO | (200, 512)
2021-05-27 16:55:47,511 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,511 INFO | (200, 512)
2021-05-27 16:55:47,512 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,512 INFO | (200, 512)
2021-05-27 16:55:47,518 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,518 INFO | (200, 512)
2021-05-27 16:55:47,519 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,519 INFO | (200, 512)
2021-05-27 16:55:47,524 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,524 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  84%|████████▍ | 483/574 [00:52<00:09,  9.30it/s]

2021-05-27 16:55:47,596 INFO | INITIAL
2021-05-27 16:55:47,597 INFO | (50, 200)
2021-05-27 16:55:47,604 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:47,604 INFO | (50, 200, 512)
2021-05-27 16:55:47,606 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:47,606 INFO | (50, 200, 512)
2021-05-27 16:55:47,607 INFO | BERT LAYER
2021-05-27 16:55:47,608 INFO | (200, 512)
2021-05-27 16:55:47,608 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,609 INFO | (200, 512)
2021-05-27 16:55:47,610 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,610 INFO | (200, 512)
2021-05-27 16:55:47,618 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,619 INFO | (200, 512)
2021-05-27 16:55:47,619 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,620 INFO | (200, 512)
2021-05-27 16:55:47,627 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,630 INFO | (200, 512)
2021-05-27 16:55:47,630 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,633 INFO | (200, 512)
2021-05-27 16:55:47,638 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,638 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  84%|████████▍ | 484/574 [00:52<00:09,  9.22it/s]

2021-05-27 16:55:47,707 INFO | INITIAL
2021-05-27 16:55:47,707 INFO | (50, 200)
2021-05-27 16:55:47,714 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:47,715 INFO | (50, 200, 512)
2021-05-27 16:55:47,716 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:47,716 INFO | (50, 200, 512)
2021-05-27 16:55:47,717 INFO | BERT LAYER
2021-05-27 16:55:47,718 INFO | (200, 512)
2021-05-27 16:55:47,718 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,718 INFO | (200, 512)
2021-05-27 16:55:47,719 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,719 INFO | (200, 512)
2021-05-27 16:55:47,724 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,724 INFO | (200, 512)
2021-05-27 16:55:47,725 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,725 INFO | (200, 512)
2021-05-27 16:55:47,730 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,731 INFO | (200, 512)
2021-05-27 16:55:47,731 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,732 INFO | (200, 512)
2021-05-27 16:55:47,737 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,738 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  84%|████████▍ | 484/574 [00:52<00:09,  9.22it/s]

2021-05-27 16:55:47,807 INFO | INITIAL
2021-05-27 16:55:47,807 INFO | (50, 200)
2021-05-27 16:55:47,813 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:47,813 INFO | (50, 200, 512)
2021-05-27 16:55:47,815 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:47,815 INFO | (50, 200, 512)
2021-05-27 16:55:47,816 INFO | BERT LAYER
2021-05-27 16:55:47,816 INFO | (200, 512)
2021-05-27 16:55:47,816 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,817 INFO | (200, 512)
2021-05-27 16:55:47,817 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,817 INFO | (200, 512)
2021-05-27 16:55:47,825 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,825 INFO | (200, 512)
2021-05-27 16:55:47,826 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,826 INFO | (200, 512)
2021-05-27 16:55:47,831 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,832 INFO | (200, 512)
2021-05-27 16:55:47,833 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,833 INFO | (200, 512)
2021-05-27 16:55:47,841 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,841 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  85%|████████▍ | 486/574 [00:53<00:09,  9.39it/s]

2021-05-27 16:55:47,915 INFO | INITIAL
2021-05-27 16:55:47,916 INFO | (50, 200)
2021-05-27 16:55:47,923 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:47,923 INFO | (50, 200, 512)
2021-05-27 16:55:47,924 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:47,925 INFO | (50, 200, 512)
2021-05-27 16:55:47,926 INFO | BERT LAYER
2021-05-27 16:55:47,926 INFO | (200, 512)
2021-05-27 16:55:47,927 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,927 INFO | (200, 512)
2021-05-27 16:55:47,927 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,928 INFO | (200, 512)
2021-05-27 16:55:47,935 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,935 INFO | (200, 512)
2021-05-27 16:55:47,936 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,936 INFO | (200, 512)
2021-05-27 16:55:47,942 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,942 INFO | (200, 512)
2021-05-27 16:55:47,943 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:47,943 INFO | (200, 512)
2021-05-27 16:55:47,949 INFO | BERT LAYER LOOP
2021-05-27 16:55:47,950 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  85%|████████▍ | 487/574 [00:53<00:09,  9.48it/s]

2021-05-27 16:55:48,018 INFO | INITIAL
2021-05-27 16:55:48,019 INFO | (50, 200)
2021-05-27 16:55:48,027 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:48,027 INFO | (50, 200, 512)
2021-05-27 16:55:48,029 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:48,029 INFO | (50, 200, 512)
2021-05-27 16:55:48,030 INFO | BERT LAYER
2021-05-27 16:55:48,030 INFO | (200, 512)
2021-05-27 16:55:48,030 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,031 INFO | (200, 512)
2021-05-27 16:55:48,032 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,032 INFO | (200, 512)
2021-05-27 16:55:48,038 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,039 INFO | (200, 512)
2021-05-27 16:55:48,039 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,039 INFO | (200, 512)
2021-05-27 16:55:48,045 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,045 INFO | (200, 512)
2021-05-27 16:55:48,046 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,046 INFO | (200, 512)
2021-05-27 16:55:48,051 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,052 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  85%|████████▌ | 488/574 [00:53<00:09,  9.50it/s]

2021-05-27 16:55:48,123 INFO | INITIAL
2021-05-27 16:55:48,123 INFO | (50, 200)
2021-05-27 16:55:48,129 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:48,130 INFO | (50, 200, 512)
2021-05-27 16:55:48,131 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:48,131 INFO | (50, 200, 512)
2021-05-27 16:55:48,132 INFO | BERT LAYER
2021-05-27 16:55:48,133 INFO | (200, 512)
2021-05-27 16:55:48,134 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,134 INFO | (200, 512)
2021-05-27 16:55:48,135 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,135 INFO | (200, 512)
2021-05-27 16:55:48,141 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,142 INFO | (200, 512)
2021-05-27 16:55:48,143 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,143 INFO | (200, 512)
2021-05-27 16:55:48,149 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,149 INFO | (200, 512)
2021-05-27 16:55:48,150 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,151 INFO | (200, 512)
2021-05-27 16:55:48,156 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,156 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  85%|████████▌ | 489/574 [00:53<00:09,  9.36it/s]

2021-05-27 16:55:48,234 INFO | INITIAL
2021-05-27 16:55:48,234 INFO | (50, 200)
2021-05-27 16:55:48,240 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:48,240 INFO | (50, 200, 512)
2021-05-27 16:55:48,242 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:48,243 INFO | (50, 200, 512)
2021-05-27 16:55:48,244 INFO | BERT LAYER
2021-05-27 16:55:48,244 INFO | (200, 512)
2021-05-27 16:55:48,244 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,245 INFO | (200, 512)
2021-05-27 16:55:48,245 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,246 INFO | (200, 512)
2021-05-27 16:55:48,251 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,252 INFO | (200, 512)
2021-05-27 16:55:48,253 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,253 INFO | (200, 512)
2021-05-27 16:55:48,258 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,259 INFO | (200, 512)
2021-05-27 16:55:48,259 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,259 INFO | (200, 512)
2021-05-27 16:55:48,265 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,266 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  85%|████████▌ | 490/574 [00:53<00:08,  9.40it/s]

2021-05-27 16:55:48,339 INFO | INITIAL
2021-05-27 16:55:48,339 INFO | (50, 200)
2021-05-27 16:55:48,346 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:48,346 INFO | (50, 200, 512)
2021-05-27 16:55:48,348 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:48,348 INFO | (50, 200, 512)
2021-05-27 16:55:48,349 INFO | BERT LAYER
2021-05-27 16:55:48,349 INFO | (200, 512)
2021-05-27 16:55:48,350 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,350 INFO | (200, 512)
2021-05-27 16:55:48,351 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,351 INFO | (200, 512)
2021-05-27 16:55:48,357 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,358 INFO | (200, 512)
2021-05-27 16:55:48,359 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,360 INFO | (200, 512)
2021-05-27 16:55:48,366 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,366 INFO | (200, 512)
2021-05-27 16:55:48,367 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,367 INFO | (200, 512)
2021-05-27 16:55:48,374 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,374 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  86%|████████▌ | 491/574 [00:53<00:08,  9.47it/s]

2021-05-27 16:55:48,443 INFO | INITIAL
2021-05-27 16:55:48,443 INFO | (50, 200)
2021-05-27 16:55:48,449 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:48,449 INFO | (50, 200, 512)
2021-05-27 16:55:48,450 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:48,451 INFO | (50, 200, 512)
2021-05-27 16:55:48,452 INFO | BERT LAYER
2021-05-27 16:55:48,452 INFO | (200, 512)
2021-05-27 16:55:48,452 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,453 INFO | (200, 512)
2021-05-27 16:55:48,453 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,454 INFO | (200, 512)
2021-05-27 16:55:48,459 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,459 INFO | (200, 512)
2021-05-27 16:55:48,460 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,460 INFO | (200, 512)
2021-05-27 16:55:48,466 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,467 INFO | (200, 512)
2021-05-27 16:55:48,468 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,469 INFO | (200, 512)
2021-05-27 16:55:48,475 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,476 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  86%|████████▌ | 492/574 [00:53<00:08,  9.58it/s]

2021-05-27 16:55:48,544 INFO | INITIAL
2021-05-27 16:55:48,545 INFO | (50, 200)
2021-05-27 16:55:48,549 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:48,550 INFO | (50, 200, 512)
2021-05-27 16:55:48,551 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:48,552 INFO | (50, 200, 512)
2021-05-27 16:55:48,553 INFO | BERT LAYER
2021-05-27 16:55:48,553 INFO | (200, 512)
2021-05-27 16:55:48,553 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,554 INFO | (200, 512)
2021-05-27 16:55:48,554 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,554 INFO | (200, 512)
2021-05-27 16:55:48,562 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,562 INFO | (200, 512)
2021-05-27 16:55:48,563 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,563 INFO | (200, 512)
2021-05-27 16:55:48,570 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,571 INFO | (200, 512)
2021-05-27 16:55:48,571 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,571 INFO | (200, 512)
2021-05-27 16:55:48,578 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,579 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  86%|████████▌ | 493/574 [00:53<00:08,  9.42it/s]

2021-05-27 16:55:48,654 INFO | INITIAL
2021-05-27 16:55:48,655 INFO | (50, 200)
2021-05-27 16:55:48,660 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:48,661 INFO | (50, 200, 512)
2021-05-27 16:55:48,662 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:48,663 INFO | (50, 200, 512)
2021-05-27 16:55:48,664 INFO | BERT LAYER
2021-05-27 16:55:48,664 INFO | (200, 512)
2021-05-27 16:55:48,664 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,665 INFO | (200, 512)
2021-05-27 16:55:48,665 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,666 INFO | (200, 512)
2021-05-27 16:55:48,673 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,673 INFO | (200, 512)
2021-05-27 16:55:48,674 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,674 INFO | (200, 512)
2021-05-27 16:55:48,679 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,680 INFO | (200, 512)
2021-05-27 16:55:48,681 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,681 INFO | (200, 512)
2021-05-27 16:55:48,686 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,687 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  86%|████████▌ | 494/574 [00:53<00:08,  9.37it/s]

2021-05-27 16:55:48,762 INFO | INITIAL
2021-05-27 16:55:48,763 INFO | (50, 200)
2021-05-27 16:55:48,775 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:48,776 INFO | (50, 200, 512)
2021-05-27 16:55:48,777 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:48,778 INFO | (50, 200, 512)
2021-05-27 16:55:48,779 INFO | BERT LAYER
2021-05-27 16:55:48,783 INFO | (200, 512)
2021-05-27 16:55:48,783 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,786 INFO | (200, 512)
2021-05-27 16:55:48,786 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,787 INFO | (200, 512)
2021-05-27 16:55:48,800 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,803 INFO | (200, 512)
2021-05-27 16:55:48,803 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,804 INFO | (200, 512)
2021-05-27 16:55:48,811 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,811 INFO | (200, 512)
2021-05-27 16:55:48,812 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,813 INFO | (200, 512)
2021-05-27 16:55:48,818 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,818 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  86%|████████▌ | 495/574 [00:54<00:08,  8.83it/s]

2021-05-27 16:55:48,891 INFO | INITIAL
2021-05-27 16:55:48,892 INFO | (50, 200)
2021-05-27 16:55:48,899 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:48,900 INFO | (50, 200, 512)
2021-05-27 16:55:48,901 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:48,901 INFO | (50, 200, 512)
2021-05-27 16:55:48,902 INFO | BERT LAYER
2021-05-27 16:55:48,902 INFO | (200, 512)
2021-05-27 16:55:48,902 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,903 INFO | (200, 512)
2021-05-27 16:55:48,903 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,903 INFO | (200, 512)
2021-05-27 16:55:48,909 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,909 INFO | (200, 512)
2021-05-27 16:55:48,910 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,911 INFO | (200, 512)
2021-05-27 16:55:48,918 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,919 INFO | (200, 512)
2021-05-27 16:55:48,919 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:48,920 INFO | (200, 512)
2021-05-27 16:55:48,926 INFO | BERT LAYER LOOP
2021-05-27 16:55:48,927 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  86%|████████▋ | 496/574 [00:54<00:08,  9.09it/s]

2021-05-27 16:55:48,994 INFO | INITIAL
2021-05-27 16:55:48,995 INFO | (50, 200)
2021-05-27 16:55:49,003 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:49,003 INFO | (50, 200, 512)
2021-05-27 16:55:49,005 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:49,006 INFO | (50, 200, 512)
2021-05-27 16:55:49,007 INFO | BERT LAYER
2021-05-27 16:55:49,007 INFO | (200, 512)
2021-05-27 16:55:49,009 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,009 INFO | (200, 512)
2021-05-27 16:55:49,010 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,010 INFO | (200, 512)
2021-05-27 16:55:49,016 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,017 INFO | (200, 512)
2021-05-27 16:55:49,017 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,018 INFO | (200, 512)
2021-05-27 16:55:49,024 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,024 INFO | (200, 512)
2021-05-27 16:55:49,025 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,025 INFO | (200, 512)
2021-05-27 16:55:49,030 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,031 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  87%|████████▋ | 497/574 [00:54<00:08,  9.21it/s]

2021-05-27 16:55:49,099 INFO | INITIAL
2021-05-27 16:55:49,099 INFO | (50, 200)
2021-05-27 16:55:49,105 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:49,105 INFO | (50, 200, 512)
2021-05-27 16:55:49,106 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:49,107 INFO | (50, 200, 512)
2021-05-27 16:55:49,107 INFO | BERT LAYER
2021-05-27 16:55:49,108 INFO | (200, 512)
2021-05-27 16:55:49,108 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,108 INFO | (200, 512)
2021-05-27 16:55:49,109 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,110 INFO | (200, 512)
2021-05-27 16:55:49,114 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,114 INFO | (200, 512)
2021-05-27 16:55:49,115 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,115 INFO | (200, 512)
2021-05-27 16:55:49,120 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,120 INFO | (200, 512)
2021-05-27 16:55:49,120 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,121 INFO | (200, 512)
2021-05-27 16:55:49,127 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,128 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  87%|████████▋ | 498/574 [00:54<00:08,  9.37it/s]

2021-05-27 16:55:49,201 INFO | INITIAL
2021-05-27 16:55:49,203 INFO | (50, 200)
2021-05-27 16:55:49,208 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:49,209 INFO | (50, 200, 512)
2021-05-27 16:55:49,210 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:49,210 INFO | (50, 200, 512)
2021-05-27 16:55:49,211 INFO | BERT LAYER
2021-05-27 16:55:49,212 INFO | (200, 512)
2021-05-27 16:55:49,213 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,213 INFO | (200, 512)
2021-05-27 16:55:49,214 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,215 INFO | (200, 512)
2021-05-27 16:55:49,221 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,222 INFO | (200, 512)
2021-05-27 16:55:49,222 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,223 INFO | (200, 512)
2021-05-27 16:55:49,229 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,230 INFO | (200, 512)
2021-05-27 16:55:49,230 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,231 INFO | (200, 512)
2021-05-27 16:55:49,237 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,238 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  87%|████████▋ | 499/574 [00:54<00:08,  9.35it/s]

2021-05-27 16:55:49,309 INFO | INITIAL
2021-05-27 16:55:49,309 INFO | (50, 200)
2021-05-27 16:55:49,315 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:49,315 INFO | (50, 200, 512)
2021-05-27 16:55:49,317 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:49,317 INFO | (50, 200, 512)
2021-05-27 16:55:49,318 INFO | BERT LAYER
2021-05-27 16:55:49,318 INFO | (200, 512)
2021-05-27 16:55:49,319 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,319 INFO | (200, 512)
2021-05-27 16:55:49,319 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,320 INFO | (200, 512)
2021-05-27 16:55:49,325 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,326 INFO | (200, 512)
2021-05-27 16:55:49,326 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,326 INFO | (200, 512)
2021-05-27 16:55:49,332 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,333 INFO | (200, 512)
2021-05-27 16:55:49,333 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,334 INFO | (200, 512)
2021-05-27 16:55:49,340 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,340 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  87%|████████▋ | 500/574 [00:54<00:07,  9.48it/s]

2021-05-27 16:55:49,411 INFO | INITIAL
2021-05-27 16:55:49,411 INFO | (50, 200)
2021-05-27 16:55:49,417 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:49,417 INFO | (50, 200, 512)
2021-05-27 16:55:49,419 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:49,419 INFO | (50, 200, 512)
2021-05-27 16:55:49,420 INFO | BERT LAYER
2021-05-27 16:55:49,420 INFO | (200, 512)
2021-05-27 16:55:49,421 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,421 INFO | (200, 512)
2021-05-27 16:55:49,422 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,422 INFO | (200, 512)
2021-05-27 16:55:49,429 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,429 INFO | (200, 512)
2021-05-27 16:55:49,430 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,431 INFO | (200, 512)
2021-05-27 16:55:49,438 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,438 INFO | (200, 512)
2021-05-27 16:55:49,439 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,439 INFO | (200, 512)
2021-05-27 16:55:49,445 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,446 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  87%|████████▋ | 501/574 [00:54<00:07,  9.51it/s]

2021-05-27 16:55:49,515 INFO | INITIAL
2021-05-27 16:55:49,515 INFO | (50, 200)
2021-05-27 16:55:49,520 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:49,521 INFO | (50, 200, 512)
2021-05-27 16:55:49,522 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:49,523 INFO | (50, 200, 512)
2021-05-27 16:55:49,523 INFO | BERT LAYER
2021-05-27 16:55:49,524 INFO | (200, 512)
2021-05-27 16:55:49,524 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,524 INFO | (200, 512)
2021-05-27 16:55:49,525 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,525 INFO | (200, 512)
2021-05-27 16:55:49,531 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,532 INFO | (200, 512)
2021-05-27 16:55:49,532 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,533 INFO | (200, 512)
2021-05-27 16:55:49,540 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,540 INFO | (200, 512)
2021-05-27 16:55:49,541 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,541 INFO | (200, 512)
2021-05-27 16:55:49,546 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,547 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  87%|████████▋ | 502/574 [00:54<00:07,  9.58it/s]

2021-05-27 16:55:49,618 INFO | INITIAL
2021-05-27 16:55:49,618 INFO | (50, 200)
2021-05-27 16:55:49,624 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:49,625 INFO | (50, 200, 512)
2021-05-27 16:55:49,626 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:49,626 INFO | (50, 200, 512)
2021-05-27 16:55:49,627 INFO | BERT LAYER
2021-05-27 16:55:49,627 INFO | (200, 512)
2021-05-27 16:55:49,628 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,628 INFO | (200, 512)
2021-05-27 16:55:49,628 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,629 INFO | (200, 512)
2021-05-27 16:55:49,635 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,636 INFO | (200, 512)
2021-05-27 16:55:49,636 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,637 INFO | (200, 512)
2021-05-27 16:55:49,644 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,645 INFO | (200, 512)
2021-05-27 16:55:49,646 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,646 INFO | (200, 512)
2021-05-27 16:55:49,653 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,653 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  88%|████████▊ | 503/574 [00:54<00:07,  9.55it/s]

2021-05-27 16:55:49,723 INFO | INITIAL
2021-05-27 16:55:49,724 INFO | (50, 200)
2021-05-27 16:55:49,730 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:49,731 INFO | (50, 200, 512)
2021-05-27 16:55:49,732 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:49,733 INFO | (50, 200, 512)
2021-05-27 16:55:49,734 INFO | BERT LAYER
2021-05-27 16:55:49,735 INFO | (200, 512)
2021-05-27 16:55:49,735 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,736 INFO | (200, 512)
2021-05-27 16:55:49,736 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,737 INFO | (200, 512)
2021-05-27 16:55:49,743 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,744 INFO | (200, 512)
2021-05-27 16:55:49,745 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,745 INFO | (200, 512)
2021-05-27 16:55:49,750 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,750 INFO | (200, 512)
2021-05-27 16:55:49,751 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,752 INFO | (200, 512)
2021-05-27 16:55:49,756 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,757 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  88%|████████▊ | 504/574 [00:55<00:07,  9.51it/s]

2021-05-27 16:55:49,829 INFO | INITIAL
2021-05-27 16:55:49,830 INFO | (50, 200)
2021-05-27 16:55:49,834 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:49,835 INFO | (50, 200, 512)
2021-05-27 16:55:49,837 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:49,838 INFO | (50, 200, 512)
2021-05-27 16:55:49,839 INFO | BERT LAYER
2021-05-27 16:55:49,839 INFO | (200, 512)
2021-05-27 16:55:49,840 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,840 INFO | (200, 512)
2021-05-27 16:55:49,841 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,841 INFO | (200, 512)
2021-05-27 16:55:49,847 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,848 INFO | (200, 512)
2021-05-27 16:55:49,848 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,848 INFO | (200, 512)
2021-05-27 16:55:49,853 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,854 INFO | (200, 512)
2021-05-27 16:55:49,855 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,855 INFO | (200, 512)
2021-05-27 16:55:49,863 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,864 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  88%|████████▊ | 505/574 [00:55<00:07,  9.49it/s]

2021-05-27 16:55:49,935 INFO | INITIAL
2021-05-27 16:55:49,936 INFO | (50, 200)
2021-05-27 16:55:49,943 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:49,944 INFO | (50, 200, 512)
2021-05-27 16:55:49,945 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:49,946 INFO | (50, 200, 512)
2021-05-27 16:55:49,947 INFO | BERT LAYER
2021-05-27 16:55:49,948 INFO | (200, 512)
2021-05-27 16:55:49,948 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,948 INFO | (200, 512)
2021-05-27 16:55:49,949 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,949 INFO | (200, 512)
2021-05-27 16:55:49,955 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,956 INFO | (200, 512)
2021-05-27 16:55:49,956 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,957 INFO | (200, 512)
2021-05-27 16:55:49,962 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,963 INFO | (200, 512)
2021-05-27 16:55:49,963 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:49,964 INFO | (200, 512)
2021-05-27 16:55:49,969 INFO | BERT LAYER LOOP
2021-05-27 16:55:49,969 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  88%|████████▊ | 506/574 [00:55<00:07,  9.42it/s]

2021-05-27 16:55:50,043 INFO | INITIAL
2021-05-27 16:55:50,044 INFO | (50, 200)
2021-05-27 16:55:50,049 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:50,049 INFO | (50, 200, 512)
2021-05-27 16:55:50,051 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:50,051 INFO | (50, 200, 512)
2021-05-27 16:55:50,052 INFO | BERT LAYER
2021-05-27 16:55:50,052 INFO | (200, 512)
2021-05-27 16:55:50,053 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,053 INFO | (200, 512)
2021-05-27 16:55:50,054 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,055 INFO | (200, 512)
2021-05-27 16:55:50,059 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,060 INFO | (200, 512)
2021-05-27 16:55:50,060 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,061 INFO | (200, 512)
2021-05-27 16:55:50,067 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,068 INFO | (200, 512)
2021-05-27 16:55:50,068 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,069 INFO | (200, 512)
2021-05-27 16:55:50,078 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,079 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  88%|████████▊ | 507/574 [00:55<00:07,  9.25it/s]

2021-05-27 16:55:50,156 INFO | INITIAL
2021-05-27 16:55:50,157 INFO | (50, 200)
2021-05-27 16:55:50,165 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:50,165 INFO | (50, 200, 512)
2021-05-27 16:55:50,166 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:50,167 INFO | (50, 200, 512)
2021-05-27 16:55:50,168 INFO | BERT LAYER
2021-05-27 16:55:50,168 INFO | (200, 512)
2021-05-27 16:55:50,168 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,168 INFO | (200, 512)
2021-05-27 16:55:50,169 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,169 INFO | (200, 512)
2021-05-27 16:55:50,175 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,176 INFO | (200, 512)
2021-05-27 16:55:50,176 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,176 INFO | (200, 512)
2021-05-27 16:55:50,181 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,182 INFO | (200, 512)
2021-05-27 16:55:50,182 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,183 INFO | (200, 512)
2021-05-27 16:55:50,188 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,189 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  89%|████████▊ | 508/574 [00:55<00:07,  9.42it/s]

2021-05-27 16:55:50,258 INFO | INITIAL
2021-05-27 16:55:50,258 INFO | (50, 200)
2021-05-27 16:55:50,265 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:50,266 INFO | (50, 200, 512)
2021-05-27 16:55:50,268 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:50,268 INFO | (50, 200, 512)
2021-05-27 16:55:50,269 INFO | BERT LAYER
2021-05-27 16:55:50,270 INFO | (200, 512)
2021-05-27 16:55:50,271 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,271 INFO | (200, 512)
2021-05-27 16:55:50,271 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,272 INFO | (200, 512)
2021-05-27 16:55:50,278 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,278 INFO | (200, 512)
2021-05-27 16:55:50,279 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,280 INFO | (200, 512)
2021-05-27 16:55:50,286 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,287 INFO | (200, 512)
2021-05-27 16:55:50,287 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,288 INFO | (200, 512)
2021-05-27 16:55:50,294 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,295 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  89%|████████▊ | 509/574 [00:55<00:06,  9.38it/s]

2021-05-27 16:55:50,365 INFO | INITIAL
2021-05-27 16:55:50,366 INFO | (50, 200)
2021-05-27 16:55:50,373 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:50,373 INFO | (50, 200, 512)
2021-05-27 16:55:50,374 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:50,375 INFO | (50, 200, 512)
2021-05-27 16:55:50,376 INFO | BERT LAYER
2021-05-27 16:55:50,376 INFO | (200, 512)
2021-05-27 16:55:50,377 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,377 INFO | (200, 512)
2021-05-27 16:55:50,377 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,378 INFO | (200, 512)
2021-05-27 16:55:50,383 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,384 INFO | (200, 512)
2021-05-27 16:55:50,384 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,384 INFO | (200, 512)
2021-05-27 16:55:50,389 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,390 INFO | (200, 512)
2021-05-27 16:55:50,390 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,391 INFO | (200, 512)
2021-05-27 16:55:50,396 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,397 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  89%|████████▉ | 510/574 [00:55<00:06,  9.43it/s]

2021-05-27 16:55:50,471 INFO | INITIAL
2021-05-27 16:55:50,472 INFO | (50, 200)
2021-05-27 16:55:50,477 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:50,478 INFO | (50, 200, 512)
2021-05-27 16:55:50,479 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:50,480 INFO | (50, 200, 512)
2021-05-27 16:55:50,481 INFO | BERT LAYER
2021-05-27 16:55:50,482 INFO | (200, 512)
2021-05-27 16:55:50,483 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,483 INFO | (200, 512)
2021-05-27 16:55:50,484 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,484 INFO | (200, 512)
2021-05-27 16:55:50,492 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,493 INFO | (200, 512)
2021-05-27 16:55:50,493 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,494 INFO | (200, 512)
2021-05-27 16:55:50,501 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,501 INFO | (200, 512)
2021-05-27 16:55:50,502 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,502 INFO | (200, 512)
2021-05-27 16:55:50,507 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,507 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  89%|████████▉ | 511/574 [00:55<00:06,  9.32it/s]

2021-05-27 16:55:50,580 INFO | INITIAL
2021-05-27 16:55:50,580 INFO | (50, 200)
2021-05-27 16:55:50,586 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:50,586 INFO | (50, 200, 512)
2021-05-27 16:55:50,587 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:50,588 INFO | (50, 200, 512)
2021-05-27 16:55:50,589 INFO | BERT LAYER
2021-05-27 16:55:50,589 INFO | (200, 512)
2021-05-27 16:55:50,589 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,590 INFO | (200, 512)
2021-05-27 16:55:50,590 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,590 INFO | (200, 512)
2021-05-27 16:55:50,597 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,597 INFO | (200, 512)
2021-05-27 16:55:50,598 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,598 INFO | (200, 512)
2021-05-27 16:55:50,605 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,606 INFO | (200, 512)
2021-05-27 16:55:50,606 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,607 INFO | (200, 512)
2021-05-27 16:55:50,613 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,614 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  89%|████████▉ | 512/574 [00:55<00:06,  9.30it/s]

2021-05-27 16:55:50,692 INFO | INITIAL
2021-05-27 16:55:50,692 INFO | (50, 200)
2021-05-27 16:55:50,700 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:50,701 INFO | (50, 200, 512)
2021-05-27 16:55:50,702 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:50,703 INFO | (50, 200, 512)
2021-05-27 16:55:50,703 INFO | BERT LAYER
2021-05-27 16:55:50,704 INFO | (200, 512)
2021-05-27 16:55:50,704 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,705 INFO | (200, 512)
2021-05-27 16:55:50,705 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,705 INFO | (200, 512)
2021-05-27 16:55:50,711 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,711 INFO | (200, 512)
2021-05-27 16:55:50,712 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,712 INFO | (200, 512)
2021-05-27 16:55:50,717 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,718 INFO | (200, 512)
2021-05-27 16:55:50,718 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,719 INFO | (200, 512)
2021-05-27 16:55:50,724 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,724 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  89%|████████▉ | 513/574 [00:55<00:06,  9.30it/s]

2021-05-27 16:55:50,795 INFO | INITIAL
2021-05-27 16:55:50,796 INFO | (50, 200)
2021-05-27 16:55:50,802 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:50,803 INFO | (50, 200, 512)
2021-05-27 16:55:50,804 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:50,805 INFO | (50, 200, 512)
2021-05-27 16:55:50,806 INFO | BERT LAYER
2021-05-27 16:55:50,806 INFO | (200, 512)
2021-05-27 16:55:50,807 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,807 INFO | (200, 512)
2021-05-27 16:55:50,808 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,808 INFO | (200, 512)
2021-05-27 16:55:50,814 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,815 INFO | (200, 512)
2021-05-27 16:55:50,816 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,816 INFO | (200, 512)
2021-05-27 16:55:50,824 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,824 INFO | (200, 512)
2021-05-27 16:55:50,825 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,826 INFO | (200, 512)
2021-05-27 16:55:50,834 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,834 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  90%|████████▉ | 514/574 [00:56<00:06,  9.24it/s]

2021-05-27 16:55:50,906 INFO | INITIAL
2021-05-27 16:55:50,906 INFO | (50, 200)
2021-05-27 16:55:50,913 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:50,913 INFO | (50, 200, 512)
2021-05-27 16:55:50,915 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:50,916 INFO | (50, 200, 512)
2021-05-27 16:55:50,917 INFO | BERT LAYER
2021-05-27 16:55:50,917 INFO | (200, 512)
2021-05-27 16:55:50,918 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,918 INFO | (200, 512)
2021-05-27 16:55:50,918 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,919 INFO | (200, 512)
2021-05-27 16:55:50,924 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,925 INFO | (200, 512)
2021-05-27 16:55:50,925 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,926 INFO | (200, 512)
2021-05-27 16:55:50,932 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,932 INFO | (200, 512)
2021-05-27 16:55:50,933 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:50,933 INFO | (200, 512)
2021-05-27 16:55:50,939 INFO | BERT LAYER LOOP
2021-05-27 16:55:50,940 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  90%|████████▉ | 515/574 [00:56<00:06,  9.23it/s]

2021-05-27 16:55:51,014 INFO | INITIAL
2021-05-27 16:55:51,015 INFO | (50, 200)
2021-05-27 16:55:51,020 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:51,020 INFO | (50, 200, 512)
2021-05-27 16:55:51,022 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:51,022 INFO | (50, 200, 512)
2021-05-27 16:55:51,023 INFO | BERT LAYER
2021-05-27 16:55:51,023 INFO | (200, 512)
2021-05-27 16:55:51,024 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,024 INFO | (200, 512)
2021-05-27 16:55:51,024 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,025 INFO | (200, 512)
2021-05-27 16:55:51,032 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,032 INFO | (200, 512)
2021-05-27 16:55:51,033 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,033 INFO | (200, 512)
2021-05-27 16:55:51,041 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,042 INFO | (200, 512)
2021-05-27 16:55:51,042 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,043 INFO | (200, 512)
2021-05-27 16:55:51,049 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,050 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  90%|████████▉ | 516/574 [00:56<00:06,  9.28it/s]

2021-05-27 16:55:51,121 INFO | INITIAL
2021-05-27 16:55:51,122 INFO | (50, 200)
2021-05-27 16:55:51,127 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:51,128 INFO | (50, 200, 512)
2021-05-27 16:55:51,129 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:51,129 INFO | (50, 200, 512)
2021-05-27 16:55:51,130 INFO | BERT LAYER
2021-05-27 16:55:51,130 INFO | (200, 512)
2021-05-27 16:55:51,131 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,131 INFO | (200, 512)
2021-05-27 16:55:51,131 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,132 INFO | (200, 512)
2021-05-27 16:55:51,138 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,139 INFO | (200, 512)
2021-05-27 16:55:51,139 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,139 INFO | (200, 512)
2021-05-27 16:55:51,146 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,146 INFO | (200, 512)
2021-05-27 16:55:51,147 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,147 INFO | (200, 512)
2021-05-27 16:55:51,153 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,153 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  90%|█████████ | 517/574 [00:56<00:06,  9.37it/s]

2021-05-27 16:55:51,225 INFO | INITIAL
2021-05-27 16:55:51,226 INFO | (50, 200)
2021-05-27 16:55:51,232 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:51,233 INFO | (50, 200, 512)
2021-05-27 16:55:51,234 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:51,234 INFO | (50, 200, 512)
2021-05-27 16:55:51,235 INFO | BERT LAYER
2021-05-27 16:55:51,235 INFO | (200, 512)
2021-05-27 16:55:51,236 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,236 INFO | (200, 512)
2021-05-27 16:55:51,237 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,237 INFO | (200, 512)
2021-05-27 16:55:51,243 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,244 INFO | (200, 512)
2021-05-27 16:55:51,244 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,244 INFO | (200, 512)
2021-05-27 16:55:51,250 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,251 INFO | (200, 512)
2021-05-27 16:55:51,251 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,252 INFO | (200, 512)
2021-05-27 16:55:51,259 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,260 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  90%|█████████ | 518/574 [00:56<00:05,  9.49it/s]

2021-05-27 16:55:51,328 INFO | INITIAL
2021-05-27 16:55:51,328 INFO | (50, 200)
2021-05-27 16:55:51,334 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:51,335 INFO | (50, 200, 512)
2021-05-27 16:55:51,336 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:51,337 INFO | (50, 200, 512)
2021-05-27 16:55:51,338 INFO | BERT LAYER
2021-05-27 16:55:51,339 INFO | (200, 512)
2021-05-27 16:55:51,339 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,340 INFO | (200, 512)
2021-05-27 16:55:51,340 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,341 INFO | (200, 512)
2021-05-27 16:55:51,348 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,349 INFO | (200, 512)
2021-05-27 16:55:51,349 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,350 INFO | (200, 512)
2021-05-27 16:55:51,356 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,357 INFO | (200, 512)
2021-05-27 16:55:51,357 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,358 INFO | (200, 512)
2021-05-27 16:55:51,364 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,364 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  90%|█████████ | 519/574 [00:56<00:05,  9.38it/s]

2021-05-27 16:55:51,437 INFO | INITIAL
2021-05-27 16:55:51,437 INFO | (50, 200)
2021-05-27 16:55:51,442 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:51,443 INFO | (50, 200, 512)
2021-05-27 16:55:51,444 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:51,444 INFO | (50, 200, 512)
2021-05-27 16:55:51,445 INFO | BERT LAYER
2021-05-27 16:55:51,446 INFO | (200, 512)
2021-05-27 16:55:51,446 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,446 INFO | (200, 512)
2021-05-27 16:55:51,447 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,447 INFO | (200, 512)
2021-05-27 16:55:51,454 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,454 INFO | (200, 512)
2021-05-27 16:55:51,455 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,455 INFO | (200, 512)
2021-05-27 16:55:51,462 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,462 INFO | (200, 512)
2021-05-27 16:55:51,463 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,463 INFO | (200, 512)
2021-05-27 16:55:51,472 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,472 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  91%|█████████ | 520/574 [00:56<00:05,  9.22it/s]

2021-05-27 16:55:51,549 INFO | INITIAL
2021-05-27 16:55:51,550 INFO | (50, 200)
2021-05-27 16:55:51,555 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:51,556 INFO | (50, 200, 512)
2021-05-27 16:55:51,557 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:51,558 INFO | (50, 200, 512)
2021-05-27 16:55:51,560 INFO | BERT LAYER
2021-05-27 16:55:51,560 INFO | (200, 512)
2021-05-27 16:55:51,561 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,561 INFO | (200, 512)
2021-05-27 16:55:51,562 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,562 INFO | (200, 512)
2021-05-27 16:55:51,569 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,570 INFO | (200, 512)
2021-05-27 16:55:51,570 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,571 INFO | (200, 512)
2021-05-27 16:55:51,577 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,578 INFO | (200, 512)
2021-05-27 16:55:51,578 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,578 INFO | (200, 512)
2021-05-27 16:55:51,585 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,585 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  91%|█████████ | 521/574 [00:56<00:05,  9.22it/s]

2021-05-27 16:55:51,658 INFO | INITIAL
2021-05-27 16:55:51,658 INFO | (50, 200)
2021-05-27 16:55:51,665 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:51,666 INFO | (50, 200, 512)
2021-05-27 16:55:51,667 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:51,668 INFO | (50, 200, 512)
2021-05-27 16:55:51,669 INFO | BERT LAYER
2021-05-27 16:55:51,669 INFO | (200, 512)
2021-05-27 16:55:51,670 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,670 INFO | (200, 512)
2021-05-27 16:55:51,671 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,671 INFO | (200, 512)
2021-05-27 16:55:51,678 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,678 INFO | (200, 512)
2021-05-27 16:55:51,679 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,679 INFO | (200, 512)
2021-05-27 16:55:51,685 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,685 INFO | (200, 512)
2021-05-27 16:55:51,686 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,686 INFO | (200, 512)
2021-05-27 16:55:51,694 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,694 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  91%|█████████ | 522/574 [00:56<00:05,  9.24it/s]

2021-05-27 16:55:51,766 INFO | INITIAL
2021-05-27 16:55:51,766 INFO | (50, 200)
2021-05-27 16:55:51,773 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:51,774 INFO | (50, 200, 512)
2021-05-27 16:55:51,775 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:51,775 INFO | (50, 200, 512)
2021-05-27 16:55:51,776 INFO | BERT LAYER
2021-05-27 16:55:51,777 INFO | (200, 512)
2021-05-27 16:55:51,777 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,777 INFO | (200, 512)
2021-05-27 16:55:51,778 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,778 INFO | (200, 512)
2021-05-27 16:55:51,784 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,784 INFO | (200, 512)
2021-05-27 16:55:51,785 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,785 INFO | (200, 512)
2021-05-27 16:55:51,790 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,790 INFO | (200, 512)
2021-05-27 16:55:51,791 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,791 INFO | (200, 512)
2021-05-27 16:55:51,796 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,797 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  91%|█████████ | 523/574 [00:57<00:05,  9.45it/s]

2021-05-27 16:55:51,866 INFO | INITIAL
2021-05-27 16:55:51,866 INFO | (50, 200)
2021-05-27 16:55:51,872 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:51,872 INFO | (50, 200, 512)
2021-05-27 16:55:51,874 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:51,874 INFO | (50, 200, 512)
2021-05-27 16:55:51,875 INFO | BERT LAYER
2021-05-27 16:55:51,876 INFO | (200, 512)
2021-05-27 16:55:51,876 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,877 INFO | (200, 512)
2021-05-27 16:55:51,878 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,879 INFO | (200, 512)
2021-05-27 16:55:51,885 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,886 INFO | (200, 512)
2021-05-27 16:55:51,886 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,887 INFO | (200, 512)
2021-05-27 16:55:51,893 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,894 INFO | (200, 512)
2021-05-27 16:55:51,894 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,895 INFO | (200, 512)
2021-05-27 16:55:51,901 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,902 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  91%|█████████▏| 524/574 [00:57<00:05,  9.39it/s]

2021-05-27 16:55:51,974 INFO | INITIAL
2021-05-27 16:55:51,974 INFO | (50, 200)
2021-05-27 16:55:51,978 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:51,979 INFO | (50, 200, 512)
2021-05-27 16:55:51,980 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:51,981 INFO | (50, 200, 512)
2021-05-27 16:55:51,982 INFO | BERT LAYER
2021-05-27 16:55:51,982 INFO | (200, 512)
2021-05-27 16:55:51,983 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,983 INFO | (200, 512)
2021-05-27 16:55:51,983 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,983 INFO | (200, 512)
2021-05-27 16:55:51,988 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,989 INFO | (200, 512)
2021-05-27 16:55:51,989 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,990 INFO | (200, 512)
2021-05-27 16:55:51,995 INFO | BERT LAYER LOOP
2021-05-27 16:55:51,995 INFO | (200, 512)
2021-05-27 16:55:51,996 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:51,996 INFO | (200, 512)
2021-05-27 16:55:52,002 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,003 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  91%|█████████▏| 524/574 [00:57<00:05,  9.39it/s]

2021-05-27 16:55:52,072 INFO | INITIAL
2021-05-27 16:55:52,073 INFO | (50, 200)
2021-05-27 16:55:52,079 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:52,080 INFO | (50, 200, 512)
2021-05-27 16:55:52,081 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:52,082 INFO | (50, 200, 512)
2021-05-27 16:55:52,083 INFO | BERT LAYER
2021-05-27 16:55:52,083 INFO | (200, 512)
2021-05-27 16:55:52,084 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,084 INFO | (200, 512)
2021-05-27 16:55:52,085 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,085 INFO | (200, 512)
2021-05-27 16:55:52,091 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,091 INFO | (200, 512)
2021-05-27 16:55:52,092 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,092 INFO | (200, 512)
2021-05-27 16:55:52,099 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,099 INFO | (200, 512)
2021-05-27 16:55:52,100 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,100 INFO | (200, 512)
2021-05-27 16:55:52,108 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,108 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  92%|█████████▏| 526/574 [00:57<00:05,  9.46it/s]

2021-05-27 16:55:52,183 INFO | INITIAL
2021-05-27 16:55:52,184 INFO | (50, 200)
2021-05-27 16:55:52,189 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:52,189 INFO | (50, 200, 512)
2021-05-27 16:55:52,191 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:52,191 INFO | (50, 200, 512)
2021-05-27 16:55:52,192 INFO | BERT LAYER
2021-05-27 16:55:52,192 INFO | (200, 512)
2021-05-27 16:55:52,193 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,193 INFO | (200, 512)
2021-05-27 16:55:52,193 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,194 INFO | (200, 512)
2021-05-27 16:55:52,199 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,199 INFO | (200, 512)
2021-05-27 16:55:52,200 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,200 INFO | (200, 512)
2021-05-27 16:55:52,207 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,207 INFO | (200, 512)
2021-05-27 16:55:52,208 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,209 INFO | (200, 512)
2021-05-27 16:55:52,215 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,215 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  92%|█████████▏| 527/574 [00:57<00:04,  9.55it/s]

2021-05-27 16:55:52,285 INFO | INITIAL
2021-05-27 16:55:52,285 INFO | (50, 200)
2021-05-27 16:55:52,290 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:52,291 INFO | (50, 200, 512)
2021-05-27 16:55:52,292 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:52,293 INFO | (50, 200, 512)
2021-05-27 16:55:52,293 INFO | BERT LAYER
2021-05-27 16:55:52,294 INFO | (200, 512)
2021-05-27 16:55:52,294 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,294 INFO | (200, 512)
2021-05-27 16:55:52,295 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,295 INFO | (200, 512)
2021-05-27 16:55:52,301 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,301 INFO | (200, 512)
2021-05-27 16:55:52,302 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,302 INFO | (200, 512)
2021-05-27 16:55:52,308 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,309 INFO | (200, 512)
2021-05-27 16:55:52,309 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,310 INFO | (200, 512)
2021-05-27 16:55:52,315 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,316 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  92%|█████████▏| 528/574 [00:57<00:04,  9.36it/s]

2021-05-27 16:55:52,398 INFO | INITIAL
2021-05-27 16:55:52,399 INFO | (50, 200)
2021-05-27 16:55:52,404 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:52,404 INFO | (50, 200, 512)
2021-05-27 16:55:52,406 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:52,406 INFO | (50, 200, 512)
2021-05-27 16:55:52,407 INFO | BERT LAYER
2021-05-27 16:55:52,408 INFO | (200, 512)
2021-05-27 16:55:52,408 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,409 INFO | (200, 512)
2021-05-27 16:55:52,410 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,411 INFO | (200, 512)
2021-05-27 16:55:52,417 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,418 INFO | (200, 512)
2021-05-27 16:55:52,418 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,418 INFO | (200, 512)
2021-05-27 16:55:52,425 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,426 INFO | (200, 512)
2021-05-27 16:55:52,426 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,426 INFO | (200, 512)
2021-05-27 16:55:52,431 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,432 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  92%|█████████▏| 529/574 [00:57<00:04,  9.27it/s]

2021-05-27 16:55:52,509 INFO | INITIAL
2021-05-27 16:55:52,509 INFO | (50, 200)
2021-05-27 16:55:52,515 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:52,516 INFO | (50, 200, 512)
2021-05-27 16:55:52,517 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:52,517 INFO | (50, 200, 512)
2021-05-27 16:55:52,518 INFO | BERT LAYER
2021-05-27 16:55:52,518 INFO | (200, 512)
2021-05-27 16:55:52,519 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,519 INFO | (200, 512)
2021-05-27 16:55:52,519 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,521 INFO | (200, 512)
2021-05-27 16:55:52,526 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,526 INFO | (200, 512)
2021-05-27 16:55:52,527 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,527 INFO | (200, 512)
2021-05-27 16:55:52,533 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,533 INFO | (200, 512)
2021-05-27 16:55:52,534 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,534 INFO | (200, 512)
2021-05-27 16:55:52,540 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,541 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  92%|█████████▏| 530/574 [00:57<00:04,  9.31it/s]

2021-05-27 16:55:52,615 INFO | INITIAL
2021-05-27 16:55:52,615 INFO | (50, 200)
2021-05-27 16:55:52,622 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:52,622 INFO | (50, 200, 512)
2021-05-27 16:55:52,627 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:52,628 INFO | (50, 200, 512)
2021-05-27 16:55:52,629 INFO | BERT LAYER
2021-05-27 16:55:52,629 INFO | (200, 512)
2021-05-27 16:55:52,630 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,631 INFO | (200, 512)
2021-05-27 16:55:52,632 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,633 INFO | (200, 512)
2021-05-27 16:55:52,639 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,640 INFO | (200, 512)
2021-05-27 16:55:52,642 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,643 INFO | (200, 512)
2021-05-27 16:55:52,649 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,649 INFO | (200, 512)
2021-05-27 16:55:52,650 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,650 INFO | (200, 512)
2021-05-27 16:55:52,655 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,656 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  93%|█████████▎| 531/574 [00:57<00:04,  9.20it/s]

2021-05-27 16:55:52,727 INFO | INITIAL
2021-05-27 16:55:52,727 INFO | (50, 200)
2021-05-27 16:55:52,732 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:52,733 INFO | (50, 200, 512)
2021-05-27 16:55:52,734 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:52,734 INFO | (50, 200, 512)
2021-05-27 16:55:52,735 INFO | BERT LAYER
2021-05-27 16:55:52,736 INFO | (200, 512)
2021-05-27 16:55:52,736 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,736 INFO | (200, 512)
2021-05-27 16:55:52,737 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,737 INFO | (200, 512)
2021-05-27 16:55:52,748 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,749 INFO | (200, 512)
2021-05-27 16:55:52,749 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,749 INFO | (200, 512)
2021-05-27 16:55:52,755 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,756 INFO | (200, 512)
2021-05-27 16:55:52,756 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,757 INFO | (200, 512)
2021-05-27 16:55:52,763 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,764 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  93%|█████████▎| 532/574 [00:58<00:04,  9.42it/s]

2021-05-27 16:55:52,827 INFO | INITIAL
2021-05-27 16:55:52,827 INFO | (50, 200)
2021-05-27 16:55:52,834 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:52,835 INFO | (50, 200, 512)
2021-05-27 16:55:52,836 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:52,837 INFO | (50, 200, 512)
2021-05-27 16:55:52,838 INFO | BERT LAYER
2021-05-27 16:55:52,838 INFO | (200, 512)
2021-05-27 16:55:52,838 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,839 INFO | (200, 512)
2021-05-27 16:55:52,839 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,839 INFO | (200, 512)
2021-05-27 16:55:52,846 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,846 INFO | (200, 512)
2021-05-27 16:55:52,847 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,847 INFO | (200, 512)
2021-05-27 16:55:52,853 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,853 INFO | (200, 512)
2021-05-27 16:55:52,853 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,854 INFO | (200, 512)
2021-05-27 16:55:52,859 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,859 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  93%|█████████▎| 532/574 [00:58<00:04,  9.42it/s]

2021-05-27 16:55:52,925 INFO | INITIAL
2021-05-27 16:55:52,926 INFO | (50, 200)
2021-05-27 16:55:52,932 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:52,933 INFO | (50, 200, 512)
2021-05-27 16:55:52,934 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:52,934 INFO | (50, 200, 512)
2021-05-27 16:55:52,935 INFO | BERT LAYER
2021-05-27 16:55:52,935 INFO | (200, 512)
2021-05-27 16:55:52,936 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,937 INFO | (200, 512)
2021-05-27 16:55:52,937 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,938 INFO | (200, 512)
2021-05-27 16:55:52,944 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,945 INFO | (200, 512)
2021-05-27 16:55:52,945 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,946 INFO | (200, 512)
2021-05-27 16:55:52,951 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,952 INFO | (200, 512)
2021-05-27 16:55:52,952 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:52,952 INFO | (200, 512)
2021-05-27 16:55:52,957 INFO | BERT LAYER LOOP
2021-05-27 16:55:52,958 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  93%|█████████▎| 534/574 [00:58<00:04,  9.53it/s]

2021-05-27 16:55:53,034 INFO | INITIAL
2021-05-27 16:55:53,034 INFO | (50, 200)
2021-05-27 16:55:53,039 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:53,040 INFO | (50, 200, 512)
2021-05-27 16:55:53,041 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:53,042 INFO | (50, 200, 512)
2021-05-27 16:55:53,043 INFO | BERT LAYER
2021-05-27 16:55:53,043 INFO | (200, 512)
2021-05-27 16:55:53,043 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,044 INFO | (200, 512)
2021-05-27 16:55:53,044 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,044 INFO | (200, 512)
2021-05-27 16:55:53,051 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,052 INFO | (200, 512)
2021-05-27 16:55:53,053 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,053 INFO | (200, 512)
2021-05-27 16:55:53,060 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,060 INFO | (200, 512)
2021-05-27 16:55:53,061 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,061 INFO | (200, 512)
2021-05-27 16:55:53,068 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,068 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  93%|█████████▎| 535/574 [00:58<00:04,  9.57it/s]

2021-05-27 16:55:53,138 INFO | INITIAL
2021-05-27 16:55:53,140 INFO | (50, 200)
2021-05-27 16:55:53,148 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:53,148 INFO | (50, 200, 512)
2021-05-27 16:55:53,150 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:53,150 INFO | (50, 200, 512)
2021-05-27 16:55:53,151 INFO | BERT LAYER
2021-05-27 16:55:53,151 INFO | (200, 512)
2021-05-27 16:55:53,152 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,152 INFO | (200, 512)
2021-05-27 16:55:53,153 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,153 INFO | (200, 512)
2021-05-27 16:55:53,159 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,160 INFO | (200, 512)
2021-05-27 16:55:53,160 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,160 INFO | (200, 512)
2021-05-27 16:55:53,167 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,167 INFO | (200, 512)
2021-05-27 16:55:53,168 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,169 INFO | (200, 512)
2021-05-27 16:55:53,175 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,175 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  93%|█████████▎| 536/574 [00:58<00:04,  9.42it/s]

2021-05-27 16:55:53,248 INFO | INITIAL
2021-05-27 16:55:53,248 INFO | (50, 200)
2021-05-27 16:55:53,253 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:53,253 INFO | (50, 200, 512)
2021-05-27 16:55:53,255 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:53,255 INFO | (50, 200, 512)
2021-05-27 16:55:53,256 INFO | BERT LAYER
2021-05-27 16:55:53,256 INFO | (200, 512)
2021-05-27 16:55:53,256 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,257 INFO | (200, 512)
2021-05-27 16:55:53,257 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,257 INFO | (200, 512)
2021-05-27 16:55:53,263 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,264 INFO | (200, 512)
2021-05-27 16:55:53,264 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,265 INFO | (200, 512)
2021-05-27 16:55:53,271 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,271 INFO | (200, 512)
2021-05-27 16:55:53,272 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,272 INFO | (200, 512)
2021-05-27 16:55:53,280 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,281 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  94%|█████████▎| 537/574 [00:58<00:03,  9.48it/s]

2021-05-27 16:55:53,351 INFO | INITIAL
2021-05-27 16:55:53,352 INFO | (50, 200)
2021-05-27 16:55:53,357 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:53,358 INFO | (50, 200, 512)
2021-05-27 16:55:53,359 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:53,360 INFO | (50, 200, 512)
2021-05-27 16:55:53,360 INFO | BERT LAYER
2021-05-27 16:55:53,361 INFO | (200, 512)
2021-05-27 16:55:53,361 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,362 INFO | (200, 512)
2021-05-27 16:55:53,362 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,363 INFO | (200, 512)
2021-05-27 16:55:53,368 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,369 INFO | (200, 512)
2021-05-27 16:55:53,369 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,370 INFO | (200, 512)
2021-05-27 16:55:53,375 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,376 INFO | (200, 512)
2021-05-27 16:55:53,376 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,376 INFO | (200, 512)
2021-05-27 16:55:53,381 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,382 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  94%|█████████▎| 537/574 [00:58<00:03,  9.48it/s]

2021-05-27 16:55:53,448 INFO | INITIAL
2021-05-27 16:55:53,449 INFO | (50, 200)
2021-05-27 16:55:53,453 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:53,454 INFO | (50, 200, 512)
2021-05-27 16:55:53,455 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:53,455 INFO | (50, 200, 512)
2021-05-27 16:55:53,456 INFO | BERT LAYER
2021-05-27 16:55:53,456 INFO | (200, 512)
2021-05-27 16:55:53,457 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,457 INFO | (200, 512)
2021-05-27 16:55:53,457 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,458 INFO | (200, 512)
2021-05-27 16:55:53,463 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,463 INFO | (200, 512)
2021-05-27 16:55:53,464 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,464 INFO | (200, 512)
2021-05-27 16:55:53,470 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,470 INFO | (200, 512)
2021-05-27 16:55:53,471 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,471 INFO | (200, 512)
2021-05-27 16:55:53,478 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,479 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  94%|█████████▍| 539/574 [00:58<00:03,  9.82it/s]

2021-05-27 16:55:53,546 INFO | INITIAL
2021-05-27 16:55:53,546 INFO | (50, 200)
2021-05-27 16:55:53,552 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:53,552 INFO | (50, 200, 512)
2021-05-27 16:55:53,553 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:53,554 INFO | (50, 200, 512)
2021-05-27 16:55:53,555 INFO | BERT LAYER
2021-05-27 16:55:53,555 INFO | (200, 512)
2021-05-27 16:55:53,555 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,556 INFO | (200, 512)
2021-05-27 16:55:53,556 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,557 INFO | (200, 512)
2021-05-27 16:55:53,562 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,563 INFO | (200, 512)
2021-05-27 16:55:53,563 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,563 INFO | (200, 512)
2021-05-27 16:55:53,569 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,569 INFO | (200, 512)
2021-05-27 16:55:53,570 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,570 INFO | (200, 512)
2021-05-27 16:55:53,575 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,575 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  94%|█████████▍| 540/574 [00:58<00:03,  9.77it/s]

2021-05-27 16:55:53,650 INFO | INITIAL
2021-05-27 16:55:53,650 INFO | (50, 200)
2021-05-27 16:55:53,655 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:53,655 INFO | (50, 200, 512)
2021-05-27 16:55:53,656 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:53,657 INFO | (50, 200, 512)
2021-05-27 16:55:53,658 INFO | BERT LAYER
2021-05-27 16:55:53,658 INFO | (200, 512)
2021-05-27 16:55:53,658 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,659 INFO | (200, 512)
2021-05-27 16:55:53,659 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,660 INFO | (200, 512)
2021-05-27 16:55:53,667 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,668 INFO | (200, 512)
2021-05-27 16:55:53,668 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,669 INFO | (200, 512)
2021-05-27 16:55:53,676 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,676 INFO | (200, 512)
2021-05-27 16:55:53,677 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,677 INFO | (200, 512)
2021-05-27 16:55:53,683 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,683 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  94%|█████████▍| 541/574 [00:58<00:03,  9.69it/s]

2021-05-27 16:55:53,755 INFO | INITIAL
2021-05-27 16:55:53,756 INFO | (50, 200)
2021-05-27 16:55:53,763 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:53,764 INFO | (50, 200, 512)
2021-05-27 16:55:53,765 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:53,765 INFO | (50, 200, 512)
2021-05-27 16:55:53,766 INFO | BERT LAYER
2021-05-27 16:55:53,766 INFO | (200, 512)
2021-05-27 16:55:53,767 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,767 INFO | (200, 512)
2021-05-27 16:55:53,768 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,769 INFO | (200, 512)
2021-05-27 16:55:53,775 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,775 INFO | (200, 512)
2021-05-27 16:55:53,776 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,776 INFO | (200, 512)
2021-05-27 16:55:53,784 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,784 INFO | (200, 512)
2021-05-27 16:55:53,785 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,785 INFO | (200, 512)
2021-05-27 16:55:53,792 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,792 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  94%|█████████▍| 542/574 [00:59<00:03,  9.50it/s]

2021-05-27 16:55:53,867 INFO | INITIAL
2021-05-27 16:55:53,867 INFO | (50, 200)
2021-05-27 16:55:53,874 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:53,874 INFO | (50, 200, 512)
2021-05-27 16:55:53,876 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:53,876 INFO | (50, 200, 512)
2021-05-27 16:55:53,878 INFO | BERT LAYER
2021-05-27 16:55:53,879 INFO | (200, 512)
2021-05-27 16:55:53,881 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,881 INFO | (200, 512)
2021-05-27 16:55:53,882 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,883 INFO | (200, 512)
2021-05-27 16:55:53,889 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,890 INFO | (200, 512)
2021-05-27 16:55:53,890 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,891 INFO | (200, 512)
2021-05-27 16:55:53,896 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,897 INFO | (200, 512)
2021-05-27 16:55:53,897 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,898 INFO | (200, 512)
2021-05-27 16:55:53,904 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,904 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  95%|█████████▍| 543/574 [00:59<00:03,  9.40it/s]

2021-05-27 16:55:53,976 INFO | INITIAL
2021-05-27 16:55:53,976 INFO | (50, 200)
2021-05-27 16:55:53,982 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:53,983 INFO | (50, 200, 512)
2021-05-27 16:55:53,984 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:53,985 INFO | (50, 200, 512)
2021-05-27 16:55:53,986 INFO | BERT LAYER
2021-05-27 16:55:53,986 INFO | (200, 512)
2021-05-27 16:55:53,987 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,987 INFO | (200, 512)
2021-05-27 16:55:53,987 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,988 INFO | (200, 512)
2021-05-27 16:55:53,993 INFO | BERT LAYER LOOP
2021-05-27 16:55:53,994 INFO | (200, 512)
2021-05-27 16:55:53,994 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:53,995 INFO | (200, 512)
2021-05-27 16:55:54,001 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,001 INFO | (200, 512)
2021-05-27 16:55:54,002 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,002 INFO | (200, 512)
2021-05-27 16:55:54,011 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,011 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  95%|█████████▍| 544/574 [00:59<00:03,  9.29it/s]

2021-05-27 16:55:54,087 INFO | INITIAL
2021-05-27 16:55:54,088 INFO | (50, 200)
2021-05-27 16:55:54,093 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:54,094 INFO | (50, 200, 512)
2021-05-27 16:55:54,095 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:54,096 INFO | (50, 200, 512)
2021-05-27 16:55:54,097 INFO | BERT LAYER
2021-05-27 16:55:54,097 INFO | (200, 512)
2021-05-27 16:55:54,097 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,098 INFO | (200, 512)
2021-05-27 16:55:54,099 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,099 INFO | (200, 512)
2021-05-27 16:55:54,105 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,106 INFO | (200, 512)
2021-05-27 16:55:54,106 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,107 INFO | (200, 512)
2021-05-27 16:55:54,113 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,114 INFO | (200, 512)
2021-05-27 16:55:54,114 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,114 INFO | (200, 512)
2021-05-27 16:55:54,120 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,120 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  95%|█████████▍| 545/574 [00:59<00:03,  9.21it/s]

2021-05-27 16:55:54,198 INFO | INITIAL
2021-05-27 16:55:54,198 INFO | (50, 200)
2021-05-27 16:55:54,203 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:54,204 INFO | (50, 200, 512)
2021-05-27 16:55:54,205 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:54,206 INFO | (50, 200, 512)
2021-05-27 16:55:54,207 INFO | BERT LAYER
2021-05-27 16:55:54,207 INFO | (200, 512)
2021-05-27 16:55:54,208 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,208 INFO | (200, 512)
2021-05-27 16:55:54,209 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,209 INFO | (200, 512)
2021-05-27 16:55:54,215 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,216 INFO | (200, 512)
2021-05-27 16:55:54,216 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,217 INFO | (200, 512)
2021-05-27 16:55:54,224 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,224 INFO | (200, 512)
2021-05-27 16:55:54,225 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,225 INFO | (200, 512)
2021-05-27 16:55:54,232 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,233 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  95%|█████████▌| 546/574 [00:59<00:03,  9.17it/s]

2021-05-27 16:55:54,308 INFO | INITIAL
2021-05-27 16:55:54,309 INFO | (50, 200)
2021-05-27 16:55:54,316 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:54,317 INFO | (50, 200, 512)
2021-05-27 16:55:54,318 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:54,318 INFO | (50, 200, 512)
2021-05-27 16:55:54,319 INFO | BERT LAYER
2021-05-27 16:55:54,319 INFO | (200, 512)
2021-05-27 16:55:54,320 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,320 INFO | (200, 512)
2021-05-27 16:55:54,320 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,320 INFO | (200, 512)
2021-05-27 16:55:54,326 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,326 INFO | (200, 512)
2021-05-27 16:55:54,327 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,327 INFO | (200, 512)
2021-05-27 16:55:54,332 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,332 INFO | (200, 512)
2021-05-27 16:55:54,333 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,333 INFO | (200, 512)
2021-05-27 16:55:54,339 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,339 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  95%|█████████▌| 547/574 [00:59<00:02,  9.26it/s]

2021-05-27 16:55:54,413 INFO | INITIAL
2021-05-27 16:55:54,413 INFO | (50, 200)
2021-05-27 16:55:54,419 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:54,419 INFO | (50, 200, 512)
2021-05-27 16:55:54,420 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:54,420 INFO | (50, 200, 512)
2021-05-27 16:55:54,421 INFO | BERT LAYER
2021-05-27 16:55:54,422 INFO | (200, 512)
2021-05-27 16:55:54,422 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,423 INFO | (200, 512)
2021-05-27 16:55:54,423 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,424 INFO | (200, 512)
2021-05-27 16:55:54,430 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,430 INFO | (200, 512)
2021-05-27 16:55:54,431 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,431 INFO | (200, 512)
2021-05-27 16:55:54,437 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,437 INFO | (200, 512)
2021-05-27 16:55:54,438 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,438 INFO | (200, 512)
2021-05-27 16:55:54,447 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,448 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  95%|█████████▌| 548/574 [00:59<00:02,  9.16it/s]

2021-05-27 16:55:54,525 INFO | INITIAL
2021-05-27 16:55:54,526 INFO | (50, 200)
2021-05-27 16:55:54,533 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:54,533 INFO | (50, 200, 512)
2021-05-27 16:55:54,535 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:54,535 INFO | (50, 200, 512)
2021-05-27 16:55:54,536 INFO | BERT LAYER
2021-05-27 16:55:54,536 INFO | (200, 512)
2021-05-27 16:55:54,537 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,537 INFO | (200, 512)
2021-05-27 16:55:54,539 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,539 INFO | (200, 512)
2021-05-27 16:55:54,546 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,547 INFO | (200, 512)
2021-05-27 16:55:54,547 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,548 INFO | (200, 512)
2021-05-27 16:55:54,555 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,555 INFO | (200, 512)
2021-05-27 16:55:54,556 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,556 INFO | (200, 512)
2021-05-27 16:55:54,562 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,562 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  96%|█████████▌| 549/574 [00:59<00:02,  9.05it/s]

2021-05-27 16:55:54,639 INFO | INITIAL
2021-05-27 16:55:54,639 INFO | (50, 200)
2021-05-27 16:55:54,646 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:54,647 INFO | (50, 200, 512)
2021-05-27 16:55:54,648 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:54,648 INFO | (50, 200, 512)
2021-05-27 16:55:54,649 INFO | BERT LAYER
2021-05-27 16:55:54,650 INFO | (200, 512)
2021-05-27 16:55:54,650 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,650 INFO | (200, 512)
2021-05-27 16:55:54,651 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,651 INFO | (200, 512)
2021-05-27 16:55:54,656 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,657 INFO | (200, 512)
2021-05-27 16:55:54,657 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,658 INFO | (200, 512)
2021-05-27 16:55:54,665 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,666 INFO | (200, 512)
2021-05-27 16:55:54,667 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,667 INFO | (200, 512)
2021-05-27 16:55:54,674 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,674 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  96%|█████████▌| 550/574 [00:59<00:02,  9.01it/s]

2021-05-27 16:55:54,751 INFO | INITIAL
2021-05-27 16:55:54,751 INFO | (50, 200)
2021-05-27 16:55:54,759 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:54,759 INFO | (50, 200, 512)
2021-05-27 16:55:54,760 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:54,761 INFO | (50, 200, 512)
2021-05-27 16:55:54,762 INFO | BERT LAYER
2021-05-27 16:55:54,762 INFO | (200, 512)
2021-05-27 16:55:54,763 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,763 INFO | (200, 512)
2021-05-27 16:55:54,764 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,764 INFO | (200, 512)
2021-05-27 16:55:54,770 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,771 INFO | (200, 512)
2021-05-27 16:55:54,771 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,772 INFO | (200, 512)
2021-05-27 16:55:54,777 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,777 INFO | (200, 512)
2021-05-27 16:55:54,778 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,778 INFO | (200, 512)
2021-05-27 16:55:54,784 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,784 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  96%|█████████▌| 551/574 [01:00<00:02,  9.18it/s]

2021-05-27 16:55:54,855 INFO | INITIAL
2021-05-27 16:55:54,855 INFO | (50, 200)
2021-05-27 16:55:54,862 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:54,863 INFO | (50, 200, 512)
2021-05-27 16:55:54,864 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:54,864 INFO | (50, 200, 512)
2021-05-27 16:55:54,865 INFO | BERT LAYER
2021-05-27 16:55:54,866 INFO | (200, 512)
2021-05-27 16:55:54,866 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,866 INFO | (200, 512)
2021-05-27 16:55:54,867 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,867 INFO | (200, 512)
2021-05-27 16:55:54,873 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,874 INFO | (200, 512)
2021-05-27 16:55:54,874 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,875 INFO | (200, 512)
2021-05-27 16:55:54,882 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,882 INFO | (200, 512)
2021-05-27 16:55:54,883 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,883 INFO | (200, 512)
2021-05-27 16:55:54,889 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,890 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  96%|█████████▌| 552/574 [01:00<00:02,  9.16it/s]

2021-05-27 16:55:54,965 INFO | INITIAL
2021-05-27 16:55:54,966 INFO | (50, 200)
2021-05-27 16:55:54,971 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:54,972 INFO | (50, 200, 512)
2021-05-27 16:55:54,973 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:54,974 INFO | (50, 200, 512)
2021-05-27 16:55:54,975 INFO | BERT LAYER
2021-05-27 16:55:54,975 INFO | (200, 512)
2021-05-27 16:55:54,976 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,976 INFO | (200, 512)
2021-05-27 16:55:54,977 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,977 INFO | (200, 512)
2021-05-27 16:55:54,985 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,985 INFO | (200, 512)
2021-05-27 16:55:54,986 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,986 INFO | (200, 512)
2021-05-27 16:55:54,993 INFO | BERT LAYER LOOP
2021-05-27 16:55:54,993 INFO | (200, 512)
2021-05-27 16:55:54,994 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:54,994 INFO | (200, 512)
2021-05-27 16:55:55,000 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,001 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  96%|█████████▋| 553/574 [01:00<00:02,  9.11it/s]

2021-05-27 16:55:55,076 INFO | INITIAL
2021-05-27 16:55:55,076 INFO | (50, 200)
2021-05-27 16:55:55,084 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:55,084 INFO | (50, 200, 512)
2021-05-27 16:55:55,085 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:55,086 INFO | (50, 200, 512)
2021-05-27 16:55:55,086 INFO | BERT LAYER
2021-05-27 16:55:55,087 INFO | (200, 512)
2021-05-27 16:55:55,087 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,087 INFO | (200, 512)
2021-05-27 16:55:55,088 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,088 INFO | (200, 512)
2021-05-27 16:55:55,093 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,094 INFO | (200, 512)
2021-05-27 16:55:55,094 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,095 INFO | (200, 512)
2021-05-27 16:55:55,100 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,100 INFO | (200, 512)
2021-05-27 16:55:55,101 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,101 INFO | (200, 512)
2021-05-27 16:55:55,106 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,107 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  97%|█████████▋| 554/574 [01:00<00:02,  9.34it/s]

2021-05-27 16:55:55,177 INFO | INITIAL
2021-05-27 16:55:55,177 INFO | (50, 200)
2021-05-27 16:55:55,182 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:55,183 INFO | (50, 200, 512)
2021-05-27 16:55:55,184 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:55,184 INFO | (50, 200, 512)
2021-05-27 16:55:55,185 INFO | BERT LAYER
2021-05-27 16:55:55,185 INFO | (200, 512)
2021-05-27 16:55:55,185 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,186 INFO | (200, 512)
2021-05-27 16:55:55,186 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,186 INFO | (200, 512)
2021-05-27 16:55:55,192 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,192 INFO | (200, 512)
2021-05-27 16:55:55,193 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,193 INFO | (200, 512)
2021-05-27 16:55:55,198 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,199 INFO | (200, 512)
2021-05-27 16:55:55,199 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,199 INFO | (200, 512)
2021-05-27 16:55:55,204 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,205 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  97%|█████████▋| 555/574 [01:00<00:02,  9.36it/s]

2021-05-27 16:55:55,283 INFO | INITIAL
2021-05-27 16:55:55,284 INFO | (50, 200)
2021-05-27 16:55:55,290 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:55,291 INFO | (50, 200, 512)
2021-05-27 16:55:55,292 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:55,293 INFO | (50, 200, 512)
2021-05-27 16:55:55,294 INFO | BERT LAYER
2021-05-27 16:55:55,295 INFO | (200, 512)
2021-05-27 16:55:55,295 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,295 INFO | (200, 512)
2021-05-27 16:55:55,296 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,297 INFO | (200, 512)
2021-05-27 16:55:55,306 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,306 INFO | (200, 512)
2021-05-27 16:55:55,307 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,308 INFO | (200, 512)
2021-05-27 16:55:55,314 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,315 INFO | (200, 512)
2021-05-27 16:55:55,315 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,316 INFO | (200, 512)
2021-05-27 16:55:55,321 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,321 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  97%|█████████▋| 556/574 [01:00<00:01,  9.13it/s]

2021-05-27 16:55:55,399 INFO | INITIAL
2021-05-27 16:55:55,399 INFO | (50, 200)
2021-05-27 16:55:55,404 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:55,404 INFO | (50, 200, 512)
2021-05-27 16:55:55,405 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:55,406 INFO | (50, 200, 512)
2021-05-27 16:55:55,406 INFO | BERT LAYER
2021-05-27 16:55:55,407 INFO | (200, 512)
2021-05-27 16:55:55,407 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,408 INFO | (200, 512)
2021-05-27 16:55:55,409 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,409 INFO | (200, 512)
2021-05-27 16:55:55,415 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,416 INFO | (200, 512)
2021-05-27 16:55:55,416 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,417 INFO | (200, 512)
2021-05-27 16:55:55,422 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,422 INFO | (200, 512)
2021-05-27 16:55:55,423 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,423 INFO | (200, 512)
2021-05-27 16:55:55,429 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,430 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  97%|█████████▋| 557/574 [01:00<00:01,  9.33it/s]

2021-05-27 16:55:55,501 INFO | INITIAL
2021-05-27 16:55:55,501 INFO | (50, 200)
2021-05-27 16:55:55,508 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:55,509 INFO | (50, 200, 512)
2021-05-27 16:55:55,510 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:55,511 INFO | (50, 200, 512)
2021-05-27 16:55:55,512 INFO | BERT LAYER
2021-05-27 16:55:55,513 INFO | (200, 512)
2021-05-27 16:55:55,513 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,514 INFO | (200, 512)
2021-05-27 16:55:55,514 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,514 INFO | (200, 512)
2021-05-27 16:55:55,521 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,522 INFO | (200, 512)
2021-05-27 16:55:55,522 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,522 INFO | (200, 512)
2021-05-27 16:55:55,528 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,528 INFO | (200, 512)
2021-05-27 16:55:55,529 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,529 INFO | (200, 512)
2021-05-27 16:55:55,535 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,535 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  97%|█████████▋| 558/574 [01:00<00:01,  9.31it/s]

2021-05-27 16:55:55,609 INFO | INITIAL
2021-05-27 16:55:55,610 INFO | (50, 200)
2021-05-27 16:55:55,619 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:55,620 INFO | (50, 200, 512)
2021-05-27 16:55:55,621 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:55,622 INFO | (50, 200, 512)
2021-05-27 16:55:55,623 INFO | BERT LAYER
2021-05-27 16:55:55,623 INFO | (200, 512)
2021-05-27 16:55:55,624 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,624 INFO | (200, 512)
2021-05-27 16:55:55,624 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,625 INFO | (200, 512)
2021-05-27 16:55:55,631 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,631 INFO | (200, 512)
2021-05-27 16:55:55,632 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,632 INFO | (200, 512)
2021-05-27 16:55:55,636 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,637 INFO | (200, 512)
2021-05-27 16:55:55,637 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,638 INFO | (200, 512)
2021-05-27 16:55:55,643 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,644 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  97%|█████████▋| 559/574 [01:00<00:01,  9.30it/s]

2021-05-27 16:55:55,716 INFO | INITIAL
2021-05-27 16:55:55,717 INFO | (50, 200)
2021-05-27 16:55:55,723 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:55,723 INFO | (50, 200, 512)
2021-05-27 16:55:55,726 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:55,726 INFO | (50, 200, 512)
2021-05-27 16:55:55,727 INFO | BERT LAYER
2021-05-27 16:55:55,728 INFO | (200, 512)
2021-05-27 16:55:55,728 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,729 INFO | (200, 512)
2021-05-27 16:55:55,729 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,730 INFO | (200, 512)
2021-05-27 16:55:55,736 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,736 INFO | (200, 512)
2021-05-27 16:55:55,737 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,738 INFO | (200, 512)
2021-05-27 16:55:55,746 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,746 INFO | (200, 512)
2021-05-27 16:55:55,747 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,747 INFO | (200, 512)
2021-05-27 16:55:55,753 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,753 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  98%|█████████▊| 560/574 [01:01<00:01,  9.18it/s]

2021-05-27 16:55:55,828 INFO | INITIAL
2021-05-27 16:55:55,829 INFO | (50, 200)
2021-05-27 16:55:55,834 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:55,835 INFO | (50, 200, 512)
2021-05-27 16:55:55,836 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:55,836 INFO | (50, 200, 512)
2021-05-27 16:55:55,837 INFO | BERT LAYER
2021-05-27 16:55:55,837 INFO | (200, 512)
2021-05-27 16:55:55,837 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,838 INFO | (200, 512)
2021-05-27 16:55:55,838 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,839 INFO | (200, 512)
2021-05-27 16:55:55,845 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,845 INFO | (200, 512)
2021-05-27 16:55:55,846 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,846 INFO | (200, 512)
2021-05-27 16:55:55,852 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,853 INFO | (200, 512)
2021-05-27 16:55:55,853 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,853 INFO | (200, 512)
2021-05-27 16:55:55,859 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,859 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  98%|█████████▊| 561/574 [01:01<00:01,  9.35it/s]

2021-05-27 16:55:55,931 INFO | INITIAL
2021-05-27 16:55:55,932 INFO | (50, 200)
2021-05-27 16:55:55,936 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:55,936 INFO | (50, 200, 512)
2021-05-27 16:55:55,938 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:55,938 INFO | (50, 200, 512)
2021-05-27 16:55:55,939 INFO | BERT LAYER
2021-05-27 16:55:55,939 INFO | (200, 512)
2021-05-27 16:55:55,940 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,940 INFO | (200, 512)
2021-05-27 16:55:55,941 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,941 INFO | (200, 512)
2021-05-27 16:55:55,948 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,948 INFO | (200, 512)
2021-05-27 16:55:55,948 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,949 INFO | (200, 512)
2021-05-27 16:55:55,954 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,954 INFO | (200, 512)
2021-05-27 16:55:55,954 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:55,955 INFO | (200, 512)
2021-05-27 16:55:55,960 INFO | BERT LAYER LOOP
2021-05-27 16:55:55,961 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  98%|█████████▊| 562/574 [01:01<00:01,  9.39it/s]

2021-05-27 16:55:56,036 INFO | INITIAL
2021-05-27 16:55:56,037 INFO | (50, 200)
2021-05-27 16:55:56,043 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:56,044 INFO | (50, 200, 512)
2021-05-27 16:55:56,045 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:56,046 INFO | (50, 200, 512)
2021-05-27 16:55:56,047 INFO | BERT LAYER
2021-05-27 16:55:56,047 INFO | (200, 512)
2021-05-27 16:55:56,047 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,048 INFO | (200, 512)
2021-05-27 16:55:56,048 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,049 INFO | (200, 512)
2021-05-27 16:55:56,055 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,057 INFO | (200, 512)
2021-05-27 16:55:56,058 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,059 INFO | (200, 512)
2021-05-27 16:55:56,066 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,067 INFO | (200, 512)
2021-05-27 16:55:56,067 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,067 INFO | (200, 512)
2021-05-27 16:55:56,073 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,073 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  98%|█████████▊| 563/574 [01:01<00:01,  9.17it/s]

2021-05-27 16:55:56,152 INFO | INITIAL
2021-05-27 16:55:56,152 INFO | (50, 200)
2021-05-27 16:55:56,157 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:56,158 INFO | (50, 200, 512)
2021-05-27 16:55:56,160 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:56,160 INFO | (50, 200, 512)
2021-05-27 16:55:56,161 INFO | BERT LAYER
2021-05-27 16:55:56,162 INFO | (200, 512)
2021-05-27 16:55:56,162 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,163 INFO | (200, 512)
2021-05-27 16:55:56,163 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,164 INFO | (200, 512)
2021-05-27 16:55:56,170 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,171 INFO | (200, 512)
2021-05-27 16:55:56,171 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,171 INFO | (200, 512)
2021-05-27 16:55:56,177 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,178 INFO | (200, 512)
2021-05-27 16:55:56,178 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,179 INFO | (200, 512)
2021-05-27 16:55:56,184 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,184 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  98%|█████████▊| 564/574 [01:01<00:01,  9.31it/s]

2021-05-27 16:55:56,255 INFO | INITIAL
2021-05-27 16:55:56,255 INFO | (50, 200)
2021-05-27 16:55:56,261 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:56,261 INFO | (50, 200, 512)
2021-05-27 16:55:56,263 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:56,263 INFO | (50, 200, 512)
2021-05-27 16:55:56,264 INFO | BERT LAYER
2021-05-27 16:55:56,265 INFO | (200, 512)
2021-05-27 16:55:56,265 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,265 INFO | (200, 512)
2021-05-27 16:55:56,266 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,266 INFO | (200, 512)
2021-05-27 16:55:56,271 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,272 INFO | (200, 512)
2021-05-27 16:55:56,272 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,273 INFO | (200, 512)
2021-05-27 16:55:56,280 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,281 INFO | (200, 512)
2021-05-27 16:55:56,282 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,283 INFO | (200, 512)
2021-05-27 16:55:56,290 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,291 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  98%|█████████▊| 565/574 [01:01<00:00,  9.49it/s]

2021-05-27 16:55:56,356 INFO | INITIAL
2021-05-27 16:55:56,356 INFO | (50, 200)
2021-05-27 16:55:56,363 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:56,363 INFO | (50, 200, 512)
2021-05-27 16:55:56,364 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:56,365 INFO | (50, 200, 512)
2021-05-27 16:55:56,366 INFO | BERT LAYER
2021-05-27 16:55:56,367 INFO | (200, 512)
2021-05-27 16:55:56,367 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,367 INFO | (200, 512)
2021-05-27 16:55:56,368 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,369 INFO | (200, 512)
2021-05-27 16:55:56,377 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,378 INFO | (200, 512)
2021-05-27 16:55:56,378 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,379 INFO | (200, 512)
2021-05-27 16:55:56,384 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,385 INFO | (200, 512)
2021-05-27 16:55:56,385 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,386 INFO | (200, 512)
2021-05-27 16:55:56,392 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,393 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  99%|█████████▊| 566/574 [01:01<00:00,  9.52it/s]

2021-05-27 16:55:56,460 INFO | INITIAL
2021-05-27 16:55:56,461 INFO | (50, 200)
2021-05-27 16:55:56,468 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:56,468 INFO | (50, 200, 512)
2021-05-27 16:55:56,470 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:56,470 INFO | (50, 200, 512)
2021-05-27 16:55:56,471 INFO | BERT LAYER
2021-05-27 16:55:56,471 INFO | (200, 512)
2021-05-27 16:55:56,472 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,472 INFO | (200, 512)
2021-05-27 16:55:56,473 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,473 INFO | (200, 512)
2021-05-27 16:55:56,479 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,480 INFO | (200, 512)
2021-05-27 16:55:56,480 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,480 INFO | (200, 512)
2021-05-27 16:55:56,488 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,488 INFO | (200, 512)
2021-05-27 16:55:56,489 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,489 INFO | (200, 512)
2021-05-27 16:55:56,496 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,497 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  99%|█████████▉| 567/574 [01:01<00:00,  9.40it/s]

2021-05-27 16:55:56,569 INFO | INITIAL
2021-05-27 16:55:56,570 INFO | (50, 200)
2021-05-27 16:55:56,575 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:56,577 INFO | (50, 200, 512)
2021-05-27 16:55:56,579 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:56,579 INFO | (50, 200, 512)
2021-05-27 16:55:56,580 INFO | BERT LAYER
2021-05-27 16:55:56,580 INFO | (200, 512)
2021-05-27 16:55:56,580 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,581 INFO | (200, 512)
2021-05-27 16:55:56,581 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,581 INFO | (200, 512)
2021-05-27 16:55:56,587 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,587 INFO | (200, 512)
2021-05-27 16:55:56,588 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,588 INFO | (200, 512)
2021-05-27 16:55:56,594 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,594 INFO | (200, 512)
2021-05-27 16:55:56,595 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,595 INFO | (200, 512)
2021-05-27 16:55:56,601 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,602 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  99%|█████████▉| 568/574 [01:01<00:00,  9.43it/s]

2021-05-27 16:55:56,676 INFO | INITIAL
2021-05-27 16:55:56,676 INFO | (50, 200)
2021-05-27 16:55:56,683 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:56,684 INFO | (50, 200, 512)
2021-05-27 16:55:56,686 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:56,687 INFO | (50, 200, 512)
2021-05-27 16:55:56,688 INFO | BERT LAYER
2021-05-27 16:55:56,689 INFO | (200, 512)
2021-05-27 16:55:56,689 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,690 INFO | (200, 512)
2021-05-27 16:55:56,691 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,691 INFO | (200, 512)
2021-05-27 16:55:56,697 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,697 INFO | (200, 512)
2021-05-27 16:55:56,698 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,699 INFO | (200, 512)
2021-05-27 16:55:56,706 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,707 INFO | (200, 512)
2021-05-27 16:55:56,707 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,707 INFO | (200, 512)
2021-05-27 16:55:56,714 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,715 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  99%|█████████▉| 569/574 [01:01<00:00,  9.27it/s]

2021-05-27 16:55:56,787 INFO | INITIAL
2021-05-27 16:55:56,787 INFO | (50, 200)
2021-05-27 16:55:56,792 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:56,792 INFO | (50, 200, 512)
2021-05-27 16:55:56,794 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:56,794 INFO | (50, 200, 512)
2021-05-27 16:55:56,794 INFO | BERT LAYER
2021-05-27 16:55:56,795 INFO | (200, 512)
2021-05-27 16:55:56,795 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,796 INFO | (200, 512)
2021-05-27 16:55:56,797 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,797 INFO | (200, 512)
2021-05-27 16:55:56,802 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,803 INFO | (200, 512)
2021-05-27 16:55:56,803 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,803 INFO | (200, 512)
2021-05-27 16:55:56,809 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,809 INFO | (200, 512)
2021-05-27 16:55:56,810 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,810 INFO | (200, 512)
2021-05-27 16:55:56,816 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,816 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  99%|█████████▉| 570/574 [01:02<00:00,  9.46it/s]

2021-05-27 16:55:56,887 INFO | INITIAL
2021-05-27 16:55:56,888 INFO | (50, 200)
2021-05-27 16:55:56,893 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:56,894 INFO | (50, 200, 512)
2021-05-27 16:55:56,895 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:56,895 INFO | (50, 200, 512)
2021-05-27 16:55:56,896 INFO | BERT LAYER
2021-05-27 16:55:56,896 INFO | (200, 512)
2021-05-27 16:55:56,897 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,897 INFO | (200, 512)
2021-05-27 16:55:56,897 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,898 INFO | (200, 512)
2021-05-27 16:55:56,904 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,904 INFO | (200, 512)
2021-05-27 16:55:56,904 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,905 INFO | (200, 512)
2021-05-27 16:55:56,912 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,912 INFO | (200, 512)
2021-05-27 16:55:56,913 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:56,914 INFO | (200, 512)
2021-05-27 16:55:56,920 INFO | BERT LAYER LOOP
2021-05-27 16:55:56,920 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% :  99%|█████████▉| 571/574 [01:02<00:00,  9.58it/s]

2021-05-27 16:55:56,989 INFO | INITIAL
2021-05-27 16:55:56,989 INFO | (50, 200)
2021-05-27 16:55:57,000 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:57,000 INFO | (50, 200, 512)
2021-05-27 16:55:57,002 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:57,002 INFO | (50, 200, 512)
2021-05-27 16:55:57,003 INFO | BERT LAYER
2021-05-27 16:55:57,004 INFO | (200, 512)
2021-05-27 16:55:57,004 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,005 INFO | (200, 512)
2021-05-27 16:55:57,005 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,005 INFO | (200, 512)
2021-05-27 16:55:57,012 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,012 INFO | (200, 512)
2021-05-27 16:55:57,013 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,013 INFO | (200, 512)
2021-05-27 16:55:57,020 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,020 INFO | (200, 512)
2021-05-27 16:55:57,021 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,021 INFO | (200, 512)
2021-05-27 16:55:57,027 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,027 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% : 100%|█████████▉| 572/574 [01:02<00:00,  9.44it/s]

2021-05-27 16:55:57,099 INFO | INITIAL
2021-05-27 16:55:57,099 INFO | (50, 200)
2021-05-27 16:55:57,105 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:57,105 INFO | (50, 200, 512)
2021-05-27 16:55:57,107 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:57,107 INFO | (50, 200, 512)
2021-05-27 16:55:57,108 INFO | BERT LAYER
2021-05-27 16:55:57,108 INFO | (200, 512)
2021-05-27 16:55:57,108 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,109 INFO | (200, 512)
2021-05-27 16:55:57,109 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,109 INFO | (200, 512)
2021-05-27 16:55:57,115 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,115 INFO | (200, 512)
2021-05-27 16:55:57,115 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,116 INFO | (200, 512)
2021-05-27 16:55:57,121 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,122 INFO | (200, 512)
2021-05-27 16:55:57,122 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,123 INFO | (200, 512)
2021-05-27 16:55:57,128 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,129 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% : 100%|█████████▉| 573/574 [01:02<00:00,  9.53it/s]

2021-05-27 16:55:57,201 INFO | INITIAL
2021-05-27 16:55:57,201 INFO | (50, 200)
2021-05-27 16:55:57,207 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:57,208 INFO | (50, 200, 512)
2021-05-27 16:55:57,210 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:57,210 INFO | (50, 200, 512)
2021-05-27 16:55:57,211 INFO | BERT LAYER
2021-05-27 16:55:57,214 INFO | (200, 512)
2021-05-27 16:55:57,214 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,215 INFO | (200, 512)
2021-05-27 16:55:57,215 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,216 INFO | (200, 512)
2021-05-27 16:55:57,224 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,225 INFO | (200, 512)
2021-05-27 16:55:57,225 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,226 INFO | (200, 512)
2021-05-27 16:55:57,233 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,233 INFO | (200, 512)
2021-05-27 16:55:57,234 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,234 INFO | (200, 512)
2021-05-27 16:55:57,239 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,240 INFO | (200, 

Epoch: 000, Loss: 0.000, Accuracy: 0.000% : 100%|██████████| 574/574 [01:02<00:00,  9.18it/s]
Epoch: 000, Loss: 0.000, Accuracy: 0.000%:   0%|          | 0/574 [00:00<?, ?it/s]

2021-05-27 16:55:57,314 INFO | INITIAL
2021-05-27 16:55:57,315 INFO | (50, 200)
2021-05-27 16:55:57,320 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:57,321 INFO | (50, 200, 512)
2021-05-27 16:55:57,322 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:57,322 INFO | (50, 200, 512)
2021-05-27 16:55:57,323 INFO | BERT LAYER
2021-05-27 16:55:57,323 INFO | (200, 512)
2021-05-27 16:55:57,324 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,324 INFO | (200, 512)
2021-05-27 16:55:57,324 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,325 INFO | (200, 512)
2021-05-27 16:55:57,331 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,331 INFO | (200, 512)
2021-05-27 16:55:57,331 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,332 INFO | (200, 512)
2021-05-27 16:55:57,337 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,337 INFO | (200, 512)
2021-05-27 16:55:57,338 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,338 INFO | (200, 512)
2021-05-27 16:55:57,344 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,345 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   0%|          | 1/574 [00:00<00:59,  9.66it/s]

2021-05-27 16:55:57,419 INFO | INITIAL
2021-05-27 16:55:57,419 INFO | (50, 200)
2021-05-27 16:55:57,425 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:57,425 INFO | (50, 200, 512)
2021-05-27 16:55:57,427 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:57,427 INFO | (50, 200, 512)
2021-05-27 16:55:57,428 INFO | BERT LAYER
2021-05-27 16:55:57,428 INFO | (200, 512)
2021-05-27 16:55:57,429 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,429 INFO | (200, 512)
2021-05-27 16:55:57,430 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,430 INFO | (200, 512)
2021-05-27 16:55:57,438 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,439 INFO | (200, 512)
2021-05-27 16:55:57,440 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,440 INFO | (200, 512)
2021-05-27 16:55:57,448 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,449 INFO | (200, 512)
2021-05-27 16:55:57,449 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,450 INFO | (200, 512)
2021-05-27 16:55:57,456 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,457 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   0%|          | 2/574 [00:00<01:02,  9.13it/s]

2021-05-27 16:55:57,532 INFO | INITIAL
2021-05-27 16:55:57,532 INFO | (50, 200)
2021-05-27 16:55:57,537 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:57,537 INFO | (50, 200, 512)
2021-05-27 16:55:57,539 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:57,539 INFO | (50, 200, 512)
2021-05-27 16:55:57,540 INFO | BERT LAYER
2021-05-27 16:55:57,540 INFO | (200, 512)
2021-05-27 16:55:57,541 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,541 INFO | (200, 512)
2021-05-27 16:55:57,542 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,542 INFO | (200, 512)
2021-05-27 16:55:57,549 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,550 INFO | (200, 512)
2021-05-27 16:55:57,550 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,551 INFO | (200, 512)
2021-05-27 16:55:57,557 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,557 INFO | (200, 512)
2021-05-27 16:55:57,558 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,559 INFO | (200, 512)
2021-05-27 16:55:57,565 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,566 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   1%|          | 3/574 [00:00<01:01,  9.27it/s]

2021-05-27 16:55:57,638 INFO | INITIAL
2021-05-27 16:55:57,638 INFO | (50, 200)
2021-05-27 16:55:57,645 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:57,646 INFO | (50, 200, 512)
2021-05-27 16:55:57,647 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:57,647 INFO | (50, 200, 512)
2021-05-27 16:55:57,648 INFO | BERT LAYER
2021-05-27 16:55:57,649 INFO | (200, 512)
2021-05-27 16:55:57,649 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,649 INFO | (200, 512)
2021-05-27 16:55:57,650 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,650 INFO | (200, 512)
2021-05-27 16:55:57,656 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,657 INFO | (200, 512)
2021-05-27 16:55:57,657 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,658 INFO | (200, 512)
2021-05-27 16:55:57,666 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,666 INFO | (200, 512)
2021-05-27 16:55:57,667 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,667 INFO | (200, 512)
2021-05-27 16:55:57,675 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,675 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   1%|          | 4/574 [00:00<01:01,  9.26it/s]

2021-05-27 16:55:57,747 INFO | INITIAL
2021-05-27 16:55:57,748 INFO | (50, 200)
2021-05-27 16:55:57,755 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:57,756 INFO | (50, 200, 512)
2021-05-27 16:55:57,757 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:57,757 INFO | (50, 200, 512)
2021-05-27 16:55:57,758 INFO | BERT LAYER
2021-05-27 16:55:57,759 INFO | (200, 512)
2021-05-27 16:55:57,759 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,760 INFO | (200, 512)
2021-05-27 16:55:57,760 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,761 INFO | (200, 512)
2021-05-27 16:55:57,768 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,768 INFO | (200, 512)
2021-05-27 16:55:57,769 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,769 INFO | (200, 512)
2021-05-27 16:55:57,774 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,774 INFO | (200, 512)
2021-05-27 16:55:57,775 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,775 INFO | (200, 512)
2021-05-27 16:55:57,781 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,781 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   1%|          | 5/574 [00:00<01:01,  9.29it/s]

2021-05-27 16:55:57,853 INFO | INITIAL
2021-05-27 16:55:57,853 INFO | (50, 200)
2021-05-27 16:55:57,858 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:57,859 INFO | (50, 200, 512)
2021-05-27 16:55:57,860 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:57,860 INFO | (50, 200, 512)
2021-05-27 16:55:57,861 INFO | BERT LAYER
2021-05-27 16:55:57,861 INFO | (200, 512)
2021-05-27 16:55:57,862 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,862 INFO | (200, 512)
2021-05-27 16:55:57,862 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,863 INFO | (200, 512)
2021-05-27 16:55:57,869 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,869 INFO | (200, 512)
2021-05-27 16:55:57,870 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,871 INFO | (200, 512)
2021-05-27 16:55:57,877 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,878 INFO | (200, 512)
2021-05-27 16:55:57,879 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,879 INFO | (200, 512)
2021-05-27 16:55:57,888 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,888 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   1%|          | 6/574 [00:00<01:00,  9.40it/s]

2021-05-27 16:55:57,957 INFO | INITIAL
2021-05-27 16:55:57,958 INFO | (50, 200)
2021-05-27 16:55:57,967 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:57,968 INFO | (50, 200, 512)
2021-05-27 16:55:57,969 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:57,970 INFO | (50, 200, 512)
2021-05-27 16:55:57,971 INFO | BERT LAYER
2021-05-27 16:55:57,971 INFO | (200, 512)
2021-05-27 16:55:57,972 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,972 INFO | (200, 512)
2021-05-27 16:55:57,973 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,973 INFO | (200, 512)
2021-05-27 16:55:57,980 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,981 INFO | (200, 512)
2021-05-27 16:55:57,981 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,982 INFO | (200, 512)
2021-05-27 16:55:57,988 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,988 INFO | (200, 512)
2021-05-27 16:55:57,989 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:57,989 INFO | (200, 512)
2021-05-27 16:55:57,995 INFO | BERT LAYER LOOP
2021-05-27 16:55:57,996 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   1%|          | 7/574 [00:00<01:01,  9.22it/s]

2021-05-27 16:55:58,070 INFO | INITIAL
2021-05-27 16:55:58,071 INFO | (50, 200)
2021-05-27 16:55:58,076 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:58,076 INFO | (50, 200, 512)
2021-05-27 16:55:58,078 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:58,078 INFO | (50, 200, 512)
2021-05-27 16:55:58,079 INFO | BERT LAYER
2021-05-27 16:55:58,080 INFO | (200, 512)
2021-05-27 16:55:58,081 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,081 INFO | (200, 512)
2021-05-27 16:55:58,082 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,082 INFO | (200, 512)
2021-05-27 16:55:58,088 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,088 INFO | (200, 512)
2021-05-27 16:55:58,089 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,089 INFO | (200, 512)
2021-05-27 16:55:58,095 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,096 INFO | (200, 512)
2021-05-27 16:55:58,096 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,097 INFO | (200, 512)
2021-05-27 16:55:58,103 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,104 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   1%|▏         | 8/574 [00:00<01:01,  9.27it/s]

2021-05-27 16:55:58,176 INFO | INITIAL
2021-05-27 16:55:58,177 INFO | (50, 200)
2021-05-27 16:55:58,182 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:58,183 INFO | (50, 200, 512)
2021-05-27 16:55:58,184 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:58,185 INFO | (50, 200, 512)
2021-05-27 16:55:58,185 INFO | BERT LAYER
2021-05-27 16:55:58,185 INFO | (200, 512)
2021-05-27 16:55:58,186 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,186 INFO | (200, 512)
2021-05-27 16:55:58,187 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,188 INFO | (200, 512)
2021-05-27 16:55:58,195 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,196 INFO | (200, 512)
2021-05-27 16:55:58,196 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,196 INFO | (200, 512)
2021-05-27 16:55:58,203 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,203 INFO | (200, 512)
2021-05-27 16:55:58,204 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,204 INFO | (200, 512)
2021-05-27 16:55:58,211 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,212 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   2%|▏         | 9/574 [00:00<01:01,  9.13it/s]

2021-05-27 16:55:58,289 INFO | INITIAL
2021-05-27 16:55:58,290 INFO | (50, 200)
2021-05-27 16:55:58,297 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:58,298 INFO | (50, 200, 512)
2021-05-27 16:55:58,299 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:58,299 INFO | (50, 200, 512)
2021-05-27 16:55:58,300 INFO | BERT LAYER
2021-05-27 16:55:58,301 INFO | (200, 512)
2021-05-27 16:55:58,301 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,301 INFO | (200, 512)
2021-05-27 16:55:58,302 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,302 INFO | (200, 512)
2021-05-27 16:55:58,308 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,309 INFO | (200, 512)
2021-05-27 16:55:58,309 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,309 INFO | (200, 512)
2021-05-27 16:55:58,315 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,315 INFO | (200, 512)
2021-05-27 16:55:58,316 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,316 INFO | (200, 512)
2021-05-27 16:55:58,322 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,323 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   2%|▏         | 10/574 [00:01<01:00,  9.31it/s]

2021-05-27 16:55:58,392 INFO | INITIAL
2021-05-27 16:55:58,392 INFO | (50, 200)
2021-05-27 16:55:58,398 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:58,399 INFO | (50, 200, 512)
2021-05-27 16:55:58,400 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:58,400 INFO | (50, 200, 512)
2021-05-27 16:55:58,401 INFO | BERT LAYER
2021-05-27 16:55:58,401 INFO | (200, 512)
2021-05-27 16:55:58,402 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,402 INFO | (200, 512)
2021-05-27 16:55:58,402 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,403 INFO | (200, 512)
2021-05-27 16:55:58,408 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,409 INFO | (200, 512)
2021-05-27 16:55:58,409 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,410 INFO | (200, 512)
2021-05-27 16:55:58,415 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,416 INFO | (200, 512)
2021-05-27 16:55:58,416 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,417 INFO | (200, 512)
2021-05-27 16:55:58,424 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,424 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   2%|▏         | 11/574 [00:01<01:03,  8.80it/s]

2021-05-27 16:55:58,520 INFO | INITIAL
2021-05-27 16:55:58,520 INFO | (50, 200)
2021-05-27 16:55:58,525 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:58,526 INFO | (50, 200, 512)
2021-05-27 16:55:58,527 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:58,528 INFO | (50, 200, 512)
2021-05-27 16:55:58,528 INFO | BERT LAYER
2021-05-27 16:55:58,529 INFO | (200, 512)
2021-05-27 16:55:58,529 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,530 INFO | (200, 512)
2021-05-27 16:55:58,530 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,531 INFO | (200, 512)
2021-05-27 16:55:58,538 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,539 INFO | (200, 512)
2021-05-27 16:55:58,539 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,540 INFO | (200, 512)
2021-05-27 16:55:58,547 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,548 INFO | (200, 512)
2021-05-27 16:55:58,549 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,550 INFO | (200, 512)
2021-05-27 16:55:58,557 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,557 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   2%|▏         | 12/574 [00:01<01:03,  8.86it/s]

2021-05-27 16:55:58,631 INFO | INITIAL
2021-05-27 16:55:58,631 INFO | (50, 200)
2021-05-27 16:55:58,636 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:58,637 INFO | (50, 200, 512)
2021-05-27 16:55:58,638 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:58,638 INFO | (50, 200, 512)
2021-05-27 16:55:58,639 INFO | BERT LAYER
2021-05-27 16:55:58,639 INFO | (200, 512)
2021-05-27 16:55:58,639 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,640 INFO | (200, 512)
2021-05-27 16:55:58,640 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,640 INFO | (200, 512)
2021-05-27 16:55:58,646 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,647 INFO | (200, 512)
2021-05-27 16:55:58,647 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,647 INFO | (200, 512)
2021-05-27 16:55:58,652 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,653 INFO | (200, 512)
2021-05-27 16:55:58,653 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,654 INFO | (200, 512)
2021-05-27 16:55:58,660 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,660 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   2%|▏         | 12/574 [00:01<01:03,  8.86it/s]

2021-05-27 16:55:58,730 INFO | INITIAL
2021-05-27 16:55:58,731 INFO | (50, 200)
2021-05-27 16:55:58,736 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:58,736 INFO | (50, 200, 512)
2021-05-27 16:55:58,737 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:58,738 INFO | (50, 200, 512)
2021-05-27 16:55:58,738 INFO | BERT LAYER
2021-05-27 16:55:58,739 INFO | (200, 512)
2021-05-27 16:55:58,740 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,740 INFO | (200, 512)
2021-05-27 16:55:58,740 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,741 INFO | (200, 512)
2021-05-27 16:55:58,749 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,749 INFO | (200, 512)
2021-05-27 16:55:58,750 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,750 INFO | (200, 512)
2021-05-27 16:55:58,756 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,756 INFO | (200, 512)
2021-05-27 16:55:58,757 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,757 INFO | (200, 512)
2021-05-27 16:55:58,764 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,765 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   2%|▏         | 14/574 [00:01<01:01,  9.12it/s]

2021-05-27 16:55:58,844 INFO | INITIAL
2021-05-27 16:55:58,844 INFO | (50, 200)
2021-05-27 16:55:58,849 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:58,850 INFO | (50, 200, 512)
2021-05-27 16:55:58,851 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:58,851 INFO | (50, 200, 512)
2021-05-27 16:55:58,852 INFO | BERT LAYER
2021-05-27 16:55:58,853 INFO | (200, 512)
2021-05-27 16:55:58,853 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,853 INFO | (200, 512)
2021-05-27 16:55:58,854 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,854 INFO | (200, 512)
2021-05-27 16:55:58,860 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,860 INFO | (200, 512)
2021-05-27 16:55:58,860 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,861 INFO | (200, 512)
2021-05-27 16:55:58,866 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,867 INFO | (200, 512)
2021-05-27 16:55:58,867 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,867 INFO | (200, 512)
2021-05-27 16:55:58,874 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,874 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   3%|▎         | 15/574 [00:01<01:00,  9.25it/s]

2021-05-27 16:55:58,947 INFO | INITIAL
2021-05-27 16:55:58,948 INFO | (50, 200)
2021-05-27 16:55:58,954 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:58,955 INFO | (50, 200, 512)
2021-05-27 16:55:58,956 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:58,957 INFO | (50, 200, 512)
2021-05-27 16:55:58,958 INFO | BERT LAYER
2021-05-27 16:55:58,958 INFO | (200, 512)
2021-05-27 16:55:58,959 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,959 INFO | (200, 512)
2021-05-27 16:55:58,960 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,961 INFO | (200, 512)
2021-05-27 16:55:58,967 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,968 INFO | (200, 512)
2021-05-27 16:55:58,969 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,969 INFO | (200, 512)
2021-05-27 16:55:58,975 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,976 INFO | (200, 512)
2021-05-27 16:55:58,976 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:58,977 INFO | (200, 512)
2021-05-27 16:55:58,983 INFO | BERT LAYER LOOP
2021-05-27 16:55:58,983 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   3%|▎         | 16/574 [00:01<01:00,  9.22it/s]

2021-05-27 16:55:59,056 INFO | INITIAL
2021-05-27 16:55:59,056 INFO | (50, 200)
2021-05-27 16:55:59,063 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:59,064 INFO | (50, 200, 512)
2021-05-27 16:55:59,065 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:59,066 INFO | (50, 200, 512)
2021-05-27 16:55:59,067 INFO | BERT LAYER
2021-05-27 16:55:59,067 INFO | (200, 512)
2021-05-27 16:55:59,067 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,068 INFO | (200, 512)
2021-05-27 16:55:59,068 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,068 INFO | (200, 512)
2021-05-27 16:55:59,074 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,075 INFO | (200, 512)
2021-05-27 16:55:59,076 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,076 INFO | (200, 512)
2021-05-27 16:55:59,081 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,082 INFO | (200, 512)
2021-05-27 16:55:59,082 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,082 INFO | (200, 512)
2021-05-27 16:55:59,089 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,089 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   3%|▎         | 17/574 [00:01<01:00,  9.21it/s]

2021-05-27 16:55:59,165 INFO | INITIAL
2021-05-27 16:55:59,165 INFO | (50, 200)
2021-05-27 16:55:59,170 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:59,171 INFO | (50, 200, 512)
2021-05-27 16:55:59,172 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:59,173 INFO | (50, 200, 512)
2021-05-27 16:55:59,173 INFO | BERT LAYER
2021-05-27 16:55:59,174 INFO | (200, 512)
2021-05-27 16:55:59,174 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,175 INFO | (200, 512)
2021-05-27 16:55:59,176 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,176 INFO | (200, 512)
2021-05-27 16:55:59,183 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,183 INFO | (200, 512)
2021-05-27 16:55:59,184 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,184 INFO | (200, 512)
2021-05-27 16:55:59,190 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,191 INFO | (200, 512)
2021-05-27 16:55:59,192 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,192 INFO | (200, 512)
2021-05-27 16:55:59,198 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,198 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   3%|▎         | 18/574 [00:01<00:59,  9.32it/s]

2021-05-27 16:55:59,269 INFO | INITIAL
2021-05-27 16:55:59,269 INFO | (50, 200)
2021-05-27 16:55:59,274 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:59,275 INFO | (50, 200, 512)
2021-05-27 16:55:59,277 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:59,277 INFO | (50, 200, 512)
2021-05-27 16:55:59,278 INFO | BERT LAYER
2021-05-27 16:55:59,278 INFO | (200, 512)
2021-05-27 16:55:59,279 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,281 INFO | (200, 512)
2021-05-27 16:55:59,281 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,282 INFO | (200, 512)
2021-05-27 16:55:59,287 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,288 INFO | (200, 512)
2021-05-27 16:55:59,289 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,289 INFO | (200, 512)
2021-05-27 16:55:59,297 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,298 INFO | (200, 512)
2021-05-27 16:55:59,298 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,299 INFO | (200, 512)
2021-05-27 16:55:59,304 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,305 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   3%|▎         | 19/574 [00:02<00:59,  9.32it/s]

2021-05-27 16:55:59,376 INFO | INITIAL
2021-05-27 16:55:59,377 INFO | (50, 200)
2021-05-27 16:55:59,383 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:59,383 INFO | (50, 200, 512)
2021-05-27 16:55:59,384 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:59,385 INFO | (50, 200, 512)
2021-05-27 16:55:59,385 INFO | BERT LAYER
2021-05-27 16:55:59,385 INFO | (200, 512)
2021-05-27 16:55:59,386 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,386 INFO | (200, 512)
2021-05-27 16:55:59,386 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,387 INFO | (200, 512)
2021-05-27 16:55:59,394 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,394 INFO | (200, 512)
2021-05-27 16:55:59,395 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,395 INFO | (200, 512)
2021-05-27 16:55:59,400 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,400 INFO | (200, 512)
2021-05-27 16:55:59,401 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,401 INFO | (200, 512)
2021-05-27 16:55:59,406 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,406 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   3%|▎         | 20/574 [00:02<00:59,  9.37it/s]

2021-05-27 16:55:59,481 INFO | INITIAL
2021-05-27 16:55:59,482 INFO | (50, 200)
2021-05-27 16:55:59,488 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:59,488 INFO | (50, 200, 512)
2021-05-27 16:55:59,490 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:59,490 INFO | (50, 200, 512)
2021-05-27 16:55:59,491 INFO | BERT LAYER
2021-05-27 16:55:59,492 INFO | (200, 512)
2021-05-27 16:55:59,493 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,493 INFO | (200, 512)
2021-05-27 16:55:59,494 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,494 INFO | (200, 512)
2021-05-27 16:55:59,500 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,501 INFO | (200, 512)
2021-05-27 16:55:59,501 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,502 INFO | (200, 512)
2021-05-27 16:55:59,508 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,508 INFO | (200, 512)
2021-05-27 16:55:59,509 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,509 INFO | (200, 512)
2021-05-27 16:55:59,518 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,518 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   4%|▎         | 21/574 [00:02<00:59,  9.33it/s]

2021-05-27 16:55:59,590 INFO | INITIAL
2021-05-27 16:55:59,590 INFO | (50, 200)
2021-05-27 16:55:59,597 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:59,598 INFO | (50, 200, 512)
2021-05-27 16:55:59,599 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:59,599 INFO | (50, 200, 512)
2021-05-27 16:55:59,600 INFO | BERT LAYER
2021-05-27 16:55:59,600 INFO | (200, 512)
2021-05-27 16:55:59,601 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,601 INFO | (200, 512)
2021-05-27 16:55:59,601 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,602 INFO | (200, 512)
2021-05-27 16:55:59,607 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,608 INFO | (200, 512)
2021-05-27 16:55:59,608 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,609 INFO | (200, 512)
2021-05-27 16:55:59,616 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,617 INFO | (200, 512)
2021-05-27 16:55:59,618 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,618 INFO | (200, 512)
2021-05-27 16:55:59,625 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,626 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   4%|▍         | 22/574 [00:02<00:58,  9.38it/s]

2021-05-27 16:55:59,695 INFO | INITIAL
2021-05-27 16:55:59,696 INFO | (50, 200)
2021-05-27 16:55:59,701 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:59,701 INFO | (50, 200, 512)
2021-05-27 16:55:59,702 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:59,703 INFO | (50, 200, 512)
2021-05-27 16:55:59,704 INFO | BERT LAYER
2021-05-27 16:55:59,704 INFO | (200, 512)
2021-05-27 16:55:59,705 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,705 INFO | (200, 512)
2021-05-27 16:55:59,705 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,706 INFO | (200, 512)
2021-05-27 16:55:59,712 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,712 INFO | (200, 512)
2021-05-27 16:55:59,713 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,713 INFO | (200, 512)
2021-05-27 16:55:59,718 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,718 INFO | (200, 512)
2021-05-27 16:55:59,719 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,719 INFO | (200, 512)
2021-05-27 16:55:59,724 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,725 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   4%|▍         | 22/574 [00:02<00:58,  9.38it/s]

2021-05-27 16:55:59,792 INFO | INITIAL
2021-05-27 16:55:59,792 INFO | (50, 200)
2021-05-27 16:55:59,797 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:59,798 INFO | (50, 200, 512)
2021-05-27 16:55:59,799 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:59,799 INFO | (50, 200, 512)
2021-05-27 16:55:59,800 INFO | BERT LAYER
2021-05-27 16:55:59,800 INFO | (200, 512)
2021-05-27 16:55:59,801 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,801 INFO | (200, 512)
2021-05-27 16:55:59,801 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,802 INFO | (200, 512)
2021-05-27 16:55:59,808 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,808 INFO | (200, 512)
2021-05-27 16:55:59,809 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,810 INFO | (200, 512)
2021-05-27 16:55:59,818 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,819 INFO | (200, 512)
2021-05-27 16:55:59,820 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,820 INFO | (200, 512)
2021-05-27 16:55:59,828 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,829 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   4%|▍         | 24/574 [00:02<00:57,  9.56it/s]

2021-05-27 16:55:59,900 INFO | INITIAL
2021-05-27 16:55:59,900 INFO | (50, 200)
2021-05-27 16:55:59,905 INFO | POST EMBEDDING LAYER
2021-05-27 16:55:59,905 INFO | (50, 200, 512)
2021-05-27 16:55:59,906 INFO | POST POSITIONAL ENCODING
2021-05-27 16:55:59,907 INFO | (50, 200, 512)
2021-05-27 16:55:59,908 INFO | BERT LAYER
2021-05-27 16:55:59,908 INFO | (200, 512)
2021-05-27 16:55:59,909 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,911 INFO | (200, 512)
2021-05-27 16:55:59,912 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,912 INFO | (200, 512)
2021-05-27 16:55:59,918 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,918 INFO | (200, 512)
2021-05-27 16:55:59,919 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,919 INFO | (200, 512)
2021-05-27 16:55:59,926 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,926 INFO | (200, 512)
2021-05-27 16:55:59,927 INFO | MULTIHEADED ATTENTION
2021-05-27 16:55:59,927 INFO | (200, 512)
2021-05-27 16:55:59,934 INFO | BERT LAYER LOOP
2021-05-27 16:55:59,934 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   4%|▍         | 25/574 [00:02<00:57,  9.55it/s]

2021-05-27 16:56:00,005 INFO | INITIAL
2021-05-27 16:56:00,006 INFO | (50, 200)
2021-05-27 16:56:00,012 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:00,012 INFO | (50, 200, 512)
2021-05-27 16:56:00,013 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:00,014 INFO | (50, 200, 512)
2021-05-27 16:56:00,015 INFO | BERT LAYER
2021-05-27 16:56:00,015 INFO | (200, 512)
2021-05-27 16:56:00,015 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,016 INFO | (200, 512)
2021-05-27 16:56:00,016 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,016 INFO | (200, 512)
2021-05-27 16:56:00,023 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,023 INFO | (200, 512)
2021-05-27 16:56:00,024 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,024 INFO | (200, 512)
2021-05-27 16:56:00,031 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,031 INFO | (200, 512)
2021-05-27 16:56:00,032 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,033 INFO | (200, 512)
2021-05-27 16:56:00,037 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,038 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   5%|▍         | 26/574 [00:02<00:57,  9.50it/s]

2021-05-27 16:56:00,112 INFO | INITIAL
2021-05-27 16:56:00,112 INFO | (50, 200)
2021-05-27 16:56:00,118 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:00,118 INFO | (50, 200, 512)
2021-05-27 16:56:00,120 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:00,120 INFO | (50, 200, 512)
2021-05-27 16:56:00,121 INFO | BERT LAYER
2021-05-27 16:56:00,121 INFO | (200, 512)
2021-05-27 16:56:00,121 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,122 INFO | (200, 512)
2021-05-27 16:56:00,122 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,122 INFO | (200, 512)
2021-05-27 16:56:00,129 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,130 INFO | (200, 512)
2021-05-27 16:56:00,130 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,130 INFO | (200, 512)
2021-05-27 16:56:00,136 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,137 INFO | (200, 512)
2021-05-27 16:56:00,137 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,137 INFO | (200, 512)
2021-05-27 16:56:00,144 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,144 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   5%|▍         | 27/574 [00:02<00:58,  9.36it/s]

2021-05-27 16:56:00,223 INFO | INITIAL
2021-05-27 16:56:00,223 INFO | (50, 200)
2021-05-27 16:56:00,230 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:00,231 INFO | (50, 200, 512)
2021-05-27 16:56:00,232 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:00,233 INFO | (50, 200, 512)
2021-05-27 16:56:00,233 INFO | BERT LAYER
2021-05-27 16:56:00,234 INFO | (200, 512)
2021-05-27 16:56:00,234 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,235 INFO | (200, 512)
2021-05-27 16:56:00,235 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,236 INFO | (200, 512)
2021-05-27 16:56:00,243 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,243 INFO | (200, 512)
2021-05-27 16:56:00,244 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,244 INFO | (200, 512)
2021-05-27 16:56:00,252 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,253 INFO | (200, 512)
2021-05-27 16:56:00,253 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,253 INFO | (200, 512)
2021-05-27 16:56:00,260 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,261 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   5%|▍         | 28/574 [00:03<00:58,  9.28it/s]

2021-05-27 16:56:00,333 INFO | INITIAL
2021-05-27 16:56:00,333 INFO | (50, 200)
2021-05-27 16:56:00,338 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:00,338 INFO | (50, 200, 512)
2021-05-27 16:56:00,340 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:00,340 INFO | (50, 200, 512)
2021-05-27 16:56:00,341 INFO | BERT LAYER
2021-05-27 16:56:00,341 INFO | (200, 512)
2021-05-27 16:56:00,342 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,342 INFO | (200, 512)
2021-05-27 16:56:00,343 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,344 INFO | (200, 512)
2021-05-27 16:56:00,350 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,351 INFO | (200, 512)
2021-05-27 16:56:00,351 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,352 INFO | (200, 512)
2021-05-27 16:56:00,359 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,359 INFO | (200, 512)
2021-05-27 16:56:00,360 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,360 INFO | (200, 512)
2021-05-27 16:56:00,367 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,367 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   5%|▌         | 29/574 [00:03<00:58,  9.31it/s]

2021-05-27 16:56:00,439 INFO | INITIAL
2021-05-27 16:56:00,440 INFO | (50, 200)
2021-05-27 16:56:00,446 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:00,446 INFO | (50, 200, 512)
2021-05-27 16:56:00,448 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:00,448 INFO | (50, 200, 512)
2021-05-27 16:56:00,449 INFO | BERT LAYER
2021-05-27 16:56:00,449 INFO | (200, 512)
2021-05-27 16:56:00,449 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,450 INFO | (200, 512)
2021-05-27 16:56:00,451 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,451 INFO | (200, 512)
2021-05-27 16:56:00,456 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,456 INFO | (200, 512)
2021-05-27 16:56:00,457 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,458 INFO | (200, 512)
2021-05-27 16:56:00,465 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,466 INFO | (200, 512)
2021-05-27 16:56:00,466 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,467 INFO | (200, 512)
2021-05-27 16:56:00,474 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,475 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   5%|▌         | 30/574 [00:03<00:58,  9.24it/s]

2021-05-27 16:56:00,550 INFO | INITIAL
2021-05-27 16:56:00,550 INFO | (50, 200)
2021-05-27 16:56:00,556 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:00,556 INFO | (50, 200, 512)
2021-05-27 16:56:00,558 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:00,559 INFO | (50, 200, 512)
2021-05-27 16:56:00,560 INFO | BERT LAYER
2021-05-27 16:56:00,560 INFO | (200, 512)
2021-05-27 16:56:00,560 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,561 INFO | (200, 512)
2021-05-27 16:56:00,562 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,562 INFO | (200, 512)
2021-05-27 16:56:00,567 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,568 INFO | (200, 512)
2021-05-27 16:56:00,568 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,568 INFO | (200, 512)
2021-05-27 16:56:00,574 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,575 INFO | (200, 512)
2021-05-27 16:56:00,575 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,576 INFO | (200, 512)
2021-05-27 16:56:00,581 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,581 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   5%|▌         | 31/574 [00:03<00:58,  9.30it/s]

2021-05-27 16:56:00,655 INFO | INITIAL
2021-05-27 16:56:00,656 INFO | (50, 200)
2021-05-27 16:56:00,661 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:00,661 INFO | (50, 200, 512)
2021-05-27 16:56:00,662 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:00,663 INFO | (50, 200, 512)
2021-05-27 16:56:00,664 INFO | BERT LAYER
2021-05-27 16:56:00,664 INFO | (200, 512)
2021-05-27 16:56:00,664 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,664 INFO | (200, 512)
2021-05-27 16:56:00,665 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,665 INFO | (200, 512)
2021-05-27 16:56:00,670 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,670 INFO | (200, 512)
2021-05-27 16:56:00,671 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,671 INFO | (200, 512)
2021-05-27 16:56:00,677 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,677 INFO | (200, 512)
2021-05-27 16:56:00,678 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,678 INFO | (200, 512)
2021-05-27 16:56:00,683 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,684 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   5%|▌         | 31/574 [00:03<00:58,  9.30it/s]

2021-05-27 16:56:00,751 INFO | INITIAL
2021-05-27 16:56:00,752 INFO | (50, 200)
2021-05-27 16:56:00,758 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:00,758 INFO | (50, 200, 512)
2021-05-27 16:56:00,760 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:00,760 INFO | (50, 200, 512)
2021-05-27 16:56:00,761 INFO | BERT LAYER
2021-05-27 16:56:00,761 INFO | (200, 512)
2021-05-27 16:56:00,762 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,762 INFO | (200, 512)
2021-05-27 16:56:00,762 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,763 INFO | (200, 512)
2021-05-27 16:56:00,770 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,770 INFO | (200, 512)
2021-05-27 16:56:00,771 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,771 INFO | (200, 512)
2021-05-27 16:56:00,780 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,780 INFO | (200, 512)
2021-05-27 16:56:00,781 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,782 INFO | (200, 512)
2021-05-27 16:56:00,788 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,788 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   6%|▌         | 33/574 [00:03<00:56,  9.50it/s]

2021-05-27 16:56:00,861 INFO | INITIAL
2021-05-27 16:56:00,862 INFO | (50, 200)
2021-05-27 16:56:00,867 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:00,867 INFO | (50, 200, 512)
2021-05-27 16:56:00,869 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:00,869 INFO | (50, 200, 512)
2021-05-27 16:56:00,869 INFO | BERT LAYER
2021-05-27 16:56:00,870 INFO | (200, 512)
2021-05-27 16:56:00,870 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,871 INFO | (200, 512)
2021-05-27 16:56:00,871 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,871 INFO | (200, 512)
2021-05-27 16:56:00,878 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,878 INFO | (200, 512)
2021-05-27 16:56:00,879 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,879 INFO | (200, 512)
2021-05-27 16:56:00,886 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,887 INFO | (200, 512)
2021-05-27 16:56:00,887 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,888 INFO | (200, 512)
2021-05-27 16:56:00,893 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,894 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   6%|▌         | 34/574 [00:03<00:56,  9.50it/s]

2021-05-27 16:56:00,965 INFO | INITIAL
2021-05-27 16:56:00,966 INFO | (50, 200)
2021-05-27 16:56:00,971 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:00,971 INFO | (50, 200, 512)
2021-05-27 16:56:00,973 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:00,973 INFO | (50, 200, 512)
2021-05-27 16:56:00,975 INFO | BERT LAYER
2021-05-27 16:56:00,976 INFO | (200, 512)
2021-05-27 16:56:00,977 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,977 INFO | (200, 512)
2021-05-27 16:56:00,978 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,978 INFO | (200, 512)
2021-05-27 16:56:00,984 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,984 INFO | (200, 512)
2021-05-27 16:56:00,985 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,985 INFO | (200, 512)
2021-05-27 16:56:00,992 INFO | BERT LAYER LOOP
2021-05-27 16:56:00,992 INFO | (200, 512)
2021-05-27 16:56:00,993 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:00,994 INFO | (200, 512)
2021-05-27 16:56:01,000 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,000 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   6%|▌         | 35/574 [00:03<00:56,  9.59it/s]

2021-05-27 16:56:01,067 INFO | INITIAL
2021-05-27 16:56:01,067 INFO | (50, 200)
2021-05-27 16:56:01,073 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:01,073 INFO | (50, 200, 512)
2021-05-27 16:56:01,074 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:01,075 INFO | (50, 200, 512)
2021-05-27 16:56:01,076 INFO | BERT LAYER
2021-05-27 16:56:01,077 INFO | (200, 512)
2021-05-27 16:56:01,077 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,078 INFO | (200, 512)
2021-05-27 16:56:01,078 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,079 INFO | (200, 512)
2021-05-27 16:56:01,087 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,088 INFO | (200, 512)
2021-05-27 16:56:01,088 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,088 INFO | (200, 512)
2021-05-27 16:56:01,094 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,094 INFO | (200, 512)
2021-05-27 16:56:01,095 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,096 INFO | (200, 512)
2021-05-27 16:56:01,101 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,102 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   6%|▋         | 36/574 [00:03<00:56,  9.58it/s]

2021-05-27 16:56:01,172 INFO | INITIAL
2021-05-27 16:56:01,173 INFO | (50, 200)
2021-05-27 16:56:01,180 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:01,181 INFO | (50, 200, 512)
2021-05-27 16:56:01,182 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:01,183 INFO | (50, 200, 512)
2021-05-27 16:56:01,184 INFO | BERT LAYER
2021-05-27 16:56:01,184 INFO | (200, 512)
2021-05-27 16:56:01,184 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,185 INFO | (200, 512)
2021-05-27 16:56:01,185 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,185 INFO | (200, 512)
2021-05-27 16:56:01,190 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,191 INFO | (200, 512)
2021-05-27 16:56:01,192 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,192 INFO | (200, 512)
2021-05-27 16:56:01,198 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,198 INFO | (200, 512)
2021-05-27 16:56:01,199 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,199 INFO | (200, 512)
2021-05-27 16:56:01,204 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,205 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   6%|▋         | 37/574 [00:03<00:56,  9.49it/s]

2021-05-27 16:56:01,280 INFO | INITIAL
2021-05-27 16:56:01,280 INFO | (50, 200)
2021-05-27 16:56:01,285 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:01,286 INFO | (50, 200, 512)
2021-05-27 16:56:01,288 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:01,288 INFO | (50, 200, 512)
2021-05-27 16:56:01,289 INFO | BERT LAYER
2021-05-27 16:56:01,289 INFO | (200, 512)
2021-05-27 16:56:01,290 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,290 INFO | (200, 512)
2021-05-27 16:56:01,291 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,292 INFO | (200, 512)
2021-05-27 16:56:01,298 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,298 INFO | (200, 512)
2021-05-27 16:56:01,299 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,299 INFO | (200, 512)
2021-05-27 16:56:01,306 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,306 INFO | (200, 512)
2021-05-27 16:56:01,307 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,309 INFO | (200, 512)
2021-05-27 16:56:01,316 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,317 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   7%|▋         | 38/574 [00:04<00:56,  9.54it/s]

2021-05-27 16:56:01,384 INFO | INITIAL
2021-05-27 16:56:01,385 INFO | (50, 200)
2021-05-27 16:56:01,392 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:01,393 INFO | (50, 200, 512)
2021-05-27 16:56:01,394 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:01,394 INFO | (50, 200, 512)
2021-05-27 16:56:01,395 INFO | BERT LAYER
2021-05-27 16:56:01,396 INFO | (200, 512)
2021-05-27 16:56:01,396 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,396 INFO | (200, 512)
2021-05-27 16:56:01,397 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,397 INFO | (200, 512)
2021-05-27 16:56:01,403 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,403 INFO | (200, 512)
2021-05-27 16:56:01,404 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,404 INFO | (200, 512)
2021-05-27 16:56:01,412 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,412 INFO | (200, 512)
2021-05-27 16:56:01,413 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,413 INFO | (200, 512)
2021-05-27 16:56:01,421 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,421 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   7%|▋         | 39/574 [00:04<00:56,  9.44it/s]

2021-05-27 16:56:01,492 INFO | INITIAL
2021-05-27 16:56:01,493 INFO | (50, 200)
2021-05-27 16:56:01,499 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:01,499 INFO | (50, 200, 512)
2021-05-27 16:56:01,500 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:01,501 INFO | (50, 200, 512)
2021-05-27 16:56:01,501 INFO | BERT LAYER
2021-05-27 16:56:01,502 INFO | (200, 512)
2021-05-27 16:56:01,502 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,502 INFO | (200, 512)
2021-05-27 16:56:01,503 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,503 INFO | (200, 512)
2021-05-27 16:56:01,509 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,510 INFO | (200, 512)
2021-05-27 16:56:01,510 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,511 INFO | (200, 512)
2021-05-27 16:56:01,515 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,517 INFO | (200, 512)
2021-05-27 16:56:01,518 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,518 INFO | (200, 512)
2021-05-27 16:56:01,523 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,524 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   7%|▋         | 40/574 [00:04<00:57,  9.33it/s]

2021-05-27 16:56:01,602 INFO | INITIAL
2021-05-27 16:56:01,602 INFO | (50, 200)
2021-05-27 16:56:01,607 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:01,608 INFO | (50, 200, 512)
2021-05-27 16:56:01,609 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:01,610 INFO | (50, 200, 512)
2021-05-27 16:56:01,611 INFO | BERT LAYER
2021-05-27 16:56:01,611 INFO | (200, 512)
2021-05-27 16:56:01,611 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,612 INFO | (200, 512)
2021-05-27 16:56:01,612 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,612 INFO | (200, 512)
2021-05-27 16:56:01,620 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,621 INFO | (200, 512)
2021-05-27 16:56:01,621 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,622 INFO | (200, 512)
2021-05-27 16:56:01,629 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,630 INFO | (200, 512)
2021-05-27 16:56:01,631 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,631 INFO | (200, 512)
2021-05-27 16:56:01,638 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,638 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   7%|▋         | 41/574 [00:04<00:56,  9.37it/s]

2021-05-27 16:56:01,708 INFO | INITIAL
2021-05-27 16:56:01,709 INFO | (50, 200)
2021-05-27 16:56:01,717 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:01,717 INFO | (50, 200, 512)
2021-05-27 16:56:01,719 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:01,719 INFO | (50, 200, 512)
2021-05-27 16:56:01,720 INFO | BERT LAYER
2021-05-27 16:56:01,720 INFO | (200, 512)
2021-05-27 16:56:01,721 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,721 INFO | (200, 512)
2021-05-27 16:56:01,722 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,722 INFO | (200, 512)
2021-05-27 16:56:01,729 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,729 INFO | (200, 512)
2021-05-27 16:56:01,730 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,730 INFO | (200, 512)
2021-05-27 16:56:01,735 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,735 INFO | (200, 512)
2021-05-27 16:56:01,736 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,736 INFO | (200, 512)
2021-05-27 16:56:01,742 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,742 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   7%|▋         | 42/574 [00:04<00:57,  9.23it/s]

2021-05-27 16:56:01,820 INFO | INITIAL
2021-05-27 16:56:01,820 INFO | (50, 200)
2021-05-27 16:56:01,826 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:01,827 INFO | (50, 200, 512)
2021-05-27 16:56:01,828 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:01,828 INFO | (50, 200, 512)
2021-05-27 16:56:01,829 INFO | BERT LAYER
2021-05-27 16:56:01,830 INFO | (200, 512)
2021-05-27 16:56:01,830 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,830 INFO | (200, 512)
2021-05-27 16:56:01,831 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,831 INFO | (200, 512)
2021-05-27 16:56:01,836 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,836 INFO | (200, 512)
2021-05-27 16:56:01,837 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,837 INFO | (200, 512)
2021-05-27 16:56:01,843 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,844 INFO | (200, 512)
2021-05-27 16:56:01,845 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,846 INFO | (200, 512)
2021-05-27 16:56:01,851 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,852 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   7%|▋         | 43/574 [00:04<00:56,  9.37it/s]

2021-05-27 16:56:01,923 INFO | INITIAL
2021-05-27 16:56:01,923 INFO | (50, 200)
2021-05-27 16:56:01,931 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:01,932 INFO | (50, 200, 512)
2021-05-27 16:56:01,933 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:01,934 INFO | (50, 200, 512)
2021-05-27 16:56:01,934 INFO | BERT LAYER
2021-05-27 16:56:01,935 INFO | (200, 512)
2021-05-27 16:56:01,935 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,936 INFO | (200, 512)
2021-05-27 16:56:01,936 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,936 INFO | (200, 512)
2021-05-27 16:56:01,942 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,943 INFO | (200, 512)
2021-05-27 16:56:01,944 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,944 INFO | (200, 512)
2021-05-27 16:56:01,950 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,950 INFO | (200, 512)
2021-05-27 16:56:01,951 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:01,951 INFO | (200, 512)
2021-05-27 16:56:01,956 INFO | BERT LAYER LOOP
2021-05-27 16:56:01,957 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   8%|▊         | 44/574 [00:04<00:55,  9.52it/s]

2021-05-27 16:56:02,024 INFO | INITIAL
2021-05-27 16:56:02,024 INFO | (50, 200)
2021-05-27 16:56:02,031 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:02,031 INFO | (50, 200, 512)
2021-05-27 16:56:02,033 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:02,033 INFO | (50, 200, 512)
2021-05-27 16:56:02,034 INFO | BERT LAYER
2021-05-27 16:56:02,034 INFO | (200, 512)
2021-05-27 16:56:02,034 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,035 INFO | (200, 512)
2021-05-27 16:56:02,035 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,035 INFO | (200, 512)
2021-05-27 16:56:02,040 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,041 INFO | (200, 512)
2021-05-27 16:56:02,041 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,042 INFO | (200, 512)
2021-05-27 16:56:02,047 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,048 INFO | (200, 512)
2021-05-27 16:56:02,048 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,049 INFO | (200, 512)
2021-05-27 16:56:02,055 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,055 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   8%|▊         | 45/574 [00:04<00:55,  9.50it/s]

2021-05-27 16:56:02,129 INFO | INITIAL
2021-05-27 16:56:02,130 INFO | (50, 200)
2021-05-27 16:56:02,134 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:02,135 INFO | (50, 200, 512)
2021-05-27 16:56:02,136 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:02,136 INFO | (50, 200, 512)
2021-05-27 16:56:02,137 INFO | BERT LAYER
2021-05-27 16:56:02,137 INFO | (200, 512)
2021-05-27 16:56:02,138 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,138 INFO | (200, 512)
2021-05-27 16:56:02,139 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,139 INFO | (200, 512)
2021-05-27 16:56:02,145 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,146 INFO | (200, 512)
2021-05-27 16:56:02,146 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,146 INFO | (200, 512)
2021-05-27 16:56:02,153 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,153 INFO | (200, 512)
2021-05-27 16:56:02,154 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,154 INFO | (200, 512)
2021-05-27 16:56:02,160 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,161 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   8%|▊         | 46/574 [00:04<00:55,  9.49it/s]

2021-05-27 16:56:02,235 INFO | INITIAL
2021-05-27 16:56:02,236 INFO | (50, 200)
2021-05-27 16:56:02,242 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:02,243 INFO | (50, 200, 512)
2021-05-27 16:56:02,245 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:02,246 INFO | (50, 200, 512)
2021-05-27 16:56:02,247 INFO | BERT LAYER
2021-05-27 16:56:02,248 INFO | (200, 512)
2021-05-27 16:56:02,248 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,249 INFO | (200, 512)
2021-05-27 16:56:02,249 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,250 INFO | (200, 512)
2021-05-27 16:56:02,256 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,257 INFO | (200, 512)
2021-05-27 16:56:02,257 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,258 INFO | (200, 512)
2021-05-27 16:56:02,264 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,264 INFO | (200, 512)
2021-05-27 16:56:02,265 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,265 INFO | (200, 512)
2021-05-27 16:56:02,270 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,271 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   8%|▊         | 47/574 [00:05<00:55,  9.45it/s]

2021-05-27 16:56:02,342 INFO | INITIAL
2021-05-27 16:56:02,343 INFO | (50, 200)
2021-05-27 16:56:02,348 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:02,349 INFO | (50, 200, 512)
2021-05-27 16:56:02,350 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:02,351 INFO | (50, 200, 512)
2021-05-27 16:56:02,352 INFO | BERT LAYER
2021-05-27 16:56:02,352 INFO | (200, 512)
2021-05-27 16:56:02,353 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,353 INFO | (200, 512)
2021-05-27 16:56:02,354 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,354 INFO | (200, 512)
2021-05-27 16:56:02,361 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,361 INFO | (200, 512)
2021-05-27 16:56:02,362 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,362 INFO | (200, 512)
2021-05-27 16:56:02,368 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,368 INFO | (200, 512)
2021-05-27 16:56:02,369 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,369 INFO | (200, 512)
2021-05-27 16:56:02,375 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,375 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   8%|▊         | 48/574 [00:05<00:55,  9.56it/s]

2021-05-27 16:56:02,444 INFO | INITIAL
2021-05-27 16:56:02,444 INFO | (50, 200)
2021-05-27 16:56:02,451 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:02,451 INFO | (50, 200, 512)
2021-05-27 16:56:02,453 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:02,453 INFO | (50, 200, 512)
2021-05-27 16:56:02,454 INFO | BERT LAYER
2021-05-27 16:56:02,454 INFO | (200, 512)
2021-05-27 16:56:02,454 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,455 INFO | (200, 512)
2021-05-27 16:56:02,455 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,456 INFO | (200, 512)
2021-05-27 16:56:02,463 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,463 INFO | (200, 512)
2021-05-27 16:56:02,464 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,464 INFO | (200, 512)
2021-05-27 16:56:02,472 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,472 INFO | (200, 512)
2021-05-27 16:56:02,473 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,473 INFO | (200, 512)
2021-05-27 16:56:02,480 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,480 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   9%|▊         | 49/574 [00:05<00:54,  9.56it/s]

2021-05-27 16:56:02,549 INFO | INITIAL
2021-05-27 16:56:02,549 INFO | (50, 200)
2021-05-27 16:56:02,556 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:02,557 INFO | (50, 200, 512)
2021-05-27 16:56:02,559 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:02,560 INFO | (50, 200, 512)
2021-05-27 16:56:02,562 INFO | BERT LAYER
2021-05-27 16:56:02,562 INFO | (200, 512)
2021-05-27 16:56:02,563 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,563 INFO | (200, 512)
2021-05-27 16:56:02,564 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,564 INFO | (200, 512)
2021-05-27 16:56:02,570 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,571 INFO | (200, 512)
2021-05-27 16:56:02,571 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,572 INFO | (200, 512)
2021-05-27 16:56:02,577 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,578 INFO | (200, 512)
2021-05-27 16:56:02,578 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,578 INFO | (200, 512)
2021-05-27 16:56:02,584 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,584 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   9%|▊         | 50/574 [00:05<00:55,  9.51it/s]

2021-05-27 16:56:02,655 INFO | INITIAL
2021-05-27 16:56:02,656 INFO | (50, 200)
2021-05-27 16:56:02,664 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:02,665 INFO | (50, 200, 512)
2021-05-27 16:56:02,667 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:02,667 INFO | (50, 200, 512)
2021-05-27 16:56:02,668 INFO | BERT LAYER
2021-05-27 16:56:02,669 INFO | (200, 512)
2021-05-27 16:56:02,670 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,670 INFO | (200, 512)
2021-05-27 16:56:02,671 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,672 INFO | (200, 512)
2021-05-27 16:56:02,678 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,679 INFO | (200, 512)
2021-05-27 16:56:02,680 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,681 INFO | (200, 512)
2021-05-27 16:56:02,688 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,688 INFO | (200, 512)
2021-05-27 16:56:02,689 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,689 INFO | (200, 512)
2021-05-27 16:56:02,696 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,696 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   9%|▉         | 51/574 [00:05<00:56,  9.24it/s]

2021-05-27 16:56:02,770 INFO | INITIAL
2021-05-27 16:56:02,771 INFO | (50, 200)
2021-05-27 16:56:02,776 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:02,777 INFO | (50, 200, 512)
2021-05-27 16:56:02,778 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:02,778 INFO | (50, 200, 512)
2021-05-27 16:56:02,779 INFO | BERT LAYER
2021-05-27 16:56:02,779 INFO | (200, 512)
2021-05-27 16:56:02,780 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,780 INFO | (200, 512)
2021-05-27 16:56:02,781 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,781 INFO | (200, 512)
2021-05-27 16:56:02,787 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,787 INFO | (200, 512)
2021-05-27 16:56:02,788 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,788 INFO | (200, 512)
2021-05-27 16:56:02,793 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,794 INFO | (200, 512)
2021-05-27 16:56:02,794 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,795 INFO | (200, 512)
2021-05-27 16:56:02,800 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,800 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   9%|▉         | 51/574 [00:05<00:56,  9.24it/s]

2021-05-27 16:56:02,868 INFO | INITIAL
2021-05-27 16:56:02,869 INFO | (50, 200)
2021-05-27 16:56:02,874 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:02,875 INFO | (50, 200, 512)
2021-05-27 16:56:02,876 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:02,876 INFO | (50, 200, 512)
2021-05-27 16:56:02,877 INFO | BERT LAYER
2021-05-27 16:56:02,878 INFO | (200, 512)
2021-05-27 16:56:02,878 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,878 INFO | (200, 512)
2021-05-27 16:56:02,879 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,879 INFO | (200, 512)
2021-05-27 16:56:02,886 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,887 INFO | (200, 512)
2021-05-27 16:56:02,888 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,888 INFO | (200, 512)
2021-05-27 16:56:02,896 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,897 INFO | (200, 512)
2021-05-27 16:56:02,897 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,898 INFO | (200, 512)
2021-05-27 16:56:02,904 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,905 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   9%|▉         | 53/574 [00:05<00:55,  9.43it/s]

2021-05-27 16:56:02,978 INFO | INITIAL
2021-05-27 16:56:02,979 INFO | (50, 200)
2021-05-27 16:56:02,985 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:02,985 INFO | (50, 200, 512)
2021-05-27 16:56:02,986 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:02,987 INFO | (50, 200, 512)
2021-05-27 16:56:02,987 INFO | BERT LAYER
2021-05-27 16:56:02,988 INFO | (200, 512)
2021-05-27 16:56:02,989 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,989 INFO | (200, 512)
2021-05-27 16:56:02,990 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,990 INFO | (200, 512)
2021-05-27 16:56:02,996 INFO | BERT LAYER LOOP
2021-05-27 16:56:02,997 INFO | (200, 512)
2021-05-27 16:56:02,997 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:02,998 INFO | (200, 512)
2021-05-27 16:56:03,004 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,004 INFO | (200, 512)
2021-05-27 16:56:03,005 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,005 INFO | (200, 512)
2021-05-27 16:56:03,011 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,012 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :   9%|▉         | 54/574 [00:05<00:55,  9.41it/s]

2021-05-27 16:56:03,084 INFO | INITIAL
2021-05-27 16:56:03,085 INFO | (50, 200)
2021-05-27 16:56:03,091 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:03,091 INFO | (50, 200, 512)
2021-05-27 16:56:03,093 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:03,093 INFO | (50, 200, 512)
2021-05-27 16:56:03,094 INFO | BERT LAYER
2021-05-27 16:56:03,095 INFO | (200, 512)
2021-05-27 16:56:03,095 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,096 INFO | (200, 512)
2021-05-27 16:56:03,096 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,096 INFO | (200, 512)
2021-05-27 16:56:03,103 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,103 INFO | (200, 512)
2021-05-27 16:56:03,104 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,104 INFO | (200, 512)
2021-05-27 16:56:03,111 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,111 INFO | (200, 512)
2021-05-27 16:56:03,112 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,112 INFO | (200, 512)
2021-05-27 16:56:03,117 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,118 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  10%|▉         | 55/574 [00:05<00:54,  9.45it/s]

2021-05-27 16:56:03,189 INFO | INITIAL
2021-05-27 16:56:03,189 INFO | (50, 200)
2021-05-27 16:56:03,195 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:03,196 INFO | (50, 200, 512)
2021-05-27 16:56:03,197 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:03,198 INFO | (50, 200, 512)
2021-05-27 16:56:03,199 INFO | BERT LAYER
2021-05-27 16:56:03,199 INFO | (200, 512)
2021-05-27 16:56:03,199 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,200 INFO | (200, 512)
2021-05-27 16:56:03,200 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,200 INFO | (200, 512)
2021-05-27 16:56:03,206 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,207 INFO | (200, 512)
2021-05-27 16:56:03,207 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,208 INFO | (200, 512)
2021-05-27 16:56:03,214 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,214 INFO | (200, 512)
2021-05-27 16:56:03,215 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,215 INFO | (200, 512)
2021-05-27 16:56:03,222 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,222 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  10%|▉         | 55/574 [00:05<00:54,  9.45it/s]

2021-05-27 16:56:03,288 INFO | INITIAL
2021-05-27 16:56:03,289 INFO | (50, 200)
2021-05-27 16:56:03,294 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:03,295 INFO | (50, 200, 512)
2021-05-27 16:56:03,296 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:03,298 INFO | (50, 200, 512)
2021-05-27 16:56:03,299 INFO | BERT LAYER
2021-05-27 16:56:03,300 INFO | (200, 512)
2021-05-27 16:56:03,300 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,301 INFO | (200, 512)
2021-05-27 16:56:03,301 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,302 INFO | (200, 512)
2021-05-27 16:56:03,307 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,308 INFO | (200, 512)
2021-05-27 16:56:03,308 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,309 INFO | (200, 512)
2021-05-27 16:56:03,314 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,316 INFO | (200, 512)
2021-05-27 16:56:03,317 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,317 INFO | (200, 512)
2021-05-27 16:56:03,322 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,323 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  10%|▉         | 57/574 [00:06<00:54,  9.45it/s]

2021-05-27 16:56:03,401 INFO | INITIAL
2021-05-27 16:56:03,401 INFO | (50, 200)
2021-05-27 16:56:03,406 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:03,406 INFO | (50, 200, 512)
2021-05-27 16:56:03,408 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:03,409 INFO | (50, 200, 512)
2021-05-27 16:56:03,410 INFO | BERT LAYER
2021-05-27 16:56:03,410 INFO | (200, 512)
2021-05-27 16:56:03,410 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,411 INFO | (200, 512)
2021-05-27 16:56:03,411 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,412 INFO | (200, 512)
2021-05-27 16:56:03,417 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,418 INFO | (200, 512)
2021-05-27 16:56:03,418 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,419 INFO | (200, 512)
2021-05-27 16:56:03,425 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,425 INFO | (200, 512)
2021-05-27 16:56:03,426 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,426 INFO | (200, 512)
2021-05-27 16:56:03,433 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,433 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  10%|█         | 58/574 [00:06<00:54,  9.39it/s]

2021-05-27 16:56:03,510 INFO | INITIAL
2021-05-27 16:56:03,510 INFO | (50, 200)
2021-05-27 16:56:03,516 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:03,517 INFO | (50, 200, 512)
2021-05-27 16:56:03,518 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:03,518 INFO | (50, 200, 512)
2021-05-27 16:56:03,519 INFO | BERT LAYER
2021-05-27 16:56:03,519 INFO | (200, 512)
2021-05-27 16:56:03,520 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,520 INFO | (200, 512)
2021-05-27 16:56:03,520 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,521 INFO | (200, 512)
2021-05-27 16:56:03,527 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,527 INFO | (200, 512)
2021-05-27 16:56:03,528 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,529 INFO | (200, 512)
2021-05-27 16:56:03,535 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,535 INFO | (200, 512)
2021-05-27 16:56:03,536 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,536 INFO | (200, 512)
2021-05-27 16:56:03,543 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,543 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  10%|█         | 59/574 [00:06<00:54,  9.44it/s]

2021-05-27 16:56:03,614 INFO | INITIAL
2021-05-27 16:56:03,614 INFO | (50, 200)
2021-05-27 16:56:03,620 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:03,620 INFO | (50, 200, 512)
2021-05-27 16:56:03,622 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:03,622 INFO | (50, 200, 512)
2021-05-27 16:56:03,623 INFO | BERT LAYER
2021-05-27 16:56:03,623 INFO | (200, 512)
2021-05-27 16:56:03,624 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,624 INFO | (200, 512)
2021-05-27 16:56:03,625 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,625 INFO | (200, 512)
2021-05-27 16:56:03,632 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,633 INFO | (200, 512)
2021-05-27 16:56:03,633 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,634 INFO | (200, 512)
2021-05-27 16:56:03,639 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,639 INFO | (200, 512)
2021-05-27 16:56:03,640 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,640 INFO | (200, 512)
2021-05-27 16:56:03,647 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,647 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  10%|█         | 60/574 [00:06<00:53,  9.53it/s]

2021-05-27 16:56:03,715 INFO | INITIAL
2021-05-27 16:56:03,716 INFO | (50, 200)
2021-05-27 16:56:03,721 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:03,721 INFO | (50, 200, 512)
2021-05-27 16:56:03,722 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:03,723 INFO | (50, 200, 512)
2021-05-27 16:56:03,723 INFO | BERT LAYER
2021-05-27 16:56:03,725 INFO | (200, 512)
2021-05-27 16:56:03,726 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,726 INFO | (200, 512)
2021-05-27 16:56:03,726 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,727 INFO | (200, 512)
2021-05-27 16:56:03,732 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,732 INFO | (200, 512)
2021-05-27 16:56:03,733 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,733 INFO | (200, 512)
2021-05-27 16:56:03,738 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,738 INFO | (200, 512)
2021-05-27 16:56:03,739 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,739 INFO | (200, 512)
2021-05-27 16:56:03,744 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,745 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  11%|█         | 61/574 [00:06<00:53,  9.65it/s]

2021-05-27 16:56:03,816 INFO | INITIAL
2021-05-27 16:56:03,816 INFO | (50, 200)
2021-05-27 16:56:03,820 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:03,821 INFO | (50, 200, 512)
2021-05-27 16:56:03,822 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:03,822 INFO | (50, 200, 512)
2021-05-27 16:56:03,823 INFO | BERT LAYER
2021-05-27 16:56:03,824 INFO | (200, 512)
2021-05-27 16:56:03,825 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,825 INFO | (200, 512)
2021-05-27 16:56:03,826 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,826 INFO | (200, 512)
2021-05-27 16:56:03,832 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,832 INFO | (200, 512)
2021-05-27 16:56:03,833 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,833 INFO | (200, 512)
2021-05-27 16:56:03,839 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,839 INFO | (200, 512)
2021-05-27 16:56:03,840 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,841 INFO | (200, 512)
2021-05-27 16:56:03,846 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,846 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  11%|█         | 62/574 [00:06<00:53,  9.61it/s]

2021-05-27 16:56:03,921 INFO | INITIAL
2021-05-27 16:56:03,921 INFO | (50, 200)
2021-05-27 16:56:03,926 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:03,927 INFO | (50, 200, 512)
2021-05-27 16:56:03,928 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:03,929 INFO | (50, 200, 512)
2021-05-27 16:56:03,930 INFO | BERT LAYER
2021-05-27 16:56:03,930 INFO | (200, 512)
2021-05-27 16:56:03,931 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,931 INFO | (200, 512)
2021-05-27 16:56:03,932 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,932 INFO | (200, 512)
2021-05-27 16:56:03,936 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,937 INFO | (200, 512)
2021-05-27 16:56:03,937 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,938 INFO | (200, 512)
2021-05-27 16:56:03,944 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,945 INFO | (200, 512)
2021-05-27 16:56:03,945 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:03,945 INFO | (200, 512)
2021-05-27 16:56:03,950 INFO | BERT LAYER LOOP
2021-05-27 16:56:03,951 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  11%|█         | 63/574 [00:06<00:53,  9.64it/s]

2021-05-27 16:56:04,024 INFO | INITIAL
2021-05-27 16:56:04,025 INFO | (50, 200)
2021-05-27 16:56:04,031 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:04,032 INFO | (50, 200, 512)
2021-05-27 16:56:04,034 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:04,034 INFO | (50, 200, 512)
2021-05-27 16:56:04,035 INFO | BERT LAYER
2021-05-27 16:56:04,035 INFO | (200, 512)
2021-05-27 16:56:04,036 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,036 INFO | (200, 512)
2021-05-27 16:56:04,036 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,037 INFO | (200, 512)
2021-05-27 16:56:04,043 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,044 INFO | (200, 512)
2021-05-27 16:56:04,044 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,045 INFO | (200, 512)
2021-05-27 16:56:04,052 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,053 INFO | (200, 512)
2021-05-27 16:56:04,053 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,054 INFO | (200, 512)
2021-05-27 16:56:04,060 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,061 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  11%|█         | 64/574 [00:06<00:53,  9.53it/s]

2021-05-27 16:56:04,132 INFO | INITIAL
2021-05-27 16:56:04,132 INFO | (50, 200)
2021-05-27 16:56:04,137 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:04,138 INFO | (50, 200, 512)
2021-05-27 16:56:04,139 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:04,142 INFO | (50, 200, 512)
2021-05-27 16:56:04,143 INFO | BERT LAYER
2021-05-27 16:56:04,144 INFO | (200, 512)
2021-05-27 16:56:04,144 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,144 INFO | (200, 512)
2021-05-27 16:56:04,145 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,146 INFO | (200, 512)
2021-05-27 16:56:04,153 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,153 INFO | (200, 512)
2021-05-27 16:56:04,154 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,154 INFO | (200, 512)
2021-05-27 16:56:04,161 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,161 INFO | (200, 512)
2021-05-27 16:56:04,162 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,163 INFO | (200, 512)
2021-05-27 16:56:04,170 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,171 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  11%|█▏        | 65/574 [00:06<00:54,  9.34it/s]

2021-05-27 16:56:04,244 INFO | INITIAL
2021-05-27 16:56:04,245 INFO | (50, 200)
2021-05-27 16:56:04,251 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:04,251 INFO | (50, 200, 512)
2021-05-27 16:56:04,253 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:04,253 INFO | (50, 200, 512)
2021-05-27 16:56:04,254 INFO | BERT LAYER
2021-05-27 16:56:04,254 INFO | (200, 512)
2021-05-27 16:56:04,255 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,255 INFO | (200, 512)
2021-05-27 16:56:04,256 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,256 INFO | (200, 512)
2021-05-27 16:56:04,263 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,264 INFO | (200, 512)
2021-05-27 16:56:04,265 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,265 INFO | (200, 512)
2021-05-27 16:56:04,271 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,271 INFO | (200, 512)
2021-05-27 16:56:04,272 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,272 INFO | (200, 512)
2021-05-27 16:56:04,279 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,279 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  11%|█▏        | 66/574 [00:07<01:08,  7.44it/s]

2021-05-27 16:56:04,444 INFO | INITIAL
2021-05-27 16:56:04,444 INFO | (50, 200)
2021-05-27 16:56:04,450 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:04,450 INFO | (50, 200, 512)
2021-05-27 16:56:04,452 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:04,452 INFO | (50, 200, 512)
2021-05-27 16:56:04,453 INFO | BERT LAYER
2021-05-27 16:56:04,454 INFO | (200, 512)
2021-05-27 16:56:04,454 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,455 INFO | (200, 512)
2021-05-27 16:56:04,455 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,456 INFO | (200, 512)
2021-05-27 16:56:04,462 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,462 INFO | (200, 512)
2021-05-27 16:56:04,462 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,463 INFO | (200, 512)
2021-05-27 16:56:04,468 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,468 INFO | (200, 512)
2021-05-27 16:56:04,469 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,469 INFO | (200, 512)
2021-05-27 16:56:04,474 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,474 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  12%|█▏        | 67/574 [00:07<01:03,  8.02it/s]

2021-05-27 16:56:04,545 INFO | INITIAL
2021-05-27 16:56:04,545 INFO | (50, 200)
2021-05-27 16:56:04,550 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:04,550 INFO | (50, 200, 512)
2021-05-27 16:56:04,552 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:04,552 INFO | (50, 200, 512)
2021-05-27 16:56:04,553 INFO | BERT LAYER
2021-05-27 16:56:04,553 INFO | (200, 512)
2021-05-27 16:56:04,553 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,554 INFO | (200, 512)
2021-05-27 16:56:04,554 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,554 INFO | (200, 512)
2021-05-27 16:56:04,560 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,561 INFO | (200, 512)
2021-05-27 16:56:04,561 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,562 INFO | (200, 512)
2021-05-27 16:56:04,568 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,569 INFO | (200, 512)
2021-05-27 16:56:04,569 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,569 INFO | (200, 512)
2021-05-27 16:56:04,575 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,575 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  12%|█▏        | 68/574 [00:07<01:00,  8.43it/s]

2021-05-27 16:56:04,650 INFO | INITIAL
2021-05-27 16:56:04,650 INFO | (50, 200)
2021-05-27 16:56:04,655 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:04,655 INFO | (50, 200, 512)
2021-05-27 16:56:04,657 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:04,658 INFO | (50, 200, 512)
2021-05-27 16:56:04,659 INFO | BERT LAYER
2021-05-27 16:56:04,660 INFO | (200, 512)
2021-05-27 16:56:04,660 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,660 INFO | (200, 512)
2021-05-27 16:56:04,661 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,661 INFO | (200, 512)
2021-05-27 16:56:04,666 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,667 INFO | (200, 512)
2021-05-27 16:56:04,667 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,668 INFO | (200, 512)
2021-05-27 16:56:04,673 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,674 INFO | (200, 512)
2021-05-27 16:56:04,674 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,675 INFO | (200, 512)
2021-05-27 16:56:04,680 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,680 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  12%|█▏        | 69/574 [00:07<00:57,  8.78it/s]

2021-05-27 16:56:04,752 INFO | INITIAL
2021-05-27 16:56:04,753 INFO | (50, 200)
2021-05-27 16:56:04,758 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:04,759 INFO | (50, 200, 512)
2021-05-27 16:56:04,761 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:04,762 INFO | (50, 200, 512)
2021-05-27 16:56:04,763 INFO | BERT LAYER
2021-05-27 16:56:04,763 INFO | (200, 512)
2021-05-27 16:56:04,764 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,764 INFO | (200, 512)
2021-05-27 16:56:04,765 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,766 INFO | (200, 512)
2021-05-27 16:56:04,773 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,773 INFO | (200, 512)
2021-05-27 16:56:04,774 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,775 INFO | (200, 512)
2021-05-27 16:56:04,783 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,783 INFO | (200, 512)
2021-05-27 16:56:04,784 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,784 INFO | (200, 512)
2021-05-27 16:56:04,790 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,791 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  12%|█▏        | 70/574 [00:07<00:57,  8.81it/s]

2021-05-27 16:56:04,865 INFO | INITIAL
2021-05-27 16:56:04,865 INFO | (50, 200)
2021-05-27 16:56:04,870 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:04,871 INFO | (50, 200, 512)
2021-05-27 16:56:04,872 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:04,879 INFO | (50, 200, 512)
2021-05-27 16:56:04,885 INFO | BERT LAYER
2021-05-27 16:56:04,886 INFO | (200, 512)
2021-05-27 16:56:04,892 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,893 INFO | (200, 512)
2021-05-27 16:56:04,895 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,895 INFO | (200, 512)
2021-05-27 16:56:04,902 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,903 INFO | (200, 512)
2021-05-27 16:56:04,903 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,904 INFO | (200, 512)
2021-05-27 16:56:04,911 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,912 INFO | (200, 512)
2021-05-27 16:56:04,912 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:04,913 INFO | (200, 512)
2021-05-27 16:56:04,918 INFO | BERT LAYER LOOP
2021-05-27 16:56:04,918 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  12%|█▏        | 71/574 [00:07<00:58,  8.57it/s]

2021-05-27 16:56:04,989 INFO | INITIAL
2021-05-27 16:56:04,990 INFO | (50, 200)
2021-05-27 16:56:04,999 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:04,999 INFO | (50, 200, 512)
2021-05-27 16:56:05,001 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:05,001 INFO | (50, 200, 512)
2021-05-27 16:56:05,002 INFO | BERT LAYER
2021-05-27 16:56:05,002 INFO | (200, 512)
2021-05-27 16:56:05,003 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,003 INFO | (200, 512)
2021-05-27 16:56:05,003 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,004 INFO | (200, 512)
2021-05-27 16:56:05,010 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,011 INFO | (200, 512)
2021-05-27 16:56:05,011 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,012 INFO | (200, 512)
2021-05-27 16:56:05,017 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,017 INFO | (200, 512)
2021-05-27 16:56:05,018 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,018 INFO | (200, 512)
2021-05-27 16:56:05,024 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,025 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  13%|█▎        | 72/574 [00:07<00:58,  8.59it/s]

2021-05-27 16:56:05,105 INFO | INITIAL
2021-05-27 16:56:05,105 INFO | (50, 200)
2021-05-27 16:56:05,113 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:05,113 INFO | (50, 200, 512)
2021-05-27 16:56:05,115 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:05,115 INFO | (50, 200, 512)
2021-05-27 16:56:05,116 INFO | BERT LAYER
2021-05-27 16:56:05,117 INFO | (200, 512)
2021-05-27 16:56:05,117 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,117 INFO | (200, 512)
2021-05-27 16:56:05,118 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,119 INFO | (200, 512)
2021-05-27 16:56:05,125 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,126 INFO | (200, 512)
2021-05-27 16:56:05,127 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,127 INFO | (200, 512)
2021-05-27 16:56:05,132 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,133 INFO | (200, 512)
2021-05-27 16:56:05,133 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,134 INFO | (200, 512)
2021-05-27 16:56:05,139 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,140 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  13%|█▎        | 73/574 [00:07<00:58,  8.60it/s]

2021-05-27 16:56:05,221 INFO | INITIAL
2021-05-27 16:56:05,221 INFO | (50, 200)
2021-05-27 16:56:05,228 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:05,228 INFO | (50, 200, 512)
2021-05-27 16:56:05,230 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:05,231 INFO | (50, 200, 512)
2021-05-27 16:56:05,232 INFO | BERT LAYER
2021-05-27 16:56:05,233 INFO | (200, 512)
2021-05-27 16:56:05,234 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,234 INFO | (200, 512)
2021-05-27 16:56:05,235 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,236 INFO | (200, 512)
2021-05-27 16:56:05,242 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,242 INFO | (200, 512)
2021-05-27 16:56:05,243 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,243 INFO | (200, 512)
2021-05-27 16:56:05,250 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,251 INFO | (200, 512)
2021-05-27 16:56:05,251 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,252 INFO | (200, 512)
2021-05-27 16:56:05,258 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,259 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  13%|█▎        | 74/574 [00:08<00:57,  8.68it/s]

2021-05-27 16:56:05,333 INFO | INITIAL
2021-05-27 16:56:05,334 INFO | (50, 200)
2021-05-27 16:56:05,339 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:05,340 INFO | (50, 200, 512)
2021-05-27 16:56:05,341 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:05,342 INFO | (50, 200, 512)
2021-05-27 16:56:05,343 INFO | BERT LAYER
2021-05-27 16:56:05,343 INFO | (200, 512)
2021-05-27 16:56:05,344 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,345 INFO | (200, 512)
2021-05-27 16:56:05,345 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,345 INFO | (200, 512)
2021-05-27 16:56:05,351 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,352 INFO | (200, 512)
2021-05-27 16:56:05,352 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,352 INFO | (200, 512)
2021-05-27 16:56:05,358 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,359 INFO | (200, 512)
2021-05-27 16:56:05,359 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,360 INFO | (200, 512)
2021-05-27 16:56:05,367 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,368 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  13%|█▎        | 75/574 [00:08<00:55,  8.92it/s]

2021-05-27 16:56:05,438 INFO | INITIAL
2021-05-27 16:56:05,439 INFO | (50, 200)
2021-05-27 16:56:05,444 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:05,445 INFO | (50, 200, 512)
2021-05-27 16:56:05,446 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:05,447 INFO | (50, 200, 512)
2021-05-27 16:56:05,448 INFO | BERT LAYER
2021-05-27 16:56:05,448 INFO | (200, 512)
2021-05-27 16:56:05,448 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,449 INFO | (200, 512)
2021-05-27 16:56:05,449 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,450 INFO | (200, 512)
2021-05-27 16:56:05,456 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,457 INFO | (200, 512)
2021-05-27 16:56:05,458 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,459 INFO | (200, 512)
2021-05-27 16:56:05,466 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,466 INFO | (200, 512)
2021-05-27 16:56:05,467 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,468 INFO | (200, 512)
2021-05-27 16:56:05,475 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,475 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  13%|█▎        | 76/574 [00:08<00:55,  9.03it/s]

2021-05-27 16:56:05,546 INFO | INITIAL
2021-05-27 16:56:05,546 INFO | (50, 200)
2021-05-27 16:56:05,552 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:05,552 INFO | (50, 200, 512)
2021-05-27 16:56:05,554 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:05,554 INFO | (50, 200, 512)
2021-05-27 16:56:05,555 INFO | BERT LAYER
2021-05-27 16:56:05,555 INFO | (200, 512)
2021-05-27 16:56:05,556 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,556 INFO | (200, 512)
2021-05-27 16:56:05,556 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,558 INFO | (200, 512)
2021-05-27 16:56:05,565 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,566 INFO | (200, 512)
2021-05-27 16:56:05,567 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,567 INFO | (200, 512)
2021-05-27 16:56:05,575 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,575 INFO | (200, 512)
2021-05-27 16:56:05,576 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,576 INFO | (200, 512)
2021-05-27 16:56:05,583 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,583 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  13%|█▎        | 77/574 [00:08<00:55,  8.95it/s]

2021-05-27 16:56:05,661 INFO | INITIAL
2021-05-27 16:56:05,662 INFO | (50, 200)
2021-05-27 16:56:05,667 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:05,668 INFO | (50, 200, 512)
2021-05-27 16:56:05,669 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:05,670 INFO | (50, 200, 512)
2021-05-27 16:56:05,670 INFO | BERT LAYER
2021-05-27 16:56:05,671 INFO | (200, 512)
2021-05-27 16:56:05,671 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,672 INFO | (200, 512)
2021-05-27 16:56:05,672 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,672 INFO | (200, 512)
2021-05-27 16:56:05,679 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,680 INFO | (200, 512)
2021-05-27 16:56:05,680 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,681 INFO | (200, 512)
2021-05-27 16:56:05,686 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,686 INFO | (200, 512)
2021-05-27 16:56:05,687 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,687 INFO | (200, 512)
2021-05-27 16:56:05,693 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,694 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  14%|█▎        | 78/574 [00:08<00:55,  9.00it/s]

2021-05-27 16:56:05,770 INFO | INITIAL
2021-05-27 16:56:05,770 INFO | (50, 200)
2021-05-27 16:56:05,776 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:05,777 INFO | (50, 200, 512)
2021-05-27 16:56:05,778 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:05,779 INFO | (50, 200, 512)
2021-05-27 16:56:05,780 INFO | BERT LAYER
2021-05-27 16:56:05,781 INFO | (200, 512)
2021-05-27 16:56:05,781 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,782 INFO | (200, 512)
2021-05-27 16:56:05,782 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,783 INFO | (200, 512)
2021-05-27 16:56:05,789 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,790 INFO | (200, 512)
2021-05-27 16:56:05,792 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,794 INFO | (200, 512)
2021-05-27 16:56:05,801 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,801 INFO | (200, 512)
2021-05-27 16:56:05,801 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,802 INFO | (200, 512)
2021-05-27 16:56:05,808 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,809 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  14%|█▍        | 79/574 [00:08<00:54,  9.02it/s]

2021-05-27 16:56:05,880 INFO | INITIAL
2021-05-27 16:56:05,880 INFO | (50, 200)
2021-05-27 16:56:05,885 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:05,886 INFO | (50, 200, 512)
2021-05-27 16:56:05,887 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:05,888 INFO | (50, 200, 512)
2021-05-27 16:56:05,888 INFO | BERT LAYER
2021-05-27 16:56:05,889 INFO | (200, 512)
2021-05-27 16:56:05,889 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,889 INFO | (200, 512)
2021-05-27 16:56:05,890 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,890 INFO | (200, 512)
2021-05-27 16:56:05,896 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,897 INFO | (200, 512)
2021-05-27 16:56:05,897 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,898 INFO | (200, 512)
2021-05-27 16:56:05,903 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,904 INFO | (200, 512)
2021-05-27 16:56:05,904 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,904 INFO | (200, 512)
2021-05-27 16:56:05,910 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,911 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  14%|█▍        | 80/574 [00:08<00:53,  9.20it/s]

2021-05-27 16:56:05,983 INFO | INITIAL
2021-05-27 16:56:05,984 INFO | (50, 200)
2021-05-27 16:56:05,989 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:05,990 INFO | (50, 200, 512)
2021-05-27 16:56:05,992 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:05,992 INFO | (50, 200, 512)
2021-05-27 16:56:05,993 INFO | BERT LAYER
2021-05-27 16:56:05,994 INFO | (200, 512)
2021-05-27 16:56:05,994 INFO | BERT LAYER LOOP
2021-05-27 16:56:05,995 INFO | (200, 512)
2021-05-27 16:56:05,996 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:05,996 INFO | (200, 512)
2021-05-27 16:56:06,001 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,002 INFO | (200, 512)
2021-05-27 16:56:06,002 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,004 INFO | (200, 512)
2021-05-27 16:56:06,009 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,009 INFO | (200, 512)
2021-05-27 16:56:06,011 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,011 INFO | (200, 512)
2021-05-27 16:56:06,017 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,018 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  14%|█▍        | 81/574 [00:08<00:53,  9.16it/s]

2021-05-27 16:56:06,094 INFO | INITIAL
2021-05-27 16:56:06,095 INFO | (50, 200)
2021-05-27 16:56:06,102 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:06,103 INFO | (50, 200, 512)
2021-05-27 16:56:06,104 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:06,105 INFO | (50, 200, 512)
2021-05-27 16:56:06,106 INFO | BERT LAYER
2021-05-27 16:56:06,107 INFO | (200, 512)
2021-05-27 16:56:06,108 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,108 INFO | (200, 512)
2021-05-27 16:56:06,109 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,109 INFO | (200, 512)
2021-05-27 16:56:06,116 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,117 INFO | (200, 512)
2021-05-27 16:56:06,117 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,118 INFO | (200, 512)
2021-05-27 16:56:06,123 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,123 INFO | (200, 512)
2021-05-27 16:56:06,124 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,125 INFO | (200, 512)
2021-05-27 16:56:06,131 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,131 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  14%|█▍        | 82/574 [00:08<00:54,  9.05it/s]

2021-05-27 16:56:06,208 INFO | INITIAL
2021-05-27 16:56:06,209 INFO | (50, 200)
2021-05-27 16:56:06,215 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:06,215 INFO | (50, 200, 512)
2021-05-27 16:56:06,217 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:06,217 INFO | (50, 200, 512)
2021-05-27 16:56:06,218 INFO | BERT LAYER
2021-05-27 16:56:06,219 INFO | (200, 512)
2021-05-27 16:56:06,219 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,219 INFO | (200, 512)
2021-05-27 16:56:06,220 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,220 INFO | (200, 512)
2021-05-27 16:56:06,228 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,228 INFO | (200, 512)
2021-05-27 16:56:06,229 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,229 INFO | (200, 512)
2021-05-27 16:56:06,237 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,237 INFO | (200, 512)
2021-05-27 16:56:06,238 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,238 INFO | (200, 512)
2021-05-27 16:56:06,246 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,247 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  14%|█▍        | 83/574 [00:08<00:53,  9.16it/s]

2021-05-27 16:56:06,314 INFO | INITIAL
2021-05-27 16:56:06,314 INFO | (50, 200)
2021-05-27 16:56:06,319 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:06,319 INFO | (50, 200, 512)
2021-05-27 16:56:06,321 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:06,321 INFO | (50, 200, 512)
2021-05-27 16:56:06,322 INFO | BERT LAYER
2021-05-27 16:56:06,322 INFO | (200, 512)
2021-05-27 16:56:06,322 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,323 INFO | (200, 512)
2021-05-27 16:56:06,323 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,324 INFO | (200, 512)
2021-05-27 16:56:06,331 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,332 INFO | (200, 512)
2021-05-27 16:56:06,332 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,333 INFO | (200, 512)
2021-05-27 16:56:06,340 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,340 INFO | (200, 512)
2021-05-27 16:56:06,341 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,341 INFO | (200, 512)
2021-05-27 16:56:06,347 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,348 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  15%|█▍        | 84/574 [00:09<00:53,  9.20it/s]

2021-05-27 16:56:06,421 INFO | INITIAL
2021-05-27 16:56:06,421 INFO | (50, 200)
2021-05-27 16:56:06,427 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:06,428 INFO | (50, 200, 512)
2021-05-27 16:56:06,430 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:06,430 INFO | (50, 200, 512)
2021-05-27 16:56:06,431 INFO | BERT LAYER
2021-05-27 16:56:06,432 INFO | (200, 512)
2021-05-27 16:56:06,434 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,434 INFO | (200, 512)
2021-05-27 16:56:06,435 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,435 INFO | (200, 512)
2021-05-27 16:56:06,442 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,443 INFO | (200, 512)
2021-05-27 16:56:06,443 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,444 INFO | (200, 512)
2021-05-27 16:56:06,449 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,449 INFO | (200, 512)
2021-05-27 16:56:06,450 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,450 INFO | (200, 512)
2021-05-27 16:56:06,455 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,455 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  15%|█▍        | 85/574 [00:09<00:53,  9.15it/s]

2021-05-27 16:56:06,532 INFO | INITIAL
2021-05-27 16:56:06,532 INFO | (50, 200)
2021-05-27 16:56:06,537 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:06,537 INFO | (50, 200, 512)
2021-05-27 16:56:06,539 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:06,539 INFO | (50, 200, 512)
2021-05-27 16:56:06,540 INFO | BERT LAYER
2021-05-27 16:56:06,540 INFO | (200, 512)
2021-05-27 16:56:06,540 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,541 INFO | (200, 512)
2021-05-27 16:56:06,541 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,542 INFO | (200, 512)
2021-05-27 16:56:06,548 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,548 INFO | (200, 512)
2021-05-27 16:56:06,549 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,549 INFO | (200, 512)
2021-05-27 16:56:06,554 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,554 INFO | (200, 512)
2021-05-27 16:56:06,555 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,555 INFO | (200, 512)
2021-05-27 16:56:06,560 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,561 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  15%|█▍        | 86/574 [00:09<00:52,  9.21it/s]

2021-05-27 16:56:06,639 INFO | INITIAL
2021-05-27 16:56:06,639 INFO | (50, 200)
2021-05-27 16:56:06,645 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:06,646 INFO | (50, 200, 512)
2021-05-27 16:56:06,647 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:06,648 INFO | (50, 200, 512)
2021-05-27 16:56:06,648 INFO | BERT LAYER
2021-05-27 16:56:06,649 INFO | (200, 512)
2021-05-27 16:56:06,650 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,650 INFO | (200, 512)
2021-05-27 16:56:06,651 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,651 INFO | (200, 512)
2021-05-27 16:56:06,657 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,657 INFO | (200, 512)
2021-05-27 16:56:06,658 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,658 INFO | (200, 512)
2021-05-27 16:56:06,665 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,666 INFO | (200, 512)
2021-05-27 16:56:06,666 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,667 INFO | (200, 512)
2021-05-27 16:56:06,673 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,673 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  15%|█▌        | 87/574 [00:09<00:54,  9.01it/s]

2021-05-27 16:56:06,755 INFO | INITIAL
2021-05-27 16:56:06,756 INFO | (50, 200)
2021-05-27 16:56:06,764 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:06,764 INFO | (50, 200, 512)
2021-05-27 16:56:06,766 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:06,767 INFO | (50, 200, 512)
2021-05-27 16:56:06,767 INFO | BERT LAYER
2021-05-27 16:56:06,768 INFO | (200, 512)
2021-05-27 16:56:06,768 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,768 INFO | (200, 512)
2021-05-27 16:56:06,769 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,770 INFO | (200, 512)
2021-05-27 16:56:06,777 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,777 INFO | (200, 512)
2021-05-27 16:56:06,778 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,778 INFO | (200, 512)
2021-05-27 16:56:06,784 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,785 INFO | (200, 512)
2021-05-27 16:56:06,785 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,785 INFO | (200, 512)
2021-05-27 16:56:06,792 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,792 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  15%|█▌        | 88/574 [00:09<00:53,  9.00it/s]

2021-05-27 16:56:06,867 INFO | INITIAL
2021-05-27 16:56:06,867 INFO | (50, 200)
2021-05-27 16:56:06,874 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:06,874 INFO | (50, 200, 512)
2021-05-27 16:56:06,876 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:06,876 INFO | (50, 200, 512)
2021-05-27 16:56:06,877 INFO | BERT LAYER
2021-05-27 16:56:06,877 INFO | (200, 512)
2021-05-27 16:56:06,878 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,878 INFO | (200, 512)
2021-05-27 16:56:06,879 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,879 INFO | (200, 512)
2021-05-27 16:56:06,886 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,886 INFO | (200, 512)
2021-05-27 16:56:06,887 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,887 INFO | (200, 512)
2021-05-27 16:56:06,895 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,896 INFO | (200, 512)
2021-05-27 16:56:06,896 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,897 INFO | (200, 512)
2021-05-27 16:56:06,904 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,905 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  16%|█▌        | 89/574 [00:09<00:53,  9.06it/s]

2021-05-27 16:56:06,975 INFO | INITIAL
2021-05-27 16:56:06,976 INFO | (50, 200)
2021-05-27 16:56:06,981 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:06,982 INFO | (50, 200, 512)
2021-05-27 16:56:06,983 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:06,984 INFO | (50, 200, 512)
2021-05-27 16:56:06,984 INFO | BERT LAYER
2021-05-27 16:56:06,985 INFO | (200, 512)
2021-05-27 16:56:06,985 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,985 INFO | (200, 512)
2021-05-27 16:56:06,986 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,986 INFO | (200, 512)
2021-05-27 16:56:06,993 INFO | BERT LAYER LOOP
2021-05-27 16:56:06,994 INFO | (200, 512)
2021-05-27 16:56:06,994 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:06,995 INFO | (200, 512)
2021-05-27 16:56:07,001 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,002 INFO | (200, 512)
2021-05-27 16:56:07,002 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,003 INFO | (200, 512)
2021-05-27 16:56:07,010 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,010 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  16%|█▌        | 90/574 [00:09<00:52,  9.17it/s]

2021-05-27 16:56:07,081 INFO | INITIAL
2021-05-27 16:56:07,082 INFO | (50, 200)
2021-05-27 16:56:07,087 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:07,087 INFO | (50, 200, 512)
2021-05-27 16:56:07,089 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:07,089 INFO | (50, 200, 512)
2021-05-27 16:56:07,091 INFO | BERT LAYER
2021-05-27 16:56:07,092 INFO | (200, 512)
2021-05-27 16:56:07,092 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,093 INFO | (200, 512)
2021-05-27 16:56:07,093 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,094 INFO | (200, 512)
2021-05-27 16:56:07,099 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,100 INFO | (200, 512)
2021-05-27 16:56:07,100 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,101 INFO | (200, 512)
2021-05-27 16:56:07,108 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,109 INFO | (200, 512)
2021-05-27 16:56:07,109 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,110 INFO | (200, 512)
2021-05-27 16:56:07,116 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,117 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  16%|█▌        | 91/574 [00:09<00:53,  8.97it/s]

2021-05-27 16:56:07,199 INFO | INITIAL
2021-05-27 16:56:07,199 INFO | (50, 200)
2021-05-27 16:56:07,205 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:07,205 INFO | (50, 200, 512)
2021-05-27 16:56:07,207 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:07,208 INFO | (50, 200, 512)
2021-05-27 16:56:07,209 INFO | BERT LAYER
2021-05-27 16:56:07,210 INFO | (200, 512)
2021-05-27 16:56:07,211 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,211 INFO | (200, 512)
2021-05-27 16:56:07,212 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,212 INFO | (200, 512)
2021-05-27 16:56:07,219 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,219 INFO | (200, 512)
2021-05-27 16:56:07,220 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,220 INFO | (200, 512)
2021-05-27 16:56:07,227 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,227 INFO | (200, 512)
2021-05-27 16:56:07,228 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,228 INFO | (200, 512)
2021-05-27 16:56:07,235 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,236 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  16%|█▌        | 92/574 [00:10<00:55,  8.72it/s]

2021-05-27 16:56:07,321 INFO | INITIAL
2021-05-27 16:56:07,321 INFO | (50, 200)
2021-05-27 16:56:07,328 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:07,328 INFO | (50, 200, 512)
2021-05-27 16:56:07,330 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:07,330 INFO | (50, 200, 512)
2021-05-27 16:56:07,331 INFO | BERT LAYER
2021-05-27 16:56:07,331 INFO | (200, 512)
2021-05-27 16:56:07,332 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,332 INFO | (200, 512)
2021-05-27 16:56:07,333 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,333 INFO | (200, 512)
2021-05-27 16:56:07,340 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,340 INFO | (200, 512)
2021-05-27 16:56:07,341 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,341 INFO | (200, 512)
2021-05-27 16:56:07,349 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,349 INFO | (200, 512)
2021-05-27 16:56:07,349 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,350 INFO | (200, 512)
2021-05-27 16:56:07,356 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,357 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  16%|█▌        | 93/574 [00:10<00:54,  8.81it/s]

2021-05-27 16:56:07,432 INFO | INITIAL
2021-05-27 16:56:07,432 INFO | (50, 200)
2021-05-27 16:56:07,439 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:07,441 INFO | (50, 200, 512)
2021-05-27 16:56:07,442 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:07,443 INFO | (50, 200, 512)
2021-05-27 16:56:07,444 INFO | BERT LAYER
2021-05-27 16:56:07,445 INFO | (200, 512)
2021-05-27 16:56:07,445 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,446 INFO | (200, 512)
2021-05-27 16:56:07,446 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,447 INFO | (200, 512)
2021-05-27 16:56:07,453 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,454 INFO | (200, 512)
2021-05-27 16:56:07,454 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,454 INFO | (200, 512)
2021-05-27 16:56:07,461 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,462 INFO | (200, 512)
2021-05-27 16:56:07,462 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,463 INFO | (200, 512)
2021-05-27 16:56:07,468 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,469 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  16%|█▋        | 94/574 [00:10<00:53,  8.89it/s]

2021-05-27 16:56:07,542 INFO | INITIAL
2021-05-27 16:56:07,542 INFO | (50, 200)
2021-05-27 16:56:07,548 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:07,548 INFO | (50, 200, 512)
2021-05-27 16:56:07,550 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:07,550 INFO | (50, 200, 512)
2021-05-27 16:56:07,551 INFO | BERT LAYER
2021-05-27 16:56:07,551 INFO | (200, 512)
2021-05-27 16:56:07,552 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,552 INFO | (200, 512)
2021-05-27 16:56:07,552 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,553 INFO | (200, 512)
2021-05-27 16:56:07,558 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,559 INFO | (200, 512)
2021-05-27 16:56:07,559 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,560 INFO | (200, 512)
2021-05-27 16:56:07,565 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,565 INFO | (200, 512)
2021-05-27 16:56:07,566 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,566 INFO | (200, 512)
2021-05-27 16:56:07,572 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,573 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  17%|█▋        | 95/574 [00:10<00:53,  8.92it/s]

2021-05-27 16:56:07,653 INFO | INITIAL
2021-05-27 16:56:07,653 INFO | (50, 200)
2021-05-27 16:56:07,660 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:07,661 INFO | (50, 200, 512)
2021-05-27 16:56:07,664 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:07,664 INFO | (50, 200, 512)
2021-05-27 16:56:07,665 INFO | BERT LAYER
2021-05-27 16:56:07,665 INFO | (200, 512)
2021-05-27 16:56:07,666 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,666 INFO | (200, 512)
2021-05-27 16:56:07,667 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,668 INFO | (200, 512)
2021-05-27 16:56:07,675 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,675 INFO | (200, 512)
2021-05-27 16:56:07,676 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,676 INFO | (200, 512)
2021-05-27 16:56:07,683 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,684 INFO | (200, 512)
2021-05-27 16:56:07,685 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,685 INFO | (200, 512)
2021-05-27 16:56:07,691 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,691 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  17%|█▋        | 96/574 [00:10<00:53,  8.92it/s]

2021-05-27 16:56:07,765 INFO | INITIAL
2021-05-27 16:56:07,766 INFO | (50, 200)
2021-05-27 16:56:07,773 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:07,774 INFO | (50, 200, 512)
2021-05-27 16:56:07,775 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:07,776 INFO | (50, 200, 512)
2021-05-27 16:56:07,777 INFO | BERT LAYER
2021-05-27 16:56:07,777 INFO | (200, 512)
2021-05-27 16:56:07,778 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,778 INFO | (200, 512)
2021-05-27 16:56:07,778 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,779 INFO | (200, 512)
2021-05-27 16:56:07,784 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,785 INFO | (200, 512)
2021-05-27 16:56:07,785 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,786 INFO | (200, 512)
2021-05-27 16:56:07,792 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,793 INFO | (200, 512)
2021-05-27 16:56:07,794 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,794 INFO | (200, 512)
2021-05-27 16:56:07,801 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,801 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  17%|█▋        | 97/574 [00:10<00:52,  9.01it/s]

2021-05-27 16:56:07,874 INFO | INITIAL
2021-05-27 16:56:07,874 INFO | (50, 200)
2021-05-27 16:56:07,880 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:07,881 INFO | (50, 200, 512)
2021-05-27 16:56:07,882 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:07,883 INFO | (50, 200, 512)
2021-05-27 16:56:07,884 INFO | BERT LAYER
2021-05-27 16:56:07,885 INFO | (200, 512)
2021-05-27 16:56:07,885 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,886 INFO | (200, 512)
2021-05-27 16:56:07,886 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,887 INFO | (200, 512)
2021-05-27 16:56:07,894 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,895 INFO | (200, 512)
2021-05-27 16:56:07,895 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,896 INFO | (200, 512)
2021-05-27 16:56:07,904 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,905 INFO | (200, 512)
2021-05-27 16:56:07,906 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:07,907 INFO | (200, 512)
2021-05-27 16:56:07,914 INFO | BERT LAYER LOOP
2021-05-27 16:56:07,915 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  17%|█▋        | 98/574 [00:10<00:53,  8.87it/s]

2021-05-27 16:56:07,990 INFO | INITIAL
2021-05-27 16:56:07,991 INFO | (50, 200)
2021-05-27 16:56:07,998 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:07,999 INFO | (50, 200, 512)
2021-05-27 16:56:08,002 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:08,003 INFO | (50, 200, 512)
2021-05-27 16:56:08,004 INFO | BERT LAYER
2021-05-27 16:56:08,004 INFO | (200, 512)
2021-05-27 16:56:08,005 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,005 INFO | (200, 512)
2021-05-27 16:56:08,005 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,006 INFO | (200, 512)
2021-05-27 16:56:08,013 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,013 INFO | (200, 512)
2021-05-27 16:56:08,014 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,014 INFO | (200, 512)
2021-05-27 16:56:08,021 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,022 INFO | (200, 512)
2021-05-27 16:56:08,022 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,022 INFO | (200, 512)
2021-05-27 16:56:08,029 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,029 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  17%|█▋        | 99/574 [00:10<00:53,  8.83it/s]

2021-05-27 16:56:08,105 INFO | INITIAL
2021-05-27 16:56:08,106 INFO | (50, 200)
2021-05-27 16:56:08,113 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:08,114 INFO | (50, 200, 512)
2021-05-27 16:56:08,115 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:08,116 INFO | (50, 200, 512)
2021-05-27 16:56:08,117 INFO | BERT LAYER
2021-05-27 16:56:08,118 INFO | (200, 512)
2021-05-27 16:56:08,118 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,118 INFO | (200, 512)
2021-05-27 16:56:08,119 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,120 INFO | (200, 512)
2021-05-27 16:56:08,126 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,127 INFO | (200, 512)
2021-05-27 16:56:08,127 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,128 INFO | (200, 512)
2021-05-27 16:56:08,135 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,135 INFO | (200, 512)
2021-05-27 16:56:08,136 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,136 INFO | (200, 512)
2021-05-27 16:56:08,142 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,142 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  17%|█▋        | 100/574 [00:10<00:53,  8.94it/s]

2021-05-27 16:56:08,213 INFO | INITIAL
2021-05-27 16:56:08,214 INFO | (50, 200)
2021-05-27 16:56:08,219 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:08,220 INFO | (50, 200, 512)
2021-05-27 16:56:08,221 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:08,221 INFO | (50, 200, 512)
2021-05-27 16:56:08,222 INFO | BERT LAYER
2021-05-27 16:56:08,223 INFO | (200, 512)
2021-05-27 16:56:08,224 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,224 INFO | (200, 512)
2021-05-27 16:56:08,225 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,225 INFO | (200, 512)
2021-05-27 16:56:08,233 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,234 INFO | (200, 512)
2021-05-27 16:56:08,234 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,235 INFO | (200, 512)
2021-05-27 16:56:08,243 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,244 INFO | (200, 512)
2021-05-27 16:56:08,244 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,244 INFO | (200, 512)
2021-05-27 16:56:08,251 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,251 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  18%|█▊        | 101/574 [00:11<00:52,  9.03it/s]

2021-05-27 16:56:08,322 INFO | INITIAL
2021-05-27 16:56:08,322 INFO | (50, 200)
2021-05-27 16:56:08,330 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:08,331 INFO | (50, 200, 512)
2021-05-27 16:56:08,332 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:08,333 INFO | (50, 200, 512)
2021-05-27 16:56:08,334 INFO | BERT LAYER
2021-05-27 16:56:08,334 INFO | (200, 512)
2021-05-27 16:56:08,334 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,335 INFO | (200, 512)
2021-05-27 16:56:08,335 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,335 INFO | (200, 512)
2021-05-27 16:56:08,343 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,344 INFO | (200, 512)
2021-05-27 16:56:08,345 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,345 INFO | (200, 512)
2021-05-27 16:56:08,352 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,353 INFO | (200, 512)
2021-05-27 16:56:08,353 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,354 INFO | (200, 512)
2021-05-27 16:56:08,361 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,361 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  18%|█▊        | 102/574 [00:11<00:51,  9.08it/s]

2021-05-27 16:56:08,430 INFO | INITIAL
2021-05-27 16:56:08,430 INFO | (50, 200)
2021-05-27 16:56:08,437 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:08,437 INFO | (50, 200, 512)
2021-05-27 16:56:08,439 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:08,440 INFO | (50, 200, 512)
2021-05-27 16:56:08,441 INFO | BERT LAYER
2021-05-27 16:56:08,442 INFO | (200, 512)
2021-05-27 16:56:08,442 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,443 INFO | (200, 512)
2021-05-27 16:56:08,444 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,444 INFO | (200, 512)
2021-05-27 16:56:08,451 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,451 INFO | (200, 512)
2021-05-27 16:56:08,451 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,452 INFO | (200, 512)
2021-05-27 16:56:08,457 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,458 INFO | (200, 512)
2021-05-27 16:56:08,459 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,459 INFO | (200, 512)
2021-05-27 16:56:08,465 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,466 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  18%|█▊        | 103/574 [00:11<00:51,  9.16it/s]

2021-05-27 16:56:08,537 INFO | INITIAL
2021-05-27 16:56:08,539 INFO | (50, 200)
2021-05-27 16:56:08,546 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:08,546 INFO | (50, 200, 512)
2021-05-27 16:56:08,548 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:08,548 INFO | (50, 200, 512)
2021-05-27 16:56:08,549 INFO | BERT LAYER
2021-05-27 16:56:08,550 INFO | (200, 512)
2021-05-27 16:56:08,550 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,551 INFO | (200, 512)
2021-05-27 16:56:08,551 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,552 INFO | (200, 512)
2021-05-27 16:56:08,558 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,558 INFO | (200, 512)
2021-05-27 16:56:08,559 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,559 INFO | (200, 512)
2021-05-27 16:56:08,564 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,565 INFO | (200, 512)
2021-05-27 16:56:08,565 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,566 INFO | (200, 512)
2021-05-27 16:56:08,572 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,573 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  18%|█▊        | 104/574 [00:11<00:51,  9.10it/s]

2021-05-27 16:56:08,648 INFO | INITIAL
2021-05-27 16:56:08,649 INFO | (50, 200)
2021-05-27 16:56:08,654 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:08,654 INFO | (50, 200, 512)
2021-05-27 16:56:08,656 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:08,656 INFO | (50, 200, 512)
2021-05-27 16:56:08,657 INFO | BERT LAYER
2021-05-27 16:56:08,657 INFO | (200, 512)
2021-05-27 16:56:08,658 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,658 INFO | (200, 512)
2021-05-27 16:56:08,659 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,659 INFO | (200, 512)
2021-05-27 16:56:08,666 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,666 INFO | (200, 512)
2021-05-27 16:56:08,667 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,667 INFO | (200, 512)
2021-05-27 16:56:08,672 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,673 INFO | (200, 512)
2021-05-27 16:56:08,673 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,674 INFO | (200, 512)
2021-05-27 16:56:08,679 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,680 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  18%|█▊        | 105/574 [00:11<00:50,  9.31it/s]

2021-05-27 16:56:08,750 INFO | INITIAL
2021-05-27 16:56:08,750 INFO | (50, 200)
2021-05-27 16:56:08,755 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:08,756 INFO | (50, 200, 512)
2021-05-27 16:56:08,757 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:08,757 INFO | (50, 200, 512)
2021-05-27 16:56:08,758 INFO | BERT LAYER
2021-05-27 16:56:08,758 INFO | (200, 512)
2021-05-27 16:56:08,759 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,759 INFO | (200, 512)
2021-05-27 16:56:08,759 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,760 INFO | (200, 512)
2021-05-27 16:56:08,765 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,766 INFO | (200, 512)
2021-05-27 16:56:08,766 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,767 INFO | (200, 512)
2021-05-27 16:56:08,774 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,774 INFO | (200, 512)
2021-05-27 16:56:08,775 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,775 INFO | (200, 512)
2021-05-27 16:56:08,781 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,782 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  18%|█▊        | 105/574 [00:11<00:50,  9.31it/s]

2021-05-27 16:56:08,848 INFO | INITIAL
2021-05-27 16:56:08,849 INFO | (50, 200)
2021-05-27 16:56:08,854 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:08,854 INFO | (50, 200, 512)
2021-05-27 16:56:08,856 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:08,856 INFO | (50, 200, 512)
2021-05-27 16:56:08,857 INFO | BERT LAYER
2021-05-27 16:56:08,857 INFO | (200, 512)
2021-05-27 16:56:08,857 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,858 INFO | (200, 512)
2021-05-27 16:56:08,858 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,859 INFO | (200, 512)
2021-05-27 16:56:08,867 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,868 INFO | (200, 512)
2021-05-27 16:56:08,869 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,869 INFO | (200, 512)
2021-05-27 16:56:08,876 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,876 INFO | (200, 512)
2021-05-27 16:56:08,877 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,878 INFO | (200, 512)
2021-05-27 16:56:08,884 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,884 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  19%|█▊        | 107/574 [00:11<00:49,  9.51it/s]

2021-05-27 16:56:08,955 INFO | INITIAL
2021-05-27 16:56:08,956 INFO | (50, 200)
2021-05-27 16:56:08,961 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:08,962 INFO | (50, 200, 512)
2021-05-27 16:56:08,963 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:08,964 INFO | (50, 200, 512)
2021-05-27 16:56:08,964 INFO | BERT LAYER
2021-05-27 16:56:08,965 INFO | (200, 512)
2021-05-27 16:56:08,965 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,965 INFO | (200, 512)
2021-05-27 16:56:08,966 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,967 INFO | (200, 512)
2021-05-27 16:56:08,972 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,973 INFO | (200, 512)
2021-05-27 16:56:08,973 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,974 INFO | (200, 512)
2021-05-27 16:56:08,980 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,980 INFO | (200, 512)
2021-05-27 16:56:08,981 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:08,981 INFO | (200, 512)
2021-05-27 16:56:08,987 INFO | BERT LAYER LOOP
2021-05-27 16:56:08,987 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  19%|█▉        | 108/574 [00:11<00:49,  9.50it/s]

2021-05-27 16:56:09,061 INFO | INITIAL
2021-05-27 16:56:09,062 INFO | (50, 200)
2021-05-27 16:56:09,067 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:09,067 INFO | (50, 200, 512)
2021-05-27 16:56:09,069 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:09,070 INFO | (50, 200, 512)
2021-05-27 16:56:09,070 INFO | BERT LAYER
2021-05-27 16:56:09,071 INFO | (200, 512)
2021-05-27 16:56:09,071 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,072 INFO | (200, 512)
2021-05-27 16:56:09,072 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,073 INFO | (200, 512)
2021-05-27 16:56:09,080 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,081 INFO | (200, 512)
2021-05-27 16:56:09,081 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,082 INFO | (200, 512)
2021-05-27 16:56:09,088 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,089 INFO | (200, 512)
2021-05-27 16:56:09,089 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,090 INFO | (200, 512)
2021-05-27 16:56:09,096 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,096 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  19%|█▉        | 109/574 [00:11<00:49,  9.44it/s]

2021-05-27 16:56:09,168 INFO | INITIAL
2021-05-27 16:56:09,169 INFO | (50, 200)
2021-05-27 16:56:09,176 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:09,177 INFO | (50, 200, 512)
2021-05-27 16:56:09,178 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:09,179 INFO | (50, 200, 512)
2021-05-27 16:56:09,180 INFO | BERT LAYER
2021-05-27 16:56:09,180 INFO | (200, 512)
2021-05-27 16:56:09,180 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,181 INFO | (200, 512)
2021-05-27 16:56:09,181 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,182 INFO | (200, 512)
2021-05-27 16:56:09,187 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,187 INFO | (200, 512)
2021-05-27 16:56:09,188 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,188 INFO | (200, 512)
2021-05-27 16:56:09,195 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,195 INFO | (200, 512)
2021-05-27 16:56:09,196 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,196 INFO | (200, 512)
2021-05-27 16:56:09,202 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,202 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  19%|█▉        | 110/574 [00:11<00:49,  9.35it/s]

2021-05-27 16:56:09,278 INFO | INITIAL
2021-05-27 16:56:09,279 INFO | (50, 200)
2021-05-27 16:56:09,284 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:09,284 INFO | (50, 200, 512)
2021-05-27 16:56:09,285 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:09,286 INFO | (50, 200, 512)
2021-05-27 16:56:09,287 INFO | BERT LAYER
2021-05-27 16:56:09,287 INFO | (200, 512)
2021-05-27 16:56:09,287 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,288 INFO | (200, 512)
2021-05-27 16:56:09,288 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,288 INFO | (200, 512)
2021-05-27 16:56:09,294 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,295 INFO | (200, 512)
2021-05-27 16:56:09,295 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,296 INFO | (200, 512)
2021-05-27 16:56:09,302 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,302 INFO | (200, 512)
2021-05-27 16:56:09,302 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,303 INFO | (200, 512)
2021-05-27 16:56:09,310 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,310 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  19%|█▉        | 111/574 [00:12<00:49,  9.41it/s]

2021-05-27 16:56:09,383 INFO | INITIAL
2021-05-27 16:56:09,383 INFO | (50, 200)
2021-05-27 16:56:09,388 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:09,389 INFO | (50, 200, 512)
2021-05-27 16:56:09,390 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:09,390 INFO | (50, 200, 512)
2021-05-27 16:56:09,391 INFO | BERT LAYER
2021-05-27 16:56:09,392 INFO | (200, 512)
2021-05-27 16:56:09,392 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,392 INFO | (200, 512)
2021-05-27 16:56:09,393 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,393 INFO | (200, 512)
2021-05-27 16:56:09,400 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,400 INFO | (200, 512)
2021-05-27 16:56:09,401 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,401 INFO | (200, 512)
2021-05-27 16:56:09,408 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,408 INFO | (200, 512)
2021-05-27 16:56:09,409 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,410 INFO | (200, 512)
2021-05-27 16:56:09,416 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,417 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  20%|█▉        | 112/574 [00:12<00:49,  9.40it/s]

2021-05-27 16:56:09,489 INFO | INITIAL
2021-05-27 16:56:09,490 INFO | (50, 200)
2021-05-27 16:56:09,497 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:09,497 INFO | (50, 200, 512)
2021-05-27 16:56:09,498 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:09,498 INFO | (50, 200, 512)
2021-05-27 16:56:09,499 INFO | BERT LAYER
2021-05-27 16:56:09,500 INFO | (200, 512)
2021-05-27 16:56:09,500 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,500 INFO | (200, 512)
2021-05-27 16:56:09,501 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,501 INFO | (200, 512)
2021-05-27 16:56:09,508 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,509 INFO | (200, 512)
2021-05-27 16:56:09,509 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,510 INFO | (200, 512)
2021-05-27 16:56:09,517 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,518 INFO | (200, 512)
2021-05-27 16:56:09,518 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,519 INFO | (200, 512)
2021-05-27 16:56:09,526 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,526 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  20%|█▉        | 113/574 [00:12<00:49,  9.29it/s]

2021-05-27 16:56:09,600 INFO | INITIAL
2021-05-27 16:56:09,601 INFO | (50, 200)
2021-05-27 16:56:09,606 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:09,606 INFO | (50, 200, 512)
2021-05-27 16:56:09,608 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:09,608 INFO | (50, 200, 512)
2021-05-27 16:56:09,609 INFO | BERT LAYER
2021-05-27 16:56:09,610 INFO | (200, 512)
2021-05-27 16:56:09,610 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,611 INFO | (200, 512)
2021-05-27 16:56:09,611 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,612 INFO | (200, 512)
2021-05-27 16:56:09,617 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,617 INFO | (200, 512)
2021-05-27 16:56:09,618 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,618 INFO | (200, 512)
2021-05-27 16:56:09,624 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,625 INFO | (200, 512)
2021-05-27 16:56:09,625 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,626 INFO | (200, 512)
2021-05-27 16:56:09,632 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,633 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  20%|█▉        | 114/574 [00:12<00:49,  9.37it/s]

2021-05-27 16:56:09,705 INFO | INITIAL
2021-05-27 16:56:09,706 INFO | (50, 200)
2021-05-27 16:56:09,712 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:09,712 INFO | (50, 200, 512)
2021-05-27 16:56:09,714 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:09,714 INFO | (50, 200, 512)
2021-05-27 16:56:09,715 INFO | BERT LAYER
2021-05-27 16:56:09,716 INFO | (200, 512)
2021-05-27 16:56:09,716 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,717 INFO | (200, 512)
2021-05-27 16:56:09,717 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,717 INFO | (200, 512)
2021-05-27 16:56:09,723 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,724 INFO | (200, 512)
2021-05-27 16:56:09,724 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,725 INFO | (200, 512)
2021-05-27 16:56:09,731 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,732 INFO | (200, 512)
2021-05-27 16:56:09,732 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,732 INFO | (200, 512)
2021-05-27 16:56:09,737 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,738 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  20%|██        | 115/574 [00:12<00:49,  9.26it/s]

2021-05-27 16:56:09,816 INFO | INITIAL
2021-05-27 16:56:09,816 INFO | (50, 200)
2021-05-27 16:56:09,821 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:09,821 INFO | (50, 200, 512)
2021-05-27 16:56:09,822 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:09,823 INFO | (50, 200, 512)
2021-05-27 16:56:09,824 INFO | BERT LAYER
2021-05-27 16:56:09,824 INFO | (200, 512)
2021-05-27 16:56:09,824 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,825 INFO | (200, 512)
2021-05-27 16:56:09,825 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,826 INFO | (200, 512)
2021-05-27 16:56:09,831 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,831 INFO | (200, 512)
2021-05-27 16:56:09,832 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,832 INFO | (200, 512)
2021-05-27 16:56:09,839 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,839 INFO | (200, 512)
2021-05-27 16:56:09,840 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,840 INFO | (200, 512)
2021-05-27 16:56:09,846 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,847 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  20%|██        | 116/574 [00:12<00:48,  9.41it/s]

2021-05-27 16:56:09,918 INFO | INITIAL
2021-05-27 16:56:09,919 INFO | (50, 200)
2021-05-27 16:56:09,924 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:09,924 INFO | (50, 200, 512)
2021-05-27 16:56:09,926 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:09,926 INFO | (50, 200, 512)
2021-05-27 16:56:09,927 INFO | BERT LAYER
2021-05-27 16:56:09,927 INFO | (200, 512)
2021-05-27 16:56:09,928 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,928 INFO | (200, 512)
2021-05-27 16:56:09,928 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,929 INFO | (200, 512)
2021-05-27 16:56:09,934 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,935 INFO | (200, 512)
2021-05-27 16:56:09,935 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,935 INFO | (200, 512)
2021-05-27 16:56:09,941 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,942 INFO | (200, 512)
2021-05-27 16:56:09,942 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:09,943 INFO | (200, 512)
2021-05-27 16:56:09,948 INFO | BERT LAYER LOOP
2021-05-27 16:56:09,949 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  20%|██        | 116/574 [00:12<00:48,  9.41it/s]

2021-05-27 16:56:10,017 INFO | INITIAL
2021-05-27 16:56:10,018 INFO | (50, 200)
2021-05-27 16:56:10,023 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:10,023 INFO | (50, 200, 512)
2021-05-27 16:56:10,025 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:10,025 INFO | (50, 200, 512)
2021-05-27 16:56:10,026 INFO | BERT LAYER
2021-05-27 16:56:10,026 INFO | (200, 512)
2021-05-27 16:56:10,027 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,027 INFO | (200, 512)
2021-05-27 16:56:10,028 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,028 INFO | (200, 512)
2021-05-27 16:56:10,034 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,034 INFO | (200, 512)
2021-05-27 16:56:10,035 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,036 INFO | (200, 512)
2021-05-27 16:56:10,042 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,043 INFO | (200, 512)
2021-05-27 16:56:10,044 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,044 INFO | (200, 512)
2021-05-27 16:56:10,049 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,050 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  21%|██        | 118/574 [00:12<00:48,  9.37it/s]

2021-05-27 16:56:10,132 INFO | INITIAL
2021-05-27 16:56:10,133 INFO | (50, 200)
2021-05-27 16:56:10,138 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:10,139 INFO | (50, 200, 512)
2021-05-27 16:56:10,140 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:10,141 INFO | (50, 200, 512)
2021-05-27 16:56:10,142 INFO | BERT LAYER
2021-05-27 16:56:10,142 INFO | (200, 512)
2021-05-27 16:56:10,143 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,143 INFO | (200, 512)
2021-05-27 16:56:10,143 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,144 INFO | (200, 512)
2021-05-27 16:56:10,151 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,151 INFO | (200, 512)
2021-05-27 16:56:10,152 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,152 INFO | (200, 512)
2021-05-27 16:56:10,158 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,159 INFO | (200, 512)
2021-05-27 16:56:10,159 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,160 INFO | (200, 512)
2021-05-27 16:56:10,166 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,166 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  21%|██        | 119/574 [00:12<00:48,  9.36it/s]

2021-05-27 16:56:10,240 INFO | INITIAL
2021-05-27 16:56:10,240 INFO | (50, 200)
2021-05-27 16:56:10,247 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:10,247 INFO | (50, 200, 512)
2021-05-27 16:56:10,249 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:10,249 INFO | (50, 200, 512)
2021-05-27 16:56:10,250 INFO | BERT LAYER
2021-05-27 16:56:10,251 INFO | (200, 512)
2021-05-27 16:56:10,251 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,251 INFO | (200, 512)
2021-05-27 16:56:10,252 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,252 INFO | (200, 512)
2021-05-27 16:56:10,260 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,260 INFO | (200, 512)
2021-05-27 16:56:10,261 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,261 INFO | (200, 512)
2021-05-27 16:56:10,267 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,267 INFO | (200, 512)
2021-05-27 16:56:10,268 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,268 INFO | (200, 512)
2021-05-27 16:56:10,274 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,275 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  21%|██        | 120/574 [00:13<00:48,  9.32it/s]

2021-05-27 16:56:10,348 INFO | INITIAL
2021-05-27 16:56:10,348 INFO | (50, 200)
2021-05-27 16:56:10,354 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:10,355 INFO | (50, 200, 512)
2021-05-27 16:56:10,356 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:10,356 INFO | (50, 200, 512)
2021-05-27 16:56:10,357 INFO | BERT LAYER
2021-05-27 16:56:10,358 INFO | (200, 512)
2021-05-27 16:56:10,358 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,359 INFO | (200, 512)
2021-05-27 16:56:10,359 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,360 INFO | (200, 512)
2021-05-27 16:56:10,367 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,367 INFO | (200, 512)
2021-05-27 16:56:10,368 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,368 INFO | (200, 512)
2021-05-27 16:56:10,375 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,377 INFO | (200, 512)
2021-05-27 16:56:10,377 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,378 INFO | (200, 512)
2021-05-27 16:56:10,383 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,384 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  21%|██        | 121/574 [00:13<00:48,  9.39it/s]

2021-05-27 16:56:10,453 INFO | INITIAL
2021-05-27 16:56:10,453 INFO | (50, 200)
2021-05-27 16:56:10,459 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:10,460 INFO | (50, 200, 512)
2021-05-27 16:56:10,462 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:10,463 INFO | (50, 200, 512)
2021-05-27 16:56:10,464 INFO | BERT LAYER
2021-05-27 16:56:10,464 INFO | (200, 512)
2021-05-27 16:56:10,465 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,465 INFO | (200, 512)
2021-05-27 16:56:10,466 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,466 INFO | (200, 512)
2021-05-27 16:56:10,474 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,474 INFO | (200, 512)
2021-05-27 16:56:10,475 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,475 INFO | (200, 512)
2021-05-27 16:56:10,481 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,481 INFO | (200, 512)
2021-05-27 16:56:10,482 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,482 INFO | (200, 512)
2021-05-27 16:56:10,487 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,488 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  21%|██▏       | 122/574 [00:13<00:47,  9.45it/s]

2021-05-27 16:56:10,557 INFO | INITIAL
2021-05-27 16:56:10,557 INFO | (50, 200)
2021-05-27 16:56:10,564 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:10,565 INFO | (50, 200, 512)
2021-05-27 16:56:10,566 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:10,566 INFO | (50, 200, 512)
2021-05-27 16:56:10,567 INFO | BERT LAYER
2021-05-27 16:56:10,568 INFO | (200, 512)
2021-05-27 16:56:10,568 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,568 INFO | (200, 512)
2021-05-27 16:56:10,569 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,570 INFO | (200, 512)
2021-05-27 16:56:10,576 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,577 INFO | (200, 512)
2021-05-27 16:56:10,577 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,578 INFO | (200, 512)
2021-05-27 16:56:10,583 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,583 INFO | (200, 512)
2021-05-27 16:56:10,584 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,584 INFO | (200, 512)
2021-05-27 16:56:10,589 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,589 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  21%|██▏       | 123/574 [00:13<00:48,  9.37it/s]

2021-05-27 16:56:10,666 INFO | INITIAL
2021-05-27 16:56:10,666 INFO | (50, 200)
2021-05-27 16:56:10,671 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:10,672 INFO | (50, 200, 512)
2021-05-27 16:56:10,673 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:10,674 INFO | (50, 200, 512)
2021-05-27 16:56:10,675 INFO | BERT LAYER
2021-05-27 16:56:10,675 INFO | (200, 512)
2021-05-27 16:56:10,676 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,676 INFO | (200, 512)
2021-05-27 16:56:10,676 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,677 INFO | (200, 512)
2021-05-27 16:56:10,684 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,684 INFO | (200, 512)
2021-05-27 16:56:10,685 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,685 INFO | (200, 512)
2021-05-27 16:56:10,692 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,693 INFO | (200, 512)
2021-05-27 16:56:10,693 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,694 INFO | (200, 512)
2021-05-27 16:56:10,699 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,700 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  22%|██▏       | 124/574 [00:13<00:47,  9.42it/s]

2021-05-27 16:56:10,771 INFO | INITIAL
2021-05-27 16:56:10,771 INFO | (50, 200)
2021-05-27 16:56:10,777 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:10,778 INFO | (50, 200, 512)
2021-05-27 16:56:10,779 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:10,780 INFO | (50, 200, 512)
2021-05-27 16:56:10,781 INFO | BERT LAYER
2021-05-27 16:56:10,781 INFO | (200, 512)
2021-05-27 16:56:10,782 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,782 INFO | (200, 512)
2021-05-27 16:56:10,782 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,783 INFO | (200, 512)
2021-05-27 16:56:10,789 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,789 INFO | (200, 512)
2021-05-27 16:56:10,790 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,791 INFO | (200, 512)
2021-05-27 16:56:10,797 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,798 INFO | (200, 512)
2021-05-27 16:56:10,798 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,798 INFO | (200, 512)
2021-05-27 16:56:10,805 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,805 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  22%|██▏       | 125/574 [00:13<00:47,  9.51it/s]

2021-05-27 16:56:10,873 INFO | INITIAL
2021-05-27 16:56:10,873 INFO | (50, 200)
2021-05-27 16:56:10,879 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:10,879 INFO | (50, 200, 512)
2021-05-27 16:56:10,880 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:10,881 INFO | (50, 200, 512)
2021-05-27 16:56:10,881 INFO | BERT LAYER
2021-05-27 16:56:10,882 INFO | (200, 512)
2021-05-27 16:56:10,882 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,882 INFO | (200, 512)
2021-05-27 16:56:10,883 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,883 INFO | (200, 512)
2021-05-27 16:56:10,888 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,888 INFO | (200, 512)
2021-05-27 16:56:10,889 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,889 INFO | (200, 512)
2021-05-27 16:56:10,894 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,895 INFO | (200, 512)
2021-05-27 16:56:10,895 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,895 INFO | (200, 512)
2021-05-27 16:56:10,901 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,902 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  22%|██▏       | 125/574 [00:13<00:47,  9.51it/s]

2021-05-27 16:56:10,969 INFO | INITIAL
2021-05-27 16:56:10,969 INFO | (50, 200)
2021-05-27 16:56:10,976 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:10,977 INFO | (50, 200, 512)
2021-05-27 16:56:10,978 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:10,978 INFO | (50, 200, 512)
2021-05-27 16:56:10,979 INFO | BERT LAYER
2021-05-27 16:56:10,980 INFO | (200, 512)
2021-05-27 16:56:10,980 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,981 INFO | (200, 512)
2021-05-27 16:56:10,981 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,982 INFO | (200, 512)
2021-05-27 16:56:10,988 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,989 INFO | (200, 512)
2021-05-27 16:56:10,989 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,990 INFO | (200, 512)
2021-05-27 16:56:10,996 INFO | BERT LAYER LOOP
2021-05-27 16:56:10,997 INFO | (200, 512)
2021-05-27 16:56:10,997 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:10,998 INFO | (200, 512)
2021-05-27 16:56:11,004 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,005 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  22%|██▏       | 127/574 [00:13<00:46,  9.55it/s]

2021-05-27 16:56:11,082 INFO | INITIAL
2021-05-27 16:56:11,082 INFO | (50, 200)
2021-05-27 16:56:11,087 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:11,087 INFO | (50, 200, 512)
2021-05-27 16:56:11,089 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:11,089 INFO | (50, 200, 512)
2021-05-27 16:56:11,090 INFO | BERT LAYER
2021-05-27 16:56:11,090 INFO | (200, 512)
2021-05-27 16:56:11,091 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,091 INFO | (200, 512)
2021-05-27 16:56:11,092 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,092 INFO | (200, 512)
2021-05-27 16:56:11,098 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,098 INFO | (200, 512)
2021-05-27 16:56:11,098 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,099 INFO | (200, 512)
2021-05-27 16:56:11,105 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,106 INFO | (200, 512)
2021-05-27 16:56:11,106 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,107 INFO | (200, 512)
2021-05-27 16:56:11,115 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,115 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  22%|██▏       | 128/574 [00:13<00:46,  9.62it/s]

2021-05-27 16:56:11,183 INFO | INITIAL
2021-05-27 16:56:11,184 INFO | (50, 200)
2021-05-27 16:56:11,189 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:11,190 INFO | (50, 200, 512)
2021-05-27 16:56:11,191 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:11,192 INFO | (50, 200, 512)
2021-05-27 16:56:11,193 INFO | BERT LAYER
2021-05-27 16:56:11,194 INFO | (200, 512)
2021-05-27 16:56:11,195 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,195 INFO | (200, 512)
2021-05-27 16:56:11,196 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,196 INFO | (200, 512)
2021-05-27 16:56:11,203 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,205 INFO | (200, 512)
2021-05-27 16:56:11,206 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,206 INFO | (200, 512)
2021-05-27 16:56:11,213 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,213 INFO | (200, 512)
2021-05-27 16:56:11,214 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,214 INFO | (200, 512)
2021-05-27 16:56:11,220 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,221 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  22%|██▏       | 129/574 [00:13<00:46,  9.60it/s]

2021-05-27 16:56:11,288 INFO | INITIAL
2021-05-27 16:56:11,289 INFO | (50, 200)
2021-05-27 16:56:11,297 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:11,297 INFO | (50, 200, 512)
2021-05-27 16:56:11,299 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:11,299 INFO | (50, 200, 512)
2021-05-27 16:56:11,300 INFO | BERT LAYER
2021-05-27 16:56:11,301 INFO | (200, 512)
2021-05-27 16:56:11,301 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,301 INFO | (200, 512)
2021-05-27 16:56:11,302 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,302 INFO | (200, 512)
2021-05-27 16:56:11,310 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,310 INFO | (200, 512)
2021-05-27 16:56:11,311 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,311 INFO | (200, 512)
2021-05-27 16:56:11,317 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,318 INFO | (200, 512)
2021-05-27 16:56:11,318 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,318 INFO | (200, 512)
2021-05-27 16:56:11,325 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,325 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  23%|██▎       | 130/574 [00:14<00:46,  9.48it/s]

2021-05-27 16:56:11,397 INFO | INITIAL
2021-05-27 16:56:11,397 INFO | (50, 200)
2021-05-27 16:56:11,402 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:11,402 INFO | (50, 200, 512)
2021-05-27 16:56:11,403 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:11,404 INFO | (50, 200, 512)
2021-05-27 16:56:11,404 INFO | BERT LAYER
2021-05-27 16:56:11,405 INFO | (200, 512)
2021-05-27 16:56:11,405 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,406 INFO | (200, 512)
2021-05-27 16:56:11,406 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,406 INFO | (200, 512)
2021-05-27 16:56:11,414 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,414 INFO | (200, 512)
2021-05-27 16:56:11,414 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,415 INFO | (200, 512)
2021-05-27 16:56:11,420 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,420 INFO | (200, 512)
2021-05-27 16:56:11,420 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,421 INFO | (200, 512)
2021-05-27 16:56:11,428 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,428 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  23%|██▎       | 131/574 [00:14<00:47,  9.34it/s]

2021-05-27 16:56:11,510 INFO | INITIAL
2021-05-27 16:56:11,510 INFO | (50, 200)
2021-05-27 16:56:11,516 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:11,517 INFO | (50, 200, 512)
2021-05-27 16:56:11,518 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:11,519 INFO | (50, 200, 512)
2021-05-27 16:56:11,520 INFO | BERT LAYER
2021-05-27 16:56:11,520 INFO | (200, 512)
2021-05-27 16:56:11,521 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,521 INFO | (200, 512)
2021-05-27 16:56:11,521 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,522 INFO | (200, 512)
2021-05-27 16:56:11,528 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,528 INFO | (200, 512)
2021-05-27 16:56:11,528 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,529 INFO | (200, 512)
2021-05-27 16:56:11,534 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,535 INFO | (200, 512)
2021-05-27 16:56:11,535 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,535 INFO | (200, 512)
2021-05-27 16:56:11,542 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,543 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  23%|██▎       | 132/574 [00:14<00:47,  9.38it/s]

2021-05-27 16:56:11,613 INFO | INITIAL
2021-05-27 16:56:11,614 INFO | (50, 200)
2021-05-27 16:56:11,619 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:11,620 INFO | (50, 200, 512)
2021-05-27 16:56:11,621 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:11,622 INFO | (50, 200, 512)
2021-05-27 16:56:11,622 INFO | BERT LAYER
2021-05-27 16:56:11,623 INFO | (200, 512)
2021-05-27 16:56:11,623 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,624 INFO | (200, 512)
2021-05-27 16:56:11,625 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,625 INFO | (200, 512)
2021-05-27 16:56:11,633 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,633 INFO | (200, 512)
2021-05-27 16:56:11,634 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,634 INFO | (200, 512)
2021-05-27 16:56:11,642 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,643 INFO | (200, 512)
2021-05-27 16:56:11,644 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,644 INFO | (200, 512)
2021-05-27 16:56:11,651 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,651 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  23%|██▎       | 133/574 [00:14<00:47,  9.33it/s]

2021-05-27 16:56:11,723 INFO | INITIAL
2021-05-27 16:56:11,723 INFO | (50, 200)
2021-05-27 16:56:11,730 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:11,731 INFO | (50, 200, 512)
2021-05-27 16:56:11,732 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:11,733 INFO | (50, 200, 512)
2021-05-27 16:56:11,734 INFO | BERT LAYER
2021-05-27 16:56:11,734 INFO | (200, 512)
2021-05-27 16:56:11,735 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,735 INFO | (200, 512)
2021-05-27 16:56:11,736 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,736 INFO | (200, 512)
2021-05-27 16:56:11,743 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,743 INFO | (200, 512)
2021-05-27 16:56:11,744 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,744 INFO | (200, 512)
2021-05-27 16:56:11,749 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,750 INFO | (200, 512)
2021-05-27 16:56:11,750 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,751 INFO | (200, 512)
2021-05-27 16:56:11,757 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,758 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  23%|██▎       | 134/574 [00:14<00:47,  9.36it/s]

2021-05-27 16:56:11,828 INFO | INITIAL
2021-05-27 16:56:11,829 INFO | (50, 200)
2021-05-27 16:56:11,834 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:11,834 INFO | (50, 200, 512)
2021-05-27 16:56:11,835 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:11,836 INFO | (50, 200, 512)
2021-05-27 16:56:11,837 INFO | BERT LAYER
2021-05-27 16:56:11,837 INFO | (200, 512)
2021-05-27 16:56:11,837 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,838 INFO | (200, 512)
2021-05-27 16:56:11,838 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,838 INFO | (200, 512)
2021-05-27 16:56:11,845 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,846 INFO | (200, 512)
2021-05-27 16:56:11,846 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,847 INFO | (200, 512)
2021-05-27 16:56:11,852 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,852 INFO | (200, 512)
2021-05-27 16:56:11,852 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,853 INFO | (200, 512)
2021-05-27 16:56:11,858 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,858 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  24%|██▎       | 135/574 [00:14<00:46,  9.44it/s]

2021-05-27 16:56:11,932 INFO | INITIAL
2021-05-27 16:56:11,932 INFO | (50, 200)
2021-05-27 16:56:11,937 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:11,938 INFO | (50, 200, 512)
2021-05-27 16:56:11,941 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:11,941 INFO | (50, 200, 512)
2021-05-27 16:56:11,942 INFO | BERT LAYER
2021-05-27 16:56:11,942 INFO | (200, 512)
2021-05-27 16:56:11,943 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,943 INFO | (200, 512)
2021-05-27 16:56:11,944 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,944 INFO | (200, 512)
2021-05-27 16:56:11,950 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,950 INFO | (200, 512)
2021-05-27 16:56:11,951 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,951 INFO | (200, 512)
2021-05-27 16:56:11,956 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,956 INFO | (200, 512)
2021-05-27 16:56:11,957 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:11,958 INFO | (200, 512)
2021-05-27 16:56:11,962 INFO | BERT LAYER LOOP
2021-05-27 16:56:11,963 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  24%|██▎       | 136/574 [00:14<00:45,  9.58it/s]

2021-05-27 16:56:12,033 INFO | INITIAL
2021-05-27 16:56:12,033 INFO | (50, 200)
2021-05-27 16:56:12,039 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:12,039 INFO | (50, 200, 512)
2021-05-27 16:56:12,041 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:12,041 INFO | (50, 200, 512)
2021-05-27 16:56:12,043 INFO | BERT LAYER
2021-05-27 16:56:12,043 INFO | (200, 512)
2021-05-27 16:56:12,044 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,044 INFO | (200, 512)
2021-05-27 16:56:12,045 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,045 INFO | (200, 512)
2021-05-27 16:56:12,051 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,052 INFO | (200, 512)
2021-05-27 16:56:12,052 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,052 INFO | (200, 512)
2021-05-27 16:56:12,058 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,059 INFO | (200, 512)
2021-05-27 16:56:12,059 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,060 INFO | (200, 512)
2021-05-27 16:56:12,066 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,066 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  24%|██▍       | 137/574 [00:14<00:45,  9.60it/s]

2021-05-27 16:56:12,136 INFO | INITIAL
2021-05-27 16:56:12,137 INFO | (50, 200)
2021-05-27 16:56:12,142 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:12,142 INFO | (50, 200, 512)
2021-05-27 16:56:12,144 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:12,145 INFO | (50, 200, 512)
2021-05-27 16:56:12,145 INFO | BERT LAYER
2021-05-27 16:56:12,146 INFO | (200, 512)
2021-05-27 16:56:12,146 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,146 INFO | (200, 512)
2021-05-27 16:56:12,147 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,147 INFO | (200, 512)
2021-05-27 16:56:12,154 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,155 INFO | (200, 512)
2021-05-27 16:56:12,155 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,155 INFO | (200, 512)
2021-05-27 16:56:12,162 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,163 INFO | (200, 512)
2021-05-27 16:56:12,164 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,164 INFO | (200, 512)
2021-05-27 16:56:12,171 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,171 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  24%|██▍       | 138/574 [00:14<00:45,  9.48it/s]

2021-05-27 16:56:12,245 INFO | INITIAL
2021-05-27 16:56:12,246 INFO | (50, 200)
2021-05-27 16:56:12,252 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:12,253 INFO | (50, 200, 512)
2021-05-27 16:56:12,254 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:12,254 INFO | (50, 200, 512)
2021-05-27 16:56:12,255 INFO | BERT LAYER
2021-05-27 16:56:12,255 INFO | (200, 512)
2021-05-27 16:56:12,256 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,257 INFO | (200, 512)
2021-05-27 16:56:12,257 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,258 INFO | (200, 512)
2021-05-27 16:56:12,265 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,265 INFO | (200, 512)
2021-05-27 16:56:12,265 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,266 INFO | (200, 512)
2021-05-27 16:56:12,272 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,272 INFO | (200, 512)
2021-05-27 16:56:12,273 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,274 INFO | (200, 512)
2021-05-27 16:56:12,280 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,281 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  24%|██▍       | 139/574 [00:15<00:46,  9.41it/s]

2021-05-27 16:56:12,353 INFO | INITIAL
2021-05-27 16:56:12,353 INFO | (50, 200)
2021-05-27 16:56:12,359 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:12,359 INFO | (50, 200, 512)
2021-05-27 16:56:12,360 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:12,361 INFO | (50, 200, 512)
2021-05-27 16:56:12,362 INFO | BERT LAYER
2021-05-27 16:56:12,362 INFO | (200, 512)
2021-05-27 16:56:12,362 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,363 INFO | (200, 512)
2021-05-27 16:56:12,363 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,364 INFO | (200, 512)
2021-05-27 16:56:12,370 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,370 INFO | (200, 512)
2021-05-27 16:56:12,371 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,371 INFO | (200, 512)
2021-05-27 16:56:12,377 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,378 INFO | (200, 512)
2021-05-27 16:56:12,378 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,378 INFO | (200, 512)
2021-05-27 16:56:12,384 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,384 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  24%|██▍       | 140/574 [00:15<00:45,  9.51it/s]

2021-05-27 16:56:12,456 INFO | INITIAL
2021-05-27 16:56:12,456 INFO | (50, 200)
2021-05-27 16:56:12,461 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:12,462 INFO | (50, 200, 512)
2021-05-27 16:56:12,463 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:12,463 INFO | (50, 200, 512)
2021-05-27 16:56:12,464 INFO | BERT LAYER
2021-05-27 16:56:12,464 INFO | (200, 512)
2021-05-27 16:56:12,464 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,465 INFO | (200, 512)
2021-05-27 16:56:12,465 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,466 INFO | (200, 512)
2021-05-27 16:56:12,471 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,472 INFO | (200, 512)
2021-05-27 16:56:12,472 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,472 INFO | (200, 512)
2021-05-27 16:56:12,478 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,479 INFO | (200, 512)
2021-05-27 16:56:12,479 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,479 INFO | (200, 512)
2021-05-27 16:56:12,484 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,485 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  24%|██▍       | 140/574 [00:15<00:45,  9.51it/s]

2021-05-27 16:56:12,552 INFO | INITIAL
2021-05-27 16:56:12,553 INFO | (50, 200)
2021-05-27 16:56:12,558 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:12,559 INFO | (50, 200, 512)
2021-05-27 16:56:12,560 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:12,561 INFO | (50, 200, 512)
2021-05-27 16:56:12,562 INFO | BERT LAYER
2021-05-27 16:56:12,562 INFO | (200, 512)
2021-05-27 16:56:12,563 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,563 INFO | (200, 512)
2021-05-27 16:56:12,563 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,564 INFO | (200, 512)
2021-05-27 16:56:12,569 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,570 INFO | (200, 512)
2021-05-27 16:56:12,570 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,571 INFO | (200, 512)
2021-05-27 16:56:12,577 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,578 INFO | (200, 512)
2021-05-27 16:56:12,578 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,578 INFO | (200, 512)
2021-05-27 16:56:12,585 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,585 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  25%|██▍       | 142/574 [00:15<00:44,  9.68it/s]

2021-05-27 16:56:12,658 INFO | INITIAL
2021-05-27 16:56:12,658 INFO | (50, 200)
2021-05-27 16:56:12,665 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:12,665 INFO | (50, 200, 512)
2021-05-27 16:56:12,667 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:12,667 INFO | (50, 200, 512)
2021-05-27 16:56:12,668 INFO | BERT LAYER
2021-05-27 16:56:12,669 INFO | (200, 512)
2021-05-27 16:56:12,669 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,669 INFO | (200, 512)
2021-05-27 16:56:12,670 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,670 INFO | (200, 512)
2021-05-27 16:56:12,677 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,678 INFO | (200, 512)
2021-05-27 16:56:12,678 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,679 INFO | (200, 512)
2021-05-27 16:56:12,686 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,686 INFO | (200, 512)
2021-05-27 16:56:12,686 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,687 INFO | (200, 512)
2021-05-27 16:56:12,694 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,694 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  25%|██▍       | 143/574 [00:15<00:44,  9.65it/s]

2021-05-27 16:56:12,763 INFO | INITIAL
2021-05-27 16:56:12,763 INFO | (50, 200)
2021-05-27 16:56:12,768 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:12,769 INFO | (50, 200, 512)
2021-05-27 16:56:12,771 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:12,771 INFO | (50, 200, 512)
2021-05-27 16:56:12,772 INFO | BERT LAYER
2021-05-27 16:56:12,773 INFO | (200, 512)
2021-05-27 16:56:12,773 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,774 INFO | (200, 512)
2021-05-27 16:56:12,775 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,776 INFO | (200, 512)
2021-05-27 16:56:12,782 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,782 INFO | (200, 512)
2021-05-27 16:56:12,783 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,783 INFO | (200, 512)
2021-05-27 16:56:12,789 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,790 INFO | (200, 512)
2021-05-27 16:56:12,790 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,791 INFO | (200, 512)
2021-05-27 16:56:12,796 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,797 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  25%|██▌       | 144/574 [00:15<00:44,  9.67it/s]

2021-05-27 16:56:12,865 INFO | INITIAL
2021-05-27 16:56:12,866 INFO | (50, 200)
2021-05-27 16:56:12,870 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:12,871 INFO | (50, 200, 512)
2021-05-27 16:56:12,872 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:12,873 INFO | (50, 200, 512)
2021-05-27 16:56:12,874 INFO | BERT LAYER
2021-05-27 16:56:12,874 INFO | (200, 512)
2021-05-27 16:56:12,875 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,875 INFO | (200, 512)
2021-05-27 16:56:12,876 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,877 INFO | (200, 512)
2021-05-27 16:56:12,882 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,882 INFO | (200, 512)
2021-05-27 16:56:12,882 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,883 INFO | (200, 512)
2021-05-27 16:56:12,887 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,888 INFO | (200, 512)
2021-05-27 16:56:12,888 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,889 INFO | (200, 512)
2021-05-27 16:56:12,894 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,895 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  25%|██▌       | 145/574 [00:15<00:44,  9.64it/s]

2021-05-27 16:56:12,970 INFO | INITIAL
2021-05-27 16:56:12,971 INFO | (50, 200)
2021-05-27 16:56:12,978 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:12,978 INFO | (50, 200, 512)
2021-05-27 16:56:12,980 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:12,980 INFO | (50, 200, 512)
2021-05-27 16:56:12,981 INFO | BERT LAYER
2021-05-27 16:56:12,982 INFO | (200, 512)
2021-05-27 16:56:12,982 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,982 INFO | (200, 512)
2021-05-27 16:56:12,983 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,983 INFO | (200, 512)
2021-05-27 16:56:12,989 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,990 INFO | (200, 512)
2021-05-27 16:56:12,991 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:12,991 INFO | (200, 512)
2021-05-27 16:56:12,998 INFO | BERT LAYER LOOP
2021-05-27 16:56:12,999 INFO | (200, 512)
2021-05-27 16:56:13,000 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,000 INFO | (200, 512)
2021-05-27 16:56:13,007 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,008 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  25%|██▌       | 146/574 [00:15<00:44,  9.59it/s]

2021-05-27 16:56:13,076 INFO | INITIAL
2021-05-27 16:56:13,077 INFO | (50, 200)
2021-05-27 16:56:13,083 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:13,083 INFO | (50, 200, 512)
2021-05-27 16:56:13,085 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:13,085 INFO | (50, 200, 512)
2021-05-27 16:56:13,086 INFO | BERT LAYER
2021-05-27 16:56:13,086 INFO | (200, 512)
2021-05-27 16:56:13,086 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,087 INFO | (200, 512)
2021-05-27 16:56:13,087 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,087 INFO | (200, 512)
2021-05-27 16:56:13,094 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,094 INFO | (200, 512)
2021-05-27 16:56:13,095 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,095 INFO | (200, 512)
2021-05-27 16:56:13,102 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,102 INFO | (200, 512)
2021-05-27 16:56:13,103 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,103 INFO | (200, 512)
2021-05-27 16:56:13,110 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,112 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  26%|██▌       | 147/574 [00:15<00:45,  9.44it/s]

2021-05-27 16:56:13,186 INFO | INITIAL
2021-05-27 16:56:13,186 INFO | (50, 200)
2021-05-27 16:56:13,191 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:13,192 INFO | (50, 200, 512)
2021-05-27 16:56:13,193 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:13,194 INFO | (50, 200, 512)
2021-05-27 16:56:13,195 INFO | BERT LAYER
2021-05-27 16:56:13,195 INFO | (200, 512)
2021-05-27 16:56:13,196 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,196 INFO | (200, 512)
2021-05-27 16:56:13,196 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,197 INFO | (200, 512)
2021-05-27 16:56:13,203 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,203 INFO | (200, 512)
2021-05-27 16:56:13,204 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,204 INFO | (200, 512)
2021-05-27 16:56:13,210 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,211 INFO | (200, 512)
2021-05-27 16:56:13,211 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,212 INFO | (200, 512)
2021-05-27 16:56:13,218 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,218 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  26%|██▌       | 148/574 [00:15<00:44,  9.54it/s]

2021-05-27 16:56:13,288 INFO | INITIAL
2021-05-27 16:56:13,288 INFO | (50, 200)
2021-05-27 16:56:13,294 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:13,294 INFO | (50, 200, 512)
2021-05-27 16:56:13,296 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:13,296 INFO | (50, 200, 512)
2021-05-27 16:56:13,297 INFO | BERT LAYER
2021-05-27 16:56:13,297 INFO | (200, 512)
2021-05-27 16:56:13,298 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,298 INFO | (200, 512)
2021-05-27 16:56:13,298 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,299 INFO | (200, 512)
2021-05-27 16:56:13,305 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,306 INFO | (200, 512)
2021-05-27 16:56:13,307 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,307 INFO | (200, 512)
2021-05-27 16:56:13,315 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,315 INFO | (200, 512)
2021-05-27 16:56:13,315 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,316 INFO | (200, 512)
2021-05-27 16:56:13,322 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,323 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  26%|██▌       | 149/574 [00:16<00:44,  9.47it/s]

2021-05-27 16:56:13,395 INFO | INITIAL
2021-05-27 16:56:13,396 INFO | (50, 200)
2021-05-27 16:56:13,401 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:13,401 INFO | (50, 200, 512)
2021-05-27 16:56:13,403 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:13,403 INFO | (50, 200, 512)
2021-05-27 16:56:13,404 INFO | BERT LAYER
2021-05-27 16:56:13,405 INFO | (200, 512)
2021-05-27 16:56:13,406 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,406 INFO | (200, 512)
2021-05-27 16:56:13,407 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,407 INFO | (200, 512)
2021-05-27 16:56:13,413 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,413 INFO | (200, 512)
2021-05-27 16:56:13,413 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,414 INFO | (200, 512)
2021-05-27 16:56:13,419 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,419 INFO | (200, 512)
2021-05-27 16:56:13,420 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,420 INFO | (200, 512)
2021-05-27 16:56:13,425 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,426 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  26%|██▌       | 150/574 [00:16<00:44,  9.53it/s]

2021-05-27 16:56:13,498 INFO | INITIAL
2021-05-27 16:56:13,499 INFO | (50, 200)
2021-05-27 16:56:13,504 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:13,504 INFO | (50, 200, 512)
2021-05-27 16:56:13,506 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:13,506 INFO | (50, 200, 512)
2021-05-27 16:56:13,507 INFO | BERT LAYER
2021-05-27 16:56:13,508 INFO | (200, 512)
2021-05-27 16:56:13,509 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,509 INFO | (200, 512)
2021-05-27 16:56:13,510 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,510 INFO | (200, 512)
2021-05-27 16:56:13,516 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,517 INFO | (200, 512)
2021-05-27 16:56:13,517 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,518 INFO | (200, 512)
2021-05-27 16:56:13,524 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,525 INFO | (200, 512)
2021-05-27 16:56:13,526 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,526 INFO | (200, 512)
2021-05-27 16:56:13,532 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,533 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  26%|██▋       | 151/574 [00:16<00:44,  9.57it/s]

2021-05-27 16:56:13,602 INFO | INITIAL
2021-05-27 16:56:13,602 INFO | (50, 200)
2021-05-27 16:56:13,608 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:13,608 INFO | (50, 200, 512)
2021-05-27 16:56:13,610 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:13,611 INFO | (50, 200, 512)
2021-05-27 16:56:13,612 INFO | BERT LAYER
2021-05-27 16:56:13,612 INFO | (200, 512)
2021-05-27 16:56:13,612 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,613 INFO | (200, 512)
2021-05-27 16:56:13,613 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,614 INFO | (200, 512)
2021-05-27 16:56:13,621 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,622 INFO | (200, 512)
2021-05-27 16:56:13,623 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,623 INFO | (200, 512)
2021-05-27 16:56:13,630 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,630 INFO | (200, 512)
2021-05-27 16:56:13,631 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,632 INFO | (200, 512)
2021-05-27 16:56:13,638 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,638 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  26%|██▋       | 152/574 [00:16<00:43,  9.62it/s]

2021-05-27 16:56:13,705 INFO | INITIAL
2021-05-27 16:56:13,706 INFO | (50, 200)
2021-05-27 16:56:13,714 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:13,715 INFO | (50, 200, 512)
2021-05-27 16:56:13,716 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:13,717 INFO | (50, 200, 512)
2021-05-27 16:56:13,718 INFO | BERT LAYER
2021-05-27 16:56:13,718 INFO | (200, 512)
2021-05-27 16:56:13,719 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,719 INFO | (200, 512)
2021-05-27 16:56:13,719 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,720 INFO | (200, 512)
2021-05-27 16:56:13,726 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,727 INFO | (200, 512)
2021-05-27 16:56:13,728 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,728 INFO | (200, 512)
2021-05-27 16:56:13,734 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,734 INFO | (200, 512)
2021-05-27 16:56:13,735 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,736 INFO | (200, 512)
2021-05-27 16:56:13,741 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,742 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  27%|██▋       | 153/574 [00:16<00:44,  9.38it/s]

2021-05-27 16:56:13,818 INFO | INITIAL
2021-05-27 16:56:13,818 INFO | (50, 200)
2021-05-27 16:56:13,824 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:13,825 INFO | (50, 200, 512)
2021-05-27 16:56:13,826 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:13,827 INFO | (50, 200, 512)
2021-05-27 16:56:13,828 INFO | BERT LAYER
2021-05-27 16:56:13,829 INFO | (200, 512)
2021-05-27 16:56:13,830 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,830 INFO | (200, 512)
2021-05-27 16:56:13,831 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,831 INFO | (200, 512)
2021-05-27 16:56:13,837 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,838 INFO | (200, 512)
2021-05-27 16:56:13,838 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,838 INFO | (200, 512)
2021-05-27 16:56:13,849 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,849 INFO | (200, 512)
2021-05-27 16:56:13,849 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,850 INFO | (200, 512)
2021-05-27 16:56:13,856 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,857 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  27%|██▋       | 154/574 [00:16<00:46,  9.04it/s]

2021-05-27 16:56:13,938 INFO | INITIAL
2021-05-27 16:56:13,938 INFO | (50, 200)
2021-05-27 16:56:13,944 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:13,944 INFO | (50, 200, 512)
2021-05-27 16:56:13,946 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:13,946 INFO | (50, 200, 512)
2021-05-27 16:56:13,947 INFO | BERT LAYER
2021-05-27 16:56:13,948 INFO | (200, 512)
2021-05-27 16:56:13,948 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,948 INFO | (200, 512)
2021-05-27 16:56:13,949 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,949 INFO | (200, 512)
2021-05-27 16:56:13,954 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,955 INFO | (200, 512)
2021-05-27 16:56:13,955 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,956 INFO | (200, 512)
2021-05-27 16:56:13,962 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,962 INFO | (200, 512)
2021-05-27 16:56:13,963 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:13,963 INFO | (200, 512)
2021-05-27 16:56:13,969 INFO | BERT LAYER LOOP
2021-05-27 16:56:13,970 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  27%|██▋       | 155/574 [00:16<00:45,  9.18it/s]

2021-05-27 16:56:14,043 INFO | INITIAL
2021-05-27 16:56:14,044 INFO | (50, 200)
2021-05-27 16:56:14,050 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:14,051 INFO | (50, 200, 512)
2021-05-27 16:56:14,052 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:14,053 INFO | (50, 200, 512)
2021-05-27 16:56:14,054 INFO | BERT LAYER
2021-05-27 16:56:14,054 INFO | (200, 512)
2021-05-27 16:56:14,055 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,055 INFO | (200, 512)
2021-05-27 16:56:14,056 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,056 INFO | (200, 512)
2021-05-27 16:56:14,063 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,064 INFO | (200, 512)
2021-05-27 16:56:14,065 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,065 INFO | (200, 512)
2021-05-27 16:56:14,072 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,073 INFO | (200, 512)
2021-05-27 16:56:14,073 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,074 INFO | (200, 512)
2021-05-27 16:56:14,080 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,081 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  27%|██▋       | 156/574 [00:16<00:45,  9.11it/s]

2021-05-27 16:56:14,154 INFO | INITIAL
2021-05-27 16:56:14,154 INFO | (50, 200)
2021-05-27 16:56:14,160 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:14,160 INFO | (50, 200, 512)
2021-05-27 16:56:14,161 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:14,162 INFO | (50, 200, 512)
2021-05-27 16:56:14,163 INFO | BERT LAYER
2021-05-27 16:56:14,163 INFO | (200, 512)
2021-05-27 16:56:14,164 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,164 INFO | (200, 512)
2021-05-27 16:56:14,164 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,165 INFO | (200, 512)
2021-05-27 16:56:14,171 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,171 INFO | (200, 512)
2021-05-27 16:56:14,172 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,172 INFO | (200, 512)
2021-05-27 16:56:14,180 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,180 INFO | (200, 512)
2021-05-27 16:56:14,181 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,181 INFO | (200, 512)
2021-05-27 16:56:14,186 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,187 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  27%|██▋       | 156/574 [00:16<00:45,  9.11it/s]

2021-05-27 16:56:14,253 INFO | INITIAL
2021-05-27 16:56:14,254 INFO | (50, 200)
2021-05-27 16:56:14,259 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:14,259 INFO | (50, 200, 512)
2021-05-27 16:56:14,261 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:14,261 INFO | (50, 200, 512)
2021-05-27 16:56:14,262 INFO | BERT LAYER
2021-05-27 16:56:14,262 INFO | (200, 512)
2021-05-27 16:56:14,263 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,263 INFO | (200, 512)
2021-05-27 16:56:14,264 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,264 INFO | (200, 512)
2021-05-27 16:56:14,270 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,270 INFO | (200, 512)
2021-05-27 16:56:14,271 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,271 INFO | (200, 512)
2021-05-27 16:56:14,278 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,278 INFO | (200, 512)
2021-05-27 16:56:14,279 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,279 INFO | (200, 512)
2021-05-27 16:56:14,285 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,286 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  28%|██▊       | 158/574 [00:17<00:44,  9.43it/s]

2021-05-27 16:56:14,357 INFO | INITIAL
2021-05-27 16:56:14,358 INFO | (50, 200)
2021-05-27 16:56:14,364 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:14,364 INFO | (50, 200, 512)
2021-05-27 16:56:14,366 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:14,366 INFO | (50, 200, 512)
2021-05-27 16:56:14,367 INFO | BERT LAYER
2021-05-27 16:56:14,368 INFO | (200, 512)
2021-05-27 16:56:14,368 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,368 INFO | (200, 512)
2021-05-27 16:56:14,369 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,369 INFO | (200, 512)
2021-05-27 16:56:14,375 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,376 INFO | (200, 512)
2021-05-27 16:56:14,376 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,377 INFO | (200, 512)
2021-05-27 16:56:14,384 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,385 INFO | (200, 512)
2021-05-27 16:56:14,385 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,386 INFO | (200, 512)
2021-05-27 16:56:14,393 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,396 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  28%|██▊       | 159/574 [00:17<00:44,  9.22it/s]

2021-05-27 16:56:14,473 INFO | INITIAL
2021-05-27 16:56:14,474 INFO | (50, 200)
2021-05-27 16:56:14,479 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:14,480 INFO | (50, 200, 512)
2021-05-27 16:56:14,481 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:14,481 INFO | (50, 200, 512)
2021-05-27 16:56:14,482 INFO | BERT LAYER
2021-05-27 16:56:14,482 INFO | (200, 512)
2021-05-27 16:56:14,483 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,484 INFO | (200, 512)
2021-05-27 16:56:14,484 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,484 INFO | (200, 512)
2021-05-27 16:56:14,489 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,490 INFO | (200, 512)
2021-05-27 16:56:14,491 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,491 INFO | (200, 512)
2021-05-27 16:56:14,497 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,498 INFO | (200, 512)
2021-05-27 16:56:14,498 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,499 INFO | (200, 512)
2021-05-27 16:56:14,505 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,506 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  28%|██▊       | 160/574 [00:17<00:45,  9.12it/s]

2021-05-27 16:56:14,586 INFO | INITIAL
2021-05-27 16:56:14,587 INFO | (50, 200)
2021-05-27 16:56:14,593 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:14,594 INFO | (50, 200, 512)
2021-05-27 16:56:14,596 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:14,596 INFO | (50, 200, 512)
2021-05-27 16:56:14,597 INFO | BERT LAYER
2021-05-27 16:56:14,598 INFO | (200, 512)
2021-05-27 16:56:14,598 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,599 INFO | (200, 512)
2021-05-27 16:56:14,600 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,600 INFO | (200, 512)
2021-05-27 16:56:14,605 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,606 INFO | (200, 512)
2021-05-27 16:56:14,606 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,607 INFO | (200, 512)
2021-05-27 16:56:14,615 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,616 INFO | (200, 512)
2021-05-27 16:56:14,617 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,617 INFO | (200, 512)
2021-05-27 16:56:14,624 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,625 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  28%|██▊       | 161/574 [00:17<00:46,  8.93it/s]

2021-05-27 16:56:14,705 INFO | INITIAL
2021-05-27 16:56:14,705 INFO | (50, 200)
2021-05-27 16:56:14,712 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:14,713 INFO | (50, 200, 512)
2021-05-27 16:56:14,715 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:14,716 INFO | (50, 200, 512)
2021-05-27 16:56:14,717 INFO | BERT LAYER
2021-05-27 16:56:14,717 INFO | (200, 512)
2021-05-27 16:56:14,717 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,718 INFO | (200, 512)
2021-05-27 16:56:14,719 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,719 INFO | (200, 512)
2021-05-27 16:56:14,725 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,725 INFO | (200, 512)
2021-05-27 16:56:14,727 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,727 INFO | (200, 512)
2021-05-27 16:56:14,734 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,734 INFO | (200, 512)
2021-05-27 16:56:14,735 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,735 INFO | (200, 512)
2021-05-27 16:56:14,742 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,743 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  28%|██▊       | 162/574 [00:17<00:46,  8.79it/s]

2021-05-27 16:56:14,823 INFO | INITIAL
2021-05-27 16:56:14,823 INFO | (50, 200)
2021-05-27 16:56:14,830 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:14,831 INFO | (50, 200, 512)
2021-05-27 16:56:14,832 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:14,832 INFO | (50, 200, 512)
2021-05-27 16:56:14,833 INFO | BERT LAYER
2021-05-27 16:56:14,834 INFO | (200, 512)
2021-05-27 16:56:14,834 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,834 INFO | (200, 512)
2021-05-27 16:56:14,835 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,836 INFO | (200, 512)
2021-05-27 16:56:14,844 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,844 INFO | (200, 512)
2021-05-27 16:56:14,845 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,846 INFO | (200, 512)
2021-05-27 16:56:14,852 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,852 INFO | (200, 512)
2021-05-27 16:56:14,853 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,853 INFO | (200, 512)
2021-05-27 16:56:14,860 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,861 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  28%|██▊       | 163/574 [00:17<00:46,  8.77it/s]

2021-05-27 16:56:14,938 INFO | INITIAL
2021-05-27 16:56:14,938 INFO | (50, 200)
2021-05-27 16:56:14,944 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:14,944 INFO | (50, 200, 512)
2021-05-27 16:56:14,946 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:14,946 INFO | (50, 200, 512)
2021-05-27 16:56:14,947 INFO | BERT LAYER
2021-05-27 16:56:14,947 INFO | (200, 512)
2021-05-27 16:56:14,947 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,948 INFO | (200, 512)
2021-05-27 16:56:14,948 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,948 INFO | (200, 512)
2021-05-27 16:56:14,954 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,954 INFO | (200, 512)
2021-05-27 16:56:14,955 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,955 INFO | (200, 512)
2021-05-27 16:56:14,960 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,961 INFO | (200, 512)
2021-05-27 16:56:14,961 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:14,962 INFO | (200, 512)
2021-05-27 16:56:14,967 INFO | BERT LAYER LOOP
2021-05-27 16:56:14,967 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  28%|██▊       | 163/574 [00:17<00:46,  8.77it/s]

2021-05-27 16:56:15,032 INFO | INITIAL
2021-05-27 16:56:15,032 INFO | (50, 200)
2021-05-27 16:56:15,037 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:15,037 INFO | (50, 200, 512)
2021-05-27 16:56:15,039 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:15,040 INFO | (50, 200, 512)
2021-05-27 16:56:15,041 INFO | BERT LAYER
2021-05-27 16:56:15,042 INFO | (200, 512)
2021-05-27 16:56:15,042 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,043 INFO | (200, 512)
2021-05-27 16:56:15,044 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,044 INFO | (200, 512)
2021-05-27 16:56:15,050 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,051 INFO | (200, 512)
2021-05-27 16:56:15,054 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,054 INFO | (200, 512)
2021-05-27 16:56:15,062 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,063 INFO | (200, 512)
2021-05-27 16:56:15,064 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,064 INFO | (200, 512)
2021-05-27 16:56:15,071 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,072 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  29%|██▊       | 165/574 [00:17<00:45,  9.07it/s]

2021-05-27 16:56:15,149 INFO | INITIAL
2021-05-27 16:56:15,150 INFO | (50, 200)
2021-05-27 16:56:15,155 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:15,155 INFO | (50, 200, 512)
2021-05-27 16:56:15,157 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:15,158 INFO | (50, 200, 512)
2021-05-27 16:56:15,158 INFO | BERT LAYER
2021-05-27 16:56:15,159 INFO | (200, 512)
2021-05-27 16:56:15,159 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,160 INFO | (200, 512)
2021-05-27 16:56:15,161 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,161 INFO | (200, 512)
2021-05-27 16:56:15,168 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,169 INFO | (200, 512)
2021-05-27 16:56:15,169 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,170 INFO | (200, 512)
2021-05-27 16:56:15,178 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,178 INFO | (200, 512)
2021-05-27 16:56:15,179 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,180 INFO | (200, 512)
2021-05-27 16:56:15,187 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,187 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  29%|██▉       | 166/574 [00:17<00:44,  9.13it/s]

2021-05-27 16:56:15,256 INFO | INITIAL
2021-05-27 16:56:15,257 INFO | (50, 200)
2021-05-27 16:56:15,263 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:15,264 INFO | (50, 200, 512)
2021-05-27 16:56:15,265 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:15,266 INFO | (50, 200, 512)
2021-05-27 16:56:15,266 INFO | BERT LAYER
2021-05-27 16:56:15,267 INFO | (200, 512)
2021-05-27 16:56:15,267 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,267 INFO | (200, 512)
2021-05-27 16:56:15,268 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,268 INFO | (200, 512)
2021-05-27 16:56:15,273 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,274 INFO | (200, 512)
2021-05-27 16:56:15,274 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,275 INFO | (200, 512)
2021-05-27 16:56:15,281 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,281 INFO | (200, 512)
2021-05-27 16:56:15,282 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,282 INFO | (200, 512)
2021-05-27 16:56:15,287 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,287 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  29%|██▉       | 166/574 [00:18<00:44,  9.13it/s]

2021-05-27 16:56:15,355 INFO | INITIAL
2021-05-27 16:56:15,356 INFO | (50, 200)
2021-05-27 16:56:15,361 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:15,362 INFO | (50, 200, 512)
2021-05-27 16:56:15,363 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:15,363 INFO | (50, 200, 512)
2021-05-27 16:56:15,364 INFO | BERT LAYER
2021-05-27 16:56:15,364 INFO | (200, 512)
2021-05-27 16:56:15,365 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,365 INFO | (200, 512)
2021-05-27 16:56:15,366 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,366 INFO | (200, 512)
2021-05-27 16:56:15,372 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,373 INFO | (200, 512)
2021-05-27 16:56:15,374 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,374 INFO | (200, 512)
2021-05-27 16:56:15,382 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,382 INFO | (200, 512)
2021-05-27 16:56:15,383 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,383 INFO | (200, 512)
2021-05-27 16:56:15,388 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,389 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  29%|██▉       | 168/574 [00:18<00:43,  9.38it/s]

2021-05-27 16:56:15,461 INFO | INITIAL
2021-05-27 16:56:15,461 INFO | (50, 200)
2021-05-27 16:56:15,467 INFO | POST EMBEDDING LAYER
2021-05-27 16:56:15,467 INFO | (50, 200, 512)
2021-05-27 16:56:15,469 INFO | POST POSITIONAL ENCODING
2021-05-27 16:56:15,469 INFO | (50, 200, 512)
2021-05-27 16:56:15,470 INFO | BERT LAYER
2021-05-27 16:56:15,470 INFO | (200, 512)
2021-05-27 16:56:15,471 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,471 INFO | (200, 512)
2021-05-27 16:56:15,471 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,472 INFO | (200, 512)
2021-05-27 16:56:15,477 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,478 INFO | (200, 512)
2021-05-27 16:56:15,478 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,479 INFO | (200, 512)
2021-05-27 16:56:15,484 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,485 INFO | (200, 512)
2021-05-27 16:56:15,485 INFO | MULTIHEADED ATTENTION
2021-05-27 16:56:15,485 INFO | (200, 512)
2021-05-27 16:56:15,491 INFO | BERT LAYER LOOP
2021-05-27 16:56:15,491 INFO | (200, 

Epoch: 001, Loss: 0.000, Accuracy: 0.000% :  29%|██▉       | 168/574 [00:18<00:44,  9.21it/s]


KeyboardInterrupt: 