In [1]:
cfg = {
    'learning_rate': 0.001,
    'epochs': 100,
    'embedding_dim': 50,
    'batch_size': 32,
    'dropout': 0.2,
    'optimizer': 'Adam',
    'num_layers': 2,
    'num_heads': 2,
    'context_size': 64
}

In [2]:
LEARNING_RATE = cfg['learning_rate']
EPOCHS = cfg['epochs']
EMBEDDING_DIM = cfg['embedding_dim']
BATCH_SIZE = cfg['batch_size']
DROPOUT = cfg['dropout']
OPTIMIZER = cfg['optimizer']
NUM_LAYERS = cfg['num_layers']
NUM_HEADS = cfg['num_heads']
CONTEXT_SIZE = cfg['context_size']

DIR = '/scratch/shu7bh/RES/1/'

In [3]:
import os
if not os.path.exists(DIR):
    os.makedirs(DIR)

In [4]:
import torch

if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')
print(DEVICE)

cuda


### Preprocessing

In [5]:
from nltk.tokenize import word_tokenize
import unicodedata
import re

def normalize_unicode(text: str) -> str:
    return unicodedata.normalize('NFD', text)


def clean_data_en(text: str) -> str:
    text = normalize_unicode(text.lower().strip())
    text = re.sub(r"([.!?])", r" \1", text)
    text = re.sub(r"[^a-zA-Z.!?]+", r" ", text)
    text = re.sub(r"(['])", r" \1", text)
    return text


def clean_data_fr(text: str) -> str:
    text = normalize_unicode(text.lower().strip())
    text = re.sub(r"([.!?])", r" \1", text)
    text = re.sub(r"[^a-zA-Zàâçéèêëîïôûùüÿñæœ.!?]+", r" ", text)
    text = re.sub(r"(['])", r" \1", text)
    return text


def tokenize_data_en(text: str, unique_words_en: list) -> list:
    tokens = word_tokenize(text)

    if unique_words_en is not None:
        tokens = [token if token in unique_words_en else '<unk>' for token in tokens]

    return tokens


def tokenize_data_fr(text: str, unique_words_fr: list) -> list:
    tokens = word_tokenize(text, language='french')

    if unique_words_fr is not None:
        tokens = [token if token in unique_words_fr else '<unk>' for token in tokens]

    return tokens


def read_data(path: str, unique_words_en: list, unique_words_fr: list):
    data_en = []

    with open(path + '.en', 'r') as f:
        data_en = f.read().split('\n')

    data_en = [tokenize_data_en(clean_data_en(line), unique_words_en) for line in data_en]

    data_fr = []

    with open(path + '.fr', 'r') as f:
        data_fr = f.read().split('\n')

    data_fr = [tokenize_data_fr(clean_data_fr(line), unique_words_fr) for line in data_fr]

    return data_en, data_fr

In [6]:
train_en, train_fr = read_data('data/train', None, None)

In [7]:
unique_words_en = set()
unique_words_fr = set()

for line in train_en:
    unique_words_en.update(line)

for line in train_fr:
    unique_words_fr.update(line)

unique_words_en = list(unique_words_en)
unique_words_fr = list(unique_words_fr)

In [8]:
dev_en, dev_fr = read_data('data/dev', unique_words_en, unique_words_fr)

In [9]:
from icecream import ic

Word to Index

In [10]:
words_to_idx_en = {word: idx + 1 for idx, word in enumerate(unique_words_en)}

words_to_idx_en['<pad>'] = 0
words_to_idx_en['<unk>'] = len(words_to_idx_en)
words_to_idx_en['<sos>'] = len(words_to_idx_en)
words_to_idx_en['<eos>'] = len(words_to_idx_en)

idx_to_words_en = {idx: word for word, idx in words_to_idx_en.items()}

words_to_idx_fr = {word: idx + 1 for idx, word in enumerate(unique_words_fr)}

words_to_idx_fr['<pad>'] = 0
words_to_idx_fr['<unk>'] = len(words_to_idx_fr)
words_to_idx_fr['<sos>'] = len(words_to_idx_fr)
words_to_idx_fr['<eos>'] = len(words_to_idx_fr)

idx_to_words_fr = {idx: word for word, idx in words_to_idx_fr.items()}

ic(len(words_to_idx_en))
ic(len(words_to_idx_fr))

ic| len(words_to_idx_en): 20993
ic| len(words_to_idx_fr): 24037


24037

In [11]:
words_to_idx_fr['<pad>']

0

### Dataset

In [12]:
from torch.utils.data import Dataset

class TranslationDataset(Dataset):
    def __init__(self, data_en, data_fr, words_to_idx_en, words_to_idx_fr):
        self.data_en = []
        self.data_fr = []
        
        for sentence in data_en:
            self.data_en.append(sentence[:CONTEXT_SIZE - 2])

        for sentence in data_fr:
            self.data_fr.append(sentence[:CONTEXT_SIZE - 2])

        self.data_y = [[] for _ in range(len(self.data_fr))]

        for i in range(len(self.data_en)):
            self.data_en[i] = self.__add_padding(*self.__convert_to_tokens(self.data_en[i], words_to_idx_en))
            self.data_fr[i] = self.__add_padding(*self.__convert_to_tokens(self.data_fr[i], words_to_idx_fr))
            self.data_y[i]  = self.data_fr[i][1:] + [words_to_idx_fr['<pad>']]

        self.data_en = torch.tensor(self.data_en)
        self.data_fr = torch.tensor(self.data_fr)
        self.data_y = torch.tensor(self.data_y)


    def __len__(self):
        return len(self.data_en)

    def __getitem__(self, idx):
        en = self.data_en[idx]
        fr = self.data_fr[idx]
        y = self.data_y[idx]

        len_en = torch.tensor(len(en))
        len_fr = torch.tensor(len(fr))

        return en, fr, y, len_en, len_fr

    def __convert_to_tokens(self, sentence, words_to_idx):
        return [words_to_idx['<sos>']] + [words_to_idx[word] for word in sentence] + [words_to_idx['<eos>']], words_to_idx
    
    def __add_padding(self, sentence, words_to_idx):
        return sentence + [words_to_idx['<pad>']] * (CONTEXT_SIZE - len(sentence))

In [13]:
train_dataset = TranslationDataset(train_en, train_fr, words_to_idx_en, words_to_idx_fr)
dev_dataset = TranslationDataset(dev_en, dev_fr, words_to_idx_en, words_to_idx_fr)

In [14]:
# en, fr, y, l1, l2 = train_dataset.__getitem__(0)

# for i in range(len(en)):
#     print(idx_to_words_en[en[i].item()], end=' ')
# print()
# for i in range(len(fr)):
#     print(idx_to_words_fr[fr[i].item()], end=' ')
# print()
# for i in range(len(y)):
#     print(idx_to_words_fr[y[i].item()], end=' ')
# print()

In [15]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [16]:
from torch import nn
from torch.nn import functional as F

In [17]:
# B = 5
# H = 2
# C = 6
# D = 3

# a = torch.randn(B, H, C, D)
# ic(a.shape)
# l = ic(torch.Tensor([4, 3, 2, 6, 1]))

In [18]:
# create mask to ignore padding in the input sequence of length l
# mask = ic(torch.arange(C)[None, :] < l[:, None])
# mask = mask.float()
# mask = mask.unsqueeze(1)
# ic(mask.shape)

# ic.disable()
# mask = ic(mask.transpose(1, 2) @ mask)
# ic.enable()
# ic(mask.shape)

# mask[0]

In [19]:
# qk = a @ a.transpose(2, 3)
# ic(qk.shape)

# qk = qk / (a.size(2) ** 0.5)
# ic(qk.shape)

# qk = qk.permute(1, 0, 2, 3)
# qk = ic(qk.masked_fill(mask == 0, float('-inf')))
# qk = qk.permute(1, 0, 2, 3)

# ic(qk[0])

# qk = ic(F.softmax(qk, dim=-1))

# # whichever element is nan in qk, set it to 0
# qk[qk != qk] = 0
# qk

### Transformer

In [20]:
def Positional_Encoding(x, EMBEDDING_DIM, CONTEXT_SIZE):
    pos = torch.arange(0, CONTEXT_SIZE, device=x.device).unsqueeze(1)

    PE = torch.zeros(CONTEXT_SIZE, EMBEDDING_DIM, device=x.device)

    PE[:, 0::2] = torch.sin(pos / (10000 ** (2 * torch.arange(0, EMBEDDING_DIM, 2, device=x.device) / EMBEDDING_DIM)))
    PE[:, 1::2] = torch.cos(pos / (10000 ** (2 * torch.arange(1, EMBEDDING_DIM, 2, device=x.device) / EMBEDDING_DIM)))

    PE = PE.unsqueeze(0)
    return PE

In [21]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embedding_dim: int, num_heads: int, dropout: float, mask: bool) -> None:
        
        super(MultiHeadSelfAttention, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.mask = mask

        self.W = nn.Linear(embedding_dim, 3 * embedding_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, l):
        batch_size = x.size(0)
        context_size = x.size(1)

        qkv = self.W(x)
        qkv = qkv.view(batch_size, context_size, 3, self.num_heads, self.embedding_dim // self.num_heads)
        qkv = qkv.permute(2, 0, 3, 1, 4)

        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = q @ k.permute(0, 1, 3, 2)
        attn = attn / (self.embedding_dim ** 0.5)

        mask = (torch.arange(context_size, device=l.device)[None, :] < l[:, None]).float().unsqueeze(1)
        mask = mask.transpose(1, 2) @ mask

        attn = attn.permute(1, 0, 2, 3)
        attn = attn.masked_fill(mask == 0, float('-inf'))
        attn = attn.permute(1, 0, 2, 3)

        if self.mask:
            mask = torch.tril(torch.ones(context_size, context_size, device=attn.device))[None, :, :]
            attn = attn.masked_fill(mask == 0, float('-inf'))

        attn = F.softmax(attn, dim=-1)

        attn = attn.nan_to_num()

        attn = self.dropout(attn)

        x = attn @ v
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, context_size, self.embedding_dim)

        return x

In [22]:
class EncoderLayer(nn.Module):
    def __init__(
        self, 
        embedding_dim: int,
        num_heads: int,
        context_size: int,
        dropout: float,
    ) -> None:
        
        super(EncoderLayer, self).__init__()

        self.multi_head_self_attention = MultiHeadSelfAttention(embedding_dim, num_heads, dropout, mask=False)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(embedding_dim)
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.fc = nn.Linear(embedding_dim, embedding_dim)
        self.activation = nn.ReLU()

    def forward(self, input: tuple) -> torch.Tensor:
        en, l = input
        rc = en.clone()
        en = self.multi_head_self_attention(en, l)
        en = self.dropout(en)
        en = self.layer_norm(en + rc)
        rc = en.clone()
        en = self.fc(en)
        en = self.activation(en)
        en = self.dropout(en)
        en = self.layer_norm(en + rc)
        return (en, l)

In [23]:
class Encoder(nn.Module):
    def __init__(
        self, 
        vocab_size: int,
        embedding_dim: int,
        num_heads: int,
        context_size: int,
        dropout: float,
        filename: str = None
    ) -> None:
        
        super(Encoder, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = Positional_Encoding
        self.layers = nn.ModuleList([EncoderLayer(embedding_dim, num_heads, context_size, dropout) for _ in range(NUM_LAYERS)])
        self.layers = nn.Sequential(*self.layers)
        # ic(self.layers)
        self.context_size = context_size
        self.embedding_dim = embedding_dim

        if filename is not None:
            self.load_state_dict(torch.load(filename))

    def forward(self, en: torch.Tensor, l: torch.Tensor) -> torch.Tensor:
        en = self.embedding(en)
        en = en + self.positional_encoding(en, self.embedding_dim, self.context_size)
        # ic(en.shape)
        # ic(l.shape)
        en, _ = self.layers((en, l))
        return en

In [24]:
# q = ic(torch.randn(1, 6, 3))
# k = ic(torch.randn(4, 6, 3))

# qk = ic(q @ k.transpose(1, 2))

In [25]:
# l_en = ic(torch.Tensor([3, 2, 5, 1]))
# l_fr = ic(torch.Tensor([4]))

# mask_en = ic((torch.arange(6)[None, :] < l_en[:, None]).float())
# mask_de = ic((torch.arange(6)[None, :] < l_fr[:, None]).float())

# ic(mask_de.shape)
# mask_de = mask_de.unsqueeze(1)
# mask_en = mask_en.unsqueeze(1)

# mask = ic(mask_de.transpose(1, 2) @ mask_en)

In [26]:
class EncoderDecoderAttention(nn.Module):
    def __init__(
        self,
        embedding_dim: int,
        num_heads: int,
        dropout: float,
    ) -> None:
        
        super(EncoderDecoderAttention, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.dropout = dropout

        self.W_Q = nn.Linear(embedding_dim, embedding_dim)
        self.W_KV = nn.Linear(embedding_dim, 2 * embedding_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, en: torch.Tensor, fr: torch.Tensor, l_en: torch.Tensor, l_fr: torch.Tensor) -> torch.Tensor:
        batch_size = en.size(0)
        context_size = en.size(1)

        q = self.W_Q(fr).view(batch_size, context_size, self.num_heads, self.embedding_dim // self.num_heads).permute(0, 2, 1, 3)
        kv = self.W_KV(en)
        k, v = kv.view(batch_size, context_size, 2, self.num_heads, self.embedding_dim // self.num_heads).permute(2, 0, 3, 1, 4)

        attn = q @ k.permute(0, 1, 3, 2)
        attn = attn / (self.embedding_dim ** 0.5)

        mask_en = (torch.arange(context_size, device=l_en.device)[None, :] < l_en[:, None]).float().unsqueeze(1)
        mask_fr = (torch.arange(context_size, device=l_fr.device)[None, :] < l_fr[:, None]).float().unsqueeze(1)

        mask = (mask_fr.transpose(1, 2) @ mask_en)

        attn = attn.permute(1, 0, 2, 3)
        attn = attn.masked_fill(mask == 0, float('-inf'))
        attn = attn.permute(1, 0, 2, 3)

        attn = F.softmax(attn, dim=-1)
        attn = attn.nan_to_num()

        attn = self.dropout(attn)

        en = attn @ v
        en = en.permute(0, 2, 1, 3).contiguous()
        en = en.view(batch_size, context_size, self.embedding_dim)

        return en

In [27]:
class DecoderLayer(nn.Module):
    def __init__(
        self, 
        embedding_dim: int,
        num_heads: int,
        context_size: int,
        dropout: float,
    ) -> None:
        
        super(DecoderLayer, self).__init__()

        self.multi_head_self_attention = MultiHeadSelfAttention(embedding_dim, num_heads, dropout, mask=True)
        self.encoder_decoder_attention = EncoderDecoderAttention(embedding_dim, num_heads, dropout)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(embedding_dim)
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.fc = nn.Linear(embedding_dim, embedding_dim)
        self.activation = nn.ReLU()
        
    def forward(self, input: tuple) -> torch.Tensor:
        en, fr, l_en, l_fr = input
        rc = fr.clone()
        fr = self.multi_head_self_attention(fr, l_fr)
        fr = self.dropout(fr)
        fr = self.layer_norm(fr + rc)
        rc = fr.clone()
        fr = self.encoder_decoder_attention(en, fr, l_en, l_fr)
        fr = self.dropout(fr)
        fr = self.layer_norm(fr + rc)
        rc = fr.clone()
        fr = self.fc(fr)
        fr = self.activation(fr)
        fr = self.dropout(fr)
        fr = self.layer_norm(fr + rc)
        return (en, fr, l_en, l_fr)

In [28]:
class Decoder(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embedding_dim: int,
        num_heads: int,
        context_size: int,
        dropout: float,
        filename: str = None
    ) -> None:
        
        super(Decoder, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = Positional_Encoding
        self.layers = nn.ModuleList([DecoderLayer(embedding_dim, num_heads, context_size, dropout) for _ in range(NUM_LAYERS)])
        self.layers = nn.Sequential(*self.layers)
        self.context_size = context_size
        self.embedding_dim = embedding_dim

        if filename is not None:
            self.load_state_dict(torch.load(filename))

    def forward(self, en: torch.Tensor, fr: torch.Tensor, l_en: torch.Tensor, l_fr: torch.Tensor) -> torch.Tensor:
        fr = self.embedding(fr)
        fr = fr + self.positional_encoding(fr, self.embedding_dim, self.context_size)
        _, fr, _, _ = self.layers((en, fr, l_en, l_fr))
        return fr

Early Stopping

In [29]:
import numpy as np

class EarlyStopping:
    def __init__(self, patience:int = 3, delta:float = 0.001):
        self.patience = patience
        self.counter = 0
        self.best_loss:float = np.inf
        self.best_model_pth = 0
        self.delta = delta

    def __call__(self, loss, epoch: int):
        should_stop = False

        if loss >= self.best_loss - self.delta:
            self.counter += 1
            if self.counter > self.patience:
                should_stop = True
        else:
            self.best_loss = loss
            self.counter = 0
            self.best_model_pth = epoch
        return should_stop

In [30]:
from tqdm import tqdm
import wandb

In [31]:
class Transformer(nn.Module):
    def __init__(self, vocab_size_en: int, vocab_size_fr: int, embedding_dim: int, num_heads: int, context_size: int, dropout: float, filename: str = None) -> None:

        super(Transformer, self).__init__()

        self.encoder = Encoder(vocab_size_en, embedding_dim, num_heads, context_size, dropout, filename)
        self.decoder = Decoder(vocab_size_fr, embedding_dim, num_heads, context_size, dropout, filename)
        self.fc = nn.Linear(embedding_dim, vocab_size_fr)

    def forward(self, en: torch.Tensor, fr: torch.Tensor, len_en: torch.Tensor, len_fr: torch.Tensor) -> torch.Tensor:
        en = self.encoder(en, len_en)
        en = self.decoder(en, fr, len_en, len_fr)
        en = self.fc(en)
        return en

    def fit(self, train_loader: DataLoader, validation_loader: DataLoader, epochs: int, learning_rate: float, filename: str) -> None:
        self.es = EarlyStopping()
        self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        self.criterion = nn.CrossEntropyLoss(ignore_index=words_to_idx_fr['<pad>'])

        for epoch in range(epochs):
        # for epoch in tqdm(range(epochs)):
            print(f'Epoch: {epoch + 1}/{epochs}')

            self.__train(train_loader)
            loss = self.__validate(validation_loader)

            if self.es(loss, epoch):
                break
            if self.es.counter == 0:
                torch.save(self.state_dict(), filename)


    def __train(self, train_loader: DataLoader) -> None:
        self.train()
        total_loss = []

        pbar = tqdm(train_loader, total=len(train_loader))
        for en, fr, y, len_en, len_fr in pbar:
            loss = self.__call(en, fr, y, len_en, len_fr)
            total_loss.append(loss.item())

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            pbar.set_description(f'T Loss: {loss.item():7.4f}, Avg Loss: {np.mean(total_loss):7.4f}, Counter: {self.es.counter}, Best Loss: {self.es.best_loss:7.4f}')

        # wandb.log({'train_loss': np.mean(total_loss)})

    def __validate(self, validation_loader: DataLoader) -> None:
        self.eval()
        total_loss = []

        with torch.no_grad():
            pbar = tqdm(validation_loader, total=len(validation_loader))
            for en, fr, y, len_en, len_fr in pbar:
                loss = self.__call(en, fr, y, len_en, len_fr)
                total_loss.append(loss.item())

                pbar.set_description(f'V Loss: {loss.item():7.4f}, Avg Loss: {np.mean(total_loss):7.4f}, Counter: {self.es.counter}, Best Loss: {self.es.best_loss:7.4f}')

        # wandb.log({'validation_loss': np.mean(total_loss)})
        return np.mean(total_loss)

    def __call(self, en: torch.Tensor, fr: torch.Tensor, y: torch.Tensor, len_en: torch.Tensor, len_fr: torch.Tensor) -> torch.Tensor:

        en = en.to(DEVICE)
        fr = fr.to(DEVICE)
        y = y.to(DEVICE)
        len_en = len_en.to(DEVICE)
        len_fr = len_fr.to(DEVICE)

        output = self(en, fr, len_en, len_fr)
        output = output.view(-1, output.size(-1))
        y = y.view(-1)

        loss = self.criterion(output, y)

        return loss

    # def __evaluate(self, test_loader: DataLoader) -> None:
    #     self.eval()
    #     total_loss = []

    #     with torch.no_grad():
    #         pbar = tqdm(test_loader, total=len(test_loader))
    #         for en, fr, y, len_en, len_fr in pbar:
    #             loss = self.__call(en, fr, y, len_en, len_fr)
    #             total_loss.append(loss.item())

    #             pbar.set_description(f'T Loss: {loss.item():7.4f}, Avg Loss: {np.mean(total_loss):7.4f}')

    #     return np.mean(total_loss)

Initiate Model

In [32]:
model = ic(Transformer(len(words_to_idx_en), len(words_to_idx_fr), EMBEDDING_DIM, NUM_HEADS, CONTEXT_SIZE, DROPOUT, filename=None).to(DEVICE))

ic| Transformer(len(words_to_idx_en), len(words_to_idx_fr), EMBEDDING_DIM, NUM_HEADS, CONTEXT_SIZE, DROPOUT, filename=None).to(DEVICE): Transformer(
                                                                                                                                          (encoder): Encoder(
                                                                                                                                            (embedding): Embedding(20993, 50)
                                                                                                                                            (layers): Sequential(
                                                                                                                                              (0): EncoderLayer(
                                                                                                                                                (multi_head_self_attention): MultiHeadSelfAttention(

In [33]:
from torchinfo import summary

summary(model, device=DEVICE)

Layer (type:depth-idx)                             Param #
Transformer                                        --
├─Encoder: 1-1                                     --
│    └─Embedding: 2-1                              1,049,650
│    └─Sequential: 2-2                             --
│    │    └─EncoderLayer: 3-1                      10,300
│    │    └─EncoderLayer: 3-2                      10,300
├─Decoder: 1-2                                     --
│    └─Embedding: 2-3                              1,201,850
│    └─Sequential: 2-4                             --
│    │    └─DecoderLayer: 3-3                      17,950
│    │    └─DecoderLayer: 3-4                      17,950
├─Linear: 1-3                                      1,225,887
Total params: 3,533,887
Trainable params: 3,533,887
Non-trainable params: 0

In [34]:
model.fit(train_loader, dev_loader, EPOCHS, LEARNING_RATE, DIR + 'best_model.pth')

Epoch: 1/100


T Loss:  5.1695, Avg Loss:  5.9274, Counter: 0, Best Loss:     inf: 100%|██████████| 938/938 [00:28<00:00, 32.85it/s]
V Loss:  5.5179, Avg Loss:  5.3846, Counter: 0, Best Loss:     inf: 100%|██████████| 28/28 [00:00<00:00, 81.30it/s]


Epoch: 2/100


T Loss:  4.7902, Avg Loss:  4.9150, Counter: 0, Best Loss:  5.3846: 100%|██████████| 938/938 [00:27<00:00, 34.52it/s]
V Loss:  5.0901, Avg Loss:  5.0893, Counter: 0, Best Loss:  5.3846: 100%|██████████| 28/28 [00:00<00:00, 81.28it/s]


Epoch: 3/100


T Loss:  4.7689, Avg Loss:  4.5859, Counter: 0, Best Loss:  5.0893: 100%|██████████| 938/938 [00:27<00:00, 34.09it/s]
V Loss:  4.6970, Avg Loss:  4.9542, Counter: 0, Best Loss:  5.0893: 100%|██████████| 28/28 [00:00<00:00, 80.24it/s]


Epoch: 4/100


T Loss:  3.9076, Avg Loss:  4.3678, Counter: 0, Best Loss:  4.9542: 100%|██████████| 938/938 [00:27<00:00, 34.14it/s]
V Loss:  4.7980, Avg Loss:  4.8672, Counter: 0, Best Loss:  4.9542: 100%|██████████| 28/28 [00:00<00:00, 81.61it/s]


Epoch: 5/100


T Loss:  4.2882, Avg Loss:  4.1926, Counter: 0, Best Loss:  4.8672: 100%|██████████| 938/938 [00:27<00:00, 34.10it/s]
V Loss:  4.5537, Avg Loss:  4.8069, Counter: 0, Best Loss:  4.8672: 100%|██████████| 28/28 [00:00<00:00, 81.39it/s]


Epoch: 6/100


T Loss:  4.2343, Avg Loss:  4.0432, Counter: 0, Best Loss:  4.8069: 100%|██████████| 938/938 [00:27<00:00, 34.09it/s]
V Loss:  4.4109, Avg Loss:  4.7746, Counter: 0, Best Loss:  4.8069: 100%|██████████| 28/28 [00:00<00:00, 81.50it/s]


Epoch: 7/100


T Loss:  3.5453, Avg Loss:  3.9033, Counter: 0, Best Loss:  4.7746: 100%|██████████| 938/938 [00:27<00:00, 34.24it/s]
V Loss:  4.3243, Avg Loss:  4.7423, Counter: 0, Best Loss:  4.7746: 100%|██████████| 28/28 [00:00<00:00, 80.90it/s]


Epoch: 8/100


T Loss:  3.9463, Avg Loss:  3.7803, Counter: 0, Best Loss:  4.7423: 100%|██████████| 938/938 [00:27<00:00, 34.29it/s]
V Loss:  4.8085, Avg Loss:  4.7294, Counter: 0, Best Loss:  4.7423: 100%|██████████| 28/28 [00:00<00:00, 81.53it/s]


Epoch: 9/100


T Loss:  3.8117, Avg Loss:  3.6656, Counter: 0, Best Loss:  4.7294: 100%|██████████| 938/938 [00:27<00:00, 34.20it/s]
V Loss:  4.1182, Avg Loss:  4.7015, Counter: 0, Best Loss:  4.7294: 100%|██████████| 28/28 [00:00<00:00, 81.30it/s]


Epoch: 10/100


T Loss:  3.8348, Avg Loss:  3.5579, Counter: 0, Best Loss:  4.7015: 100%|██████████| 938/938 [00:27<00:00, 34.30it/s]
V Loss:  4.3283, Avg Loss:  4.7048, Counter: 0, Best Loss:  4.7015: 100%|██████████| 28/28 [00:00<00:00, 80.95it/s]


Epoch: 11/100


T Loss:  2.8992, Avg Loss:  3.4597, Counter: 1, Best Loss:  4.7015: 100%|██████████| 938/938 [00:27<00:00, 34.19it/s]
V Loss:  4.6829, Avg Loss:  4.7042, Counter: 1, Best Loss:  4.7015: 100%|██████████| 28/28 [00:00<00:00, 81.08it/s]


Epoch: 12/100


T Loss:  3.2841, Avg Loss:  3.3700, Counter: 2, Best Loss:  4.7015: 100%|██████████| 938/938 [00:27<00:00, 34.26it/s]
V Loss:  4.6659, Avg Loss:  4.6791, Counter: 2, Best Loss:  4.7015: 100%|██████████| 28/28 [00:00<00:00, 81.29it/s]


Epoch: 13/100


T Loss:  3.2928, Avg Loss:  3.2891, Counter: 0, Best Loss:  4.6791: 100%|██████████| 938/938 [00:27<00:00, 34.61it/s]
V Loss:  4.8016, Avg Loss:  4.6896, Counter: 0, Best Loss:  4.6791: 100%|██████████| 28/28 [00:00<00:00, 81.60it/s]


Epoch: 14/100


T Loss:  3.3116, Avg Loss:  3.2150, Counter: 1, Best Loss:  4.6791: 100%|██████████| 938/938 [00:27<00:00, 34.43it/s]
V Loss:  4.5351, Avg Loss:  4.7040, Counter: 1, Best Loss:  4.6791: 100%|██████████| 28/28 [00:00<00:00, 81.38it/s]


Epoch: 15/100


T Loss:  3.0005, Avg Loss:  3.1447, Counter: 2, Best Loss:  4.6791: 100%|██████████| 938/938 [00:27<00:00, 34.24it/s]
V Loss:  4.4663, Avg Loss:  4.6959, Counter: 2, Best Loss:  4.6791: 100%|██████████| 28/28 [00:00<00:00, 81.95it/s]


Epoch: 16/100


T Loss:  3.2328, Avg Loss:  3.0804, Counter: 3, Best Loss:  4.6791: 100%|██████████| 938/938 [00:27<00:00, 34.22it/s]
V Loss:  4.6505, Avg Loss:  4.6804, Counter: 3, Best Loss:  4.6791: 100%|██████████| 28/28 [00:00<00:00, 81.92it/s]
