### Importing The Required Libraries

In [1]:
!pip install torch==2.3.0+cu121 torchvision==0.18.0+cu121 torchaudio==2.3.0+cu121 \
  --index-url https://download.pytorch.org/whl/cu121
!pip install torchtext==0.18.0 --no-deps
  

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.3.0+cu121
  Downloading https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp312-cp312-linux_x86_64.whl (780.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.9/780.9 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchvision==0.18.0+cu121
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.18.0%2Bcu121-cp312-cp312-linux_x86_64.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchaudio==2.3.0+cu121
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.3.0%2Bcu121-cp312-cp312-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0+cu121)
  Dow

In [2]:
import torchtext
torchtext.disable_torchtext_deprecation_warning()
import torch
from torch.utils.data import DataLoader
from torch import Tensor
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn import Transformer
from transformers import BertTokenizer
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torchtext.vocab import Vocab,build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import IMDB
import random
from itertools import chain
import pandas as pd
from copy import deepcopy
import csv
import json
import math
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import get_linear_schedule_with_warmup

# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')
import numpy as np
import torch.optim as optim
import time 

### Loading The CSV Dataset

In [3]:
class BERTCSVDataset(Dataset):
    def __init__(self,filename):
        self.data=pd.read_csv(filename)
        self.tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
    def __len__(self):
        return len(self.data)
    def __getitem__(self,idx):
        row = self.data.iloc[idx]
        try:
            
            bert_input = torch.tensor(json.loads(row['BERT Input']), dtype=torch.long)
            bert_label = torch.tensor(json.loads(row['BERT Label']), dtype=torch.long)
            segment_label = torch.tensor([int(x) for x in row['Segment Label'].split(',')], dtype=torch.long)
            is_next = torch.tensor(row['Is Next'], dtype=torch.long)
            original_text = row['Original Text']  # If you want to use it
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON for row {idx}: {e}")
            print("BERT Input:", row['BERT Input'])
            print("BERT Label:", row['BERT Label'])
            return None  

            # Tokenizing the original text with BERT
        encoded_input = self.tokenizer.encode_plus(
            original_text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        input_ids = encoded_input['input_ids'].squeeze()
        attention_mask = encoded_input['attention_mask'].squeeze()

        return(bert_input, bert_label, segment_label, is_next, input_ids, attention_mask, original_text)


In [4]:
PAD_IDX = 0
def collate_batch(batch):
    bert_inputs_batch, bert_labels_batch, segment_labels_batch, is_nexts_batch,input_ids_batch,attention_mask_batch,original_text_battch = [], [], [], [],[],[],[]
    for bert_input, bert_label, segment_label, is_next,input_ids,attention_mask,original_text in batch:
        bert_inputs_batch.append(torch.tensor(bert_input[:512], dtype=torch.long))
        bert_labels_batch.append(torch.tensor(bert_label[:512], dtype=torch.long))
        segment_labels_batch.append(torch.tensor(segment_label[:512], dtype=torch.long))
        is_nexts_batch.append(is_next)
        input_ids_batch.append(input_ids)
        attention_mask_batch.append(attention_mask)
        original_text_battch.append(original_text)
    bert_inputs_final = pad_sequence(bert_inputs_batch, padding_value=PAD_IDX, batch_first=True)
    bert_labels_final = pad_sequence(bert_labels_batch, padding_value=PAD_IDX, batch_first=True)
    segment_labels_final = pad_sequence(segment_labels_batch, padding_value=PAD_IDX, batch_first=True)
    is_nexts_batch = torch.tensor(is_nexts_batch, dtype=torch.long)
    return bert_inputs_final, bert_labels_final, segment_labels_final, is_nexts_batch

In [33]:
BATCH_SIZE = 6

train_dataset_path = '/kaggle/input/bert-dataset/bert_dataset/bert_train_data.csv'
test_dataset_path = '/kaggle/input/bert-dataset/bert_dataset/bert_test_data.csv'

train_dataset = BERTCSVDataset(train_dataset_path)
test_dataset = BERTCSVDataset(test_dataset_path)
train_dataloader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_batch)
test_dataloader=DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_batch)

### Model Creation

In [6]:
EMBEDDING_DIM=10
class TokenEmbedding(nn.Module):
    def __init__(self,vocab_size,embed_dim):
        super(TokenEmbedding,self).__init__()
        self.embedding=nn.Embedding(vocab_size,embed_dim)
        self.embed_dim=embed_dim
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.embed_dim)

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 512):
        super().__init__()

        position = torch.arange(0, maxlen).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, emb_size, 2) * (-math.log(10000.0) / emb_size)
        )

        pe = torch.zeros(maxlen, emb_size)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)

        self.register_buffer("pos_embedding", pe)
        self.dropout = nn.Dropout(dropout)

    def forward(self, token_embedding: torch.Tensor):
        """
        token_embedding: [batch_size, seq_len, emb_size]
        """
        seq_len = token_embedding.size(1)
        token_embedding = token_embedding + self.pos_embedding[:, :seq_len, :]
        return self.dropout(token_embedding)


In [8]:
class BERTEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, dropout=0.1):
        super().__init__()

        self.token_embedding = TokenEmbedding(vocab_size, embed_dim)
        self.positional_encoding = PositionalEncoding(embed_dim, dropout)
        self.segment_embedding = nn.Embedding(3, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, bert_inputs, segment_labels):
        """
        bert_inputs:   [batch_size, seq_len]
        segment_labels:[batch_size, seq_len]
        """

        token_embeddings = self.token_embedding(bert_inputs)
        position_embeddings = self.positional_encoding(token_embeddings)
        segment_embeddings = self.segment_embedding(segment_labels)

        x = token_embeddings + position_embeddings + segment_embeddings
        x = self.dropout(x)

        return x


In [9]:
VOCAB_SIZE=147161
batch=2
count=0
for batch in train_dataloader:
    bert_input,bert_label,segement_label,is_next=[b for b in batch]
    
    count+=1
    if count==5:
        break


In [10]:
bert_input.shape

torch.Size([2, 78])

In [11]:
segement_label.shape

torch.Size([2, 78])

In [12]:
bert_label.shape

torch.Size([2, 78])

In [13]:
is_next.shape

torch.Size([2])

In [14]:
is_next

tensor([0, 1])

In [15]:
bert_input

tensor([[    1,    21,    13,     7,     3,    12,     3,    52,   619,   322,
          1926,    48,     3,     6,     2,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,    38,   464,   438,     3,
             3, 36586,   158,    35,     3,   151,   198,     5,  3487,    27,
             7,    38,   338,     7,    38,  2278,     3,     8,    38,  9685,
             6,     2,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [    1,    31,     3,   939,   519,    17,   123,   435,   406,   188,
           775, 10428,     3,    25,     3,     3,   125,  2193,    13,  1739,
            11,   118,   238, 20191,     7,    35,     3,   901,  1328,   195,
             9,  4368,   217,     6,     2,     0,     0,     0,     0,     3,
          2938,   545,   197,     5,  2585,  1286,     3,     5, 

In [16]:
segement_label

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2]])

In [17]:
# Instantiate the TokenEmbedding 
token_embedding = TokenEmbedding(VOCAB_SIZE, embed_dim=EMBEDDING_DIM )

# Get the token embeddings for a sample input
t_embeddings = token_embedding(bert_input)
#Each token is transformed into a tensor of size emb_size
print(f"Dimensions of token embeddings: {t_embeddings.size()}") # Expected: (sequence_length, batch_size, EMBEDDING_DIM)
#Check the embedded vectors for first 3 tokens of the first sample in the batch
# you get embeddings[i,0,:] where i refers to the i'th token of the first sample in the batch (b=0)
for i in range(2):
    print(f"Token Embeddings for the {i}th token of the first sample: {t_embeddings[i,0,:]}")

Dimensions of token embeddings: torch.Size([2, 78, 10])
Token Embeddings for the 0th token of the first sample: tensor([-1.9143,  0.6985,  4.7210, -1.6886, -2.6555,  1.8882, -1.5148,  4.5452,
         0.8053,  3.1830], grad_fn=<SliceBackward0>)
Token Embeddings for the 1th token of the first sample: tensor([-1.9143,  0.6985,  4.7210, -1.6886, -2.6555,  1.8882, -1.5148,  4.5452,
         0.8053,  3.1830], grad_fn=<SliceBackward0>)


### Bert Model Architecture

In [18]:
class BERT(nn.Module):
    def __init__(self,vocab_size,embed_dim,num_layers,n_head,dropout):
        super().__init__()
        self.d_model = d_model
        self.n_layers = n_layers
        self.heads = heads
        self.embedding=BERTEmbedding(vocab_size,embed_dim)
        encoder_layer=nn.TransformerEncoderLayer(d_model=embed_dim,nhead=n_head,dropout=dropout,batch_first=True)
        self.encoder=nn.TransformerEncoder(encoder_layer,num_layers=num_layers)
        self.NextSentencePrediction=nn.Linear(embed_dim,2)
        self.MaskedPrediction=nn.Linear(embed_dim,vocab_size)
    def forward(self,bert_input,segement_label):
        padding_mask=(bert_input==PAD_IDX)
        x=self.embedding(bert_input,segement_label)
        values_after_encoding=self.encoder(x,src_key_padding_mask=padding_mask)
        next_sentence=self.NextSentencePrediction(values_after_encoding[:,0,:])
        masked_language=self.MaskedPrediction(values_after_encoding)
        return next_sentence,masked_language 

In [19]:
EMBEDDING_DIM = 10
vocab_size = 147161
d_model = EMBEDDING_DIM  
n_layers = 2
initial_heads = 12
initial_heads = 2
heads = initial_heads - d_model % initial_heads
dropout = 0.1  
model = BERT(vocab_size, d_model, n_layers, heads, dropout)

In [20]:
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Using 2 GPUs!


DataParallel(
  (module): BERT(
    (embedding): BERTEmbedding(
      (token_embedding): TokenEmbedding(
        (embedding): Embedding(147161, 10)
      )
      (positional_encoding): PositionalEncoding(
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (segment_embedding): Embedding(3, 10)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-1): 2 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=10, out_features=10, bias=True)
          )
          (linear1): Linear(in_features=10, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=10, bias=True)
          (norm1): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1,

### Evaluation

In [21]:
PAD_IDX=0
loss_ns=nn.CrossEntropyLoss()
loss_mlm=nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [22]:
def evaluate(dataloader, model, loss_ns, loss_mlm):
    model.eval()
    total_loss = 0
    
    for batch in dataloader:
        bert_input, bert_label, segment_label, is_next = [b.to(device) for b in batch]
        
        next_sentence_prediction, masked_language = model(bert_input, segment_label)
        
        l_ns = loss_ns(next_sentence_prediction, is_next.view(-1))
        l_mlm = loss_mlm(masked_language.reshape(-1, masked_language.size(-1)), bert_label.reshape(-1))
        
        loss = l_ns + l_mlm
        total_loss += loss.item()
        
    return total_loss / len(dataloader)

### Training

In [35]:
BATCH_SIZE = 6

train_dataset_path = '/kaggle/input/bert-dataset/bert_dataset/bert_train_data_sampled.csv'
test_dataset_path = '/kaggle/input/bert-dataset/bert_dataset/bert_test_data_sampled.csv'

train_dataset = BERTCSVDataset(train_dataset_path)
test_dataset = BERTCSVDataset(test_dataset_path)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

In [24]:
loss=evaluate(train_dataloader,model,loss_ns,loss_mlm)
loss

12.768256727218628

In [None]:
import time
from tqdm import tqdm
from torch.optim import Adam
from transformers import get_linear_schedule_with_warmup

from torch.optim import Adam
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
num_epochs=100
optimizer = Adam(model.parameters(), lr=5e-5, weight_decay=0.01)
total_steps = num_epochs * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(total_steps * 0.1), num_training_steps=total_steps)
check_loss=10
for epoch in range(num_epochs):
    model.train()
    start_time=time.time()
    total_loss = 0.0
    

    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")
    
    for batch in progress_bar:

        bert_inputs, bert_labels, segment_labels, is_nexts = [b.to(device) for b in batch]
        
        optimizer.zero_grad()
        
        # Forward pass
        next_pred, mask_pred = model(bert_inputs, segment_labels)
        
        # Calculate individual losses
        loss_n = loss_ns(next_pred, is_nexts)
        loss_m = loss_mlm(mask_pred.view(-1, mask_pred.size(-1)), bert_labels.view(-1))
        
        # Combine losses
        loss = loss_n + loss_m
        
        if torch.isnan(loss):
            print("Warning: NaN loss detected!")
            continue
        
        # Backward pass
        loss.backward()
        
        # Stability: Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        
        # Optional: Update progress bar with current loss
        progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
    end_time=time.time()
    avg_epoch_loss = total_loss / len(train_dataloader)
    print("Time Taken for the epoch",(int)(end_time-start_time))
    print(f"Epoch {epoch+1} Complete - Average Loss: {avg_epoch_loss:.4f}")
    if(avg_epoch_loss<check_loss):
        torch.save(model.state_dict(), "bert.pt")
        print("Model saved at epoch",epoch)
        check_loss=avg_epoch_loss

Epoch 1: 100%|██████████| 1667/1667 [03:02<00:00,  9.12it/s, loss=11.7924]


Time Taken for the epoch 182
Epoch 1 Complete - Average Loss: 11.5558


Epoch 2: 100%|██████████| 1667/1667 [03:00<00:00,  9.25it/s, loss=11.3332]


Time Taken for the epoch 180
Epoch 2 Complete - Average Loss: 11.5257


Epoch 3: 100%|██████████| 1667/1667 [02:59<00:00,  9.31it/s, loss=10.8581]


Time Taken for the epoch 179
Epoch 3 Complete - Average Loss: 11.4632


Epoch 4: 100%|██████████| 1667/1667 [02:57<00:00,  9.37it/s, loss=10.9780]


Time Taken for the epoch 177
Epoch 4 Complete - Average Loss: 11.3654


Epoch 5: 100%|██████████| 1667/1667 [02:58<00:00,  9.35it/s, loss=11.0431]


Time Taken for the epoch 178
Epoch 5 Complete - Average Loss: 11.2293


Epoch 6: 100%|██████████| 1667/1667 [02:58<00:00,  9.32it/s, loss=10.6366]


Time Taken for the epoch 178
Epoch 6 Complete - Average Loss: 11.0543


Epoch 7: 100%|██████████| 1667/1667 [02:59<00:00,  9.31it/s, loss=10.7951]


Time Taken for the epoch 179
Epoch 7 Complete - Average Loss: 10.8592


Epoch 8: 100%|██████████| 1667/1667 [02:57<00:00,  9.39it/s, loss=10.3345]


Time Taken for the epoch 177
Epoch 8 Complete - Average Loss: 10.6602


Epoch 9: 100%|██████████| 1667/1667 [02:58<00:00,  9.32it/s, loss=10.1665]


Time Taken for the epoch 178
Epoch 9 Complete - Average Loss: 10.4816


Epoch 10: 100%|██████████| 1667/1667 [02:57<00:00,  9.37it/s, loss=10.6151]


Time Taken for the epoch 177
Epoch 10 Complete - Average Loss: 10.3355


Epoch 11: 100%|██████████| 1667/1667 [02:57<00:00,  9.37it/s, loss=10.6051]


Time Taken for the epoch 177
Epoch 11 Complete - Average Loss: 10.2212


Epoch 12: 100%|██████████| 1667/1667 [02:57<00:00,  9.37it/s, loss=9.8430] 


Time Taken for the epoch 177
Epoch 12 Complete - Average Loss: 10.1400


Epoch 13: 100%|██████████| 1667/1667 [02:57<00:00,  9.39it/s, loss=9.4483] 


Time Taken for the epoch 177
Epoch 13 Complete - Average Loss: 10.0697


Epoch 14: 100%|██████████| 1667/1667 [02:57<00:00,  9.42it/s, loss=9.7246] 


Time Taken for the epoch 177
Epoch 14 Complete - Average Loss: 10.0171


Epoch 15: 100%|██████████| 1667/1667 [02:57<00:00,  9.38it/s, loss=10.5532]


Time Taken for the epoch 177
Epoch 15 Complete - Average Loss: 9.9757
Model saved at epoch 14


Epoch 16: 100%|██████████| 1667/1667 [02:57<00:00,  9.39it/s, loss=10.1220]


Time Taken for the epoch 177
Epoch 16 Complete - Average Loss: 9.9491
Model saved at epoch 15


Epoch 17: 100%|██████████| 1667/1667 [02:57<00:00,  9.38it/s, loss=9.2464] 


Time Taken for the epoch 177
Epoch 17 Complete - Average Loss: 9.9180
Model saved at epoch 16


Epoch 18: 100%|██████████| 1667/1667 [02:56<00:00,  9.43it/s, loss=11.0779]


Time Taken for the epoch 176
Epoch 18 Complete - Average Loss: 9.8861
Model saved at epoch 17


Epoch 19: 100%|██████████| 1667/1667 [02:57<00:00,  9.42it/s, loss=10.0991]


Time Taken for the epoch 177
Epoch 19 Complete - Average Loss: 9.8728
Model saved at epoch 18


Epoch 20: 100%|██████████| 1667/1667 [02:58<00:00,  9.35it/s, loss=9.3954] 


Time Taken for the epoch 178
Epoch 20 Complete - Average Loss: 9.8502
Model saved at epoch 19


Epoch 21: 100%|██████████| 1667/1667 [02:57<00:00,  9.41it/s, loss=8.5922] 


Time Taken for the epoch 177
Epoch 21 Complete - Average Loss: 9.8347
Model saved at epoch 20


Epoch 22: 100%|██████████| 1667/1667 [02:58<00:00,  9.35it/s, loss=9.9479] 


Time Taken for the epoch 178
Epoch 22 Complete - Average Loss: 9.8259
Model saved at epoch 21


Epoch 23: 100%|██████████| 1667/1667 [02:56<00:00,  9.43it/s, loss=9.2367] 


Time Taken for the epoch 176
Epoch 23 Complete - Average Loss: 9.8131
Model saved at epoch 22


Epoch 24: 100%|██████████| 1667/1667 [02:57<00:00,  9.40it/s, loss=9.3633] 


Time Taken for the epoch 177
Epoch 24 Complete - Average Loss: 9.8031
Model saved at epoch 23


Epoch 25: 100%|██████████| 1667/1667 [02:56<00:00,  9.42it/s, loss=9.6524] 


Time Taken for the epoch 176
Epoch 25 Complete - Average Loss: 9.7973
Model saved at epoch 24


Epoch 26: 100%|██████████| 1667/1667 [02:56<00:00,  9.44it/s, loss=8.4176] 


Time Taken for the epoch 176
Epoch 26 Complete - Average Loss: 9.7837
Model saved at epoch 25


Epoch 27: 100%|██████████| 1667/1667 [02:56<00:00,  9.45it/s, loss=10.5796]


Time Taken for the epoch 176
Epoch 27 Complete - Average Loss: 9.7846


Epoch 28: 100%|██████████| 1667/1667 [02:56<00:00,  9.43it/s, loss=10.2866]


Time Taken for the epoch 176
Epoch 28 Complete - Average Loss: 9.7677
Model saved at epoch 27


Epoch 29: 100%|██████████| 1667/1667 [02:58<00:00,  9.32it/s, loss=9.6662] 


Time Taken for the epoch 178
Epoch 29 Complete - Average Loss: 9.7697


Epoch 30: 100%|██████████| 1667/1667 [02:57<00:00,  9.37it/s, loss=9.0754] 


Time Taken for the epoch 177
Epoch 30 Complete - Average Loss: 9.7642
Model saved at epoch 29


Epoch 31: 100%|██████████| 1667/1667 [02:59<00:00,  9.31it/s, loss=10.9829]


Time Taken for the epoch 179
Epoch 31 Complete - Average Loss: 9.7578
Model saved at epoch 30


Epoch 32: 100%|██████████| 1667/1667 [02:57<00:00,  9.40it/s, loss=9.8492] 


Time Taken for the epoch 177
Epoch 32 Complete - Average Loss: 9.7617


Epoch 33: 100%|██████████| 1667/1667 [02:55<00:00,  9.49it/s, loss=10.1768]


Time Taken for the epoch 175
Epoch 33 Complete - Average Loss: 9.7513
Model saved at epoch 32


Epoch 34: 100%|██████████| 1667/1667 [02:57<00:00,  9.37it/s, loss=8.9346] 


Time Taken for the epoch 177
Epoch 34 Complete - Average Loss: 9.7440
Model saved at epoch 33


Epoch 35: 100%|██████████| 1667/1667 [02:57<00:00,  9.38it/s, loss=10.8880]


Time Taken for the epoch 177
Epoch 35 Complete - Average Loss: 9.7451


Epoch 36: 100%|██████████| 1667/1667 [02:56<00:00,  9.42it/s, loss=9.8405] 


Time Taken for the epoch 176
Epoch 36 Complete - Average Loss: 9.7438
Model saved at epoch 35


Epoch 37: 100%|██████████| 1667/1667 [02:57<00:00,  9.41it/s, loss=9.4687] 


Time Taken for the epoch 177
Epoch 37 Complete - Average Loss: 9.7400
Model saved at epoch 36


Epoch 38: 100%|██████████| 1667/1667 [02:57<00:00,  9.40it/s, loss=9.4942] 


Time Taken for the epoch 177
Epoch 38 Complete - Average Loss: 9.7384
Model saved at epoch 37


Epoch 39: 100%|██████████| 1667/1667 [02:56<00:00,  9.45it/s, loss=10.4423]


Time Taken for the epoch 176
Epoch 39 Complete - Average Loss: 9.7343
Model saved at epoch 38


Epoch 40: 100%|██████████| 1667/1667 [02:55<00:00,  9.51it/s, loss=10.3031]


Time Taken for the epoch 175
Epoch 40 Complete - Average Loss: 9.7329
Model saved at epoch 39


Epoch 41: 100%|██████████| 1667/1667 [02:55<00:00,  9.49it/s, loss=9.1215] 


Time Taken for the epoch 175
Epoch 41 Complete - Average Loss: 9.7254
Model saved at epoch 40


Epoch 42: 100%|██████████| 1667/1667 [02:55<00:00,  9.51it/s, loss=9.4364] 


Time Taken for the epoch 175
Epoch 42 Complete - Average Loss: 9.7261


Epoch 43: 100%|██████████| 1667/1667 [02:55<00:00,  9.52it/s, loss=10.0196]


Time Taken for the epoch 175
Epoch 43 Complete - Average Loss: 9.7300


Epoch 44: 100%|██████████| 1667/1667 [02:55<00:00,  9.48it/s, loss=10.2131]


Time Taken for the epoch 175
Epoch 44 Complete - Average Loss: 9.7223
Model saved at epoch 43


Epoch 45: 100%|██████████| 1667/1667 [02:55<00:00,  9.50it/s, loss=10.2663]


Time Taken for the epoch 175
Epoch 45 Complete - Average Loss: 9.7235


Epoch 46: 100%|██████████| 1667/1667 [02:55<00:00,  9.48it/s, loss=10.8488]


Time Taken for the epoch 175
Epoch 46 Complete - Average Loss: 9.7245


Epoch 47: 100%|██████████| 1667/1667 [02:56<00:00,  9.47it/s, loss=10.7290]


Time Taken for the epoch 176
Epoch 47 Complete - Average Loss: 9.7241


Epoch 48: 100%|██████████| 1667/1667 [02:55<00:00,  9.49it/s, loss=10.6480]


Time Taken for the epoch 175
Epoch 48 Complete - Average Loss: 9.7153
Model saved at epoch 47


Epoch 49: 100%|██████████| 1667/1667 [02:56<00:00,  9.47it/s, loss=9.6051] 


Time Taken for the epoch 176
Epoch 49 Complete - Average Loss: 9.7155


Epoch 50: 100%|██████████| 1667/1667 [02:56<00:00,  9.46it/s, loss=10.3551]


Time Taken for the epoch 176
Epoch 50 Complete - Average Loss: 9.7136
Model saved at epoch 49


Epoch 51: 100%|██████████| 1667/1667 [02:57<00:00,  9.39it/s, loss=9.8205] 


Time Taken for the epoch 177
Epoch 51 Complete - Average Loss: 9.7068
Model saved at epoch 50


Epoch 52: 100%|██████████| 1667/1667 [02:59<00:00,  9.31it/s, loss=10.5395]


Time Taken for the epoch 179
Epoch 52 Complete - Average Loss: 9.7148


Epoch 53: 100%|██████████| 1667/1667 [02:59<00:00,  9.30it/s, loss=11.0825]


Time Taken for the epoch 179
Epoch 53 Complete - Average Loss: 9.7079


Epoch 54: 100%|██████████| 1667/1667 [02:56<00:00,  9.42it/s, loss=9.3020] 


Time Taken for the epoch 176
Epoch 54 Complete - Average Loss: 9.7007
Model saved at epoch 53


Epoch 55: 100%|██████████| 1667/1667 [02:56<00:00,  9.44it/s, loss=9.5079] 


Time Taken for the epoch 176
Epoch 55 Complete - Average Loss: 9.7016


Epoch 56: 100%|██████████| 1667/1667 [02:56<00:00,  9.42it/s, loss=8.7766] 


Time Taken for the epoch 176
Epoch 56 Complete - Average Loss: 9.7002
Model saved at epoch 55


Epoch 57: 100%|██████████| 1667/1667 [02:56<00:00,  9.46it/s, loss=10.2187]


Time Taken for the epoch 176
Epoch 57 Complete - Average Loss: 9.6991
Model saved at epoch 56


Epoch 58: 100%|██████████| 1667/1667 [02:56<00:00,  9.42it/s, loss=9.3279] 


Time Taken for the epoch 176
Epoch 58 Complete - Average Loss: 9.7001


Epoch 59: 100%|██████████| 1667/1667 [02:56<00:00,  9.42it/s, loss=10.5127]


Time Taken for the epoch 176
Epoch 59 Complete - Average Loss: 9.6940
Model saved at epoch 58


Epoch 60: 100%|██████████| 1667/1667 [02:56<00:00,  9.42it/s, loss=9.9732] 


Time Taken for the epoch 176
Epoch 60 Complete - Average Loss: 9.6933
Model saved at epoch 59


Epoch 61: 100%|██████████| 1667/1667 [02:56<00:00,  9.47it/s, loss=9.5161] 


Time Taken for the epoch 176
Epoch 61 Complete - Average Loss: 9.6915
Model saved at epoch 60


Epoch 62: 100%|██████████| 1667/1667 [02:56<00:00,  9.43it/s, loss=9.1433] 


Time Taken for the epoch 176
Epoch 62 Complete - Average Loss: 9.6911
Model saved at epoch 61


Epoch 63: 100%|██████████| 1667/1667 [02:57<00:00,  9.42it/s, loss=8.8610] 


Time Taken for the epoch 177
Epoch 63 Complete - Average Loss: 9.6903
Model saved at epoch 62


Epoch 64: 100%|██████████| 1667/1667 [02:56<00:00,  9.44it/s, loss=10.8476]


Time Taken for the epoch 176
Epoch 64 Complete - Average Loss: 9.6884
Model saved at epoch 63


Epoch 65: 100%|██████████| 1667/1667 [02:57<00:00,  9.40it/s, loss=9.0008] 


Time Taken for the epoch 177
Epoch 65 Complete - Average Loss: 9.6871
Model saved at epoch 64


Epoch 66: 100%|██████████| 1667/1667 [02:57<00:00,  9.42it/s, loss=9.5333] 


Time Taken for the epoch 177
Epoch 66 Complete - Average Loss: 9.6808
Model saved at epoch 65


Epoch 67: 100%|██████████| 1667/1667 [02:56<00:00,  9.43it/s, loss=10.3694]


Time Taken for the epoch 176
Epoch 67 Complete - Average Loss: 9.6840


Epoch 68: 100%|██████████| 1667/1667 [02:57<00:00,  9.38it/s, loss=10.0634]


Time Taken for the epoch 177
Epoch 68 Complete - Average Loss: 9.6845


Epoch 69: 100%|██████████| 1667/1667 [02:58<00:00,  9.34it/s, loss=9.7839] 


Time Taken for the epoch 178
Epoch 69 Complete - Average Loss: 9.6824


Epoch 70: 100%|██████████| 1667/1667 [03:00<00:00,  9.23it/s, loss=10.6859]


Time Taken for the epoch 180
Epoch 70 Complete - Average Loss: 9.6768
Model saved at epoch 69


Epoch 71: 100%|██████████| 1667/1667 [02:58<00:00,  9.31it/s, loss=9.9756] 


Time Taken for the epoch 178
Epoch 71 Complete - Average Loss: 9.6801


Epoch 72: 100%|██████████| 1667/1667 [02:57<00:00,  9.37it/s, loss=10.8645]


Time Taken for the epoch 177
Epoch 72 Complete - Average Loss: 9.6745
Model saved at epoch 71


Epoch 73: 100%|██████████| 1667/1667 [02:58<00:00,  9.36it/s, loss=8.7249] 


Time Taken for the epoch 178
Epoch 73 Complete - Average Loss: 9.6744
Model saved at epoch 72


Epoch 74: 100%|██████████| 1667/1667 [02:57<00:00,  9.40it/s, loss=9.6614] 


Time Taken for the epoch 177
Epoch 74 Complete - Average Loss: 9.6742
Model saved at epoch 73


Epoch 75: 100%|██████████| 1667/1667 [02:57<00:00,  9.39it/s, loss=9.3674] 


Time Taken for the epoch 177
Epoch 75 Complete - Average Loss: 9.6779


Epoch 76: 100%|██████████| 1667/1667 [02:56<00:00,  9.42it/s, loss=9.7142] 


Time Taken for the epoch 176
Epoch 76 Complete - Average Loss: 9.6725
Model saved at epoch 75


Epoch 77: 100%|██████████| 1667/1667 [02:58<00:00,  9.33it/s, loss=9.4806] 


Time Taken for the epoch 178
Epoch 77 Complete - Average Loss: 9.6681
Model saved at epoch 76


Epoch 78: 100%|██████████| 1667/1667 [02:58<00:00,  9.35it/s, loss=9.0464] 


Time Taken for the epoch 178
Epoch 78 Complete - Average Loss: 9.6730


Epoch 79: 100%|██████████| 1667/1667 [02:57<00:00,  9.38it/s, loss=8.7664] 


Time Taken for the epoch 177
Epoch 79 Complete - Average Loss: 9.6711


Epoch 80: 100%|██████████| 1667/1667 [02:57<00:00,  9.38it/s, loss=9.2978] 


Time Taken for the epoch 177
Epoch 80 Complete - Average Loss: 9.6668
Model saved at epoch 79


Epoch 81: 100%|██████████| 1667/1667 [02:56<00:00,  9.42it/s, loss=9.3726] 


Time Taken for the epoch 176
Epoch 81 Complete - Average Loss: 9.6729


Epoch 82: 100%|██████████| 1667/1667 [02:59<00:00,  9.30it/s, loss=10.4611]


Time Taken for the epoch 179
Epoch 82 Complete - Average Loss: 9.6707


Epoch 83: 100%|██████████| 1667/1667 [03:00<00:00,  9.25it/s, loss=10.3548]


Time Taken for the epoch 180
Epoch 83 Complete - Average Loss: 9.6690


Epoch 84: 100%|██████████| 1667/1667 [02:56<00:00,  9.43it/s, loss=10.6710]


Time Taken for the epoch 176
Epoch 84 Complete - Average Loss: 9.6653
Model saved at epoch 83


Epoch 85: 100%|██████████| 1667/1667 [02:57<00:00,  9.40it/s, loss=10.5335]


Time Taken for the epoch 177
Epoch 85 Complete - Average Loss: 9.6657


Epoch 86: 100%|██████████| 1667/1667 [02:56<00:00,  9.42it/s, loss=9.9060] 


Time Taken for the epoch 176
Epoch 86 Complete - Average Loss: 9.6665


Epoch 89: 100%|██████████| 1667/1667 [02:57<00:00,  9.38it/s, loss=10.7882]


Time Taken for the epoch 177
Epoch 89 Complete - Average Loss: 9.6671


Epoch 95: 100%|██████████| 1667/1667 [02:57<00:00,  9.39it/s, loss=9.2717] 


Time Taken for the epoch 177
Epoch 95 Complete - Average Loss: 9.6650


Epoch 99:  96%|█████████▌| 1602/1667 [02:52<00:08,  7.84it/s, loss=9.7880] 