### Importing The Required Libraries

In [1]:
!pip install torch==2.3.0+cu121 torchvision==0.18.0+cu121 torchaudio==2.3.0+cu121 \
  --index-url https://download.pytorch.org/whl/cu121
!pip install torchtext==0.18.0 --no-deps
  

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.3.0+cu121
  Downloading https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp312-cp312-linux_x86_64.whl (780.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.9/780.9 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchvision==0.18.0+cu121
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.18.0%2Bcu121-cp312-cp312-linux_x86_64.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hCollecting torchaudio==2.3.0+cu121
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.3.0%2Bcu121-cp312-cp312-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m106.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0+cu121)
  Downloading

In [2]:
import torchtext
torchtext.disable_torchtext_deprecation_warning()
import torch
from torch.utils.data import DataLoader
from torch import Tensor
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn import Transformer
from transformers import BertTokenizer
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torchtext.vocab import Vocab,build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import IMDB
import random
from itertools import chain
import pandas as pd
from copy import deepcopy
import csv
import json
import math
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import get_linear_schedule_with_warmup

# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')
import numpy as np
import torch.optim as optim
import time 

### Loading The CSV Dataset

In [27]:
class BERTCSVDataset(Dataset):
    def __init__(self,filename):
        self.data=pd.read_csv(filename)
        self.tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
    def __len__(self):
        return len(self.data)
    def __getitem__(self,idx):
        row = self.data.iloc[idx]
        try:
            
            bert_input = torch.tensor(json.loads(row['BERT Input']), dtype=torch.long)
            bert_label = torch.tensor(json.loads(row['BERT Label']), dtype=torch.long)
            segment_label = torch.tensor([int(x) for x in row['Segment Label'].split(',')], dtype=torch.long)
            is_next = torch.tensor(row['Is Next'], dtype=torch.long)
            original_text = row['Original Text']  # If you want to use it
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON for row {idx}: {e}")
            print("BERT Input:", row['BERT Input'])
            print("BERT Label:", row['BERT Label'])
            return None  

            # Tokenizing the original text with BERT
        encoded_input = self.tokenizer.encode_plus(
            original_text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        input_ids = encoded_input['input_ids'].squeeze()
        attention_mask = encoded_input['attention_mask'].squeeze()

        return(bert_input, bert_label, segment_label, is_next, input_ids, attention_mask, original_text)


In [51]:
PAD_IDX = 0
def collate_batch(batch):
    bert_inputs_batch, bert_labels_batch, segment_labels_batch, is_nexts_batch,input_ids_batch,attention_mask_batch,original_text_battch = [], [], [], [],[],[],[]
    for bert_input, bert_label, segment_label, is_next,input_ids,attention_mask,original_text in batch:
        bert_inputs_batch.append(torch.tensor(bert_input[:512], dtype=torch.long))
        bert_labels_batch.append(torch.tensor(bert_label[:512], dtype=torch.long))
        segment_labels_batch.append(torch.tensor(segment_label[:512], dtype=torch.long))
        is_nexts_batch.append(is_next)
        input_ids_batch.append(input_ids)
        attention_mask_batch.append(attention_mask)
        original_text_battch.append(original_text)
    bert_inputs_final = pad_sequence(bert_inputs_batch, padding_value=PAD_IDX, batch_first=True)
    bert_labels_final = pad_sequence(bert_labels_batch, padding_value=PAD_IDX, batch_first=True)
    segment_labels_final = pad_sequence(segment_labels_batch, padding_value=PAD_IDX, batch_first=True)
    is_nexts_batch = torch.tensor(is_nexts_batch, dtype=torch.long)
    return bert_inputs_final, bert_labels_final, segment_labels_final, is_nexts_batch

In [52]:
BATCH_SIZE = 2

train_dataset_path = '/kaggle/input/bert-dataset/bert_dataset/bert_train_data.csv'
test_dataset_path = '/kaggle/input/bert-dataset/bert_dataset/bert_test_data.csv'

train_dataset = BERTCSVDataset(train_dataset_path)
test_dataset = BERTCSVDataset(test_dataset_path)
train_dataloader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_batch)
test_dataloader=DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_batch)

### Model Creation

In [53]:
EMBEDDING_DIM=10
class TokenEmbedding(nn.Module):
    def __init__(self,vocab_size,embed_dim):
        super(TokenEmbedding,self).__init__()
        self.embedding=nn.Embedding(vocab_size,embed_dim)
        self.embed_dim=embed_dim
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.embed_dim)

In [54]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 512):
        super().__init__()

        position = torch.arange(0, maxlen).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, emb_size, 2) * (-math.log(10000.0) / emb_size)
        )

        pe = torch.zeros(maxlen, emb_size)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)

        self.register_buffer("pos_embedding", pe)
        self.dropout = nn.Dropout(dropout)

    def forward(self, token_embedding: torch.Tensor):
        """
        token_embedding: [batch_size, seq_len, emb_size]
        """
        seq_len = token_embedding.size(1)
        token_embedding = token_embedding + self.pos_embedding[:, :seq_len, :]
        return self.dropout(token_embedding)


In [55]:
class BERTEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, dropout=0.1):
        super().__init__()

        self.token_embedding = TokenEmbedding(vocab_size, embed_dim)
        self.positional_encoding = PositionalEncoding(embed_dim, dropout)
        self.segment_embedding = nn.Embedding(3, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, bert_inputs, segment_labels):
        """
        bert_inputs:   [batch_size, seq_len]
        segment_labels:[batch_size, seq_len]
        """

        token_embeddings = self.token_embedding(bert_inputs)
        position_embeddings = self.positional_encoding(token_embeddings)
        segment_embeddings = self.segment_embedding(segment_labels)

        x = token_embeddings + position_embeddings + segment_embeddings
        x = self.dropout(x)

        return x


In [56]:
VOCAB_SIZE=147161
batch=2
count=0
for batch in train_dataloader:
    bert_input,bert_label,segement_label,is_next=[b for b in batch]
    
    count+=1
    if count==5:
        break


In [57]:
bert_input.shape

torch.Size([2, 82])

In [58]:
segement_label.shape

torch.Size([2, 82])

In [59]:
bert_label.shape

torch.Size([2, 82])

In [60]:
is_next.shape

torch.Size([2])

In [61]:
is_next

tensor([1, 1])

In [62]:
bert_input

tensor([[    1,     3,    20,   538,    44,   580,    11,  9535,   142,     3,
             5,  1005,    10,  7645,     3,     8,    49,    36,     3,   264,
           292,    18,    36,  4079,    15,   399, 39504,     3,   210,   112,
             7,    25,    17,     3,    70,   122,    36,   940,     6,     2,
             0,  1043,     3,    22,    47,   119,   164,   264,    70,    32,
            66,     3,     5,     3,     3,    63,    42, 12506,     5,    24,
             7,    14,    13, 17904,    18,     3,   212,    42,   107,     3,
             9,   131,    31,   264,    23,     3,  2052,   699,  1352,   197,
             6,     2],
        [    1,  2580,    12,    19,   617,    21,     3, 30250,    10,  1770,
            22,     3,   440,     6,     2,    28,   632,   603,   164,     3,
            32,     3,     3,    10,    44,  1222,   242,     6,     2,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0

In [63]:
segement_label

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [64]:
# Instantiate the TokenEmbedding 
token_embedding = TokenEmbedding(VOCAB_SIZE, embed_dim=EMBEDDING_DIM )

# Get the token embeddings for a sample input
t_embeddings = token_embedding(bert_input)
#Each token is transformed into a tensor of size emb_size
print(f"Dimensions of token embeddings: {t_embeddings.size()}") # Expected: (sequence_length, batch_size, EMBEDDING_DIM)
#Check the embedded vectors for first 3 tokens of the first sample in the batch
# you get embeddings[i,0,:] where i refers to the i'th token of the first sample in the batch (b=0)
for i in range(2):
    print(f"Token Embeddings for the {i}th token of the first sample: {t_embeddings[i,0,:]}")

Dimensions of token embeddings: torch.Size([2, 82, 10])
Token Embeddings for the 0th token of the first sample: tensor([-3.3162, -2.3625, -4.0100, -1.6696,  2.6844, -2.4756, -1.2127, -0.9825,
         0.1128,  0.8203], grad_fn=<SliceBackward0>)
Token Embeddings for the 1th token of the first sample: tensor([-3.3162, -2.3625, -4.0100, -1.6696,  2.6844, -2.4756, -1.2127, -0.9825,
         0.1128,  0.8203], grad_fn=<SliceBackward0>)


### Bert Model Architecture

In [65]:
class BERT(nn.Module):
    def __init__(self,vocab_size,embed_dim,num_layers,n_head,dropout):
        super().__init__()
        self.d_model = d_model
        self.n_layers = n_layers
        self.heads = heads
        self.embedding=BERTEmbedding(vocab_size,embed_dim)
        encoder_layer=nn.TransformerEncoderLayer(d_model=embed_dim,nhead=n_head,dropout=dropout,batch_first=True)
        self.encoder=nn.TransformerEncoder(encoder_layer,num_layers=num_layers)
        self.NextSentencePrediction=nn.Linear(embed_dim,2)
        self.MaskedPrediction=nn.Linear(embed_dim,vocab_size)
    def forward(self,bert_input,segement_label):
        padding_mask=(bert_input==PAD_IDX)
        x=self.embedding(bert_input,segement_label)
        values_after_encoding=self.encoder(x,src_key_padding_mask=padding_mask)
        next_sentence=self.NextSentencePrediction(values_after_encoding[:,0,:])
        masked_language=self.MaskedPrediction(values_after_encoding)
        return next_sentence,masked_language 

In [66]:
EMBEDDING_DIM = 10
vocab_size = 147161
d_model = EMBEDDING_DIM  
n_layers = 2
initial_heads = 12
initial_heads = 2
heads = initial_heads - d_model % initial_heads
dropout = 0.1  
model = BERT(vocab_size, d_model, n_layers, heads, dropout)

In [67]:
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Using 2 GPUs!


DataParallel(
  (module): BERT(
    (embedding): BERTEmbedding(
      (token_embedding): TokenEmbedding(
        (embedding): Embedding(147161, 10)
      )
      (positional_encoding): PositionalEncoding(
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (segment_embedding): Embedding(3, 10)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-1): 2 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=10, out_features=10, bias=True)
          )
          (linear1): Linear(in_features=10, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=10, bias=True)
          (norm1): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1,

### Evaluation

In [68]:
PAD_IDX=0
loss_ns=nn.CrossEntropyLoss()
loss_mlm=nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [69]:
def evaluate(dataloader, model, loss_ns, loss_mlm):
    model.eval()
    total_loss = 0
    
    for batch in dataloader:
        bert_input, bert_label, segment_label, is_next = [b.to(device) for b in batch]
        
        next_sentence_prediction, masked_language = model(bert_input, segment_label)
        
        l_ns = loss_ns(next_sentence_prediction, is_next.view(-1))
        l_mlm = loss_mlm(masked_language.reshape(-1, masked_language.size(-1)), bert_label.reshape(-1))
        
        loss = l_ns + l_mlm
        total_loss += loss.item()
        
    return total_loss / len(dataloader)

### Training

In [72]:
BATCH_SIZE = 2

train_dataset_path = '/kaggle/input/bert-dataset/bert_dataset/bert_train_data_sampled.csv'
test_dataset_path = '/kaggle/input/bert-dataset/bert_dataset/bert_test_data_sampled.csv'

train_dataset = BERTCSVDataset(train_dataset_path)
test_dataset = BERTCSVDataset(test_dataset_path)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

In [73]:
loss=evaluate(train_dataloader,model,loss_ns,loss_mlm)
loss

13.080436336517334

In [74]:
import time
from tqdm import tqdm
from torch.optim import Adam
from transformers import get_linear_schedule_with_warmup

from torch.optim import Adam
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
num_epochs=25
optimizer = Adam(model.parameters(), lr=1e-4, weight_decay=0.01)
total_steps = num_epochs * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(total_steps * 0.1), num_training_steps=total_steps)
check_loss=10
for epoch in range(num_epochs):
    model.train()
    start_time=time.time()
    total_loss = 0.0
    

    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")
    
    for batch in progress_bar:

        bert_inputs, bert_labels, segment_labels, is_nexts = [b.to(device) for b in batch]
        
        optimizer.zero_grad()
        
        # Forward pass
        next_pred, mask_pred = model(bert_inputs, segment_labels)
        
        # Calculate individual losses
        loss_n = loss_ns(next_pred, is_nexts)
        loss_m = loss_mlm(mask_pred.view(-1, mask_pred.size(-1)), bert_labels.view(-1))
        
        # Combine losses
        loss = loss_n + loss_m
        
        if torch.isnan(loss):
            print("Warning: NaN loss detected!")
            continue
        
        # Backward pass
        loss.backward()
        
        # Stability: Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        
        # Optional: Update progress bar with current loss
        progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
    end_time=time.time()
    avg_epoch_loss = total_loss / len(train_dataloader)
    print("Time Taken for the epoch",(int)(end_time-start_time))
    print(f"Epoch {epoch+1} Complete - Average Loss: {avg_epoch_loss:.4f}")
    if(avg_epoch_loss<check_loss):
        torch.save(model.state_dict(), "bert.pt")
        print("Model saved at epoch",epoch)
        check_loss=avg_epoch_loss

Epoch 1: 100%|██████████| 5000/5000 [03:10<00:00, 26.20it/s, loss=12.3792]


Time Taken for the epoch 190
Epoch 1 Complete - Average Loss: 12.6317


Epoch 2:  25%|██▌       | 1261/5000 [00:47<02:20, 26.52it/s, loss=12.5705]



Epoch 2: 100%|██████████| 5000/5000 [03:10<00:00, 26.19it/s, loss=12.0744]


Time Taken for the epoch 190
Epoch 2 Complete - Average Loss: 12.1916


Epoch 3: 100%|██████████| 5000/5000 [03:11<00:00, 26.13it/s, loss=11.8983]


Time Taken for the epoch 191
Epoch 3 Complete - Average Loss: 11.3157


Epoch 4: 100%|██████████| 5000/5000 [03:11<00:00, 26.17it/s, loss=9.6708] 


Time Taken for the epoch 191
Epoch 4 Complete - Average Loss: 10.4902


Epoch 5:  70%|██████▉   | 3492/5000 [02:13<00:58, 25.56it/s, loss=9.8337] 



Epoch 5: 100%|██████████| 5000/5000 [03:11<00:00, 26.16it/s, loss=9.2737] 


Time Taken for the epoch 191
Epoch 5 Complete - Average Loss: 10.2227


Epoch 6: 100%|██████████| 5000/5000 [03:10<00:00, 26.20it/s, loss=9.6485] 


Time Taken for the epoch 190
Epoch 6 Complete - Average Loss: 10.1184


Epoch 7: 100%|██████████| 5000/5000 [03:10<00:00, 26.18it/s, loss=10.8710]


Time Taken for the epoch 190
Epoch 7 Complete - Average Loss: 10.0550


Epoch 8: 100%|██████████| 5000/5000 [03:10<00:00, 26.24it/s, loss=10.0097]


Time Taken for the epoch 190
Epoch 8 Complete - Average Loss: 10.0247


Epoch 9:  38%|███▊      | 1879/5000 [01:12<01:51, 27.94it/s, loss=9.2977] 



Epoch 9:  65%|██████▌   | 3267/5000 [02:05<01:03, 27.39it/s, loss=10.8853]



Epoch 9: 100%|██████████| 5000/5000 [03:11<00:00, 26.15it/s, loss=10.6056]


Time Taken for the epoch 191
Epoch 9 Complete - Average Loss: 9.9998
Model saved at epoch 8


Epoch 10: 100%|██████████| 5000/5000 [03:10<00:00, 26.20it/s, loss=8.9049] 


Time Taken for the epoch 190
Epoch 10 Complete - Average Loss: 9.9812
Model saved at epoch 9


Epoch 11: 100%|██████████| 5000/5000 [03:11<00:00, 26.12it/s, loss=7.4476] 


Time Taken for the epoch 191
Epoch 11 Complete - Average Loss: 9.9665
Model saved at epoch 10


Epoch 12:  43%|████▎     | 2126/5000 [01:21<01:45, 27.34it/s, loss=10.2382]



Epoch 12: 100%|██████████| 5000/5000 [03:11<00:00, 26.15it/s, loss=9.6645] 


Time Taken for the epoch 191
Epoch 12 Complete - Average Loss: 9.9560
Model saved at epoch 11


Epoch 13: 100%|██████████| 5000/5000 [03:10<00:00, 26.20it/s, loss=9.9357] 


Time Taken for the epoch 190
Epoch 13 Complete - Average Loss: 9.9360
Model saved at epoch 12


Epoch 14:  80%|███████▉  | 3978/5000 [02:31<00:37, 27.10it/s, loss=8.0623] 



Epoch 14: 100%|██████████| 5000/5000 [03:10<00:00, 26.23it/s, loss=8.9258] 


Time Taken for the epoch 190
Epoch 14 Complete - Average Loss: 9.9240
Model saved at epoch 13


Epoch 15: 100%|██████████| 5000/5000 [03:10<00:00, 26.20it/s, loss=10.4354]


Time Taken for the epoch 190
Epoch 15 Complete - Average Loss: 9.9211
Model saved at epoch 14


Epoch 16:  62%|██████▏   | 3113/5000 [01:58<01:07, 27.87it/s, loss=9.2893] 



Epoch 16: 100%|██████████| 5000/5000 [03:10<00:00, 26.19it/s, loss=8.6344] 


Time Taken for the epoch 190
Epoch 16 Complete - Average Loss: 9.9080
Model saved at epoch 15


Epoch 17: 100%|██████████| 5000/5000 [03:11<00:00, 26.15it/s, loss=8.7018] 


Time Taken for the epoch 191
Epoch 17 Complete - Average Loss: 9.9116
Model saved at epoch 16


Epoch 18:   2%|▏         | 83/5000 [00:03<03:05, 26.51it/s, loss=11.3664]



Epoch 18: 100%|██████████| 5000/5000 [03:10<00:00, 26.20it/s, loss=9.8348] 


Time Taken for the epoch 190
Epoch 18 Complete - Average Loss: 9.9041
Model saved at epoch 17


Epoch 19: 100%|██████████| 5000/5000 [03:10<00:00, 26.18it/s, loss=9.6436] 


Time Taken for the epoch 190
Epoch 19 Complete - Average Loss: 9.8999
Model saved at epoch 18


Epoch 20: 100%|██████████| 5000/5000 [03:10<00:00, 26.19it/s, loss=9.1760] 


Time Taken for the epoch 190
Epoch 20 Complete - Average Loss: 9.8984
Model saved at epoch 19


Epoch 21: 100%|██████████| 5000/5000 [03:10<00:00, 26.23it/s, loss=9.5356] 


Time Taken for the epoch 190
Epoch 21 Complete - Average Loss: 9.9020
Model saved at epoch 20


Epoch 22:  72%|███████▏  | 3598/5000 [02:17<00:53, 26.24it/s, loss=7.7761] 


KeyboardInterrupt: 

In [None]:
loss=evaluate(test_dataloader,model,loss_ns,loss_mlm)
loss

In [None]:
test_dataset_actual= '/kaggle/input/bert-dataset/bert_dataset/bert_test_data.csv'
test_dataloader = DataLoader(test_dataset_actual, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
loss=evaluate(test_dataloader,model,loss_ns,loss_mlm)
loss