### Importing The Required Libraries

In [22]:
import torchtext
torchtext.disable_torchtext_deprecation_warning()
import torch
from torch.utils.data import DataLoader
from torch import Tensor
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn import Transformer
from transformers import BertTokenizer
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torchtext.vocab import Vocab,build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import IMDB
import random
from itertools import chain
import pandas as pd
from copy import deepcopy
import csv
import json
import math
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import get_linear_schedule_with_warmup

# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')
import numpy as np

### Loading The CSV Dataset

In [2]:
class BERTCSVDataset(Dataset):
    def __init__(self,filename):
        self.data=pd.read_csv(filename)
        self.tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
    def __len__(self):
        return len(self.data)
    def __getitem__(self,idx):
        row = self.data.iloc[idx]
        try:
            
            bert_input = torch.tensor(json.loads(row['BERT Input']), dtype=torch.long)
            bert_label = torch.tensor(json.loads(row['BERT Label']), dtype=torch.long)
            segment_label = torch.tensor([int(x) for x in row['Segment Label'].split(',')], dtype=torch.long)
            is_next = torch.tensor(row['Is Next'], dtype=torch.long)
            original_text = row['Original Text']  # If you want to use it
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON for row {idx}: {e}")
            print("BERT Input:", row['BERT Input'])
            print("BERT Label:", row['BERT Label'])
            # Handle the error, e.g., by skipping this row or using default values
            return None  # or some default values

            # Tokenizing the original text with BERT
        encoded_input = self.tokenizer.encode_plus(
            original_text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        input_ids = encoded_input['input_ids'].squeeze()
        attention_mask = encoded_input['attention_mask'].squeeze()

        return(bert_input, bert_label, segment_label, is_next, input_ids, attention_mask, original_text)

In [3]:
PAD_IDX = 0
def collate_batch(batch):

   
    bert_inputs_batch, bert_labels_batch, segment_labels_batch, is_nexts_batch,input_ids_batch,attention_mask_batch,original_text_battch = [], [], [], [],[],[],[]

    for bert_input, bert_label, segment_label, is_next,input_ids,attention_mask,original_text in batch:
        # Convert each sequence to a tensor and append to the respective list
        bert_inputs_batch.append(torch.tensor(bert_input, dtype=torch.long))
        bert_labels_batch.append(torch.tensor(bert_label, dtype=torch.long))
        segment_labels_batch.append(torch.tensor(segment_label, dtype=torch.long))
        is_nexts_batch.append(is_next)
        input_ids_batch.append(input_ids)
        attention_mask_batch.append(attention_mask)
        original_text_battch.append(original_text)

    # Pad the sequences in the batch
    bert_inputs_final = pad_sequence(bert_inputs_batch, padding_value=PAD_IDX, batch_first=False)
    bert_labels_final = pad_sequence(bert_labels_batch, padding_value=PAD_IDX, batch_first=False)
    segment_labels_final = pad_sequence(segment_labels_batch, padding_value=PAD_IDX, batch_first=False)
    is_nexts_batch = torch.tensor(is_nexts_batch, dtype=torch.long)

    return bert_inputs_final, bert_labels_final, segment_labels_final, is_nexts_batch

In [4]:
BATCH_SIZE = 2

train_dataset_path = './bert_dataset/bert_train_data.csv'
test_dataset_path = './bert_dataset/bert_test_data.csv'

train_dataset = BERTCSVDataset(train_dataset_path)
test_dataset = BERTCSVDataset(test_dataset_path)
train_dataloader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_batch)
test_dataloader=DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_batch)

### Model Creation

In [5]:
EMBEDDING_DIM=10
class TokenEmbedding(nn.Module):
    def __init__(self,vocab_size,embed_dim):
        super(TokenEmbedding,self).__init__()
        self.embedding=nn.Embedding(vocab_size,embed_dim)
        self.embed_dim=embed_dim
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.embed_dim)

In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: torch.Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

In [19]:
class BERTEmbedding(nn.Module):
    def __init__(self,vocab_size,embed_dim,dropout=0.1,train=True):
        super(BERTEmbedding,self).__init__()
        self.token_embedding=TokenEmbedding(vocab_size,embed_dim)
        self.positional_encoding=PositionalEncoding(embed_dim,dropout)
        self.segment_embedding = nn.Embedding(3, embed_dim)
        self.dropout = torch.nn.Dropout(p=dropout)
    def forward(self,bert_inputs,segement_labels=False):
        my_embeddings=self.token_embedding(bert_inputs)
        if self.train:
          x = self.dropout(my_embeddings + self.positional_encoding(my_embeddings) + self.segment_embedding(segment_labels))
        else:
          x = my_embeddings + self.positional_encoding(my_embeddings)
          return x

In [8]:
VOCAB_SIZE=147161
batch=2
count=0
for batch in train_dataloader:
    bert_input,bert_label,segement_label,is_next=[b for b in batch]
    
    count+=1
    if count==5:
        break


In [9]:
bert_input.shape

torch.Size([46, 2])

In [10]:
bert_label.shape

torch.Size([46, 2])

In [11]:
segement_label.shape

torch.Size([46, 2])

In [12]:
is_next.shape

torch.Size([2])

In [13]:
is_next

tensor([1, 1])

In [14]:
bert_input

tensor([[    1,     1],
        [    8,    16],
        [    3,     3],
        [   13,     9],
        [    3,    96],
        [93745,   541],
        [   10,     3],
        [   35,    12],
        [  127, 38782],
        [  125,     3],
        [ 2643,    19],
        [  134,   131],
        [   39,    18],
        [    2,    20],
        [    0,     9],
        [   98,   359],
        [   17,     7],
        [  287,   683],
        [   12, 10579],
        [    3,    43],
        [   66,   348],
        [ 2382,     6],
        [   22,     2],
        [    3,    14],
        [ 1756,    12],
        [ 1390,    19],
        [    3,    47],
        [   81,   619],
        [    6,    11],
        [    2,    77],
        [    0,    18],
        [    0,   348],
        [    0, 10675],
        [    0,   562],
        [    0,   221],
        [    0,     3],
        [    0,    23],
        [    0,   153],
        [    0,   208],
        [    0,   143],
        [    0,   502],
        [    0, 

In [15]:
segement_label

tensor([[1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [0, 1],
        [2, 1],
        [2, 1],
        [2, 1],
        [2, 1],
        [2, 1],
        [2, 1],
        [2, 1],
        [2, 1],
        [2, 2],
        [2, 2],
        [2, 2],
        [2, 2],
        [2, 2],
        [2, 2],
        [2, 2],
        [0, 2],
        [0, 2],
        [0, 2],
        [0, 2],
        [0, 2],
        [0, 2],
        [0, 2],
        [0, 2],
        [0, 2],
        [0, 2],
        [0, 2],
        [0, 2],
        [0, 2],
        [0, 0],
        [0, 0],
        [0, 0]])

In [16]:
# Instantiate the TokenEmbedding 
token_embedding = TokenEmbedding(VOCAB_SIZE, embed_dim=EMBEDDING_DIM )

# Get the token embeddings for a sample input
t_embeddings = token_embedding(bert_input)
#Each token is transformed into a tensor of size emb_size
print(f"Dimensions of token embeddings: {t_embeddings.size()}") # Expected: (sequence_length, batch_size, EMBEDDING_DIM)
#Check the embedded vectors for first 3 tokens of the first sample in the batch
# you get embeddings[i,0,:] where i refers to the i'th token of the first sample in the batch (b=0)
for i in range(3):
    print(f"Token Embeddings for the {i}th token of the first sample: {t_embeddings[i,0,:]}")

Dimensions of token embeddings: torch.Size([46, 2, 10])
Token Embeddings for the 0th token of the first sample: tensor([-5.2643,  2.1061,  0.6387,  3.3791, -2.7117,  4.6467,  0.3394,  3.1973,
        -4.3253, -0.3849], grad_fn=<SliceBackward0>)
Token Embeddings for the 1th token of the first sample: tensor([-1.6590,  1.3288,  6.5083, -6.4037, -6.3066, -1.2353,  0.4244, -0.3644,
         2.0359, -0.9853], grad_fn=<SliceBackward0>)
Token Embeddings for the 2th token of the first sample: tensor([-0.5932,  1.2142, -3.1347, -1.2464,  0.6761, -3.4110,  1.2599, -1.0130,
         2.8166,  3.5121], grad_fn=<SliceBackward0>)


### Bert Model Architecture

In [20]:
class BERT(nn.Module):
    def __init__(self,vocab_size,embed_dim,num_layers,n_head,dropout):
        super().__init__()
        self.d_model = d_model
        self.n_layers = n_layers
        self.heads = heads
        self.embedding=BERTEmbedding(vocab_size,embed_dim)
        encoder_layer=nn.TransformerEncoderLayer(d_model=embed_dim,nhead=n_head,dropout=dropout)
        self.encoder=nn.TransformerEncoder(encoder_layer,num_layers=num_layers)
        #Next Sentence Prediction
        self.NextSentencePrediction=nn.Linear(embed_dim,2)
        #Masked Word Prediction
        self.MaskedPrediction=nn.Linear(embed_dim,vocab_size)
    def forward(self,bert_input,segement_label):
        padding_mask=(bert_inputs==PAD_IDX).transpose(0,1)
        x=self.embedding(bert_input,segement_label)
        values_after_encoding=self.encoder(x,src_key_padding_mask=padding_mask)
        next_sentence=self.NextSentencePrediction(values_after_encoding[0,:])
        masked_language=self.MaskedPrediction(values_after_encoding)
        return next_sentence,masked_language 

In [21]:
EMBEDDING_DIM = 10
vocab_size = 147161
d_model = EMBEDDING_DIM  
n_layers = 2
initial_heads = 12
initial_heads = 2
heads = initial_heads - d_model % initial_heads
dropout = 0.1  
model = BERT(vocab_size, d_model, n_layers, heads, dropout)