In [1]:
# !pip install transformers
# !pip install datasets

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
import transformers
from transformers import BertConfig, BertPreTrainedModel, BertModel

import json


In [3]:
file_path = "/mnt/data/shared/sambhav/bert_node_degree.json"
device="cpu"
initial_epochs,total_epochs =0,10
max_len=512

In [4]:
class BertDataset(Dataset):
  def __init__(self, json_file,max_len,tokenizer):
    self.max_len = max_len
    self.tokenizer = tokenizer
    with open(json_file, 'r') as f:
      self.data = json.load(f)

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    sample = self.data[idx]
    input,output= sample['input'], sample['output']
    inputs = self.tokenizer.encode_plus(
            input,
            None,
            padding='max_length',
            max_length=self.max_len,
            truncation=True,
            add_special_tokens=True,
            return_attention_mask=True
        )
    return {
        'ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
        'mask':torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0),
        'target':torch.tensor(int(output), dtype=torch.long)
    }

In [5]:

tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased", return_tensors='pt')
dataset = BertDataset(file_path, max_len, tokenizer)



In [6]:
# dataset[0]

In [7]:
# tokenizer.decode(dataset[0]['ids'])

In [8]:
train_size = int(0.95 * len(dataset))
val_size = int(0.025 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size], generator=torch.Generator().manual_seed(42))

In [9]:
print(f"Train Dataset Length: {len(train_dataset)}")
print(f"Validation Dataset Length: {len(val_dataset)}")
print(f"Test Dataset Length: {len(test_dataset)}")
train_dataloader=DataLoader(dataset=train_dataset, batch_size=32)
val_dataloader=DataLoader(dataset=val_dataset, batch_size=1)
test_dataloader=DataLoader(dataset=test_dataset, batch_size=1)

Train Dataset Length: 1829863
Validation Dataset Length: 48154
Test Dataset Length: 48155


In [10]:
class RegressionLayer(nn.Module):
  def __init__(self, input_dim, output_dim):
    super().__init__()
    self.linear = nn.Linear(input_dim, output_dim)
        
  def forward(self, x):
    return self.linear(x)    ###########relu


class BERT(nn.Module):
  def __init__(self,max_len,config):
    super(BERT, self).__init__()
    self.config=config
    self.bert_model = BertModel(self.config)
    self.regression_layer = RegressionLayer(max_len*768,1)

  def forward(self, ids, mask):
    batch_size = len(mask)
    token_embeddings, cls_embedding= self.bert_model(ids,attention_mask=mask, return_dict=False)
    token_embeddings=token_embeddings.reshape(batch_size,-1)
    out= self.regression_layer(token_embeddings)
    return out


In [11]:
# bert_model2 = BertForMaskedLM(config)

In [12]:
# bert_model2.forward

In [13]:
config = BertConfig(
    vocab_size=tokenizer.vocab_size, 
    max_position_embeddings=1024,
    hidden_size=768,
    num_attention_heads=4,
    num_hidden_layers=4,
    type_vocab_size=1
)
model=BERT(max_len,config)

In [14]:
model.to(device)

BERT(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [15]:
# model

In [16]:
assert config==model.config

In [17]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

trainable params: 53564929 || all params: 53564929 || trainable%: 100.0


In [18]:
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr =2e-4)

In [47]:

def val(model,val_dataloader):
    model.eval()
    with torch.no_grad():
        loop = tqdm(val_dataloader,leave=False, total=len(val_dataloader))
        total=0
        correct=0
        total_loss=0
        for batch in loop:
            assert batch['ids'].shape[0]==1
            ids=batch['ids'].to(device)
            mask=batch['mask'].to(device)
            label=batch['target'].float().unsqueeze(1).to(device)
            output=model(ids,mask)
            loss = loss_fn(output, label)
            total_loss += loss
            total+=batch['ids'].shape[0]
            correct += output[(torch.round(output)==label)].shape[0]
        print(f"  Val accuracy: {correct/total *100}%, Val loss(avg): {total_loss/total}")
        

In [49]:
for epoch in range(initial_epochs,total_epochs):
    loop = tqdm(train_dataloader,leave=False, total=len(train_dataloader))
    total=0
    total_loss=0
    correct=0
    for batch in loop:
        model.train()
        ids=batch['ids'].to(device)
        mask=batch['mask'].to(device)
        label=batch['target'].float().unsqueeze(1).to(device)
        optimizer.zero_grad()
        output=model(ids,mask)
        loss = loss_fn(output, label)
        total_loss+=loss
        loss.backward()
        optimizer.step()
        # print("cheking   len is  ", batch['ids'].shape[0])
        total+=batch['ids'].shape[0]
        correct += output[(torch.round(output)==label)].shape[0]
    print(f"Epoch: {epoch}:- \n  Training accuracy: {correct/total *100}%, Training loss(avg): {total_loss/total }")
    val(model,val_dataloader)
    model_save_path = f"/data/sambhav/LLM4Graph/experiments/BERT/model_epoch_{epoch}.pth"
    torch.save(model.state_dict(), model_save_path)

    
    


                                         

cheking   len is   32
Epoch: 0:- 
  Training accuracy: 0.0%, Training loss(avg): 3.569014072418213


                                         

 Val accuracy: 0.0%, Val loss(avg): 1.7376817464828491


                                         

cheking   len is   32
Epoch: 1:- 
  Training accuracy: 12.5%, Training loss(avg): 0.5692562460899353


                                         

 Val accuracy: 0.0%, Val loss(avg): 48.343143463134766


                                         

KeyboardInterrupt: 

32