In [None]:
# from transformers import AutoModelForMaskedLM, AutoTokenizer
# import torch

In [None]:
# from transformers import GPT2Config
# model_name = "sberbank-ai/mGPT"
# model_config = GPT2Config.from_pretrained(model_name)
# model = AutoModelForMaskedLM.from_pretrained(model_name, config=model_config)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# dataset = ["Your", "unlabeled", "text", "data", "..."]
# inputs = tokenizer(dataset, padding=True, truncation=True, return_tensors="pt")
# labels = inputs.input_ids.detach().clone()

# optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
# num_epochs = 3
# batch_size = 8
# num_batches = len(dataset) // batch_size
# dataloader = torch.utils.data.DataLoader(
#     list(zip(inputs.input_ids, labels)), batch_size=batch_size, shuffle=True
# )
# model.train()
# for epoch in range(num_epochs):
#     for batch in dataloader:
#         outputs = model(**batch)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()

# # Save your model
# # model.save_pretrained("./your_model_directory")


In [None]:
# import numpy as np
# import pandas as pd

# df = pd.read_csv("/home/ubuntu/Project_Files/Finetune/Data/sentences.csv")
# # select fist 500 rows
# df = df.iloc[:5000]
# # save to csv
# df.to_csv("/home/ubuntu/Project_Files/Finetune/Data/sentences_5000.csv", index=False)

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForMaskedLM, AutoTokenizer
from torch.nn import DataParallel
from tqdm.auto import tqdm


class CSVDataset(Dataset):
    def __init__(self, filename, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = pd.read_csv(filename)
        self.texts = self.data['Sentence'].tolist()  # Ensure the column name matches your CSV
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        return {'input_ids': inputs['input_ids'][0], 'attention_mask': inputs['attention_mask'][0]}  
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask} 

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)
print(model)

# dataset = CSVDataset("/home/ubuntu/Project_Files/Finetune/Data/sentences.csv", tokenizer) # full
dataset = CSVDataset("/home/ubuntu/Project_Files/Finetune/Data/sentences_5000.csv", tokenizer) # only 500
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
print("Data loaded")

# Utilize multiple GPUs
if torch.cuda.is_available():
    device = torch.device("cuda")
    model = DataParallel(model)
    model.to(device)
else:
    device = torch.device("cpu")

dataloader = list(dataloader)  
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
model.train()
num_epochs = 3
print_every_n_batches = 100  
print("Starting training")


for epoch in range(num_epochs):
    loop = tqdm(dataloader, leave=True)
    total_loss = 0.0
    num_batches = len(dataloader)
    
    for i, batch in enumerate(loop):
        batch = {k: v.to(device) for k, v in batch.items()}  # Move the entire batch to the device
        labels = batch['input_ids'].clone()  # Clone input_ids to use as labels
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=labels)
        loss = outputs.loss
        
        # Compute the mean loss for the entire batch and accumulate
        total_loss += loss.mean().item()
        
        loss.mean().backward()  # Compute the mean loss gradient
        optimizer.step()
        optimizer.zero_grad()
        loop.set_description(f'Epoch {epoch+1}/{num_epochs}')
        loop.set_postfix(loss=total_loss / (i + 1))  # Compute and display the average loss
        
        if (i + 1) % print_every_n_batches == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{num_batches}], Loss: {total_loss / (i + 1):.4f}")


model.module.save_pretrained("/home/ubuntu/Project_Files/Finetune/Data/trained_model.pth")  


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

Epoch 1/3:  64%|██████▎   | 100/157 [01:05<00:35,  1.59it/s, loss=0.988]

Epoch [1/3], Step [100/157], Loss: 0.9881


Epoch 1/3: 100%|██████████| 157/157 [01:41<00:00,  1.55it/s, loss=0.635]
Epoch 2/3:  64%|██████▎   | 100/157 [01:03<00:35,  1.60it/s, loss=0.00818]

Epoch [2/3], Step [100/157], Loss: 0.0082


Epoch 2/3: 100%|██████████| 157/157 [01:38<00:00,  1.59it/s, loss=0.00706]
Epoch 3/3:  64%|██████▎   | 100/157 [01:03<00:36,  1.58it/s, loss=0.00356]

Epoch [3/3], Step [100/157], Loss: 0.0036


Epoch 3/3: 100%|██████████| 157/157 [01:38<00:00,  1.59it/s, loss=0.00322]


In [None]:

# for epoch in range(num_epochs):
#     loop = tqdm(dataloader, leave=True)
#     for i, batch in enumerate(loop):
#         inputs = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**inputs)
#         # print(outputs)
#         loss = outputs.loss
#         print(loss)
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()
#         loop.set_description(f'Epoch {epoch+1}/{num_epochs}')
#         loop.set_postfix(loss=loss.item())

#         if (i + 1) % print_every_n_batches == 0:
#             print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], Loss: {loss.item():.4f}")

# for epoch in range(num_epochs):
#     loop = tqdm(dataloader, leave=True)
#     for i, batch in enumerate(loop):
#         batch = {k: v.to(device) for k, v in batch.items()}  # Move the entire batch to the device
#         labels = batch['input_ids'].clone()  # Clone input_ids to use as labels
#         outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()
#         loop.set_description(f'Epoch {epoch+1}/{num_epochs}')
#         loop.set_postfix(loss=loss.item())

#         if (i + 1) % print_every_n_batches == 0:
#             print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], Loss: {loss.item():.4f}")


# for epoch in range(num_epochs):
#     loop = tqdm(dataloader, leave=True)
#     total_loss = 0.0
#     num_batches = len(dataloader)
    
#     for i, batch in enumerate(loop):
#         batch = {k: v.to(device) for k, v in batch.items()}  # Move the entire batch to the device
#         labels = batch['input_ids'].clone()  # Clone input_ids to use as labels
#         outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=labels)
#         loss = outputs.loss
        
#         total_loss += loss.item()
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()
#         loop.set_description(f'Epoch {epoch+1}/{num_epochs}')
#         loop.set_postfix(loss=total_loss / (i + 1))  # Compute and display the average loss
        
#         if (i + 1) % print_every_n_batches == 0:
#             print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{num_batches}], Loss: {total_loss / (i + 1):.4f}")


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForMaskedLM, AutoTokenizer
from tqdm.auto import tqdm
import pandas as pd

# Define a Dataset class for testing on the same sentences data
class TestCSVDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.sentences = sentences
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        inputs = self.tokenizer(sentence, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        return {'input_ids': inputs['input_ids'][0], 'attention_mask': inputs['attention_mask'][0]}

# Load the trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set the device for testing
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the trained model and move it to the device
model = AutoModelForMaskedLM.from_pretrained("/home/ubuntu/Project_Files/Finetune/Data/trained_model.pth").to(device)

# # Prepare the test sentences
# test_sentences = [
#     "This is a test sentence.",
#     "Another example sentence.",
#     "BERT is a powerful model.",
# ]

csv_file = "/home/ubuntu/Project_Files/Finetune/Data/sentences.csv"
df = pd.read_csv(csv_file)
test_sentences = df["Sentence"][:10].tolist()


# Create a DataLoader for testing
test_dataset = TestCSVDataset(test_sentences, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=1)  # Batch size 1 for one sentence at a time

# Set the model to evaluation mode
model.eval()

# Test the model on the test sentences
for i, batch in enumerate(test_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    # Generate predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    # Get predicted tokens
    predicted_token_ids = torch.argmax(outputs.logits, dim=2)
    
    # Convert token IDs to tokens
    predicted_tokens = tokenizer.decode(predicted_token_ids[0].tolist(), skip_special_tokens=True)
    
    print(f"Test Sentence {i+1}:")
    print("Input Sentence:", test_sentences[i])
    print("Predicted Sentence:", predicted_tokens)
    print()


Test Sentence 1:
Input Sentence: In the PHYHIP, which is a type of gene/protein, there is a noted ppi of the gene/protein KIF15.
Predicted Sentence: in the phyhip, which is a type of gene / protein, there is a noted ppi of the gene / protein kif15.

Test Sentence 2:
Input Sentence: In the GPANK1, which is a type of gene/protein, there is a noted ppi of the gene/protein PNMA1.
Predicted Sentence: in the gpank1, which is a type of gene / protein, there is a noted ppi of the gene / protein pnma1.

Test Sentence 3:
Input Sentence: In the ZRSR2, which is a type of gene/protein, there is a noted ppi of the gene/protein TTC33.
Predicted Sentence: in the zrsr2, which is a type of gene / protein, there is a noted ppi of the gene / protein ttc33.

Test Sentence 4:
Input Sentence: In the NRF1, which is a type of gene/protein, there is a noted ppi of the gene/protein MAN1B1.
Predicted Sentence: in the nrf1, which is a type of gene / protein, there is a noted ppi of the gene / protein man1b1.

Test