In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW
import json
from sklearn.model_selection import train_test_split

# Load dataset
def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]
    return data

# Dataset class
class SimilarityDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = self.tokenizer(
            item['sentence1'],
            item['sentence2'],
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
        label = torch.tensor(item['score'], dtype=torch.float)
        return inputs, label

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Add regression layer to BERT
class BertForSentenceSimilarity(BertModel):
    def __init__(self, config):
        super().__init__(config)
        self.regressor = torch.nn.Linear(config.hidden_size, 1)
    
    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = super().forward(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output
        return self.regressor(pooled_output)

model = BertForSentenceSimilarity.from_pretrained('bert-base-uncased')

# Load data and create dataloaders
data = load_data(r"C:\Users\sevan\Desktop\IIIT-H\stsbenchmark-sts\train.jsonl\train.jsonl")
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)
train_dataset = SimilarityDataset(train_data, tokenizer)
val_dataset = SimilarityDataset(val_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
model.train()
for epoch in range(3):  # Number of epochs
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = torch.nn.functional.mse_loss(outputs.view(-1), labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')



Some weights of BertForSentenceSimilarity were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.regressor.bias', 'bert.regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 