In [1]:
!pip uninstall -y numpy datasets

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: datasets 2.14.5
Uninstalling datasets-2.14.5:
  Successfully uninstalled datasets-2.14.5


In [2]:
!pip install --force-reinstall numpy==1.26.4 datasets==2.14.5

Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp39-cp39-win_amd64.whl.metadata (61 kB)
Collecting datasets==2.14.5
  Using cached datasets-2.14.5-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=8.0.0 (from datasets==2.14.5)
  Using cached pyarrow-20.0.0-cp39-cp39-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.14.5)
  Using cached dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting pandas (from datasets==2.14.5)
  Using cached pandas-2.2.3-cp39-cp39-win_amd64.whl.metadata (19 kB)
Collecting requests>=2.19.0 (from datasets==2.14.5)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.62.1 (from datasets==2.14.5)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets==2.14.5)
  Using cached xxhash-3.5.0-cp39-cp39-win_amd64.whl.metadata (13 kB)
Collecting multiprocess (from datasets==2.14.5)
  Using cached multiprocess-0.70.18-py39-none-any.whl.metadata (7.5 kB)
Collect

In [None]:
# Semantic Textual Similarity with BERT (STS-B)
# Working on Python9 environment
# Import libraries and setup
from transformers import AutoTokenizer, BertModel, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import os
# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load STS-B dataset and tokenizer
dataset = load_dataset("glue", "stsb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [5]:
# CELL 3: Tokenization function
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [6]:
# Define custom regression model class
class STSModel(nn.Module):
    def __init__(self):
        super(STSModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.regressor = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(768, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        logits = self.regressor(cls_output) * 5  # scale to [0, 5]
        return logits

model = STSModel()

In [7]:
#  Define metric function using Pearson correlation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    return {"pearson": pearsonr(predictions, labels)[0]}

In [8]:
import accelerate
print(accelerate.__version__)


1.7.0


In [9]:
# CELL 6: Define training arguments
training_args = TrainingArguments(
    output_dir="./sts_results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    do_eval=True,
    save_steps=500,
    eval_steps=500,
    report_to="none"
)

In [10]:
class STSTrainer(Trainer):
       def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
           labels = inputs.pop("labels")
           outputs = model(**inputs)
           loss = nn.MSELoss()(outputs.view(-1), labels.view(-1))
           return (loss, outputs) if return_outputs else loss

In [11]:
# CELL 7: Define custom trainer class for regression
trainer = STSTrainer(
       model=model,
       args=training_args,
       train_dataset=tokenized_dataset["train"],
       eval_dataset=tokenized_dataset["validation"],
       compute_metrics=compute_metrics,
   )


In [None]:
# CELL 9: Train model
trainer.train()

Step,Training Loss
10,2.0575
20,2.059
30,1.6804
40,1.5922
50,1.376
60,1.2425
70,1.1266
80,1.0835
90,0.8105
100,0.9469


In [36]:
# CELL 10: Save model and tokenizer
model_path = "./bert_sts_model"
os.makedirs(model_path, exist_ok=True)
tokenizer.save_pretrained(model_path)
torch.save(model.state_dict(), f"{model_path}/model.pt")