# fine-tuning a DistilBERT model on the STS-B (Semantic Textual Similarity Benchmark) dataset, which is a regression task to predict sentence similarity scores.

You need to run in terminal:

```sh
conda env create -f env.yml
```
Then, you need to activate text_classification environment

In [1]:
import datasets
import pandas as pd
import torch
import numpy as np
from datasets import load_dataset
from transformers import DistilBertConfig, DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from scipy.stats import pearsonr, spearmanr
from transformers.integrations import TensorBoardCallback

  from .autonotebook import tqdm as notebook_tqdm


## 1- Load DistilBERT model and Tokenizer

In [2]:
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=1) # num_labels=1 for regression 
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config= config)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 2- load datasets

In [3]:
train_data = load_dataset('glue', 'stsb', split="train")
val_data = load_dataset('glue', 'stsb', split="validation")
test_data = load_dataset('glue', 'stsb', split="test")

print(f"Train samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")
pd.DataFrame(train_data).head()

Train samples: 5749
Validation samples: 1500
Test samples: 1379


Unnamed: 0,sentence1,sentence2,label,idx
0,A plane is taking off.,An air plane is taking off.,5.0,0
1,A man is playing a large flute.,A man is playing a flute.,3.8,1
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.8,2
3,Three men are playing chess.,Two men are playing chess.,2.6,3
4,A man is playing the cello.,A man seated is playing the cello.,4.25,4


## 3- Tokenize the datasets

In [4]:
# Function to preprocess data - tokenize and prepare labels
def preprocess_function(examples):
    # Tokenize the sentence pairs
    tokenized = tokenizer(
        examples['sentence1'],
        examples['sentence2'],
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    
    # Convert labels to proper format (float) for regression
    tokenized["labels"] = examples["label"]
    return tokenized

# Using map() applies the tokenizer function row-by-row to the dataset
# Better memory efficiency than using  tokenizer(train_data['sentence1'], train_data['sentence2'], truncation=True, padding=True)
# More efficient for larger datasets 
# Avoids list processing issues

# Apply preprocessing with better batching
train_encodings = train_data.map(
    preprocess_function,
    batched=True,
    batch_size=1000,
    remove_columns=['idx', 'sentence1', 'sentence2', 'label']
)

val_encodings = val_data.map(
    preprocess_function,
    batched=True,
    batch_size=1000,
    remove_columns=['idx', 'sentence1', 'sentence2', 'label']
)

test_encodings = test_data.map(
    preprocess_function,
    batched=True,
    batch_size=1000,
    remove_columns=['idx', 'sentence1', 'sentence2', 'label']
)


# show some samples
pd.DataFrame(train_encodings).head()

Map: 100%|██████████| 1379/1379 [00:00<00:00, 22140.01 examples/s]


Unnamed: 0,input_ids,attention_mask,labels
0,"[101, 1037, 4946, 2003, 2635, 2125, 1012, 102,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5.0
1,"[101, 1037, 2158, 2003, 2652, 1037, 2312, 8928...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",3.8
2,"[101, 1037, 2158, 2003, 9359, 14021, 5596, 209...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",3.8
3,"[101, 2093, 2273, 2024, 2652, 7433, 1012, 102,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2.6
4,"[101, 1037, 2158, 2003, 2652, 1996, 10145, 101...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4.25


## 4- Fine-tuning the model

In [5]:
training_args = TrainingArguments(
    output_dir='./results/distilbert',
    num_train_epochs=50, # train for 50 epochs
    per_device_eval_batch_size=32,  # Increased eval batch size for efficiency
    warmup_ratio=0.1,  # Ratio instead of steps for flexibility
    weight_decay=0.01, # Regularization
    logging_dir='./logs/distilbert',
    logging_strategy='steps',
    logging_steps=100,  # Less frequent logging
    save_strategy='steps', #
    save_steps=100, # Save model checkpoint every 100 steps
    evaluation_strategy='steps',
    eval_steps=100,  # Less frequent evaluation
    fp16=torch.cuda.is_available(),  # Use mixed precision when available
    load_best_model_at_end=True,
    metric_for_best_model='Pearson_corr',  # Specifically optimize for Pearson correlation
    greater_is_better=True,  # Higher correlation is better
    save_total_limit=1,  # Only keep one model checkpoint
    report_to=["tensorboard"],
    gradient_accumulation_steps=2,  # Effective batch size of 32
)



In [6]:
def compute_metrics(pred):
    preds = np.squeeze(pred.predictions)
    labels = pred.label_ids
    
    mse = ((preds - labels) ** 2).mean().item()
    rmse = np.sqrt(mse)
    mae = np.abs(preds - labels).mean().item()
    
    # Correlation metrics
    pearson_corr = pearsonr(preds, labels)[0]
    spearman_corr = spearmanr(preds, labels)[0]
    
    # R2 score
    r2 = 1 - mse / np.var(labels)
    
    return {
        'MAE': mae,
        'RMSE': rmse,
        'MSE': mse,
        'Pearson_corr': pearson_corr,
        'Spearman_corr': spearman_corr,
        'R2': r2
    }

In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=val_encodings,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[
        TensorBoardCallback(),
    ]
)
results = trainer.train()
print("Training completed!") 


  trainer = Trainer(
You are adding a <class 'transformers.integrations.integration_utils.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback


Step,Training Loss,Validation Loss,Mae,Rmse,Mse,Pearson Corr,Spearman Corr,R2
100,8.4562,4.389685,1.742131,2.095157,4.389685,0.10143,0.104101,-0.951009
200,2.7177,1.776982,1.123375,1.333035,1.776982,0.550702,0.539864,0.210215
300,1.1547,0.703642,0.675999,0.838834,0.703642,0.832668,0.833299,0.687264
400,0.812,0.821159,0.719822,0.906179,0.821159,0.830902,0.834628,0.635033
500,0.7072,0.64358,0.636997,0.802234,0.64358,0.846773,0.845962,0.713959
600,0.693,0.594413,0.608878,0.770982,0.594413,0.861019,0.859522,0.735811
700,0.6887,0.69656,0.648483,0.834601,0.69656,0.848136,0.849553,0.690412
800,0.5124,0.604975,0.594026,0.777801,0.604975,0.859417,0.858101,0.731117
900,0.5137,0.59349,0.595081,0.770383,0.59349,0.860839,0.858054,0.736221
1000,0.4727,0.592499,0.605572,0.76974,0.592499,0.859781,0.857073,0.736662


Training completed!


## 5- Evaluate the fine-tuned model

In [8]:
# Evaluate the model on all datasets
datasets_to_evaluate = {
    "train": train_encodings,
    "validation": val_encodings,
    "test": test_encodings
}

eval_results = {}
for name, dataset in datasets_to_evaluate.items():
    metrics = trainer.evaluate(eval_dataset=dataset)
    eval_results[name] = metrics
    print(f"Evaluation on {name} set:")
    print(metrics)

# Create a summary dataframe
metrics_df = pd.DataFrame(eval_results).T
print("\nSummary of evaluation metrics:")
print(metrics_df)

Evaluation on train set:
{'eval_loss': 0.04725569114089012, 'eval_MAE': 0.16986903548240662, 'eval_RMSE': 0.21738374166641378, 'eval_MSE': 0.04725569114089012, 'eval_Pearson_corr': 0.9930166006088257, 'eval_Spearman_corr': 0.9930803685686271, 'eval_R2': 0.9779599905014038, 'eval_runtime': 3.9144, 'eval_samples_per_second': 1468.682, 'eval_steps_per_second': 45.984, 'epoch': 49.86230876216968}
Evaluation on validation set:
{'eval_loss': 0.5404831767082214, 'eval_MAE': 0.5579813122749329, 'eval_RMSE': 0.7351756094350665, 'eval_MSE': 0.5404831767082214, 'eval_Pearson_corr': 0.8769494295120239, 'eval_Spearman_corr': 0.8745449215468816, 'eval_R2': 0.7597805857658386, 'eval_runtime': 1.0451, 'eval_samples_per_second': 1435.309, 'eval_steps_per_second': 44.973, 'epoch': 49.86230876216968}
Evaluation on test set:
{'eval_loss': 13.125692367553711, 'eval_MAE': 3.4147353172302246, 'eval_RMSE': 3.6229396094717607, 'eval_MSE': 13.125691413879395, 'eval_Pearson_corr': nan, 'eval_Spearman_corr': nan,

  pearson_corr = pearsonr(preds, labels)[0]
  spearman_corr = spearmanr(preds, labels)[0]
  r2 = 1 - mse / np.var(labels)


## 6- Save the fine-tuned model

In [9]:
# Save the model
model_save_path = "./saved_models/distilbert-finetuned"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to ./saved_models/distilbert-finetuned


## 7- Example of how to use the model for prediction

In [10]:
def predict_similarity(sentence1, sentence2):
    inputs = tokenizer(
        sentence1, 
        sentence2, 
        return_tensors="pt", 
        padding=True, 
        truncation=True
    )
    
    # Move to GPU if available
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
        model.cuda()
    
    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the similarity score (scale 0-5)
    similarity_score = outputs.logits.item()
    return similarity_score

# Example usage
example1_1 = "The cat sat on the mat."
example1_2 = "The feline rested on the rug."
similarity = predict_similarity(example1_1, example1_2)
print(f"\nPredicted similarity between example sentences: {similarity:.2f} (scale 0-5)")


example2_1 = "The men are playing soccer."
example2_2 = "The match result is 2 Vs. 3."
similarity = predict_similarity(example2_1, example2_2)
print(f"\nPredicted similarity between example sentences: {similarity:.2f} (scale 0-5)")

example2_1 = "Hi How are you?"
example2_2 = "I am fine."
similarity = predict_similarity(example2_1, example2_2)
print(f"\nPredicted similarity between example sentences: {similarity:.2f} (scale 0-5)")



Predicted similarity between example sentences: 0.07 (scale 0-5)

Predicted similarity between example sentences: 0.55 (scale 0-5)

Predicted similarity between example sentences: 1.04 (scale 0-5)
