# Training STS Benchmark
This is re-implementation of https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/sts/training_stsbenchmark.py

### Importing Libraries

In [1]:
!pip install tqdm



In [2]:
import os
import logging
from tqdm.auto import tqdm
import shutil
from datetime import datetime
import gzip
import pandas as pd
import csv
import math

from sentence_transformers import SentenceTransformer, models, InputExample, losses, evaluation
from torch.utils.data import DataLoader

2022-11-29 17:26:18.355372: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-29 17:26:18.466415: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-11-29 17:26:18.488492: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-29 17:26:18.894646: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

### Loading the data

In [3]:
sts_dataset_path = "stsbenchmark.tsv.gz"

# Download the data if not exist
if not os.path.exists(sts_dataset_path):
    with requests.get("https://sbert.net/datasets/stsbenchmark.tsv.gz", stream=True) as file:
        # Get the file size in bytes from the header
        total_length = int(file.headers.get("Content-Length"))
        
        # Implement the ProgressBar
        with tqdm.wrapattr(file.raw, "read", total=total_length, desc="") as raw:
            with open(f"{os.path.basename(file.url)}", 'wb') as output:
                # Save the output as a file
                shutil.copyfileobj(raw, output)

### Set Model Params

In [5]:
model_name = "bert-base-uncased"

train_batch_size = 16
num_epochs = 4
model_save_path = "output/training_stsbenchmark_" + model_name.replace('/', '-') + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

model_save_path

'output/training_stsbenchmark_bert-base-uncased-2022-11-29_17-26-35'

### Create Model Modules

In [6]:
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                              pooling_mode_mean_tokens=True,
                              pooling_mode_cls_token=False,
                              pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Creating the DataLoader

In [7]:
# Create the dataloader
train_data = []
dev_data = []
test_data = []

with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as file:
    df = pd.read_csv("/home/farzad/Downloads/stsbenchmark.tsv", delimiter='\t', quoting=csv.QUOTE_NONE)
    for _, row in df.iterrows():
        # We should normalize the scores b/w 0-1 (by default the values are b/w 0-5)
        score = row["score"] / 5.0
        datapoint = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)
        
        if row["split"] == "train":
            train_data.append(datapoint)
        elif row["split"] == "dev":
            dev_data.append(datapoint)
        else:
            test_data.append(datapoint)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model)

evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(dev_data, name="sts-dev")

### Configuring the Training Process

In [8]:
# Use 10% of the training data as warmup
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

### Train the Model

In [9]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
         evaluator=evaluator,
         evaluation_steps=1000,
         epochs=num_epochs,
         warmup_steps=warmup_steps,
         output_path=model_save_path)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

In [10]:
# model = SentenceTransformer(model_save_path)  # Load in case you have a saved model and did not train one.
test_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(test_data, name='sts-test')
test_evaluator(model, output_path=model_save_path)

0.8413889311406069

## References
[1] https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/sts/training_stsbenchmark.py

[2] [How To: Progress Bars for Python Downloads](https://www.alpharithms.com/progress-bars-for-python-downloads-580122/)