In [None]:
!pip install -U datasets transformers sentence_transformers

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence_transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from sentence_transformers import SentenceTransformer
import torch
from google.colab import drive
from tqdm import tqdm
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr

## Comparision among our model, original BERT, and an existing sentence similarity model

<table>
  <tr>
    <th> </th>
    <th>Our model after fine-tunning</th>
    <th>BERT model before fine-tunning</th>
    <th>One sentence similarity model in hugging face</th>
  </tr>
  <tr>
    <td>MSE</td>
    <td>0.4616</td>    
    <td>8.3817</td>
    <td>8.3652</td>
  </tr>
  <tr>
    <td>R^2</td>
    <td>0.7948</td>    
    <td>-2.7252</td>
    <td>-2.7179</td>
  </tr>
    <tr>
    <td>Pearson Correlation</td>
    <td>0.90</td>    
    <td>-0.01</td>
    <td>-</td>
  </tr>
</table>

## BERT model performence before fine-tunning

In [None]:
model_id = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_id)
model = BertModel.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
dataset = load_dataset("glue", "stsb")
def tokenize_fun(example):
  # stardardize the label
  example['label'] = [e / 5.0 for e in example['label']]
  return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = dataset.map(tokenize_fun, batched=True)

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/502k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/114k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Map:   0%|          | 0/5749 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1379 [00:00<?, ? examples/s]

In [None]:
model.eval()

# List to store scaled similarities
scaled_similarities_before = []

# Loop over the validation dataset
for example in tqdm(dataset['validation']):
    sentence1 = example['sentence1']
    sentence2 = example['sentence2']
    label = example['label']

    tokens = tokenizer([sentence1, sentence2], return_tensors='pt', padding=True, truncation=True)
    output = model(**tokens)
    similarity = torch.nn.functional.cosine_similarity(output.pooler_output[0].unsqueeze(0), output.pooler_output[1].unsqueeze(0))

    # Scale the similarity and store it
    scaled_similarity = similarity.item() * 5
    scaled_similarities_before.append(scaled_similarity)

# Compare scaled similarities with labels
# calculate the mean squared error
labels = dataset['validation']['label']
# mse
mse = mean_squared_error(labels, scaled_similarities_before)
print(f"Mean Squared Error: {mse}")
# r2
r2 = r2_score(labels, scaled_similarities_before)
print(f"Coefficient of Determination (R^2): {r2}")
# Calculate Pearson correlation
pearson_corr, _ = pearsonr(labels, scaled_similarities_before)
print("Pearson Correlation:", pearson_corr)

100%|██████████| 1500/1500 [04:16<00:00,  5.85it/s]

Mean Squared Error: 8.381712652997892
Coefficient of Determination (R^2): -2.725278141315342
Pearson Correlation: -0.015633827759218517





## A sentence similarity model in hugging face

In [None]:
model_exist = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

model_exist.eval()

# List to store scaled similarities
scaled_similarities_exist = []

# Loop over the validation dataset
for example in tqdm(dataset['validation']):
    sentence1 = example['sentence1']
    sentence2 = example['sentence2']
    label = example['label']

    embeddings1 = model_exist.encode([sentence1], convert_to_tensor=True)
    embeddings2 = model_exist.encode([sentence2], convert_to_tensor=True)
    embeddings1 = torch.nn.functional.normalize(embeddings1, p=2, dim=1)
    embeddings2 = torch.nn.functional.normalize(embeddings2, p=2, dim=1)
    consine_similarities = torch.nn.functional.cosine_similarity(embeddings1, embeddings2)

    # Scale the similarity and store it
    scaled_similarity = similarity.item() * 5
    scaled_similarities_exist.append(scaled_similarity)

# Compare scaled similarities with labels
# calculate the mean squared error
labels = dataset['validation']['label']
# mse
mse = mean_squared_error(labels, scaled_similarities_exist)
print(f"Mean Squared Error: {mse}")
# r2
r2 = r2_score(labels, scaled_similarities_exist)
print(f"Coefficient of Determination (R^2): {r2}")
# Calculate Pearson correlation
pearson_corr, _ = pearsonr(labels, scaled_similarities_exist)
print("Pearson Correlation:", pearson_corr)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 1500/1500 [00:25<00:00, 59.71it/s]

Mean Squared Error: 8.3652804976575
Coefficient of Determination (R^2): -2.7179748189946498
Pearson Correlation: nan





### Our model: evaluation for the fine-tunning with trainer

In [None]:
model_id = "bert-base-uncased"
model_name = f"{model_id}-finetuned"
drive.mount('/content/drive')
dir = f'./drive/MyDrive/ml_class_group_project/Lili/final/{model_name}'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model_trainer = BertModel.from_pretrained(dir).to(device)
model_trainer = BertForSequenceClassification.from_pretrained(dir).to(device)
tokenizer_trainer = BertTokenizer.from_pretrained(model_id)

Mounted at /content/drive


In [None]:
#test_sentences = ["A man is finding something.", "A woman is slicing something."]
#test_sentences1 = ["A plane is taking off.", "An air plane is taking off."]

test_sentences = [["A man is finding something.",   "A plane is taking off."],
                  ["A woman is slicing something.", "An air plane is taking off."]]
tokens_test = tokenizer_trainer(test_sentences[0], test_sentences[1], return_tensors='pt', padding=True, truncation=True)
output_test = model_trainer(**tokens_test.to(device))
#output_test
#similarity_test = torch.nn.functional.cosine_similarity(output_test.pooler_output[0].unsqueeze(0), output_test.pooler_output[1].unsqueeze(0))
similarity_test = output_test.logits
similarity_test

tensor([[0.1140],
        [0.9848]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [None]:
model_trainer.eval()

# List to store scaled similarities
scaled_similarities = []

# Loop over the validation dataset
for example in tqdm(dataset['validation']):
    sentence1 = example['sentence1']
    sentence2 = example['sentence2']
    label = example['label']

    tokens = tokenizer_trainer(sentence1, sentence2, return_tensors='pt', padding=True, truncation=True)
    output = model_trainer(**tokens.to(device))
    similarity = output.logits
    # similarity = torch.nn.functional.cosine_similarity(output.pooler_output[0].unsqueeze(0), output.pooler_output[1].unsqueeze(0))

    # Scale the similarity and store it
    scaled_similarity = similarity.item() * 5
    scaled_similarities.append(scaled_similarity)

# Compare scaled similarities with labels
# calculate the mean squared error
labels = dataset['validation']['label']
# mse
mse = mean_squared_error(labels, scaled_similarities)
print(f"Mean Squared Error: {mse}")
# r2
r2 = r2_score(labels, scaled_similarities)
print(f"Coefficient of Determination (R^2): {r2}")
# Calculate Pearson correlation
pearson_corr, _ = pearsonr(labels, scaled_similarities)
print("Pearson Correlation:", pearson_corr)

100%|██████████| 1500/1500 [00:31<00:00, 47.52it/s]

Mean Squared Error: 0.4616096184762641
Coefficient of Determination (R^2): 0.7948361757646891
Pearson Correlation: 0.8998347571683616





### Good example

In [None]:
# label = 5/5
test_sentences_good = ["A plane is taking off.", "An air plane is taking off."]
tokens_good = tokenizer(test_sentences_good, return_tensors='pt', padding=True, truncation=True)
output_good = model(**tokens_good)

In [None]:
tokens_good

{'input_ids': tensor([[ 101,  138, 4261, 1110, 1781, 1228,  119,  102,    0],
        [ 101, 1760, 1586, 4261, 1110, 1781, 1228,  119,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
output_good.last_hidden_state.shape, output_good.pooler_output.shape

(torch.Size([2, 9, 768]), torch.Size([2, 768]))

In [None]:
output_good.pooler_output[0].unsqueeze(0).shape

torch.Size([1, 768])

In [None]:
similarity_good = torch.nn.functional.cosine_similarity(output_good.pooler_output[0].unsqueeze(0), output_good.pooler_output[1].unsqueeze(0))
similarity_good.item()*5

4.99158650636673

### Bad example

In [None]:
# label = 0.5/5
test_sentences_bad = ["A man is smoking.", "A man is skating."]
tokens_bad = tokenizer(test_sentences_bad, return_tensors='pt', padding=True, truncation=True)
output_bad = model(**tokens_bad)
similarity_bad = torch.nn.functional.cosine_similarity(output_bad.pooler_output[0].unsqueeze(0), output_bad.pooler_output[1].unsqueeze(0))
similarity_bad

tensor([0.9904], grad_fn=<SumBackward1>)

## Old fine-tuned model

In [None]:
model_id = "bert-base-cased"
model_name = f"{model_id}-finetuned"
drive.mount('/content/drive')
dir = f'./drive/MyDrive/ml_class_group_project/Lili/{model_name}/epoch_5'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_afterFinetuning = BertModel.from_pretrained(dir).to(device)
tokenizer_after = BertTokenizer.from_pretrained(model_id)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# test_sentences = ["A girl is styling her hair.", "A girl is brushing her hair."]
test_sentences =["A man is smoking.", "A man is skating."]
tokens_test = tokenizer_after(test_sentences, return_tensors='pt', padding=True, truncation=True)
output_test = model_afterFinetuning(**tokens_test.to(device))
output_test

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.5810, -0.1719, -0.1210,  ...,  1.4477, -0.4954,  0.8961],
         [ 0.5811, -0.1719, -0.1210,  ...,  1.4477, -0.4954,  0.8961],
         [ 0.5810, -0.1719, -0.1210,  ...,  1.4477, -0.4954,  0.8961],
         ...,
         [ 0.5810, -0.1719, -0.1210,  ...,  1.4477, -0.4954,  0.8961],
         [ 0.5811, -0.1719, -0.1210,  ...,  1.4477, -0.4954,  0.8961],
         [ 0.5810, -0.1719, -0.1210,  ...,  1.4477, -0.4954,  0.8962]],

        [[ 0.5810, -0.1719, -0.1207,  ...,  1.4475, -0.4956,  0.8962],
         [ 0.5810, -0.1719, -0.1207,  ...,  1.4475, -0.4956,  0.8962],
         [ 0.5810, -0.1719, -0.1207,  ...,  1.4475, -0.4956,  0.8962],
         ...,
         [ 0.5810, -0.1718, -0.1207,  ...,  1.4475, -0.4955,  0.8963],
         [ 0.5810, -0.1719, -0.1207,  ...,  1.4476, -0.4956,  0.8962],
         [ 0.5810, -0.1719, -0.1207,  ...,  1.4475, -0.4955,  0.8963]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackw

In [None]:
similarity_test = torch.nn.functional.cosine_similarity(output_test.pooler_output[0].unsqueeze(0), output_test.pooler_output[1].unsqueeze(0))
similarity_test

tensor([1.], device='cuda:0', grad_fn=<SumBackward1>)

## old fine-tunned model evaluation

In [None]:

model_afterFinetuning.eval()

# List to store scaled similarities
scaled_similarities = []

# Loop over the validation dataset
for example in tqdm(dataset['validation']):
    sentence1 = example['sentence1']
    sentence2 = example['sentence2']
    label = example['label']

    tokens = tokenizer_after([sentence1, sentence2], return_tensors='pt', padding=True, truncation=True)
    output = model_afterFinetuning(**tokens.to(device))
    similarity = torch.nn.functional.cosine_similarity(output.pooler_output[0].unsqueeze(0), output.pooler_output[1].unsqueeze(0))

    # Scale the similarity and store it
    scaled_similarity = similarity.item() * 5
    scaled_similarities.append(scaled_similarity)

# Compare scaled similarities with labels
# calculate the mean squared error
labels = dataset['validation']['label']
# mse
mse = mean_squared_error(labels, scaled_similarities)
print(f"Mean Squared Error: {mse}")
# r2
r2 = r2_score(labels, scaled_similarities)
print(f"Coefficient of Determination (R^2): {r2}")

100%|██████████| 1500/1500 [00:33<00:00, 44.49it/s]

Mean Squared Error: 9.198915682455997
Coefficient of Determination (R^2): -3.0884865581021064



