In [None]:
!pip install datasets
!pip install sentence-transformers


Collecting datasets
  Downloading datasets-1.12.1-py3-none-any.whl (270 kB)
[?25l[K     |█▏                              | 10 kB 24.2 MB/s eta 0:00:01[K     |██▍                             | 20 kB 24.9 MB/s eta 0:00:01[K     |███▋                            | 30 kB 11.3 MB/s eta 0:00:01[K     |████▉                           | 40 kB 8.9 MB/s eta 0:00:01[K     |██████                          | 51 kB 5.1 MB/s eta 0:00:01[K     |███████▎                        | 61 kB 5.4 MB/s eta 0:00:01[K     |████████▌                       | 71 kB 5.8 MB/s eta 0:00:01[K     |█████████▊                      | 81 kB 6.5 MB/s eta 0:00:01[K     |███████████                     | 92 kB 6.6 MB/s eta 0:00:01[K     |████████████▏                   | 102 kB 5.3 MB/s eta 0:00:01[K     |█████████████▍                  | 112 kB 5.3 MB/s eta 0:00:01[K     |██████████████▋                 | 122 kB 5.3 MB/s eta 0:00:01[K     |███████████████▊                | 133 kB 5.3 MB/s eta 0:00:01

In [None]:
"""
MODIFIED: (efv) Use STSb-multi-mt Spanish
source: https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/sts/training_stsbenchmark.py

---

This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch. It generates sentence embeddings
that can be compared using cosine-similarity to measure the similarity.

Usage:
python training_nli.py

OR
python training_nli.py pretrained_transformer_model_name
"""
from torch.utils.data import DataLoader
from torch import cuda
import math
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import sys
import os
import gzip
import csv

from datasets import load_dataset

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

logging.info(f"CUDA Device Name:{cuda.get_device_name()}")

2021-09-16 06:56:44 - CUDA Device Name:Tesla K80


In [None]:
MODEL = 'mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es'

#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
#model_name = sys.argv[1] if len(sys.argv) > 1 else 'distilbert-base-uncased'
model_name = MODEL 

# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = '/content/drive/MyDrive/sentence similarity'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read stsb-multi-mt train dataset")

train_samples = []
dev_samples = []
test_samples = []

def samples_from_dataset(dataset):
    samples = [InputExample(texts=[e['sentence1'], e['sentence2']], label=e['similarity_score'] / 5) \
        for e in dataset] 
    return samples

train_samples = samples_from_dataset(load_dataset("stsb_multi_mt", name="es", split="train"))
dev_samples = samples_from_dataset(load_dataset("stsb_multi_mt", name="es", split="dev"))
test_samples = samples_from_dataset(load_dataset("stsb_multi_mt", name="es", split="test"))

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

initial_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, write_csv=False)
initial_evaluator(model)

logging.info("Read stsb-multi-mt dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


## Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

#model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='stsb-multi-mt-test')
test_evaluator(model, output_path=model_save_path)


2021-09-16 07:33:50 - Lock 140037730352656 acquired on /root/.cache/huggingface/transformers/17330f67d8c327c0b1699be552404022f63be5db79858b26484fc847da416eb9.2e4532ea7d3ba93d791168876c978107ea0cba47d2b0736de7c9139e9670eff4.lock


Downloading:   0%|          | 0.00/465 [00:00<?, ?B/s]

2021-09-16 07:33:51 - Lock 140037730352656 released on /root/.cache/huggingface/transformers/17330f67d8c327c0b1699be552404022f63be5db79858b26484fc847da416eb9.2e4532ea7d3ba93d791168876c978107ea0cba47d2b0736de7c9139e9670eff4.lock
2021-09-16 07:33:51 - Lock 140037699426960 acquired on /root/.cache/huggingface/transformers/7966a0423b1c913c4e68d5399e17e4296eb2a7445564ae9ec574ae547efbe8bd.14d8bb83a1f0f787ccc04af18ea2125ec4a26e94474747d8b5834fb315e2caa4.lock


Downloading:   0%|          | 0.00/439M [00:00<?, ?B/s]

2021-09-16 07:34:06 - Lock 140037699426960 released on /root/.cache/huggingface/transformers/7966a0423b1c913c4e68d5399e17e4296eb2a7445564ae9ec574ae547efbe8bd.14d8bb83a1f0f787ccc04af18ea2125ec4a26e94474747d8b5834fb315e2caa4.lock


Some weights of the model checkpoint at mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es were not used when initializing BertModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2021-09-16 07:34:07 - Lock 140037731048592 acquired on /root/.cache/huggingface/transformers/4d0cfa842922c935f9584d98c1de673525620c32f5749db976f4dd568d90bc76.f57c45f436182a8fb3a56f7b1c341ed2943046fed9922b6963a46c869a9196aa.lock


Downloading:   0%|          | 0.00/135 [00:00<?, ?B/s]

2021-09-16 07:34:07 - Lock 140037731048592 released on /root/.cache/huggingface/transformers/4d0cfa842922c935f9584d98c1de673525620c32f5749db976f4dd568d90bc76.f57c45f436182a8fb3a56f7b1c341ed2943046fed9922b6963a46c869a9196aa.lock
2021-09-16 07:34:08 - Lock 140037735292048 acquired on /root/.cache/huggingface/transformers/2c511a62e569bb7e3623cdadba0823aa6ac3953d13dc7401f40a47794cea3079.dafbd6e6622cfaafea54bfe717b14fcacdaa069149af8fae4086afa5a9629ec3.lock


Downloading:   0%|          | 0.00/242k [00:00<?, ?B/s]

2021-09-16 07:34:08 - Lock 140037735292048 released on /root/.cache/huggingface/transformers/2c511a62e569bb7e3623cdadba0823aa6ac3953d13dc7401f40a47794cea3079.dafbd6e6622cfaafea54bfe717b14fcacdaa069149af8fae4086afa5a9629ec3.lock
2021-09-16 07:34:08 - Lock 140037734185040 acquired on /root/.cache/huggingface/transformers/9ee3712830b330cf2407b46bba34b1ca9dbeab6c887b79991d4053ca40501c8f.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock


Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

2021-09-16 07:34:09 - Lock 140037734185040 released on /root/.cache/huggingface/transformers/9ee3712830b330cf2407b46bba34b1ca9dbeab6c887b79991d4053ca40501c8f.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-09-16 07:34:09 - Use pytorch device: cuda
2021-09-16 07:34:09 - Read stsb-multi-mt train dataset
2021-09-16 07:34:10 - Reusing dataset stsb_multi_mt (/root/.cache/huggingface/datasets/stsb_multi_mt/es/1.0.0/bc6de0eaa8d97c28a4c22a07e851b05879ae62c60b0b69dd6b331339e8020f07)
2021-09-16 07:34:10 - Reusing dataset stsb_multi_mt (/root/.cache/huggingface/datasets/stsb_multi_mt/es/1.0.0/bc6de0eaa8d97c28a4c22a07e851b05879ae62c60b0b69dd6b331339e8020f07)
2021-09-16 07:34:11 - Reusing dataset stsb_multi_mt (/root/.cache/huggingface/datasets/stsb_multi_mt/es/1.0.0/bc6de0eaa8d97c28a4c22a07e851b05879ae62c60b0b69dd6b331339e8020f07)
2021-09-16 07:34:11 - EmbeddingSimilarityEvaluator: Evaluating the model on  dataset:
2021-09-16 07:34:21 - Cosine-Similarity :	Pearson: 0.452

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

2021-09-16 07:37:25 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 0:
2021-09-16 07:37:36 - Cosine-Similarity :	Pearson: 0.8095	Spearman: 0.8069
2021-09-16 07:37:36 - Manhattan-Distance:	Pearson: 0.7947	Spearman: 0.7957
2021-09-16 07:37:36 - Euclidean-Distance:	Pearson: 0.7955	Spearman: 0.7963
2021-09-16 07:37:36 - Dot-Product-Similarity:	Pearson: 0.7619	Spearman: 0.7619
2021-09-16 07:37:36 - Save model to /content/drive/MyDrive/sentence similaritymrm8488-distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es-2021-09-16_07-33-50


Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

2021-09-16 07:40:40 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 1:
2021-09-16 07:40:52 - Cosine-Similarity :	Pearson: 0.8240	Spearman: 0.8239
2021-09-16 07:40:52 - Manhattan-Distance:	Pearson: 0.8149	Spearman: 0.8164
2021-09-16 07:40:52 - Euclidean-Distance:	Pearson: 0.8154	Spearman: 0.8168
2021-09-16 07:40:52 - Dot-Product-Similarity:	Pearson: 0.7830	Spearman: 0.7842
2021-09-16 07:40:52 - Save model to /content/drive/MyDrive/sentence similaritymrm8488-distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es-2021-09-16_07-33-50


Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

2021-09-16 07:43:56 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 2:
2021-09-16 07:44:07 - Cosine-Similarity :	Pearson: 0.8317	Spearman: 0.8315
2021-09-16 07:44:07 - Manhattan-Distance:	Pearson: 0.8186	Spearman: 0.8204
2021-09-16 07:44:07 - Euclidean-Distance:	Pearson: 0.8189	Spearman: 0.8207
2021-09-16 07:44:07 - Dot-Product-Similarity:	Pearson: 0.8011	Spearman: 0.8022
2021-09-16 07:44:07 - Save model to /content/drive/MyDrive/sentence similaritymrm8488-distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es-2021-09-16_07-33-50


Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

2021-09-16 07:47:11 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 3:
2021-09-16 07:47:22 - Cosine-Similarity :	Pearson: 0.8320	Spearman: 0.8314
2021-09-16 07:47:22 - Manhattan-Distance:	Pearson: 0.8201	Spearman: 0.8214
2021-09-16 07:47:22 - Euclidean-Distance:	Pearson: 0.8205	Spearman: 0.8216
2021-09-16 07:47:22 - Dot-Product-Similarity:	Pearson: 0.7998	Spearman: 0.7997
2021-09-16 07:47:22 - EmbeddingSimilarityEvaluator: Evaluating the model on stsb-multi-mt-test dataset:
2021-09-16 07:47:32 - Cosine-Similarity :	Pearson: 0.8056	Spearman: 0.7993
2021-09-16 07:47:32 - Manhattan-Distance:	Pearson: 0.7986	Spearman: 0.7953
2021-09-16 07:47:32 - Euclidean-Distance:	Pearson: 0.7991	Spearman: 0.7960
2021-09-16 07:47:32 - Dot-Product-Similarity:	Pearson: 0.7658	Spearman: 0.7542


0.7992855280336078

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
import scipy

In [None]:
model = SentenceTransformer('/content/drive/MyDrive/sentence similaritydccuchile-bert-base-spanish-wwm-cased-2021-09-16_06-58-50')

2021-09-16 07:52:27 - Load pretrained SentenceTransformer: /content/drive/MyDrive/sentence similaritydccuchile-bert-base-spanish-wwm-cased-2021-09-16_06-58-50
2021-09-16 07:52:28 - Use pytorch device: cuda


In [None]:
sentences = ["mi nombre es Siddhartha","¿viajas a kathmandu?"]

sentence_embeddings = model.encode(sentences)
print(sentence_embeddings)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[-0.26081175 -0.6888043  -0.00603448 ... -0.18238701 -0.4568976
   0.9363467 ]
 [-1.0143832  -0.14042018 -0.322978   ...  0.49780977 -0.3989648
   0.27396184]]


In [None]:
queries = ['escuché que tu nombre es siddhartha','Ellas dijeron que eres Siddhartha','Amo hacer deporte.','Viajo a Kathmandú.','Siddhartha viajas a Kathmandú.']
query_embeddings = model.encode(queries)

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], sentence_embeddings, "cosine")[0]
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    final_result = {}

    for index, distance in results:
        sentence = sentences[index].strip()
        score = 1 - distance
        final_result[sentence] = score
    print(final_result)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'mi nombre es Siddhartha': 0.7912700592356047, '¿viajas a kathmandu?': 0.3860766430227439}
{'mi nombre es Siddhartha': 0.743372832305824, '¿viajas a kathmandu?': 0.3468786395865391}
{'¿viajas a kathmandu?': 0.16227890220390317, 'mi nombre es Siddhartha': 0.1201195806587878}
{'¿viajas a kathmandu?': 0.8232840087330114, 'mi nombre es Siddhartha': 0.3805481967190456}
{'¿viajas a kathmandu?': 0.7391887846542984, 'mi nombre es Siddhartha': 0.6079428403632269}


BETO cased
```
{'mi nombre es Siddhartha': 0.7912700592356047, '¿viajas a kathmandu?': 0.3860766430227439}
{'mi nombre es Siddhartha': 0.743372832305824, '¿viajas a kathmandu?': 0.3468786395865391}
{'¿viajas a kathmandu?': 0.16227890220390317, 'mi nombre es Siddhartha': 0.1201195806587878}
{'¿viajas a kathmandu?': 0.8232840087330114, 'mi nombre es Siddhartha': 0.3805481967190456}
{'¿viajas a kathmandu?': 0.7391887846542984, 'mi nombre es Siddhartha': 0.6079428403632269}
```
BETO uncased
```
{'mi nombre es Siddhartha': 0.8914966772911828, '¿viajas a kathmandu?': 0.353371834667227}
{'mi nombre es Siddhartha': 0.8332292937739496, '¿viajas a kathmandu?': 0.3653644360021573}
{'¿viajas a kathmandu?': 0.08264144223735836, 'mi nombre es Siddhartha': 0.0009075415377219898}
{'¿viajas a kathmandu?': 0.5626156682788024, 'mi nombre es Siddhartha': 0.1754298156049111}
{'¿viajas a kathmandu?': 0.757876368191109, 'mi nombre es Siddhartha': 0.6245562735281108}
```
mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es
```
{'mi nombre es Siddhartha': 0.9496377321757722, '¿viajas a kathmandu?': -0.011538960759532646}
{'mi nombre es Siddhartha': 0.873893744636982, '¿viajas a kathmandu?': 0.0364520392014831}
{'¿viajas a kathmandu?': 0.1880492277160054, 'mi nombre es Siddhartha': 0.10451364033851163}
{'¿viajas a kathmandu?': 0.6593797282828646, 'mi nombre es Siddhartha': -0.1055048312607405}
{'mi nombre es Siddhartha': 0.7633602917502282, '¿viajas a kathmandu?': 0.23936921891254825}
```