# Sentence embeddings

In [27]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import csv
import math
import os
import gzip
from datetime import datetime
from pathlib import Path

In [4]:
import numpy as np

import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, models, losses, util, InputExample, LoggingHandler
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from tokenizers import ByteLevelBPETokenizer, BertWordPieceTokenizer, SentencePieceBPETokenizer, CharBPETokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
embed_dim = 8
vocab_size = 512
output_features = 8
n_heads = 4
dropout_rate = 0.1
n_tlayers = 1
max_seq_len = 16
n_qlayers = 1
n_qubits = 5 # must be odd and > 3 (ie query, key, value)
q_device = "lightning.gpu" # lightning.gpu, braket.aws.qubit, default.qubit
#q_device = "qulacs.simulator"
#q_device = "braket.aws.qubit"
#q_device = "rigetti.qvm"
lr = 1e-3

In [6]:
model_name = 'gptq'
train_batch_size = 16
num_epochs = 2
model_save_path = 'output/training_stsbenchmark_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [7]:
special_tokens = [
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ]

In [8]:
train_samples = []
dev_samples = []
test_samples = []
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)

        if row['split'] == 'dev':
            dev_samples.append(inp_example)
        elif row['split'] == 'test':
            test_samples.append(inp_example)
        else:
            train_samples.append(inp_example)

In [9]:
# hack to reduce the number of training examples
n = 5
train_samples = train_samples[:n]
test_samples = test_samples[:n]

In [52]:
from models import GPTQ

gptq = GPTQ(embed_dim=embed_dim,
            tgt_vocab=vocab_size,
            n_heads=n_heads,
            dropout_rate=dropout_rate,
            n_tlayers=n_tlayers,
            max_seq_len=max_seq_len,
            n_qlayers=n_qlayers,
            n_qubits=n_qubits,
            q_device=q_device,
            batch_first=True).to(device)
pooling_model = models.Pooling(gptq.get_word_embedding_dimension()).to(device)
dense_model = models.Dense(
    in_features=pooling_model.get_sentence_embedding_dimension(),
    out_features=output_features,
    activation_function=torch.nn.Tanh()
).to(device)

model = SentenceTransformer(modules=[gptq, pooling_model, dense_model])

In [53]:
model.to(device)
model.device

device(type='cuda', index=0)

In [54]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

In [55]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

In [56]:
warmup_steps = 1
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Epoch:   0%|                                                                                                                                                                                                                | 0/2 [00:00<?, ?it/s]
Iteration:   0%|                                                                                                                                                                                                            | 0/1 [00:01<?, ?it/s][A
Epoch:   0%|                                                                                                                                                                                                                | 0/2 [00:01<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!