<a href="https://colab.research.google.com/github/softmurata/colab_notebooks/blob/main/llm/e5largeembedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[Blog](https://hironsan.hatenablog.com/entry/2023/07/05/073150)

Installation

In [None]:
!pip install -q transformers accelerate bitsandbytes

Load model

In [None]:
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel


def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-small')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-small')

Inference

In [20]:
import torch.nn as nn

input_texts = [
    'query: ここの宿は駅から車で３０分かかります。温泉があって気持ちいいです。',
    'query: 宿から駅までは３０分かかります。車が一番いいです。温泉からの見晴らしが最高です。',
    'query: 宿からのアクセスは車が一番良くて、温泉までは６０分かかります。見晴らしは最高です。'
]

batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

cos = nn.CosineSimilarity(dim=-1, eps=1e-6)
output1 = cos(embeddings[0], embeddings[1])
output2 = cos(embeddings[0], embeddings[2])
print(output1.detach().cpu().numpy(), output2.detach().cpu().numpy())

0.9467675 0.9253449
