In [4]:
from transformers import MLukeTokenizer, LukeModel
import torch

import pandas as pd

In [5]:
class SentenceLukeJapanese:
    def __init__(self, model_name_or_path, device=None):
        self.tokenizer = MLukeTokenizer.from_pretrained(model_name_or_path)
        self.model = LukeModel.from_pretrained(model_name_or_path)
        self.model.eval()

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(device)
        self.model.to(device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


    def encode(self, sentences, batch_size=8):
        all_embeddings = []
        iterator = range(0, len(sentences), batch_size)
        for batch_idx in iterator:
            batch = sentences[batch_idx:batch_idx + batch_size]

            encoded_input = self.tokenizer.batch_encode_plus(batch, padding="longest", 
                                           truncation=True, return_tensors="pt").to(self.device)
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')

            all_embeddings.extend(sentence_embeddings)

        return torch.stack(all_embeddings)


MODEL_NAME = "sonoisa/sentence-luke-japanese-base-lite"
model = SentenceLukeJapanese(MODEL_NAME)

sentences = ["暴走したAI", "暴走した人工知能"]
sentence_embeddings = model.encode(sentences, batch_size=8)

print("Sentence embeddings:", sentence_embeddings)

Sentence embeddings: tensor([[ 0.0655, -0.4808,  0.1523,  ..., -0.2441, -0.4688,  0.0680],
        [ 0.1478, -0.9534,  0.2588,  ..., -0.1766, -0.2504, -0.2852]],
       grad_fn=<StackBackward0>)


In [6]:
# 何かしら素敵なデータをCSVで用意しておく
df = pd.read_csv('resources/data.csv', engine='python')

In [7]:
titles = df['title'].to_list()

In [8]:
# 動作確認用
titles = ["新型コロナワクチン予防接種について", "胃ガンの手術"]

In [9]:
sentence_embeddings = model.encode(titles, batch_size=8)
print("Sentence embeddings:", sentence_embeddings)

Sentence embeddings: tensor([[ 0.6404, -0.0040,  0.2891,  ...,  0.1226, -0.4075,  0.2455],
        [-0.1630,  0.1910,  0.3638,  ..., -0.0770, -0.8056,  0.2798]],
       grad_fn=<StackBackward0>)


In [11]:
for e in sentence_embeddings.tolist():
    print(e)

[0.6404397487640381, -0.00399373285472393, 0.28911447525024414, -0.6502490639686584, -0.24632003903388977, 0.8617591261863708, -0.06187582015991211, 0.2735499441623688, -0.029005732387304306, -0.05127977579832077, -0.10227715969085693, 0.368375301361084, -0.6578912734985352, -0.26168882846832275, -0.2594558000564575, -0.0005442678229883313, 0.17626960575580597, 0.548218309879303, 0.18795673549175262, -0.47122642397880554, 0.3729400038719177, -0.5997788310050964, -0.23593217134475708, -0.2800194025039673, 0.23475676774978638, 0.39039146900177, -1.1767302751541138, -0.4244050681591034, -0.289577841758728, -0.45351946353912354, -0.26150113344192505, 0.0391676165163517, 0.1776231825351715, 0.06740084290504456, 0.027892887592315674, -0.28531110286712646, -0.6897222995758057, -0.041239555925130844, -0.16392093896865845, 0.3853260278701782, 0.006775832735002041, -0.732776403427124, -0.2470153123140335, 0.3238266706466675, 0.19063009321689606, -0.7805945873260498, 0.3931370973587036, 0.2100081