In [4]:
%pip install transformers sentencepiece fugashi ipadic gensim
from transformers import BertJapaneseTokenizer, BertModel
import torch
import torch.nn.functional as F
import pandas as pd




[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [5]:
from transformers import BertJapaneseTokenizer, BertModel
import torch


class SentenceBertJapanese:
    def __init__(self, model_name_or_path, device=None):
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(model_name_or_path)
        self.model = BertModel.from_pretrained(model_name_or_path)
        self.model.eval()

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(device)
        self.model.to(device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    @torch.no_grad()
    def encode(self, sentences, batch_size=8):
        all_embeddings = []
        iterator = range(0, len(sentences), batch_size)
        for batch_idx in iterator:
            batch = sentences[batch_idx:batch_idx + batch_size]

            encoded_input = self.tokenizer.batch_encode_plus(batch, padding="longest", 
                                           truncation=True, return_tensors="pt").to(self.device)
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')

            all_embeddings.extend(sentence_embeddings)

        # return torch.stack(all_embeddings).numpy()
        return torch.stack(all_embeddings)


In [14]:
#model = SentenceBertJapanese("sonoisa/sentence-bert-base-ja-mean-tokens")
#model = SentenceBertJapanese("sonoisa/sentence-bert-base-ja-mean-tokens-v2")
model = SentenceBertJapanese("tohoku-nlp/bert-base-japanese")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at tohoku-nlp/bert-base-japanese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect

In [15]:
input_docs = [
    '動物行動学者コンラート・ローレンツは、動物が大好き',
    '幼い頃から動物に囲まれて育った',
    '大人になっても、彼の屋敷ではカラス、オウム、ガン、サルなどの動物が放し飼いにされていた',
    'ただ、中には大型で危険な動物たちもいるし、屋敷には幼い長女もいる',
    '子供と一緒にするわけにはいかない',
    'そこで娘を守るために庭に檻をつくって入れた',
    '動物ではなく娘を、である',
    '私は猫が大好きです',
    '彼は犬が好きです',
    '彼は犬が大好きです',
    '彼は猫が好きです',
    '彼は猫が大好きです',
]
vecs = model.encode(input_docs, batch_size=12)

In [16]:
sim = F.cosine_similarity(vecs[0], vecs).tolist()
pd.DataFrame({'文章': input_docs, '類似度': sim})

Unnamed: 0,文章,類似度
0,動物行動学者コンラート・ローレンツは、動物が大好き,1.0
1,幼い頃から動物に囲まれて育った,0.780914
2,大人になっても、彼の屋敷ではカラス、オウム、ガン、サルなどの動物が放し飼いにされていた,0.821809
3,ただ、中には大型で危険な動物たちもいるし、屋敷には幼い長女もいる,0.78602
4,子供と一緒にするわけにはいかない,0.722192
5,そこで娘を守るために庭に檻をつくって入れた,0.765243
6,動物ではなく娘を、である,0.757
7,私は猫が大好きです,0.846038
8,彼は犬が好きです,0.850346
9,彼は犬が大好きです,0.845109


In [None]:
doc1 = 