In [1]:
%pip install transformers fugashi ipadic sentencepiece
from transformers import BertJapaneseTokenizer, BertModel
import torch


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm


Note: you may need to restart the kernel to use updated packages.


In [2]:
class SentenceBertJapanese:
    def __init__(self, model_name_or_path, device=None):
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(model_name_or_path)
        self.model = BertModel.from_pretrained(model_name_or_path)
        self.model.eval()

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(device)
        self.model.to(device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    @torch.no_grad()
    def encode(self, sentences, batch_size=8):
        all_embeddings = []
        iterator = range(0, len(sentences), batch_size)
        for batch_idx in iterator:
            batch = sentences[batch_idx:batch_idx + batch_size]

            # encoded_input = self.tokenizer.batch_encode_plus(batch, padding="longest", 
            #                                truncation=True, return_tensors="pt").to(self.device)
            encoded_input = self.tokenizer.batch_encode_plus(batch, padding="max_length", max_length=512,
                                           truncation=True, return_tensors="pt").to(self.device)
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')

            all_embeddings.extend(sentence_embeddings)

        # return torch.stack(all_embeddings).numpy()
        return torch.stack(all_embeddings)

In [None]:
#model = SentenceBertJapanese("sonoisa/sentence-bert-base-ja-mean-tokens")
#model = SentenceBertJapanese("sonoisa/sentence-bert-base-ja-mean-tokens-v2")
#model = SentenceBertJapanese("sonoisa/sentence-bert-base-ja-en-mean-tokens")
#model = SentenceBertJapanese("sonoisa/sentence-bert-base-ja-en-mean-tokens-v2")
model = SentenceBertJapanese("tohoku-nlp/bert-base-japanese")

Some weights of the model checkpoint at tohoku-nlp/bert-base-japanese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
import numpy as np

def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [5]:
s1 = "日本の風土の特質が日本の歴史と文化に大きな影響を与える。"
s2 = "島国の日本は、四周を海にかこまれ世界から隔てられている。"
s3 = "この地理的条件によって、日本は外敵の侵略と異民族の支配がなく、周囲から文化や技術などを吸収し、「島国」の中でそれを融和させて、独自の文化を磨きあげ築きあげてきたのである。"
s4 = "外国との交通が海に隔てられ、発信的な文化型は形成しにくいので、自ら外国の文化を積極的に受けいれる受信的な文化型を形成してきた。"
s5 = "日本文化の本質は受信文化である。"
s6 = "外来文化の受信能力が世界一である日本から発信することはほとんどない。"
s7 = "日本は外来文化を選択して受信するだけにとどまらず、さらに受信した外来文化を巧みに融合して日本化する。"
s8 = "古代朝鮮や中国から漢字や儒教、仏教、道教など宗教思想を導入し、近代欧米の新文明を吸収し、更にそれを自国に適応しようと、工夫に努めた。"
#s8 = "豊臣秀吉は偉大な武家だった。"
s9 = "受信文化の特徴として、選択の可能性があること。"
s10 = "日本は海に守られて外来侵略と異民族の支配がないため、受け入れる側の意識や都合を無視されるような押し付け、強制的な外来文化の受信は全くない。"
s = model.encode([s1, s2, s3, s4, s5, s6, s7, s8, s9, s10])
                                                    

In [6]:

print(f's1 vs s2:{cos_sim(s[0].numpy(), s[1].numpy())}')
print(f's2 vs s3:{cos_sim(s[1].numpy(), s[2].numpy())}')
print(f's3 vs s4:{cos_sim(s[2].numpy(), s[3].numpy())}')
print(f's4 vs s5:{cos_sim(s[3].numpy(), s[4].numpy())}')
print(f's5 vs s6:{cos_sim(s[4].numpy(), s[5].numpy())}')
print(f's6 vs s7:{cos_sim(s[5].numpy(), s[6].numpy())}')
print(f's7 vs s8:{cos_sim(s[6].numpy(), s[7].numpy())}')
print(f's8 vs s9:{cos_sim(s[7].numpy(), s[8].numpy())}')
print(f's9 vs s10:{cos_sim(s[8].numpy(), s[9].numpy())}')
#print(f's9 vs s10:{cos_sim(s[10].numpy(), s[11].numpy())}')



s1 vs s2:0.8407385349273682
s2 vs s3:0.8795011639595032
s3 vs s4:0.9389750361442566
s4 vs s5:0.8583738207817078
s5 vs s6:0.8412156105041504
s6 vs s7:0.886039674282074
s7 vs s8:0.9106302261352539
s8 vs s9:0.8608620762825012
s9 vs s10:0.8704507946968079
