In [3]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import torch
from transformers import BertModel, BertJapaneseTokenizer, BertForMaskedLM

## モデルの読み込み

In [4]:
# Bert pre-trained modelを読み込み
model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")   # BERT model
tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")   # tokenizer
maskedLM = BertForMaskedLM.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")   # MaskedLM
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

## 短文の処理

## 複数の文を処理

In [None]:
# 文章の配列を定義
string = ["新型コロナウイルスによるロックダウン（封鎖措置）の影響で電力需要や価格が低下。",
          "2019年末時点の米金利が年初以下になるとの投資家の予測は、米国経済への信頼感の後退をうかがわせる兆しだ。",
          "調達資金約３億円は広告費や開発費、人材採用などの費用に充てる。",
          "ノムラは、観光業の低迷などを理由に、タイの19年の経済成長率は18年の4.1％から3.2％に減速すると予想している。",
          "ゼネコンなど大口需要家は先安観から様子見姿勢を強め、厳しい価格要請を展開する。",
          "現在300社以上のエコシステムの基盤となっており、すでに90種類のアプリが誕生している。"]
n = len(string)

# 文章をトークン化
max_len = 512
cls = tokenizer.vocab["[CLS]"]
sep = tokenizer.vocab["[SEP]"]
pad = tokenizer.vocab["[PAD]"]
convert_id = []
m = np.repeat(0, n)
for i in range(n):
    tokenized_string = tokenizer.tokenize(string[i])
    convert_id.append(np.hstack((cls, tokenizer.convert_tokens_to_ids(tokenized_string), sep)))
    m[i] = len(convert_id[i])

# BERTで文章の特徴量を取得
max_word = np.max(m)
word_id = torch.full((n, max_word), fill_value=pad, dtype=torch.long)
for i in range(n):
    word_id[i, :len(convert_id[i])] = torch.LongTensor(convert_id[i])
if max_word > max_len:
    word_id = word_id[:, :max_len]
mask = (word_id!=pad).long()
result = model(word_id, attention_mask=mask)
bert_features = result[1].detach().numpy()
bert_features

In [5]:
path = "D:/Statistics/data/DJ_news_data/DJNML/csv_sentence"
news = pd.read_csv(path + "/DJDNA_20191008_201901-201909.csv")
news_text = np.array(news["text"])

In [38]:
string = news_text[100:200]
n = len(string)

In [39]:
%%time

# 文章をトークン化
max_len = 512
cls = tokenizer.vocab["[CLS]"]
sep = tokenizer.vocab["[SEP]"]
pad = tokenizer.vocab["[PAD]"]
convert_id = []
m = np.repeat(0, n)
for i in range(n):
    tokenized_string = tokenizer.tokenize(string[i])
    convert_id.append(np.hstack((cls, tokenizer.convert_tokens_to_ids(tokenized_string), sep)))
    m[i] = len(convert_id[i])

# BERTで文章の特徴量を取得
max_word = np.max(m)
word_id = torch.full((n, max_word), fill_value=pad, dtype=torch.long)
for i in range(n):
    word_id[i, :len(convert_id[i])] = torch.LongTensor(convert_id[i])
if max_word > max_len:
    word_id = word_id[:, :max_len]
mask = (word_id!=pad).long()
result = model(word_id, attention_mask=mask)
bert_features = result[1].detach().numpy()
bert_features

Wall time: 11.8 s


array([[-0.0971389 , -0.10152563, -0.0346058 , ...,  0.04234225,
        -0.03199498, -0.11452407],
       [-0.61564916, -0.28342882, -0.23810245, ...,  0.23771279,
         0.22977383, -0.311549  ],
       [-0.38588268, -0.18735667, -0.33128387, ...,  0.38868266,
         0.34908777, -0.2503171 ],
       ...,
       [-0.2141081 , -0.30196947, -0.56280345, ...,  0.363316  ,
         0.22282387, -0.49278072],
       [ 0.65135455, -0.2127428 , -0.3980419 , ...,  0.36201078,
         0.20860331, -0.05991581],
       [-0.6604341 , -0.45805657, -0.38299242, ...,  0.6709919 ,
         0.01456887, -0.48533824]], dtype=float32)

In [42]:
%%time
max_len = 512
cls = tokenizer.vocab["[CLS]"]
sep = tokenizer.vocab["[SEP]"]
pad = tokenizer.vocab["[PAD]"]

for i in range(n):
    tokenized_string = tokenizer.tokenize(string[i])
    word_id = np.array(tokenizer.convert_tokens_to_ids(tokenized_string))
    word_id = torch.LongTensor([np.hstack((cls, word_id, sep))])
    result = model(word_id)

Wall time: 5.49 s


In [33]:
n

32