In [1]:
import numpy as np
import pandas as pd
import os
import torch

from transformers import (
    BertModel,
    BertConfig,
    BertTokenizer,
    BertForPreTraining
)

from pyknp import Juman

from gensim import models

In [2]:
WORD2VEC_ROOT = "./models/word2vec"
BERT_ROOT = "/home/jovyan/work/BERT/models/bert/Japanese_L-12_H-768_A-12_E-30_BPE_WWM_transformers"

# Setup 

## Download Bert PreTrained Japanese Model
モデルのダウンロード

In [23]:
!wget -P "./models/bert"  "http://lotus.kuee.kyoto-u.ac.jp/nl-resource/JapaneseBertPretrainedModel/Japanese_L-12_H-768_A-12_E-30_BPE_WWM_transformers.zip"

--2021-12-19 00:58:45--  http://lotus.kuee.kyoto-u.ac.jp/nl-resource/JapaneseBertPretrainedModel/Japanese_L-12_H-768_A-12_E-30_BPE_WWM_transformers.zip
Resolving lotus.kuee.kyoto-u.ac.jp (lotus.kuee.kyoto-u.ac.jp)... 130.54.208.131
Connecting to lotus.kuee.kyoto-u.ac.jp (lotus.kuee.kyoto-u.ac.jp)|130.54.208.131|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 411210545 (392M) [application/zip]
Saving to: ‘./models/bert/Japanese_L-12_H-768_A-12_E-30_BPE_WWM_transformers.zip’


2021-12-19 01:06:29 (865 KB/s) - ‘./models/bert/Japanese_L-12_H-768_A-12_E-30_BPE_WWM_transformers.zip’ saved [411210545/411210545]



モデルの解凍

In [29]:
!unzip "./models/bert/Japanese_L-12_H-768_A-12_E-30_BPE_WWM_transformers.zip" -d "./models/bert"

Archive:  ./models/bert/Japanese_L-12_H-768_A-12_E-30_BPE_WWM_transformers.zip
   creating: ./models/bert/Japanese_L-12_H-768_A-12_E-30_BPE_WWM_transformers/
  inflating: ./models/bert/Japanese_L-12_H-768_A-12_E-30_BPE_WWM_transformers/pytorch_model.bin  
  inflating: ./models/bert/Japanese_L-12_H-768_A-12_E-30_BPE_WWM_transformers/vocab.txt  
  inflating: ./models/bert/Japanese_L-12_H-768_A-12_E-30_BPE_WWM_transformers/config.json  
  inflating: ./models/bert/Japanese_L-12_H-768_A-12_E-30_BPE_WWM_transformers/tokenizer_config.json  


## JUMAN++ のコンストラクト

In [3]:
jm = Juman()

# Do BERT
BERTの実行

## Load Pre-Trained Model
事前学習モデルの読み込み

In [4]:
model = BertModel.from_pretrained(BERT_ROOT)
model.eval()

Some weights of the model checkpoint at /home/jovyan/work/BERT/models/bert/Japanese_L-12_H-768_A-12_E-30_BPE_WWM_transformers were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32006, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [5]:
vocab_file_path = os.path.join(BERT_ROOT, 'vocab.txt')
bert_tokenizer = BertTokenizer(vocab_file_path, do_lower_case=False, do_basic_tokenize=False)

In [6]:
# サンプルテキスト
text = "大きなのっぽの古時計を購入した"

# 形態素解析
result = jm.analysis(text)
# 分かち書き
tokens = [mrph.midasi for mrph in result.mrph_list()]

# トークン化
bert_tokens = bert_tokenizer.tokenize(" ".join(["[CLS]"] + tokens + ["[SEP]"]))
token_ids = bert_tokenizer.convert_tokens_to_ids(bert_tokens)

tokens_tensor = torch.tensor(token_ids).unsqueeze(0)
outputs = model(tokens_tensor)

In [7]:
outputs['last_hidden_state']

tensor([[[ 0.6167,  0.1015,  0.4946,  ..., -0.8182,  0.2268,  0.3858],
         [-0.3297, -0.0672,  1.3605,  ..., -0.8458,  0.1461, -0.2418],
         [-0.6516,  0.5089,  0.3705,  ..., -0.0826, -0.6668, -0.5334],
         ...,
         [ 0.1957,  0.4445,  0.8782,  ..., -0.8787, -0.4205, -0.6016],
         [ 0.1796, -0.5159, -0.0943,  ..., -0.6935,  0.9005, -0.3178],
         [ 0.9526,  0.4663,  0.4639,  ..., -0.3355,  0.6858,  0.0713]]],
       grad_fn=<NativeLayerNormBackward0>)

In [8]:
outputs['last_hidden_state'].size()

torch.Size([1, 11, 768])