In [1]:
import numpy as np
import tensorflow as tf
import transformers

In [2]:
# model_nameはここから取得(cf. https://huggingface.co/transformers/pretrained_models.html)
#model_name = "cl-tohoku/bert-base-japanese"
model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
tokenizer = transformers.BertJapaneseTokenizer.from_pretrained(model_name)

tokenizer.tokenize('猫がかわいです')

['猫', 'が', 'かわい', 'です']

In [3]:
def build_model(model_name, num_classes, max_length):
    input_shape = (max_length, )
    input_ids = tf.keras.layers.Input(input_shape, dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(input_shape, dtype=tf.int32)
    token_type_ids = tf.keras.layers.Input(input_shape, dtype=tf.int32)
    bert_model = transformers.TFBertModel.from_pretrained(model_name)
    bert_model.trainable = False
    base_model_output = bert_model(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids
    )
    last_hidden_state, pooler_output = base_model_output.last_hidden_state, base_model_output.pooler_output
    output = tf.keras.layers.Dense(num_classes, activation="softmax")(pooler_output)
    model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=[output])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
    model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["acc"])
    return model

num_classes = 2
max_length = 128
model = build_model(model_name, num_classes=num_classes, max_length=max_length)
model.summary()

Some layers from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 input_3 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1106173   ['input_1[0][0]',             
 )                           ngAndCrossAttentions(last_   44         'input_2[0][0]',         

In [4]:
# テキストのリストをtransformers用の入力データに変換
def to_features(texts, max_length):
    shape = (len(texts), max_length)
    # input_idsやattention_mask, token_type_idsの説明はglossaryに記載(cf. https://huggingface.co/transformers/glossary.html)
    input_ids = np.zeros(shape, dtype="int32")
    attention_mask = np.zeros(shape, dtype="int32")
    token_type_ids = np.zeros(shape, dtype="int32")
    for i, text in enumerate(texts):
        encoded_dict = tokenizer.encode_plus(text, max_length=max_length, pad_to_max_length=True)
        input_ids[i] = encoded_dict["input_ids"]
        attention_mask[i] = encoded_dict["attention_mask"]
        token_type_ids[i] = encoded_dict["token_type_ids"]
    return [input_ids, attention_mask, token_type_ids]

In [5]:
from pathlib import Path
import json
import random
from sklearn.utils import shuffle

def LoadDataset(path: Path):
    with path.open('r') as f:
        dataset = json.load(f)
    cm = dataset['cm']
    noncm = dataset['noncm']
    random.shuffle(cm)
    random.shuffle(noncm)
    # drop some non-cm clips
    noncm = noncm[:len(cm)]

    texts, labels = cm + noncm, [1] * len(cm) + [0] * len(cm)
    return shuffle(texts, labels)

texts, labels = LoadDataset(Path(r'..\speech.json'))

texts[:5], labels[:5]


(['♬～ ≪(観客の拍手) (女の子Ａ･女の子Ｂ)久しぶり～！ (女の子Ａ)大丈夫｡ (女の子Ｂ)うん｡ (男性)ただいま｡ (女性)おかえり｡ (母)ありがとうございます｡',
  'また むずい… 難しい｡ (林田)ただいま！ えっ？ ああ お… おかえり…｡ (階段を上がる音) えっ？ ♬～',
  '(店主)いらっしゃい かがわ (香川)じゃあ 上ロース 卵つけて さかい (堺)と ｢パーフェクトサントリービール｣ ２つ ピーエスビー はい ＰＳＢ ２丁!! からの～ 乾杯したい ですよね う! う! うまい!うまい! うまい!! <ＰＳＢ ｢パーフェクト サントリービール｣> 圧巻 どぇす!',
  '(ﾊﾞｰﾙ)《キリヲ》 (ｷﾘｦ)フッ',
  '♬～ みやざわ やまだ (宮沢)＜がんばった一年だもの｡＞ (山田)あ ＜きっと今 日本中が 同じ気持ちです｡＞ かわぐち (川口)あ おぐり (友人)あ 雪！ (小栗)おっ ごほうびごほうび～♪ 結構がんばってんだよね (父親)みたいだな 見てるぞ～'],
 [1, 0, 1, 1, 1])

In [6]:
x_train = to_features(texts, max_length)
y_train = tf.keras.utils.to_categorical(labels, num_classes=num_classes)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
batch_size = 64
epochs = 10

model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1a600690580>

In [8]:
model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=20
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x1a60ba4ecd0>