In [12]:
import os
ON_COLAB = "COLAB_GPU" in os.environ

In [2]:
from datetime import date
modelpath = f"./model_{date.today().strftime('%y-%m-%d')}"
if ON_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    modelpath = '/content/drive/MyDrive/ColabFolder/NLPwithBERT/Section6/'+modelpath

## ライブラリインストール

In [3]:
# if ON_COLAB:
#     ! pip install -U pip 2>&1 >/dev/null
#     ! pip install \
#         transformers==4.5.0 \
#         fugashi==1.1.0 \
#         ipadic==1.0.0  \
#         pytorch-lightning==1.2.7 2>&1 >/dev/null 

In [2]:
import random
import glob
import json
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertModel
# import pytorch_lightning as pl

MODEL_NAME = "cl-tohoku/bert-base-japanese-whole-word-masking"

In [7]:
class BertForSequenceClassificationMultiLabel(torch.nn.Module):
    def __init__(self, model_name, num_labels) -> None:
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None,
    ):
        bert_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        last_hidden_state = bert_output.last_hidden_state

        # [PAD]以外のトークンの平均
        average_hidden_state = (last_hidden_state * attention_mask.unsqueeze(-1)).sum(
            1
        ) / attention_mask.sum(1, keepdim=True)

        # 線形変換
        scores = self.linear(average_hidden_state)

        # 出力フォーマット
        output = {"logits": scores}
        if labels is not None:
            loss = torch.nn.BCEWithLogitsLoss()(scores, labels.float())
            output["loss"] = loss

        output = type("bert_output", (object,), output)

        return output

In [8]:
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
bert_scml = BertForSequenceClassificationMultiLabel(MODEL_NAME, num_labels=2)
# bert_scml = bert_scml.cuda()

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
text_list = [
    "今日の仕事はうまくいったが、体調があまり良くない。",
    "昨日は楽しかった。",
]
labels_list = [
    [1, 1],
    [0, 1],
]

encoding = tokenizer(text_list, padding="longest", return_tensors="pt")
labels = torch.tensor(labels_list)

if ON_COLAB:
    encoding = {k: v.cuda() for k, v in encoding.items()}
    labels = labels.cuda()

with torch.no_grad():
    output = bert_scml(**encoding)
scores = output.logits

labels_predicted = (scores > 0).int()

num_correct = (labels_predicted == labels).all(-1).sum().item()
accuracy = num_correct / labels.size(0)

print(labels_predicted)
print(accuracy)

tensor([[1, 0],
        [1, 0]], dtype=torch.int32)
0.0


In [17]:
encoding = tokenizer(text_list, padding="longest", return_tensors="pt")
encoding["labels"] = torch.tensor(labels_list)
if ON_COLAB:
    encoding = {k: v.cuda() for k, v in encoding.items()}

output = bert_scml(**encoding)
loss = output.loss
print(loss)


tensor(0.7120, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
