<a href="https://colab.research.google.com/github/straxFromIbr/NLP_with_BERT/blob/main/Section6/6_BERTwithLiveDoor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
ON_COLAB = "COLAB_GPU" in os.environ

## ライブラリインストール

In [4]:
if ON_COLAB:
    !pip install -U pip 2>&1 >/dev/null
    !pip install \
        transformers==4.5.0 \
        fugashi==1.1.0 \
        ipadic==1.0.0  \
        pytorch-lightning==1.2.7 2>&1 >/dev/null 



## データセットのダウンロードと解凍

In [5]:
if not os.path.exists('ldcc-20140209.tar.gz'):
    ! wget 'https://rondhuit.com/download/ldcc-20140209.tar.gz'  >/dev/null 2>&1
    ! tar -zxf 'ldcc-20140209.tar.gz'  >/dev/null 2>&1

In [None]:
! head -n7 './text/it-life-hack/it-life-hack-6342280.txt'

In [6]:
import random
import glob
from tqdm import tqdm

import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertForSequenceClassification

MODEL_NAME = "cl-tohoku/bert-base-japanese-whole-word-masking"

In [7]:
# ラベルリスト
category_list = [
    "dokujo-tsushin",
    "it-life-hack",
    "kaden-channel",
    "livedoor-homme",
    "movie-enter",
    "peachy",
    "smax",
    "sports-watch",
    "topic-news",
]
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)


Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110 [00:00<?, ?B/s]

In [8]:
# データ整形
max_length = 128
dataset_for_loader = []
for label, category in enumerate(tqdm(category_list)):
    for file in glob.glob(f"./text/{category}/{category}*"):
        lines = open(file).read().splitlines()
        text = "\n".join(lines[3:])
        encoding = tokenizer(
            text,
            max_length=max_length,
            padding="max_length",
            truncation=True,
        )
        encoding["labels"] = label
        encoding = {k: torch.tensor(v) for k, v in encoding.items()}
        dataset_for_loader.append(encoding)

print(dataset_for_loader[0])


100%|██████████| 9/9 [00:36<00:00,  4.03s/it]

{'input_ids': tensor([    2,    70,  1134,  5328,    49,  2198,   633,     5, 10341,    64,
           12,     6,    25,   181,     5,  2198,    11,   830,   969,    13,
         4534,  7393,    45,    14,  3731,    16,    33,     8, 13601,     5,
          881,    28,   130,     6,  4292,    11, 13097,    82,     7, 11211,
           11,    15,    16,    33,   969,    28,  8326,   312,     6,  6175,
           14,  2198,     7,   297,     6,   811,    13, 16333,    26,   191,
           16,    33,   969,    28,     8,  2198,    11,    25,   181,   830,
         1515,     9,   976,    75,    14,     6,  7309,     6, 12453,    12,
           88, 28580, 28478, 28580,    13,    15,    10, 14901,    11,    15,
           16,    33,     5,    14,  5648,    81,     8,  7033,     6,    88,
        28580, 28478, 28580,    13,    15,    16,    33,     5,    29,  2935,
           25,   181,     5,  2198,    11,    34,    99, 28659,     9,  2935,
           25,   181,     5,  2198,    11,   830, 




In [9]:
random.shuffle(dataset_for_loader)
n = len(dataset_for_loader)
n_train = int(0.6 * n)
n_val = int(0.2 * n)
dataset_train = dataset_for_loader[:n_train]
dataset_val = dataset_for_loader[n_train : n_train + n_val]
dataset_test = dataset_for_loader[n_train + n_val :]

dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=256)
dataloader_test = DataLoader(dataset_test, batch_size=256)


## PyTorch Lightningで文章分類モデルを構築する

In [13]:
class BertForSequenceClassification_pl(pl.LightningModule):
    def __init__(self, model_name, num_labels, lr) -> None:
        """
        model_name: モデルの名前
        """
        super().__init__()
        # `__init__`の引数を保存する！便利！
        self.save_hyperparameters()

        self.bert_sc = BertForSequenceClassification.from_pretrained(
            model_name, num_labels=num_labels
        )

    def training_step(self, batch, batch_idx):
        """
        # 各学習ステップで呼ばれる関数
            - 損失を記録し、返す
        """
        output = self.bert_sc(**batch)
        loss = output.loss
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        """
        # こちらは検証ステップで呼ばれる関数
            - 損失を記録し、返す
        """
        output = self.bert_sc(**batch)
        loss = output.loss
        self.log("val_loss", loss)
        return loss

    def test_step(self, batch, batch_idx):
        """
        # こちらはテストステップで呼ばれる関数
            - ラベルの正解率を求めて記録する
        """
        labels = batch.pop("labels")
        output = self.bert_sc(**batch)
        labels_predicted = output.logits.argmax(-1)
        num_correct = (labels_predicted == labels).sum().item()
        # 正解率を求める
        accurancy = num_correct / labels.size(0)
        self.log("accurancy", accurancy)
    
    def configure_optimizers(self):
        """
        オプティマイザを返す。
        オプティマイザにはAdamを使用しモデルのパラメータと学習率を渡す
        """
        return torch.optim.Adam(self.parameters() ,lr=self.hparams.lr)


## CheckpointのCallbackとTrainerの作成
- kerasとノリが似てる

In [14]:
checkpoint = pl.callbacks.ModelCheckpoint(
    monitor="val_loss",
    mode="min",
    save_top_k=1,
    save_weights_only=True,
    dirpath="model/",
)

if ON_COLAB:
    trainer = pl.Trainer(
        gpus=1,
        max_epochs=10,
        callbacks=[checkpoint],
    )


GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
# モデル作成
model = BertForSequenceClassification_pl(MODEL_NAME, num_labels=9, lr=1e-5)

# 訓練(ファインチューニング)する
hist = trainer.fit(model, dataloader_train, dataloader_val)


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [None]:
best_model_path = checkpoint.best_model_path
print('最良モデルのチェックポイント', best_model_path)
print('裁量モデルでの検証データに対する損失', checkpoint.best_model_scorÏe)