In [2]:
import re
import unicodedata
import polars as pl
import random
from tqdm.notebook import tqdm

import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertModel
import pytorch_lightning as L

# 日本語の事前学習モデル
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/suizushinsaku/develop/patent/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/suizushinsaku/develop/patent/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/suizushinsaku/develop/patent/.venv/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 7

In [61]:
df_train = pl.read_csv("../data/processed/train.csv")
df_test = pl.read_csv("../data/processed/test.csv")

In [73]:
class BertForSequenceClassificationMultiLabel(torch.nn.Module):

  def __init__(self, modl_name, num_labels):
    super() .__init__()
    # BertModelのロード
    self.bert = BertModel.from_pretrained(MODEL_NAME)
    # 線形変換を初期化しておく
    self.linear = torch.nn.Linear(
        self.bert.config.hidden_size, num_labels
    )

  def forward(
      self,
      input_ids=None,
      attention_mask=None,
      token_type_ids=None,
      labels=None     
  ):
    # データを入力しBERTの最終層の出力を得る
    bert_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids)
    last_hidden_state = bert_output.last_hidden_state

    # [PAD]以外のトークンで隠れ状態の平均をとる
    averaged_hedden_state = \
      (last_hidden_state*attention_mask.unsqueeze(-1)).sum(1) \
      / attention_mask.sum(1, keepdim=True)

    # 線形変換
    scores = self.linear(averaged_hedden_state)

    # 出力の形式を整える
    output = {'logits': scores}

    # labelsが入力に含まれていたら、損失を計算し出力する
    if labels is not None:
      loss = torch.nn.BCEWithLogitsLoss() (scores, labels.float())
      output['loss'] = loss

    # 属性でアクセスできるようにする
    output = type('bert_output', (object,), output)

    return output


In [74]:
class BertForSequenceClassificationMultiLabel_pl(L.LightningModule):

  def __init__(self, model_name, num_labels, lr):
    super() .__init__()
    self.save_hyperparameters()
    self.bert_scml = BertForSequenceClassificationMultiLabel(
        model_name, num_labels=num_labels
    )

  def training_step(self, batch, batch_idx):
    output = self.bert_scml(**batch)
    loss = output.loss
    self.log('train_loss', loss)
    return loss

  def validation_step(self, batch, batch_idx):
    output = self.bert_scml(**batch)
    val_loss = output.loss
    self.log('val_loss', val_loss)

  def test_step(self, batch, batch_idx):
    labels = batch.pop('labels')
    output = self.bert_scml(**batch)
    scores = output.logits
    labels_predicted = ( scores > 0 ).int()
    num_correct = ( labels_predicted == labels ).all(-1).sum().item()
    accuracy = num_correct/scores.size(0)
    self.log('accuracy', accuracy)

  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

checkpoint = L.callbacks.ModelCheckpoint(
    monitor = 'val_loss',
    mode = 'min',
    save_top_k=1,
    save_weights_only=True,
    dirpath = 'model/',
)

trainer = L.Trainer(
    # gpus=1,
    max_epochs=5,
    callbacks = [checkpoint]
)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/suizushinsaku/develop/patent/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [63]:
G06V30_l_train = []
G06V30_l_test = []

for fi_l in df_train["FI"]:
    if "G06V30" in fi_l:
        encode = 1
    else:
        encode = 0
    G06V30_l_train.append(encode)

for fi_l in df_test["FI"]:
    if "G06V30" in fi_l:
        encode = 1
    else:
        encode = 0
    G06V30_l_test.append(encode)

In [71]:
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
# 各データの形式を整える
max_length = 256
train_dataset_for_loader = []
test_dataset_for_loader = []

for i in range(len(df_train)):
  text = df_train["summary"][i]
  labels = [G06V30_l_train[i]]
  encoding = tokenizer(
      text,
      max_length=max_length,
      padding='max_length',
      truncation=True
  )
  encoding['labels'] = labels
  encoding = { k: torch.tensor(v) for k, v in encoding.items() }
  train_dataset_for_loader.append(encoding)

for i in range(len(df_test)):
  text = df_test["summary"][i]
  labels = [G06V30_l_test[i]]
  encoding = tokenizer(
      text,
      max_length=max_length,
      padding='max_length',
      truncation=True
  )
  encoding['labels'] = labels
  encoding = { k: torch.tensor(v) for k, v in encoding.items() }
  test_dataset_for_loader.append(encoding)

# データセットの分割
random.shuffle(train_dataset_for_loader)
n = len(train_dataset_for_loader)
n_train = int(0.7*n)
dataset_train = train_dataset_for_loader[:n_train]  # 学習データ
dataset_val = train_dataset_for_loader[n_train:]  # 検証データ
dataset_test = test_dataset_for_loader # テストデータ

# データセットからデータローダを作成
dataloader_train = DataLoader(
    dataset_train, batch_size=32, shuffle=True
)
dataloader_val = DataLoader(dataset_val, batch_size=256)
dataloader_test = DataLoader(dataset_test, batch_size=256)


In [75]:
model = BertForSequenceClassificationMultiLabel_pl(
    MODEL_NAME,
    num_labels=1,   #対象とするFI数
    lr=1e-5
)
trainer.fit(model, dataloader_train, dataloader_val)
test = trainer.test(dataloaders=dataloader_test)
print(f'Accuracy: {test[0] ["accuracy"]:.2f}')



  | Name      | Type                                    | Params | Mode 
------------------------------------------------------------------------------
0 | bert_scml | BertForSequenceClassificationMultiLabel | 110 M  | train
------------------------------------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
442.472   Total estimated model params size (MB)
2         Modules in train mode
228       Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/Users/suizushinsaku/develop/patent/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/Users/suizushinsaku/develop/patent/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/Users/suizushinsaku/develop/patent/.venv/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (43) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 4: 100%|██████████| 43/43 [14:30<00:00,  0.05it/s, v_num=0]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 43/43 [14:30<00:00,  0.05it/s, v_num=0]

Restoring states from the checkpoint path at /Users/suizushinsaku/develop/patent/notebook/model/epoch=4-step=215.ckpt
Loaded model weights from the checkpoint at /Users/suizushinsaku/develop/patent/notebook/model/epoch=4-step=215.ckpt





/Users/suizushinsaku/develop/patent/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 1/1 [00:39<00:00,  0.03it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        accuracy            0.9951691031455994
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Accuracy: 1.00


In [88]:
# text_list = []
# for i in range(len(df_test[:5])):
#     text = df_test["summary"][i]
#     text_list.append(text)

# # モデルのロード
best_model_path = checkpoint.best_model_path
model = BertForSequenceClassificationMultiLabel_pl.load_from_checkpoint(best_model_path)
bert_scml = model.bert_scml

text_list = [df_test.filter(pl.col("FI").str.contains("G06V30"))["summary"][1]]
# データの符号化
encoding = tokenizer(
    text_list,
    padding = 'longest',
    return_tensors = 'pt'
)
encoding = { k: v for k, v in encoding.items() }

# BERTへデータを入力し分類スコアを得る
with torch.no_grad():
  output = bert_scml(**encoding)
scores = output.logits
labels_predicted = ( scores > 0).int().tolist()

# 結果を表示
for text, label in zip(text_list, labels_predicted):
  print('--')
  print(f'入力 : {text}')
  print(f'出力 : {label}')

--
入力 : テキストシーケンス方式であっても、文字認識結果として得られた文字が、認識対象における文字列画像のどの位置にあるかを把握するまでの時間を低減する。文字列を表す文字列画像から文字を認識する文字列画像認識において当該文字列画像に記載された文字列、および前記文字列画像における1文字以上の文字が含まれる部分文字列の位置を学習する学習モデル学習システムであって、前記部分文字列が2つ以上含まれる1行の文字列が書かれた文字列画像、前記文字列画像上に書かれている文字列及び前記文字列画像における各部分文字列の始端・終端位置情報を用いて学習することで学習モデルを生成する学習部を備える。
出力 : [1]


In [89]:
scores

tensor([[3.4610]])

In [85]:
sample

'テキストシーケンス方式であっても、文字認識結果として得られた文字が、認識対象における文字列画像のどの位置にあるかを把握するまでの時間を低減する。文字列を表す文字列画像から文字を認識する文字列画像認識において当該文字列画像に記載された文字列、および前記文字列画像における1文字以上の文字が含まれる部分文字列の位置を学習する学習モデル学習システムであって、前記部分文字列が2つ以上含まれる1行の文字列が書かれた文字列画像、前記文字列画像上に書かれている文字列及び前記文字列画像における各部分文字列の始端・終端位置情報を用いて学習することで学習モデルを生成する学習部を備える。'