In [1]:
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
import torch
import torch.nn as nn
from transformers import AutoTokenizer, BertJapaneseTokenizer
from src.utils import load_dataset
from src.train import SingleLabelBERTTrainer

In [2]:
# CONFIG
TRAIN_DATA_DIR = Path('../data/train')
SEED = 2023
FOLDS = 5
DEVICE = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 100
MAX_LENGTH = 512
EARLY_STOPPING_ROUNDS = 5
CRITERION = nn.CrossEntropyLoss()

# Read Data
df = load_dataset(TRAIN_DATA_DIR)
df['TNM'] = df['T'].astype(str) + df['N'].astype(str) + df['M'].astype(str)
fold = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
labelTNM2id = {v:i for i, v in enumerate(df['TNM'].unique())}

In [3]:
# 東北大BERT-v2
MODEL_NAME = 'cl-tohoku/bert-base-japanese-v2'
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)

label_name = 'TNM'
cv = fold.split(df['text'], df['TNM'])
trainer = SingleLabelBERTTrainer(model_name=MODEL_NAME, tokenizer=TOKENIZER, criterion=CRITERION, device=DEVICE, seed=SEED)
cv_predsTNM = trainer.training_cv(df, label_name, cv, batch_size=BATCH_SIZE, epochs=EPOCHS, learning_rate=LEARNING_RATE, max_length=MAX_LENGTH, early_stopping_rounds=EARLY_STOPPING_ROUNDS, label2id=labelTNM2id)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertM

BEST Epochs: [12, 17, 10, 14, 13]
macro F1: 0.1649 Accuracy: 0.4762


In [4]:
# UTH-BERT
MODEL_NAME = '../data/UTH_BERT_BASE_512_MC_BPE_WWM_V25000_352K'
TOKENIZER = BertJapaneseTokenizer.from_pretrained(MODEL_NAME, mecab_kwargs={'mecab_option': '-d ../data/mecab-unidic-neologd -u ../data/MANBYO_201907_Dic-utf8.dic'})

label_name = 'TNM'
cv = fold.split(df['text'], df['TNM'])
trainer = SingleLabelBERTTrainer(model_name=MODEL_NAME, tokenizer=TOKENIZER, criterion=CRITERION, device=DEVICE, seed=SEED)
cv_predsTNM = trainer.training_cv(df, label_name, cv, batch_size=BATCH_SIZE, epochs=EPOCHS, learning_rate=LEARNING_RATE, max_length=MAX_LENGTH, early_stopping_rounds=EARLY_STOPPING_ROUNDS, label2id=labelTNM2id)

Some weights of the model checkpoint at ../data/UTH_BERT_BASE_512_MC_BPE_WWM_V25000_352K were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at ../data/UTH_BERT_BASE_512_MC_BPE_WWM_V25000_352K wer

BEST Epochs: [18, 21, 14, 19, 12]
macro F1: 0.1719 Accuracy: 0.4683


In [5]:
# JMedRoBERTa (sentencepiece)
MODEL_NAME = 'alabnii/jmedroberta-base-sentencepiece-vocab50000'
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)

label_name = 'TNM'
cv = fold.split(df['text'], df['TNM'])
trainer = SingleLabelBERTTrainer(model_name=MODEL_NAME, tokenizer=TOKENIZER, criterion=CRITERION, device=DEVICE, seed=SEED)
cv_predsTNM = trainer.training_cv(df, label_name, cv, batch_size=BATCH_SIZE, epochs=EPOCHS, learning_rate=LEARNING_RATE, max_length=MAX_LENGTH, early_stopping_rounds=EARLY_STOPPING_ROUNDS, label2id=labelTNM2id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at alabnii/jmedroberta-base-sentencepiece-vocab50000 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were no

BEST Epochs: [20, 18, 15, 18, 23]
macro F1: 0.1812 Accuracy: 0.4921


In [6]:
# JMedRoBERTa (manbyo-wordpiece)
MODEL_NAME = 'alabnii/jmedroberta-base-manbyo-wordpiece-vocab50000'
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME, **{
    'mecab_kwargs': {
        'mecab_option': '-u ../data/MANBYO_201907_Dic-utf8.dic'
    }
})

label_name = 'TNM'
cv = fold.split(df['text'], df['TNM'])
trainer = SingleLabelBERTTrainer(model_name=MODEL_NAME, tokenizer=TOKENIZER, criterion=CRITERION, device=DEVICE, seed=SEED)
cv_predsTNM = trainer.training_cv(df, label_name, cv, batch_size=BATCH_SIZE, epochs=EPOCHS, learning_rate=LEARNING_RATE, max_length=MAX_LENGTH, early_stopping_rounds=EARLY_STOPPING_ROUNDS, label2id=labelTNM2id)

Some weights of the model checkpoint at alabnii/jmedroberta-base-manbyo-wordpiece-vocab50000 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at alabnii/jmedroberta-base-manbyo-wordpiece-vocab50000 and are newly initi

BEST Epochs: [13, 26, 13, 15, 15]
macro F1: 0.2194 Accuracy: 0.5238


In [7]:
# 東北大BERT-v3
MODEL_NAME = 'cl-tohoku/bert-base-japanese-v3'
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)

label_name = 'TNM'
cv = fold.split(df['text'], df['TNM'])
trainer = SingleLabelBERTTrainer(model_name=MODEL_NAME, tokenizer=TOKENIZER, criterion=CRITERION, device=DEVICE, seed=SEED)
cv_predsTNM = trainer.training_cv(df, label_name, cv, batch_size=BATCH_SIZE, epochs=EPOCHS, learning_rate=LEARNING_RATE, max_length=MAX_LENGTH, early_stopping_rounds=EARLY_STOPPING_ROUNDS, label2id=labelTNM2id)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v3 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v3 were not used when initializing BertM

BEST Epochs: [14, 20, 10, 14, 12]
macro F1: 0.2176 Accuracy: 0.5000


In [8]:
# 東北大BERT-v3-char
MODEL_NAME = 'cl-tohoku/bert-base-japanese-char-v3'
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)

label_name = 'TNM'
cv = fold.split(df['text'], df['TNM'])
trainer = SingleLabelBERTTrainer(model_name=MODEL_NAME, tokenizer=TOKENIZER, criterion=CRITERION, device=DEVICE, seed=SEED)
cv_predsTNM = trainer.training_cv(df, label_name, cv, batch_size=BATCH_SIZE, epochs=EPOCHS, learning_rate=LEARNING_RATE, max_length=MAX_LENGTH, early_stopping_rounds=EARLY_STOPPING_ROUNDS, label2id=labelTNM2id)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-char-v3 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-char-v3 were not used when initiali

BEST Epochs: [17, 20, 17, 14, 12]
macro F1: 0.2986 Accuracy: 0.5952


In [9]:
# 東北大BERT-v2(large)
MODEL_NAME = 'cl-tohoku/bert-large-japanese-v2'
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)

label_name = 'TNM'
cv = fold.split(df['text'], df['TNM'])
trainer = SingleLabelBERTTrainer(model_name=MODEL_NAME, tokenizer=TOKENIZER, criterion=CRITERION, device=DEVICE, seed=SEED)
cv_predsTNM = trainer.training_cv(df, label_name, cv, batch_size=BATCH_SIZE, epochs=EPOCHS, learning_rate=LEARNING_RATE, max_length=MAX_LENGTH, early_stopping_rounds=EARLY_STOPPING_ROUNDS, label2id=labelTNM2id)

Some weights of the model checkpoint at cl-tohoku/bert-large-japanese-v2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at cl-tohoku/bert-large-japanese-v2 were not used when initializing Ber

BEST Epochs: [7, 10, 6, 9, 9]
macro F1: 0.2070 Accuracy: 0.5317


In [10]:
# 東北大BERT-v2-char(large)
MODEL_NAME = 'cl-tohoku/bert-large-japanese-char-v2'
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)

label_name = 'TNM'
cv = fold.split(df['text'], df['TNM'])
trainer = SingleLabelBERTTrainer(model_name=MODEL_NAME, tokenizer=TOKENIZER, criterion=CRITERION, device=DEVICE, seed=SEED)
cv_predsTNM = trainer.training_cv(df, label_name, cv, batch_size=BATCH_SIZE, epochs=EPOCHS, learning_rate=LEARNING_RATE, max_length=MAX_LENGTH, early_stopping_rounds=EARLY_STOPPING_ROUNDS, label2id=labelTNM2id)

Some weights of the model checkpoint at cl-tohoku/bert-large-japanese-char-v2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at cl-tohoku/bert-large-japanese-char-v2 were not used when initia

BEST Epochs: [12, 10, 8, 9, 6]
macro F1: 0.1980 Accuracy: 0.5159
