<a href="https://colab.research.google.com/github/qawnaoya/ColabGPT2Example/blob/main/bert_train001.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers[ja]==4.3.3 torch==1.8.1 sentencepiece==0.1.91



In [2]:
from google.colab import drive
import pandas as pd

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
training_data = pd.read_csv('/content/drive/MyDrive/Texts/negaposi.csv')
training_data.head()

Unnamed: 0,query,title,label
0,negaposi,面白かった,0
1,negaposi,楽しかった,0
2,negaposi,退屈だった,1
3,negaposi,悲しかった,1
4,negaposi,満喫した,0


In [5]:
print(len(training_data["query"].unique()))
training_data[["title", "label"]].groupby("label").count()

1


Unnamed: 0_level_0,title
label,Unnamed: 1_level_1
0,3
1,3


In [6]:
from sklearn.model_selection import train_test_split
train_queries, val_queries, train_docs, val_docs, train_labels, val_labels = train_test_split(
    training_data["query"].tolist(),
    training_data["title"].tolist(),
    training_data["label"].tolist(),
    test_size=.2
)

In [7]:
from transformers import BertJapaneseTokenizer, BertForMaskedLM, pipeline

In [8]:
from transformers import BertJapaneseTokenizer, BertForMaskedLM

model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(train_queries, train_docs, truncation=True, padding='max_length', max_length=128)
val_encodings = tokenizer(val_queries, val_docs, truncation=True, padding='max_length', max_length=128)

In [9]:
import torch

class NPDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NPDataset(train_encodings, train_labels)
val_dataset = NPDataset(val_encodings, val_labels)

In [10]:
model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"

from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

In [11]:
for param in model.base_model.parameters():
    param.requires_grad = False

In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Models/',          # output directory
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    save_total_limit=1,              # limit the total amount of checkpoints. Deletes the older checkpoints.
)


trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,No log,0.622909,0.5949,3.362
2,No log,0.622928,0.6312,3.169
3,No log,0.622966,0.6101,3.278
4,No log,0.623018,0.5908,3.385
5,No log,0.623088,0.6255,3.198
6,No log,0.623177,0.6145,3.255
7,No log,0.623285,0.6047,3.308
8,No log,0.623407,0.5998,3.335
9,No log,0.623546,0.6097,3.28
10,No log,0.623706,0.6042,3.31


TrainOutput(global_step=10, training_loss=0.7404057502746582, metrics={'train_runtime': 17.9831, 'train_samples_per_second': 0.556, 'total_flos': 3398212055040, 'epoch': 10.0})

In [14]:
# 実験
nlp = pipeline("sentiment-analysis",model=model,tokenizer=tokenizer)

In [15]:
nlp("楽しかった")

[{'label': 'LABEL_0', 'score': 0.5115618705749512}]