In [10]:
import torch
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import pickle

In [None]:
with open("../saved_data/e028_use_markov/train_yad_no_list.pkl", "rb") as f:
    train_data = pickle.load(f)
with open("../saved_data/e028_use_markov/test_yad_no_list.pkl", "rb") as f:
    test_data = pickle.load(f)

In [19]:
import torch
import random


class NumbersDataset(Dataset):
    def __init__(self, sequences, tokenizer, max_length=11):
        self.sequences = sequences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        sequence = self.sequences[index]
        sequence_str = " ".join(map(str, sequence))
        inputs = self.tokenizer(
            sequence_str,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt",
        )
        # Randomly mask a token
        mask_idx = random.randint(0, self.max_length - 1)
        inputs.input_ids[0][mask_idx] = self.tokenizer.mask_token_id
        return {key: val[0] for key, val in inputs.items()}


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
dataset = NumbersDataset(sequences=train_data, tokenizer=tokenizer)

In [23]:
next(iter(dataset))

{'input_ids': tensor([  101, 23688,  2629, 19151,   103,   102,     0,     0,     0,     0,
             0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])}

In [21]:
train_data

[[2395, 4101],
 [13535, 8253],
 [123, 4863],
 [8475, 1652],
 [96, 898, 96],
 [6868, 4823],
 [8602, 10378],
 [13106, 10362],
 [12062, 1227],
 [4866, 175],
 [2043, 1209],
 [756, 8822, 756],
 [7614, 13754],
 [7561, 3848],
 [143, 7913, 143],
 [13678, 2220],
 [569, 7246, 569],
 [7641, 7062],
 [10620, 7394],
 [2441, 13106],
 [3228, 4985],
 [1383, 7600, 1383],
 [2452, 11796],
 [9549, 11234],
 [8833, 8322],
 [13770, 8567],
 [4177, 13555],
 [3079, 7920],
 [5583, 6001, 5583],
 [13700, 9163, 13700],
 [712, 6787, 712, 6787],
 [13776, 12125],
 [10483, 6333],
 [635, 10834],
 [9020, 11125],
 [10749, 13728],
 [2927, 11037, 2927, 11037, 2927, 11037, 2927],
 [12211, 3238],
 [13579, 10923],
 [11237, 12862, 11237],
 [13120, 7725],
 [9859, 9419],
 [7178, 12843, 7178],
 [4885, 1448],
 [12333, 8346, 12333],
 [12707, 11273],
 [2278, 7123, 2278],
 [2753, 2222, 1024],
 [5710, 10006],
 [618, 1199],
 [6441, 307, 6441, 307],
 [9290, 4323],
 [13079, 1092, 7694],
 [3901, 13537, 3901],
 [3404, 3424],
 [9515, 13463],


In [12]:
from transformers import BertForMaskedLM, AdamW

model = BertForMaskedLM.from_pretrained("bert-base-uncased")
optimizer = AdamW(model.parameters(), lr=1e-5)

# トレーニングループ
num_epochs = 5
batch_size = 2
for epoch in range(num_epochs):
    for inputs in DataLoader(dataset, batch_size=batch_size, shuffle=True):
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AttributeError: 'NoneType' object has no attribute 'backward'

In [14]:
inputs

{'input_ids': tensor([[103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103],
         [103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [None]:
with torch.no_grad():
    for test_sequence in test_data:
        inputs = tokenizer(test_sequence, return_tensors="pt")
        outputs = model(**inputs)
        predictions = outputs.logits
        # 予測されたトークンのIDを取得し、それを数字に変換
        predicted_indices = torch.topk(predictions[0, -1, :], 10).indices
        predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_indices)