In [None]:
pip install tokenizers

In [None]:
pip install transformers

In [None]:
import string

train_file = open("train.fo1.txt", "r", encoding="utf-8")


train_sents = []
for i in train_file:
  train_sents.append(i)
train_data  = " "

In [None]:
from tokenizers import ByteLevelBPETokenizer

In [None]:
tokenizer = ByteLevelBPETokenizer()

tokenizer.train("train.fo1.txt", vocab_size = 33_278, min_frequency = 2, 
                special_tokens = ['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

In [None]:
tokenizer.save_model('tokenizer')

In [None]:
from transformers import RobertaTokenizer

In [None]:
#config_file = RobertaTokenizer.from_pretrained('tokenizer')
tokenizer = RobertaTokenizer.from_pretrained('tokenizer', max_len=512)

In [None]:
tokens = tokenizer('as')
tokens

In [None]:
#@title 預設標題文字
#@title labels == input_ids
#@title input_ids -> MLM

In [None]:
import torch
#15% of the words are being masked
def mLm(tensor):
  rand = torch.rand(tensor.shape)
  mask_arr = (rand < 0.15) * (tensor > 2)
  for i in range(tensor.shape[0]):
    selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    tensor[i, selection] = 4
  return tensor


In [None]:
input_ids = []
mask = []
labels = []

train_sents = []
with open("train.fo1.txt", "r", encoding = "utf8") as file:
  sents = file.read().split('\n')
  sample = tokenizer(sents, max_length=512, padding='max_length', truncation=True,return_tensors='pt')
  labels.append(sample.input_ids)
  mask.append(sample.attention_mask)
  input_ids.append(mLm(sample.input_ids.detach().clone()))




In [None]:
input_ids = torch.cat(input_ids)
mask = torch.cat(mask)
labels = torch.cat(labels)

In [None]:
input_ids[0][:10]
labels[0][:10]

tensor([    0,  3221, 27251,   546,  7757,   474,   539,   225,     3,  7775])

In [None]:
encodings = {
    'input_ids': input_ids,
    'attention_mask': mask,
    'labels': labels
}

In [None]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  def __len__(self):
    return self.encodings['input_ids'].shape[0]
  def __getitem__(self, i):
    return { key: tensor[i] for key, tensor in self.encodings.items()}

In [None]:
dataset = Dataset(encodings)

In [None]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size = 16, shuffle = True)

In [None]:
from transformers import RobertaConfig

In [None]:
config = RobertaConfig(
    vocab_size = 33_278,
    max_position_embeddings = 514,
    hidden_size = 768,
    num_attention_heads = 12,
    num_hidden_layers = 6,
    type_vocab_size = 1
)

In [None]:
from transformers import RobertaForMaskedLM

In [None]:
model = RobertaForMaskedLM(config)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
model.to(device)

In [None]:
from transformers import AdamW

In [None]:
model.train()

In [None]:
optim = AdamW(model.parameters(), lr=1e-6)

In [None]:
import torch.nn as nn
criterion = nn.CrossEntropyLoss()

In [None]:
epochs = 2
step = 0

In [None]:
from tqdm import tqdm

#loop = tqdm(enumerate(iter(dataloader), 0), leave=True)

for epoch in range(epochs):
  loop = tqdm(dataloader, leave= True)
  for batch in loop:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask = mask, labels=labels)

  #loss = criterion(outputs, labels)
    loss = outputs.loss
    loss.backward()
    optim.step()

    loop.set_description(f'Epoch {epoch}')
    loop.set_postfix(loss = loss.item())

In [None]:
model.save_pretrained('./tokenizer')

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()