In [1]:
from transformers import BertTokenizer, BertForPreTraining, XLNetTokenizer, XLNetModel, AlbertForPreTraining, AlbertTokenizer, BertForMaskedLM, AlbertForMaskedLM, RobertaForMaskedLM, RobertaTokenizer
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch import linalg as LA
import os
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"


tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')
model = AutoModelForMaskedLM.from_pretrained('albert-base-v2')
model_name = 'albert_debiased'
file_name = "finetuning_sentences.txt"
#tok = 'Tokenizer_roberta_finetune_100s0a'

with open('data/{}'.format(file_name), 'r') as fp:
    text = fp.read().split('\n')

In [2]:
bag = [item for sentence in text for item in sentence.split('.') if item != '']
bag_size = len(bag)

In [3]:
#bag_size

In [4]:

import random

sentence_a = []
sentence_b = []
label = []

for paragraph in text:
    sentences = [sentence for sentence in paragraph.split('.') if sentence != '']
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)


In [5]:
#len(sentence_a)

In [6]:
#for i in range(3):
    #print(label[i])
    #print(sentence_a[i] + '\n---')
    #print(sentence_b[i] + '\n')

In [7]:
import warnings
warnings.filterwarnings('ignore')
sentences = [sent for sent in text]
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
#inputs = tokenizer(sentences, return_tensors='pt', max_length=128, truncation=True, padding='max_length')

In [8]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [9]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T

inputs.next_sentence_label[:10]

tensor([[1],
        [0],
        [0],
        [1],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1]])

In [10]:
inputs['labels'] = inputs.input_ids.detach().clone()
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [11]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)


In [12]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(torch.flatten(mask_arr[i].nonzero()).tolist())

selection[:2]

[[1, 8, 11], [0, 4, 9]]

In [13]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [14]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

dataset = OurDataset(inputs)
loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

In [15]:
print(torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

cuda


AlbertForMaskedLM(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
  

In [16]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optimizer = AdamW(model.parameters(), lr=1e-4)

In [17]:
from tqdm import tqdm  # for our progress bar

if (file_name == "finetuning_sentences.txt"):
    epochs = 1
else:
    epochs = 3

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optimizer.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        #token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        #token_type_ids=token_type_ids,
                        #next_sentence_label=next_sentence_label,
                        #sentence_order_label=next_sentence_label,
                        labels=labels
                       )
        # extract loss
        loss = outputs[0]
        #loss = LA.vector_norm(LA.matrix_norm(outputs[0]))
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optimizer.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|████████████████████████████████████████████████████████| 12926/12926 [54:20<00:00,  3.96it/s, loss=1.13]


In [18]:
model.save_pretrained('models\{}'.format(model_name))
tokenizer.save_pretrained('tokenizers\{}'.format(model_name))

('tokenizers\\albert_debiased\\tokenizer_config.json',
 'tokenizers\\albert_debiased\\special_tokens_map.json',
 'tokenizers\\albert_debiased\\spiece.model',
 'tokenizers\\albert_debiased\\added_tokens.json',
 'tokenizers\\albert_debiased\\tokenizer.json')

In [19]:
torch.cuda.empty_cache()