In [1]:
import torch
import random
import sentencepiece as spm
from transformers import ReformerConfig, ReformerModelWithLMHead, ReformerTokenizer
from torch.utils.data import DataLoader, Dataset

NUM_BATCHES = None
BATCH_SIZE = 6
GRADIENT_ACCUMULATE_EVERY = 3
LEARNING_RATE = 0.01
VALIDATE_EVERY  = 20
SEQ_LEN = 4608

In [2]:
import torch

# Encoding
def encode(list_of_strings, pad_to_max_length=True, pad_token_id=0):
    max_length = max([len(string) for string in list_of_strings])

    # create emtpy tensors
    attention_masks = torch.zeros((len(list_of_strings), max_length), dtype=torch.long)
    input_ids = torch.full((len(list_of_strings), max_length), pad_token_id, dtype=torch.long)

    for idx, string in enumerate(list_of_strings):
        # make sure string is in byte format
        if not isinstance(string, bytes):
            string = str.encode(string)

        input_ids[idx, :len(string)] = torch.tensor([x + 2 for x in string])
        attention_masks[idx, :len(string)] = 1

    return input_ids, attention_masks

# Decoding
def decode(outputs_ids):
    decoded_outputs = []
    for output_ids in outputs_ids.tolist():
        # transform id back to char IDs < 2 are simply transformed to ""
        decoded_outputs.append("".join([chr(x - 2) if x > 1 else "" for x in output_ids]))
    return decoded_outputs

In [3]:
encode(['ABCDEF'])

(tensor([[67, 68, 69, 70, 71, 72]]), tensor([[1, 1, 1, 1, 1, 1]]))

In [4]:
spm.SentencePieceTrainer.Train("--input=./data/tokenizer_training/AAresiduals.txt \
                                --vocab_size=28 \
                                --model_prefix=sequence_tokenizer \
                                --model_type=char \
                                --character_coverage=1.0")
tokenizer = ReformerTokenizer(vocab_file="sequence_tokenizer.model", do_lower_case=False, model_max_length=SEQ_LEN)

In [5]:
configuration = ReformerConfig.from_pretrained("google/reformer-enwik8")
configuration.axial_pos_shape = (64, 72)
configuration.max_position_embeddings=SEQ_LEN
configuration.vocab_size=tokenizer.vocab_size
configuration.save_pretrained('model/config_enwik8_modified/')
configuration = ReformerConfig.from_pretrained('model/config_enwik8_modified/')
model = ReformerModelWithLMHead(configuration)

In [6]:
model.train()

ReformerModelWithLMHead(
  (reformer): ReformerModel(
    (embeddings): ReformerEmbeddings(
      (word_embeddings): Embedding(28, 1024)
      (position_embeddings): AxialPositionEmbeddings(
        (weights): ParameterList(
            (0): Parameter containing: [torch.FloatTensor of size 64x1x256]
            (1): Parameter containing: [torch.FloatTensor of size 1x72x768]
        )
      )
    )
    (encoder): ReformerEncoder(
      (layers): ModuleList(
        (0): ReformerLayer(
          (attention): ReformerAttention(
            (layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (self_attention): LocalSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=False)
              (key): Linear(in_features=1024, out_features=1024, bias=False)
              (value): Linear(in_features=1024, out_features=1024, bias=False)
            )
            (output): ReformerSelfOutput(
              (dense): Linear(in_features=1024,

In [12]:
tokenizer.max_len

4608

In [13]:
input_id = torch.tensor(tokenizer.encode("ABCDEFGH", add_special_tokens=True, pad_to_max_length=True)).unsqueeze(0)  # Batch size 1


In [14]:
input_id.shape

torch.Size([1, 4608])

In [16]:
outputs = model(input_id, labels = input_id)

In [17]:
loss, prediction_scores = outputs[:2]

In [20]:
prediction_scores.shape

torch.Size([1, 4608, 28])

In [27]:
input_id[torch.argmax(prediction_scores, dim=2) == input_id]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [22]:
input_id

tensor([[0, 3, 4,  ..., 0, 0, 0]])

In [30]:
[torch.randint(7, (1,)).item() for i in range(9)]

[3, 0, 6, 1, 6, 2, 6, 2, 5]

In [None]:
torch.roll(x, 3)

In [None]:
sequence_length = 4608


In [45]:
prediction_scores.shape

torch.Size([1, 4608, 28])

In [47]:
prediction_scores

tensor([[[ 1.8784,  1.2836, -1.0793,  ..., -0.6524, -2.6258, -1.0666],
         [ 1.1133,  1.1670, -0.8323,  ..., -1.0635,  0.9155, -0.1506],
         [ 0.3937,  1.0289,  0.8168,  ..., -0.4873, -0.8674,  0.1013],
         ...,
         [ 0.9368, -2.1731,  0.8456,  ...,  1.9906, -3.1547, -1.3232],
         [ 1.2605, -1.3634, -0.4402,  ...,  0.3854,  0.1005, -1.1920],
         [ 1.1513,  0.2879, -0.3987,  ..., -0.1991, -0.5784, -1.4584]]],
       grad_fn=<AddBackward0>)

In [46]:
prediction_scores[..., :-1, :]

tensor([[[ 1.8784,  1.2836, -1.0793,  ..., -0.6524, -2.6258, -1.0666],
         [ 1.1133,  1.1670, -0.8323,  ..., -1.0635,  0.9155, -0.1506],
         [ 0.3937,  1.0289,  0.8168,  ..., -0.4873, -0.8674,  0.1013],
         ...,
         [ 0.6132, -0.1413,  0.1096,  ...,  1.2919, -1.4866, -0.8239],
         [ 0.9368, -2.1731,  0.8456,  ...,  1.9906, -3.1547, -1.3232],
         [ 1.2605, -1.3634, -0.4402,  ...,  0.3854,  0.1005, -1.1920]]],
       grad_fn=<SliceBackward>)

In [48]:
prediction_scores[0,0,]

tensor([ 1.8784,  1.2836, -1.0793, -1.7684,  0.4985,  0.6058,  0.6368, -1.1766,
        -1.2031,  0.4661,  0.4723,  0.5633, -2.3839,  0.6446, -0.5566,  1.0457,
         0.3205,  1.0964,  0.6490,  0.0175,  1.2438, -1.3752,  0.1407,  0.1297,
        -1.0813, -0.6524, -2.6258, -1.0666], grad_fn=<SelectBackward>)

In [51]:
input_id, input_id.shape

(tensor([[0, 3, 4,  ..., 0, 0, 0]]), torch.Size([1, 4608]))

In [52]:
input_id[..., 1:]

tensor([[3, 4, 5,  ..., 0, 0, 0]])

In [62]:
from torch.nn import CrossEntropyLoss
loss_fct = CrossEntropyLoss(ignore_index=0)

In [63]:
loss = loss_fct(prediction_scores[..., :-1, :].view(-1, 28), input_id[..., 1:].view(-1))

In [64]:
loss

tensor(3.9867, grad_fn=<NllLossBackward>)

In [65]:
prediction_scores[..., :-1, :].view(-1, 28)

tensor([[ 1.8784,  1.2836, -1.0793,  ..., -0.6524, -2.6258, -1.0666],
        [ 1.1133,  1.1670, -0.8323,  ..., -1.0635,  0.9155, -0.1506],
        [ 0.3937,  1.0289,  0.8168,  ..., -0.4873, -0.8674,  0.1013],
        ...,
        [ 0.6132, -0.1413,  0.1096,  ...,  1.2919, -1.4866, -0.8239],
        [ 0.9368, -2.1731,  0.8456,  ...,  1.9906, -3.1547, -1.3232],
        [ 1.2605, -1.3634, -0.4402,  ...,  0.3854,  0.1005, -1.1920]],
       grad_fn=<ViewBackward>)

In [97]:
input_id[:,0:15]

tensor([[ 9, 19,  4,  5,  6,  7,  8,  9, 10, 19,  0,  0,  0,  0,  0]])

In [85]:
input_id[:,0:15][..., 1:]

tensor([[ 3,  4,  5,  6,  7,  8,  9, 10,  0,  0,  0,  0,  0,  0]])

In [83]:
torch.argmax(prediction_scores[..., :-1, :], dim=2)[:, 0:15]

tensor([[ 0, 19, 20, 10, 25,  0, 17, 15, 20, 20, 10, 14, 25, 20, 11]],
       grad_fn=<SliceBackward>)

In [78]:
loss = loss_fct(prediction_scores[..., :-1, :].view(-1, 28), input_id[..., 1:].view(-1))
loss

tensor(3.9867, grad_fn=<NllLossBackward>)

In [90]:
input_id[0,1] = 19

In [91]:
loss = loss_fct(prediction_scores[..., :-1, :].view(-1, 28), input_id[..., 1:].view(-1))
loss

tensor(3.7635, grad_fn=<NllLossBackward>)

In [98]:
input_id[0,10] = 19

In [99]:
loss = loss_fct(prediction_scores[..., :-1, :].view(-1, 28), input_id[..., 1:].view(-1))
loss

tensor(3.6840, grad_fn=<NllLossBackward>)

In [102]:
input_id[:,0:15][..., 1:]

tensor([[19,  4,  5,  6,  7,  8,  9, 10, 19, 19,  0,  0,  0,  0]])

In [107]:
torch.argmax(prediction_scores[..., :-1, :], dim=2)[:, 0:15]

tensor([[ 0, 19, 20, 10, 25,  0, 17, 15, 20, 20, 10, 14, 25, 20, 11]],
       grad_fn=<SliceBackward>)

In [108]:
input_id[0,2]=20

In [109]:
loss = loss_fct(prediction_scores[..., :-1, :].view(-1, 28), input_id[..., 1:].view(-1))
loss

tensor(3.6215, grad_fn=<NllLossBackward>)

In [113]:
input_id[0,3]=20
loss = loss_fct(prediction_scores[..., :-1, :].view(-1, 28), input_id[..., 1:].view(-1))
loss

tensor(3.4052, grad_fn=<NllLossBackward>)

In [3]:
class LineByLineTextDataset(Dataset):
    """modified: 
    https://github.com/huggingface/transformers/blob/cb3c2212c79d7ff0a4a4e84c3db48371ecc1c15d/src/transformers/data/datasets/language_modeling.py#L77
    """

    def __init__(self, tokenizer, file_path: str):
        assert os.path.isfile(file_path)

        with open(file_path, encoding="utf-8") as f:
            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

#         lines = lines[:50_000]
        batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=tokenizer.vocab_size)
        self.examples = batch_encoding["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
        return torch.tensor(self.examples[i], dtype=torch.long)

In [18]:
# input_ids = torch.tensor(tokenizer.encode("ALKLAKALK", 
#                                           add_special_tokens=True, 
#                                           max_length=SEQ_LEN, 
#                                           pad_to_max_length=True)).unsqueeze(0)  # Batch size 1