In [13]:
import torch
import numpy as np


One-hot encoding

In [None]:
words = ['files', 'find', 'my', 'files']

n_words = len(set(words))
words_keys = {}
for i, w in enumerate(words):
    if w not in words_keys:
        words_keys[w] = i

output = torch.empty((n_words,0), dtype=torch.float32)

for word in words:
    one_hot = torch.zeros(n_words)
    one_hot[words_keys[word]] = 1
    output = torch.column_stack((output, one_hot))

print(output)


Dot product

In [None]:
a = torch.tensor([0,1,1,2]).float()
b = torch.tensor([1,0,1,2]).float()

print(a @ b)

a = torch.tensor([0,1,0,0]).float()
b = torch.tensor([0,0,0,1]).float()

print(a @ b)


Matrix multiplication

In [None]:
a = torch.tensor([ 0, 0, 1, 0], dtype=torch.float32)
b = torch.tensor([.2,.7,.8,.1]).t()

print(a @ b)

a = torch.tensor(([ 1, 0, 0, 0],
                  [ 0, 0, 1, 0]) , dtype=torch.float32)
b = torch.tensor([.2,.7,.8,.1]).t()

print(a @ b)


a = torch.tensor([ 0, 0, 1, 0] , dtype=torch.float32)
b = torch.tensor(([.2,.7,.8,.1],
                 [.9, 0,.3,.4])).t()
print(a @ b)

a = torch.tensor(([ 1, 0, 0, 0],
                  [ 0, 0, 0, 1],
                  [ 0, 0, 1, 0]) , dtype=torch.float32)
b = torch.tensor(([.2,.7,.8,.1],
                  [.9, 0,.3,.4])).t()
print(a @ b)


First order sequence model

In [None]:
from itertools import pairwise, product
            # Phrase                         Probability
commands = [("show me my directories please", .2),
            ("show me my files please",       .3),
            ("show me my photos please",      .5)]

vocabulary = set([j for i in commands for j in i[0].split()])
print("Vocabulary:", vocabulary)

voc_keys = {w:i for i,w in enumerate(sorted(vocabulary))}
print(voc_keys)

matrix = torch.zeros(len(vocabulary), len(vocabulary))


for phrase, probability in commands:
    for word, next_word in pairwise(phrase.split()):
    #    print(word,next_word, probability, voc_keys[word], voc_keys[next_word])
       matrix[(voc_keys[word], voc_keys[next_word])] += probability

for i,l in enumerate(matrix):
    if ((a:=sum(l)) != 0):
        matrix[i]/=a 


print(" "*11,end=" ")
for word in sorted(vocabulary):
    print(f"{word[:6]:6}",end=" ")
print()
for word1 in sorted(vocabulary):
    print(f"{word1:11}",end=" ")
    for word2 in sorted(vocabulary):
        print(f"{matrix[voc_keys[word1], voc_keys[word2]].item():<6.2f}",end=" ")
    print()


In [None]:
from itertools import pairwise, combinations
            # Phrase                         Probability
commands = [("check whether the battery ran down please", 0.4),
            ("check whether the program ran please",      0.6)]

vocabulary = {j for i in commands for j in i[0].split()}
# print("Vocabulary:", vocabulary)

voc_keys = {w:i for i,w in enumerate(vocabulary)}
# print(voc_keys)

words_combinations = {c: frozenset(c) for c in sorted(combinations(vocabulary,2))}

matrix = torch.zeros(len(words_combinations), len(vocabulary))

comb_keys = {w:i for i,w in enumerate(words_combinations.values())}


for phrase, probability in commands:
    for *comb, next_word in zip(*[phrase.split()[i:] for i in range(3)]):
       matrix[(comb_keys[frozenset(comb)], voc_keys[next_word])] += probability

for i,l in enumerate(matrix):
    if ((a:=sum(l)) != 0):
        matrix[i]/=a 

print(" "*15,end=" ")
for word in sorted(vocabulary):
    print(f"{word[:6]:6}",end=" ")
print()

# sorted_comb = {tuple(fz): fz for fz in words_combinations}
for comb, fz in words_combinations.items():
    print(f"{comb[0]+' '+comb[1]:15}",end=" ")
    for word in sorted(vocabulary):
        print(f"{matrix[comb_keys[fz], voc_keys[word]].item():<6.2f}",end=" ")
    print()

In [None]:
# from itertools import combinations_with_replacement

            # Phrase                                                          Probability
commands = [("check the program log and find out whether it ran please",      0.5),
            ("check the battery log and find out whether it ran down please", 0.5)]
vocabulary = {j for i in commands for j in i[0].split()} 
voc_keys = {w:i for i,w in enumerate(vocabulary)}

ran_combinations = {(j, 'ran') for i in commands for j in i[0].split()[:i[0].split().index('ran')+1]}
comb_keys = {w:i for i,w in enumerate(ran_combinations)}

matrix = torch.zeros(len(ran_combinations), len(vocabulary))

for phrase, probability in commands:
    phrase = phrase.split();
    ran_index = phrase.index('ran')+1
    for word in phrase[:ran_index]:
        matrix[(comb_keys[(word, 'ran')], voc_keys[phrase[ran_index]])] += probability

for i,l in enumerate(matrix):
    if ((a:=sum(l)) != 0):
        matrix[i]/=a 

# Priting
def print_matrix():
    margin = max(len(comb[0]+' '+comb[1]) for comb in ran_combinations)
    print(" "*margin,end=" ")
    for word in sorted(vocabulary):
        print(f"{word[:6]:6}",end=" ")
    print()
    for comb in sorted(ran_combinations):
        print(f"{comb[0]+' '+comb[1]:{margin}}",end=" ")
        for word in sorted(vocabulary):
            print(f"{matrix[comb_keys[comb], voc_keys[word]].item():<6.2f}",end=" ")
        print()

print_matrix()
print()

mask = torch.zeros(len(ran_combinations))
mask[comb_keys[('battery', 'ran')]] = 1
mask[comb_keys[('program', 'ran')]] = 1
matrix = (matrix.T * mask).T
print_matrix()

<p xmlns:cc="http://creativecommons.org/ns#" >This work is licensed under <a href="http://creativecommons.org/licenses/by-nc-sa/4.0/?ref=chooser-v1" target="_blank" rel="license noopener noreferrer" style="display:inline-block;">CC BY-NC-SA 4.0<img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/cc.svg?ref=chooser-v1"><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/by.svg?ref=chooser-v1"><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/nc.svg?ref=chooser-v1"><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/sa.svg?ref=chooser-v1"></a></p>


In [14]:
from torch import nn
from torch.nn import functional
from math import ceil

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.set_default_device(device)

In [15]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.w_2(functional.relu(self.w_1(x)))

In [16]:
class SingleHeadAttention(nn.Module):
    def __init__(self, d_model, d_k, d_v):
        super(SingleHeadAttention, self).__init__()
        self.d_k = d_k
        self.Wv = nn.Linear(d_model, d_v)  
        self.Wk = nn.Linear(d_model, d_k)  
        self.Wq = nn.Linear(d_model, d_k)

    def forward(self, v, k, q, mask):
        v = self.Wv(v)
        k = self.Wv(k)
        q = self.Wv(q)

        qk = q@k.transpose(1, 2)
        qk = qk/self.d_k
        if mask is not None:
            mask = mask.reshape(qk.shape[0], 1, qk.shape[2])
            qk = qk.masked_fill(mask == 0, -1e9)
        qk = torch.softmax(qk, dim=-1)

        qkv = qk @ v
        return qkv

In [17]:
class MultiheadAttention(nn.Module):
    def __init__(self, d_model, d_k, d_v, h, mask: bool = False, n:int = None):
        super(MultiheadAttention, self).__init__()
        self.d_k = d_k
        self.Wv = torch.nn.ModuleList([nn.Linear(d_model, d_v)]*h)  # value matrices
        self.Wk = torch.nn.ModuleList([nn.Linear(d_model, d_k)]*h)  # key matrices
        self.Wq = torch.nn.ModuleList([nn.Linear(d_model, d_k)]*h)  # query matrices
        self.lineal = nn.Linear(h*d_v, d_model)
        if mask:
            self.mask = torch.ones((n,n))
            self.mask = torch.tril(self.mask)
    def forward(self, v, k ,q, mask = None):
        nv = [w(v) for w in self.Wv]
        nk = [w(k) for w in self.Wk]
        nq = [w(q) for w in self.Wq]

        att = [self.atention(*i, mask) for i in zip(nv,nq, nk)]
        att =torch.concat(att, dim=-1)
        return self.lineal(att)

    def atention(self, v, k, q, mask):
        qk = q@k.transpose(1, 2)
        qk = qk/self.d_k
        if mask is not None:
            mask = mask.reshape(qk.shape[0], 1, qk.shape[2])
            qk = qk.masked_fill(mask == 0, -1e9)
        qk = torch.softmax(qk, dim=-1)

        qkv = qk @ v
        return qkv

In [18]:
class AddNormalize(nn.Module):
  def __init__(self):
      super(AddNormalize, self).__init__()
      
  def forward(self, x, y):
    x = x + y
    mean = torch.mean(x, dim=-1, keepdim=True)
    std = torch.std(x, dim=-1, keepdim=True)
    return (x-mean)/std

In [19]:
class Encoder(nn.Module):
    def __init__(self, d_model, d_k, d_v, h, d_ff):
      '''
      N: Número de palabras del vocabulario
      d_model: Número de dimensiones de embdedding de vocabulario
      d_k: Número de dimensiones del embdedding de claves/consultas
      d_v: Número de dimensiones de embdedding de valores
      d_v: Número de dimensiones de embdedding de valores
      h: Número de cabezas de atención
      '''
      super(Encoder, self).__init__()
      self.encoder_mh = MultiheadAttention(d_model, d_k, d_v, h)
      self.norm       = AddNormalize()
      self.encoder_ff = FeedForward(d_model, d_ff)
    # x represents our data
    def forward(self, inputs):
      enc_mha   = self.encoder_mh(inputs, inputs, inputs)
      enc_mha   = self.norm(inputs, enc_mha)
      enc_ff    = self.encoder_ff(enc_mha)
      enc_ff    = self.norm(enc_ff, enc_mha)
      return enc_ff


In [20]:
class Decoder(nn.Module):
    def __init__(self, d_model, d_k, d_v, h, d_ff, n, features=True):
      '''
      N: Número de palabras del vocabulario
      d_model: Número de dimensiones de embdedding de vocabulario
      d_k: Número de dimensiones del embdedding de claves/consultas
      d_v: Número de dimensiones de embdedding de valores
      d_v: Número de dimensiones de embdedding de valores
      h: Número de cabezas de atención
      '''
      super(Decoder, self).__init__()
      self.masked_mh  = MultiheadAttention(d_model, d_k, d_v, h)
      if features:
        self.decoder_mh = MultiheadAttention(d_model, d_k, d_v, h)
      self.decoder_ff = FeedForward(d_model, d_ff)  
      self.norm       = AddNormalize()
      
      self.dropout = nn.Dropout(0.1)
    # x represents our data
    def forward(self, outputs, mask, features=None):
      dec_mmha   = self.masked_mh(outputs, outputs, outputs, mask)
      dec_mmha   = self.norm(outputs, dec_mmha)
      if features:
        dec_mha    = self.decoder_mh(features, features, dec_mmha)
        dec_mha    = self.norm(dec_mmha, dec_mha)
      else:
        dec_mha = dec_mmha
      dec_ff     = self.decoder_ff(dec_mha)

      if self.training:
        dec_ff = self.dropout(dec_ff)
      dec_ff     = self.norm(dec_ff, dec_mha)
      return dec_ff

In [21]:
from math import ceil
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, n):
        super(PositionalEncoding, self).__init__()
        # Compute the positional encodings once in log space.
        # self.pe = torch.zeros((n, d_model))
        # position = torch.arange(0, n)
        # i = torch.torch.arange(0, d_model, 2)
        # div_term = 10_000 ** (2*i/d_model)
        # self.pe[:, 0::2] = torch.sin(position / div_term)
        # self.pe[:, 1::2] = torch.cos(position / div_term)[:,:-1 if d_model%2 else None]
        # self.pe = self.pe.unsqueeze(0)

        positional_encoding = np.zeros((n, d_model))

        # Calculate positional encoding for each position and each dimension
        for pos in range(n):
            for i in range(0, d_model, 2):
                # Apply sin to even indices in the array; indices in Python start at 0 so i is even.
                positional_encoding[pos, i] = np.sin(pos / (10_000 ** ((2 * i) / d_model)))

                if i + 1 < d_model:
                    # Apply cos to odd indices in the array; we add 1 to i because indices in Python start at 0.
                    positional_encoding[pos, i + 1] = np.cos(pos / (10000 ** ((2 * i) / d_model)))

        # Convert numpy array to PyTorch tensor and return it
        self.pe = torch.from_numpy(positional_encoding).float().cuda()
        
    def forward(self, x:torch.Tensor):
        return x + self.pe

In [None]:
class Transformer(nn.Module):
    def __init__(self, N, d_model, d_k, d_v, h, d_ff, n):
      '''
      N: Número de palabras del vocabulario
      d_model: Número de dimensiones de embdedding de vocabulario
      d_k: Número de dimensiones del embdedding de claves/consultas
      d_v: Número de dimensiones de embdedding de valores
      d_v: Número de dimensiones de embdedding de valores
      h: Número de cabezas de atención
      '''
      super(Transformer, self).__init__()

      self.encoder_em = nn.Embedding(N, d_model)
      self.posencoding = PositionalEncoding(d_model, n)
      self.decoder_em = nn.Embedding(N, d_model)
      self.encoder    = Encoder(d_model, d_k, d_v, h, d_ff) 
      self.decoder    = Decoder(d_model, d_k, d_v, h, d_ff, n) 
      self.output     = nn.Linear(d_model, N)
    # x represents our data
    def forward(self, inputs, outputs):
      emb_input = self.encoder_em(inputs)
      emb_input = self.posencoding(emb_input)
      features  = self.encoder(emb_input)

      emb_output = self.encoder_em(outputs)
      emb_output = self.posencoding(emb_output)
      dec_out    = self.decoder(emb_output, features)

      output     = self.output(dec_out)
      return torch.softmax(output, dim=1)


In [22]:
class TextGenerator(nn.Module):
    def __init__(self, N, d_model, d_k, d_v, h, d_ff, n, n_dec_bloks=1):
      '''
      N: Número de palabras del vocabulario
      d_model: Número de dimensiones de embdedding de vocabulario
      d_k: Número de dimensiones del embdedding de claves/consultas
      d_v: Número de dimensiones de embdedding de valores
      d_v: Número de dimensiones de embdedding de valores
      h: Número de cabezas de atención
      '''
      super(TextGenerator, self).__init__()

      self.decoder_em = nn.Embedding(N, d_model)
      self.posencoding = PositionalEncoding(d_model, n)
      self.decoder    = torch.nn.ModuleList([Decoder(d_model, d_k, d_v, h, d_ff, n, features=False)]*n_dec_bloks)
      self.output     = nn.Linear(d_model, N)
    # x represents our data
    def forward(self, inputs, mask):
      emb_output = self.decoder_em(inputs)
      output = self.posencoding(emb_output)
      for dec in self.decoder:
        output = dec(output, mask)

      output = self.output(output)
      # return torch.softmax(output, dim=-1)
      return output

In [30]:
class AutoregressiveWrapper(torch.nn.Module):
    def __init__(self, gpt_model , max_sequence_length):
        super().__init__()
        self.model = gpt_model
        self.max_sequence_length = max_sequence_length

    def forward(self, x, mask):
        inp, target = x[:, :-1], x[:, 1:]
        mask = mask[:, :-1]

        output = self.model(inp, mask)
        return output, target

    def next_token_probabilities(self, x, mask, temperature=1.0):
        logits = self.model(x, mask)[:, -1]

        # Apply the temperature
        if temperature != 1.0:
            logits = logits / temperature

        # Apply the softmax
        probabilities = torch.softmax(logits, dim=-1)

        return probabilities

In [24]:
class Tokenizer:

    def __init__(self):
        super().__init__()
        self.dictionary = {}
        self.reverse_dictionary = []
        # Add the padding token
        self._add_to_dict('<PAD>')

        # Add characters and numbers to the dictionary
        for i in range(10):
            self._add_to_dict(str(i))
        for i in range(26):
            self._add_to_dict(chr(ord('a') + i))

        # Add space and punctuation to the dictionary
        self._add_to_dict('.')
        self._add_to_dict(' ')

    def _add_to_dict(self, character):
        if character not in self.dictionary:
            index = len(self.dictionary)
            self.dictionary[character] = index
            self.reverse_dictionary.append(character)

    def tokenize(self, text):
        return [self.dictionary[c] for c in text]

    def character_to_token(self, character):
        return self.dictionary[character]

    def token_to_character(self, token):
        return self.reverse_dictionary[token]

    def size(self):
        return len(self.dictionary)

In [81]:
import random
class Trainer:

    def __init__(self, model, tokenizer: Tokenizer, optimizer=None):
        super().__init__()
        self.model = model
        if optimizer is None:
            self.optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
        else:
            self.optimizer = optimizer
        self.tokenizer = tokenizer
        self.loss_function = torch.nn.CrossEntropyLoss()
    def __gen_batch(self, data, batch_size):
        for i in range(0, len(data), batch_size):
            sequence_tensor = torch.tensor(data[i: i + batch_size], dtype=torch.long)

            # Create the mask tensor for the batch, where 1 means the token is not a padding token
            mask_tensor = torch.ones_like(sequence_tensor)
            mask_tensor[sequence_tensor == self.tokenizer.character_to_token('<PAD>')] = 0

            yield (sequence_tensor, mask_tensor)
    def train(self, data: list[str], epochs, batch_size):
        loss_per_epoch = []
        for epoch in range(epochs):
            losses = []

            # Shuffle the sequences
            random.shuffle(data)

            # Create batches of sequences and their respective mask.

            # Train the model on each batch
            for batch_number, batch in enumerate(self.__gen_batch(data, batch_size)):
                self.model.train()

                # Create the input and mask tensors
                input_tensor = torch.zeros((batch_size, self.model.max_sequence_length + 1), dtype=torch.long)
                mask_tensor = torch.zeros((batch_size, self.model.max_sequence_length + 1), dtype=torch.long)

                for i, input_entry in enumerate(batch[0]):
                    input_tensor[i] = input_entry

                for i, mask_entry in enumerate(batch[1]):
                    mask_tensor[i] = mask_entry

                # Compute the model output
                model_output, target = self.model(input_tensor, mask_tensor)

                # Compute the losses
                # The loss is computed on the model output and the target
                loss = self.loss_function(model_output.transpose(1, 2), target)

                print(f"{batch_number}/{len(data)//batch_size} Loss: {loss}", end="\r")
                # Backpropagate the loss.
                loss.backward()

                # Clip the gradients. This is used to prevent exploding gradients.
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)

                # Update the model parameters. This is done by taking a step in the direction of the gradient.
                self.optimizer.step()

                # Reset the gradients. This is done so that the gradients from the previous batch
                # are not used in the next step.
                self.optimizer.zero_grad()

                # Append the loss to the list of losses, so that the average loss can be computed for this epoch.
                losses.append(loss.item())

            # Print the loss
            epoch_loss = np.average(losses)
            loss_per_epoch.append(epoch_loss)
            print('Epoch:', epoch, 'Loss:', epoch_loss)

        return loss_per_epoch

In [None]:
def create_training_sequences(max_sequence_length, tokenized_training_data):
    # Create sequences of length max_sequence_length + 1
    # The last token of each sequence is the target token
    sequences = []
    for i in range(0, len(tokenized_training_data) - max_sequence_length - 1):
        sequences.append(tokenized_training_data[i: i + max_sequence_length + 1])
    return sequences


def tokenize_and_PAD_training_data(max_sequence_length, tokenizer, training_data):
    # Tokenize the training data
    tokenized_training_data = tokenizer.tokenize(training_data)
    for _ in range(max_sequence_length):
        # Prepend PADding tokens
        tokenized_training_data.insert(0, tokenizer.character_to_token('<PAD>'))
    return tokenized_training_data


tokenizer = Tokenizer()

embedding_dimension = 256
max_sequence_length = 20
number_of_tokens = tokenizer.size()

# Create the model
model = AutoregressiveWrapper(TextGenerator(number_of_tokens, embedding_dimension, 
            64, 64, 4, 256//4, max_sequence_length, 3), max_sequence_length)
# Create the training data
training_data = '. '.join([
    'cats rule the world',
    'dogs are the best',
    'elephants have long trunks',
    'monkeys like bananas',
    'pandas eat bamboo',
    'tigers are dangerous',
    'zebras have stripes',
    'lions are the kings of the savannah',
    'giraffes have long necks',
    'hippos are big and scary',
    'rhinos have horns',
    'penguins live in the arctic',
    'polar bears are white'
])

tokenized_and_PADded_training_data = tokenize_and_PAD_training_data(max_sequence_length, tokenizer, training_data)
sequences = create_training_sequences(max_sequence_length, tokenized_and_PADded_training_data)

# Train the model
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
trainer = Trainer(model, tokenizer, optimizer)
loss = trainer.train(sequences, epochs=120, batch_size=16)

In [None]:
def create_training_sequences(max_sequence_length, tokenized_training_data):
    # Create sequences of length max_sequence_length + 1
    # The last token of each sequence is the target token
    sequences = []
    for i in range(0, len(tokenized_training_data) - max_sequence_length - 1):
        sequences.append(tokenized_training_data[i: i + max_sequence_length + 1])
    return sequences


def tokenize_and_PAD_training_data(max_sequence_length, tokenizer, training_data):
    # Tokenize the training data
    tokenized_training_data = tokenizer.tokenize(training_data)
    for _ in range(max_sequence_length):
        # Prepend PADding tokens
        tokenized_training_data.insert(0, tokenizer.character_to_token('<PAD>'))
    return tokenized_training_data


# tokenizer = Tokenizer()


# Create the training data
training_data = '. '.join([
    'cats rule'
])

tokenized_and_PADded_training_data = tokenize_and_PAD_training_data(max_sequence_length, tokenizer, training_data)
sequences = create_training_sequences(max_sequence_length, tokenized_and_PADded_training_data)

# print(tokenized_and_PADded_training_data)
tensor = torch.tensor([tokenized_and_PADded_training_data[9:]])

mask = torch.ones_like(tensor)
mask[tensor == 0] = 0

model.eval()
out=model.next_token_probabilities(tensor, mask)
# token = random.choices(range(tokenizer.size()), out)[0]
token = torch.multinomial(out, num_samples=1)
print(tokenizer.token_to_character(token), end="")
while 1:
    tensor = torch.cat([tensor, token], dim=1)
    tensor = tensor[:, 1:]
    mask = torch.ones_like(tensor)
    mask[tensor == 0] = 0
    out=model.next_token_probabilities(tensor, mask)
    token = torch.multinomial(out, num_samples=1)
    print(tokenizer.token_to_character(token), end="")

print()


In [26]:
from collections import  Counter
from itertools import pairwise  
with open("esto va a salir mal.txt", 'r', encoding='utf-8') as f:
    text = f.read().split('------------')

vocabulary = set()
[vocabulary.update(i) for i in text]
vocabulary = ['<PAD>', "<EOS>"] + list(vocabulary)
indexes = {j:i for i,j in enumerate(vocabulary)}
text = [[indexes[j] for j in i] for i in text]

# print(text)

while len(vocabulary) < 1024:
    count = Counter()
    [count.update(pairwise(i)) for i in text]
    most_common, _ = max(count.items(), key= lambda x: x[1])
    most_common_str = "".join(vocabulary[i] for i in most_common)
    vocabulary.append(most_common_str)
    new_index= len(indexes)
    indexes[most_common_str] = new_index
    new_text=[]
    for i in text:
        new_text.append([])
        iterator = enumerate(i)
        for idx, j in iterator:
            if idx < len(i)-1 and (j, i[idx+1])==most_common:
                new_text[-1].append(new_index)
                next(iterator)
            else:
               new_text[-1].append(j)  
    text = new_text


KeyboardInterrupt: 

In [60]:
from collections import Counter
from itertools import pairwise

class TokenizerBPE(Tokenizer):
    def __init__(self, N):
        super(TokenizerBPE, self).__init__()
        self.dictionary = {}
        self.reverse_dictionary = []
        # Add the PADding token
        self._add_to_dict('<PAD>')
        self._add_to_dict('\x00')
        self.N = N

    def add_list(self, text_list):
        [self._add_to_dict(character) for text in text_list for character in text]
        text_list = [self.tokenize(i) for i in text_list]

        while len(self.dictionary) < self.N:
            count = Counter()
            [count.update(pairwise(i)) for i in text_list]
            most_common, _ = count.most_common(1)[0]
            most_common_str = "".join(self.reverse_dictionary[i] for i in most_common)
            self.reverse_dictionary.append(most_common_str)
            new_index= len(self.dictionary)
            self.dictionary[most_common] = new_index
            new_text=[]
            for i in text_list:
                new_text.append([])
                iterator = enumerate(i)
                for idx, j in iterator:
                    if idx < len(i)-1 and (j, i[idx+1])==most_common:
                        new_text[-1].append(new_index)
                        next(iterator)
                    else:
                        new_text[-1].append(j)  
            text_list = new_text

    def tokenize(self, text):
        text = [self.dictionary[c] for c in text]
        changes = True 
        while changes:
            new_text = []
            iterator = enumerate(text)
            changes = False
            for idx, i in iterator:
                if idx < len(text)-1 and (i, text[idx+1]) in self.dictionary:
                    new_text.append(self.dictionary[(i, text[idx+1])])
                    next(iterator)
                    changes = True
                else:
                    new_text.append(i)
            text=new_text
        return text

In [62]:
def create_training_sequences(max_sequence_length, tokenized_training_data):
    # Create sequences of length max_sequence_length + 1
    # The last token of each sequence is the target token
    sequences = []
    for i in range(0, len(tokenized_training_data) - max_sequence_length - 1):
        sequences.append(tokenized_training_data[i: i + max_sequence_length + 1])
    return sequences


def tokenize_and_PAD_training_data(max_sequence_length, tokenizer, training_data):
    # Tokenize the training data
    tokenized_training_data = tokenizer.tokenize(training_data)
    for _ in range(max_sequence_length):
        # Prepend PADding tokens
        tokenized_training_data.insert(0, tokenizer.character_to_token('<PAD>'))
    return tokenized_training_data


max_sequence_length = 512
number_of_tokens = 1024

# Load the training data
with open("esto va a salir mal.txt", 'r', encoding='utf-8') as f:
    text = f.read().split('------------')

tokenizer = TokenizerBPE(number_of_tokens)
tokenizer.add_list(text)

print("Created tokenization")


Created tokenization


In [79]:
embedding_dimension = 64


model = AutoregressiveWrapper(TextGenerator(number_of_tokens, embedding_dimension, 
            8, 8, 8, 256//4, max_sequence_length, 4), max_sequence_length)

training_data = '\x00'.join(text)

tokenized_and_PADded_training_data = tokenize_and_PAD_training_data(max_sequence_length, tokenizer, training_data)
sequences = create_training_sequences(max_sequence_length, tokenized_and_PADded_training_data)


In [87]:
# Train the model
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
trainer = Trainer(model, tokenizer, optimizer)
loss = trainer.train(sequences, epochs=1200, batch_size=32)

373/1940 Loss: 3.1226003170013428

In [84]:
torch.save(model.state_dict(), "model_statedict")
torch.save(model, "model")

In [85]:
def tokenize_prompt(max_sequence_length, tokenizer, training_data):
    # Tokenize the training data
    tokenized_training_data = tokenizer.tokenize(training_data)
    for _ in range(max_sequence_length - len(tokenized_training_data)):
        # Prepend PADding tokens
        tokenized_training_data.insert(0, tokenizer.character_to_token('<PAD>'))
    return tokenized_training_data


# Create the training data
prompt = input(">")

tokenized_and_PADded_training_data = tokenize_prompt(max_sequence_length, tokenizer, prompt)

# print(tokenized_and_PADded_training_data)
tensor = torch.tensor([tokenized_and_PADded_training_data])

model.eval()
while 1:
    mask = torch.ones_like(tensor)
    mask[tensor == 0] = 0

    out=model.next_token_probabilities(tensor, mask)
    token = torch.multinomial(out, num_samples=1)
    if token == tokenizer.character_to_token('\x00'):
        break
    print(tokenizer.token_to_character(token), end="")
    tensor = torch.cat([tensor, token], dim=1)
    tensor = tensor[:, 1:]
print()

lolodia, el usuoagenlotieneconaseste, exs
Que sororejorazónnntereoja;
sulio.

Y que en pagunonajaro y.
Y loceo que siiena.
meóno.

Y poonirme, era al cientoqué con diosguasfioíjo.
Vámeno.
di
iguotazuorcaslodo.

Lerd, no? yo algeso, no
y un,
hacensé qué bín tuorojereso, empiezo a duea.
tivo.
¡que esguejasa el e caonalo nitoo
y envezono, yo, evo sionegn veoorerounay, no,
siego rego.
Soy entenrisorermea,
quej, y noy en ego eva en y, todo.

Saquí.
Destepa; en esforío.
mi nsolal, tendo.
Cmeterdaado
podefriaba la encuenoríjartendónmás
Dino, ohe luz demasinga, lasa!
con una blno, y nas go el mueriitodo,
Me acueemaya.
ndo, masia, suusmorriego el preguego suextienó siencivo.
Me mayzhe.
yo qurisa nano, alegría a ase, no.

