**Preprocessing**

In [1]:
# reading file
with open("pg1998.txt", 'r', encoding='utf-8') as file:
    full_text = file.read()
print(len(full_text))
print(full_text[:100])

637293
﻿THUS SPAKE ZARATHUSTRA

A BOOK FOR ALL AND NONE


By Friedrich Nietzsche


Translated By Thomas Com


In [161]:
characters = sorted(set(full_text))
VOCAB_SIZE = len(characters)
print(characters)
print(VOCAB_SIZE)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65


In [162]:
# look into tiktoken

# character-to-index mapping
def create_mapping():
    mapping = {}
    for i, ch in enumerate(characters):
        mapping[ch] = i
    return mapping

# encode a string using the mapping
def encode(string, mapping):
    encoded = []
    for c in string:
        encoded.append(mapping[c])
    return encoded

# reverse mapping for decoding
def create_reverse_mapping(mapping):
    reverse_mapping = {}
    for ch, i in mapping.items():
        reverse_mapping[i] = ch
    return reverse_mapping

# decode an encoded list back to the original string
def decode(encoded, reverse_mapping):
    decoded = []
    for i in encoded:
        decoded.append(reverse_mapping[i])
    return ''.join(decoded)

encoder_map = create_mapping()
decoder_map = create_reverse_mapping(encoder_map)

sample_string = encode("hola", encoder_map)
print("Encoded:", sample_string)

decoded_string = decode(sample_string, decoder_map)
print("Decoded:", decoded_string)


Encoded: [46, 53, 50, 39]
Decoded: hola


In [163]:
import torch

torch.manual_seed(1)
data = torch.tensor(encode(full_text, encoder_map), dtype=torch.int64) # convert data into a tensor
print(data[:100])

split = int(0.8*len(data))
training_set = data[:split] # split into training and validation set
validation_set = data[split:]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [164]:
context_size = 16 # the batch size to contain context in
batch_size = 32 # how many sequences we are processing in each epoch

x = training_set[:context_size]
y = training_set[1:context_size+1] # y is the character after x

**Defining Functions**

In [165]:
def get_batch(split): # generate a small batch of data to process of inputs x and targets y

    if split == 'train':
        data = training_set
    else:
        data =  validation_set

    # generate a random starting point and generate a batch from that
    random_batch = torch.randint(len(data) - context_size, (batch_size,))
    x = torch.stack([data[i:i+context_size] for i in random_batch])
    y = torch.stack([data[i+1:i+context_size+1] for i in random_batch])

    return x, y

x_sample, y_sample = get_batch('train')
print(x_sample)
print(y_sample)
print(x_sample.shape)

tensor([[ 1, 47, 58,  1, 58, 53,  1, 39,  1, 46, 39, 54, 54, 63,  1, 47],
        [57,  1, 55, 59, 43, 43, 52,  6,  1, 54, 39, 56, 58,  1, 53, 44],
        [61, 43, 58,  1, 58, 46, 43, 47, 56,  1, 41, 46, 43, 43, 49, 57],
        [50,  1, 46, 47, 51,  6,  1, 47, 52,  1, 46, 53, 54, 43,  1, 46],
        [20, 43, 56, 43,  1, 41, 53, 51, 43, 57,  1, 30, 53, 51, 43, 53],
        [63, 53, 59,  1, 52, 53, 61, 12,  0,  0, 35, 13, 30, 35, 21, 15],
        [53, 44,  1, 58, 46, 43, 57, 43,  1, 51, 53, 39, 52, 57,  2,  0],
        [ 1, 40, 43, 39, 56,  1, 47, 58,  1, 39, 57,  1, 63, 53, 59,  1],
        [ 1, 46, 39, 54, 54, 47, 43, 56,  1, 50, 39, 52, 42, 57,  6,  0],
        [53, 56,  1, 45, 52, 39, 56, 50, 47, 52, 45,  1, 57, 53, 56, 56],
        [53, 42,  1, 61, 47, 50, 50,  0, 35, 46, 47, 41, 46,  1, 58, 47],
        [39, 47, 42, 57,  1, 61, 53, 53, 47, 52, 45,  1, 39,  1, 51, 39],
        [ 0, 24, 43, 57, 58,  1, 58, 46, 39, 58,  1, 58, 46, 63,  1, 50],
        [46, 43, 56, 11,  1, 63, 43, 5

In [166]:
def estimate_loss(model, val_iterations): # evaluates loss for both validation and trainiing sets
    out = {}
    model.eval() # set to validation mode
    for split in ['train', 'val']:
        losses = torch.zeros(val_iterations) # initialize loss
        for k in range(val_iterations): # val iterations is the training loss over x validation iterations
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean() # calculate loss mean
    model.train() # set back to training
    return out

**Model**

In [167]:
import torch.nn as nn
from torch.nn import functional

class BigramModel(nn.Module):

    def __init__(self, vocabulary_size):
        super().__init__()
        self.embedding_table = nn.Embedding(vocabulary_size, vocabulary_size) # we create a 65 x 65 embedding table
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, index, targets=None):
        logits = self.embedding_table(index) # logits are used to convert into a probability distribution for each token prediction

        if targets==None:
            loss = None
        else:
            logits = logits.view(-1, logits.size(-1))
            targets = targets.view(-1) # logits and targets must be resized to be used with NNs
            loss = self.loss_fn(logits, targets) # calculate difference between predicted value and target value

        return logits, loss

    def generate(self, index, new_tokens): # generate likely tokens that come after selected index

        for i in range(new_tokens):
            logits, loss = self(index)
            logits = logits[:, -1, :] # we only want the last token
            probs = nn.functional.softmax(logits, dim=-1) # apply softmax to get probability distribution
            next_index = torch.multinomial(probs, num_samples=1) # retrieve sample from the distribution
            index = torch.cat((index, next_index), dim=1) # # append sample  to the full sequence
        return index

sample_model = BigramModel(VOCAB_SIZE)
sample_logits, sample_loss = sample_model(x_sample, y_sample)

print(sample_logits.shape, sample_loss)

torch.Size([512, 65]) tensor(4.6685, grad_fn=<NllLossBackward0>)


**Main Loop**

In [168]:
optimizer = torch.optim.Adam(sample_model.parameters(), lr = 0.001)
val_iterations = 500

for epoch in range(10000):

    # we compare training and validation loss every once in a while
    if epoch % val_iterations == 0:
        losses = estimate_loss(sample_model, val_iterations)
        print(f"step {epoch}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    x_sample, y_sample = get_batch("train") # retrieve a batch randomly
    # backpropogation step to compute gradients
    logits, loss = sample_model(x_sample, y_sample)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


step 0: train loss 4.6402, val loss 4.6277
step 500: train loss 4.0626, val loss 4.0515
step 1000: train loss 3.6047, val loss 3.6057
step 1500: train loss 3.2662, val loss 3.2701
step 2000: train loss 3.0192, val loss 3.0247
step 2500: train loss 2.8379, val loss 2.8532
step 3000: train loss 2.7188, val loss 2.7316
step 3500: train loss 2.6349, val loss 2.6593
step 4000: train loss 2.5851, val loss 2.6089
step 4500: train loss 2.5484, val loss 2.5761
step 5000: train loss 2.5229, val loss 2.5544
step 5500: train loss 2.5108, val loss 2.5333
step 6000: train loss 2.4952, val loss 2.5266
step 6500: train loss 2.4842, val loss 2.5200
step 7000: train loss 2.4780, val loss 2.5180
step 7500: train loss 2.4730, val loss 2.5147
step 8000: train loss 2.4665, val loss 2.5059
step 8500: train loss 2.4689, val loss 2.5081
step 9000: train loss 2.4597, val loss 2.5058
step 9500: train loss 2.4614, val loss 2.5050


In [170]:
initial = torch.zeros((1, 1), dtype=torch.long) # initial batch
sample_generation = sample_model.generate(initial, new_tokens=300)[0].tolist() # generate the next 1000 tokens
print(decode(sample_generation, decoder_map)) # predict


Arond on h, cond wineoust mint.
AUShewhombs. abe mboree t
The fof ck
CHe fete ishe at, You scod ais ave f D semonlalaprd sosserr thevetom; RINGoun Go asou m OMan, htithar the pe ss hotu swata'dr y incina fl Maiou y ttesot IVI ityo h tyofuss wherdr, plilitorint mee y
W: cufoulicouer sh tithe Switief 
