In [None]:
# Start by downloading the dataset, here is War and Peace
!wget https://www.gutenberg.org/files/2600/2600-0.txt

--2024-03-02 06:27:54--  https://www.gutenberg.org/files/2600/2600-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3359405 (3.2M) [text/plain]
Saving to: ‘2600-0.txt.2’


2024-03-02 06:27:55 (14.6 MB/s) - ‘2600-0.txt.2’ saved [3359405/3359405]



In [None]:
# Read .txt dataset in for inspection
with open('2600-0.txt', 'r', encoding='utf-8') as f:
  text = f.read()

In [None]:
# Print the length of the dataset
print("Length of dataset (in characters): ", len(text))

Length of dataset (in characters):  3227520


In [None]:
# Find the beginning of the content
# print(text[7454:10000])

# Find the end of the content
# print(text[-18413:])

# Content begins at character 7454, disregard table of contents
text = text[7454:-18413]

In [None]:
# Print first 1000 characters to get a sense of style
print(text[:1000])

BOOK ONE: 1805





CHAPTER I

“Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes. But I warn you, if you don’t tell me that this means war,
if you still try to defend the infamies and horrors perpetrated by that
Antichrist—I really believe he is Antichrist—I will have nothing
more to do with you and you are no longer my friend, no longer my
‘faithful slave,’ as you call yourself! But how do you do? I see I
have frightened you—sit down and tell me all the news.”

It was in July, 1805, and the speaker was the well-known Anna Pávlovna
Schérer, maid of honor and favorite of the Empress Márya Fëdorovna.
With these words she greeted Prince Vasíli Kurágin, a man of high
rank and importance, who was the first to arrive at her reception. Anna
Pávlovna had had a cough for some days. She was, as she said, suffering
from la grippe; grippe being then a new word in St. Petersburg, used
only by the elite.

All her invitations without exception, written in French, and de

In [None]:
# Get all unique characters that occur in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Print the list of the chars and vocab size
print(''.join(chars))
print(vocab_size)


 !()*,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÁÉàáâäæçèéêëíîïóôöúüýœ—‘’“”
104


In [None]:
# Now we tokenize the text, map characters to integers
stoi = {ch:i for i,ch in enumerate(chars)}  # character -> integer map
itos = {i:ch for i,ch in enumerate(chars)}  # integer -> character map

# Encoders and decoders for string -> list[int], and vice versa
encode = lambda s: [stoi[c] for c in s]           # encoder, takes a string and outputs a list of integers
decode = lambda l: ''.join([itos[i] for i in l])  # decoder, takes a list of integers and outputs a string

# Test functionality
print(encode("good morning"))
print(decode(encode("good morning")))

[56, 64, 64, 53, 1, 62, 64, 67, 63, 58, 63, 56]
good morning


In [None]:
# Now encode the entire dataset into a torch.Tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)

# Print tensor shape and data type
print("Shape of data tensor: ", data.shape)
print("Data tensor data type: ", data.dtype)

# Print first 1000 characters encoded as integers, with max being vocab_size-1
print(data[:1000])

Shape of data tensor:  torch.Size([3201653])
Data tensor data type:  torch.int64
tensor([ 25,  38,  38,  34,   1,  38,  37,  28,  20,   1,  11,  18,  10,  15,
          0,   0,   0,   0,   0,   0,  26,  31,  24,  39,  43,  28,  41,   1,
         32,   0,   0, 102,  46,  54,  61,  61,   6,   1,  39,  67,  58,  63,
         52,  54,   6,   1,  68,  64,   1,  30,  54,  63,  64,  50,   1,  50,
         63,  53,   1,  35,  70,  52,  52,  50,   1,  50,  67,  54,   1,  63,
         64,  72,   1,  59,  70,  68,  69,   1,  55,  50,  62,  58,  61,  74,
          1,  54,  68,  69,  50,  69,  54,  68,   1,  64,  55,   1,  69,  57,
         54,   0,  25,  70,  64,  63,  50,  65,  50,  67,  69,  54,  68,   8,
          1,  25,  70,  69,   1,  32,   1,  72,  50,  67,  63,   1,  74,  64,
         70,   6,   1,  58,  55,   1,  74,  64,  70,   1,  53,  64,  63, 101,
         69,   1,  69,  54,  61,  61,   1,  62,  54,   1,  69,  57,  50,  69,
          1,  69,  57,  58,  68,   1,  62,  54,  50,  63,  68

In [None]:
# Now to split the data for training and validation
percent_training = 0.9                  # using 90% for training, this can change
n = int(percent_training * len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
# Determine block size, the maximum length of each chunk of the dataset to be worked on at a given time
block_size = 8    # choosing 8, this can change

# Print the first block_size+1 in the training data
train_data[:block_size+1]

tensor([25, 38, 38, 34,  1, 38, 37, 28, 20])

In [None]:
# Visualize the desired pattern of prediction for each chunk of data
x = train_data[:block_size]
y = train_data[1:block_size+1]    # target is the token following the context

for t in range(block_size):
  context = x[:t+1]
  target = y[t]         # y is staggered one past x
  print(f"when the input/context is {context.tolist()}, the output/target is: {target}")

when the input/context is [25], the output/target is: 38
when the input/context is [25, 38], the output/target is: 38
when the input/context is [25, 38, 38], the output/target is: 34
when the input/context is [25, 38, 38, 34], the output/target is: 1
when the input/context is [25, 38, 38, 34, 1], the output/target is: 38
when the input/context is [25, 38, 38, 34, 1, 38], the output/target is: 37
when the input/context is [25, 38, 38, 34, 1, 38, 37], the output/target is: 28
when the input/context is [25, 38, 38, 34, 1, 38, 37, 28], the output/target is: 20


In [None]:
# Extend this to have multiple batches by adding a batch dimension.
# Stack multiple sequences into single tensor for efficiency-- processed independently.

# Randomly generate sequences from the dataset
torch.manual_seed(1234)
batch_size = 4            # Number of independent sequences that are processed in parallel
block_size = 8            # Maximum context length for predictions

# Generate small batch of data of inputs x and targets y
def get_batch(split):

  # are we sampling from train or val split
  data = train_data if split == 'train' else val_data

  # Creates a randomized 1-dimensional tensor (vector) with batch_size elements,
  # each being a random offset that indicates the position of each block
  ix = torch.randint(high=len(data)-block_size, size=(batch_size,))

  # Stack input sequences into a tensor x, and targets in a tensor y,
  # stacking them as rows in their respective tensor.
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])

  return x, y


In [None]:
# Now lets get a batch
xb, yb = get_batch('train')

# Print the input shape and tensor
print('inputs:')
print(xb.shape)
print(xb)
# Print the output shape and tensor
print('outputs:')
print(yb.shape)
print(yb)

print('\n')

# 32 independent examples into one batch (4 random sequences of 8 characters)
for b in range(batch_size):     # B: batch dimension
  for t in range(block_size):   # T: "time" dimension (time steps)
    context = xb[b, :t+1]
    target = yb[b,t]
    print(f"when the input/context is {context.tolist()}, the output/target is: {target}")

inputs:
torch.Size([4, 8])
tensor([[54, 58, 56, 57, 51, 64, 67, 68],
        [69,  1, 65, 64, 67, 52, 57,  8],
        [62, 50, 61, 61,  1, 52, 61, 54],
        [ 1, 53, 58, 55, 55, 54, 67, 54]])
outputs:
torch.Size([4, 8])
tensor([[58, 56, 57, 51, 64, 67, 68,  1],
        [ 1, 65, 64, 67, 52, 57,  8,  1],
        [50, 61, 61,  1, 52, 61, 54, 50],
        [53, 58, 55, 55, 54, 67, 54, 63]])


when the input/context is [54], the output/target is: 58
when the input/context is [54, 58], the output/target is: 56
when the input/context is [54, 58, 56], the output/target is: 57
when the input/context is [54, 58, 56, 57], the output/target is: 51
when the input/context is [54, 58, 56, 57, 51], the output/target is: 64
when the input/context is [54, 58, 56, 57, 51, 64], the output/target is: 67
when the input/context is [54, 58, 56, 57, 51, 64, 67], the output/target is: 68
when the input/context is [54, 58, 56, 57, 51, 64, 67, 68], the output/target is: 1
when the input/context is [69], the ou

In [None]:
# Input to the transformer
print(xb)

tensor([[54, 58, 56, 57, 51, 64, 67, 68],
        [69,  1, 65, 64, 67, 52, 57,  8],
        [62, 50, 61, 61,  1, 52, 61, 54],
        [ 1, 53, 58, 55, 55, 54, 67, 54]])


In [None]:
# Implement the simplest language model, a Bigram Language Model

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1234)

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    # each token reads the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):

    # idx and targets are both (B,T) tensor of integers (Batch #, Time #)
    # pass inputs through embedding layer to get logits
    logits = self.token_embedding_table(idx)    # (B,T,C)

    #
    if targets is None:
      # allows for optional targets, otherwise just unlabeled generation
      loss = None
    else:
      # Reshaping since PyTorch expects channel in 2nd dimension
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  # Where max_new_tokens is the number of tokens to be generated.
  # This function takes a (B,T) sequence and extends it to be (B,T+1),
  # then to (B,T+2), etc. until reaching (B,T+max_new_tokens).
  def generate(self, idx, max_new_tokens):

    for _ in range(max_new_tokens):
      # get the predictions with current indices, ignore loss.
      logits, loss = self(idx)
      # focus on only last time step, pluck out last element in T
      logits = logits[:, -1, :]
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1)
      # get single prediction from softmax distribution for next token in sequence
      idx_next = torch.multinomial(probs, num_samples=1)
      # append sampled index to running sequence
      idx = torch.cat((idx, idx_next), dim=1)

    return idx


In [None]:
# Use the BLM Model
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)

# Print resulting shape and loss
print(logits.shape)
print(loss)

# Generate 100 characters from a single newline character, 1 batch and 1 time.
print(decode(m.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([32, 104])
tensor(5.3348, grad_fn=<NllLossBackward0>)

íA=öWêkUäKáp:Tv
Y7?;âQægïöx*’=àczNöEshLiCä*08ZzHç“zJ êC AT;Tóó2’î-ëUýü2*OrA-”ólëThKZh‘RM?sæcTtf3;”P7


This is meaningless output before training a model. Represents pure chance and is currently only using a single character to predict the next.

In [None]:
# Create an Optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)    # small network, can afford to go higher than 3e-4

In [None]:
# Typical Training Loop
batch_size = 32
for steps in range(10000):

  # Get a batch of data
  xb, yb = get_batch('train')

  # Evaluate loss
  logits, loss = m(xb, yb)                  # get prediction and loss
  optimizer.zero_grad(set_to_none=True)     # zero model parameters to prevent accumulation
  loss.backward()                           # back propagation
  optimizer.step()                          # step in direction of negative gradient


# Print resulting loss from training
print(loss.item())

# Increasing number of iterations will decrease loss, until plateau due to very simple model

2.383948802947998


In [None]:
# Print resulting generation after training (500 characters)
print(decode(m.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens=400)[0].tolist()))


Namesthe TEhethe He to fof stesimexe lirs toft o ent
ou witat and “HAne r. ifr, bimomras. ale
Dón enst thed depleanererdstthercoosh t che tusioo ay owit nca and y Pimuinthanes a buthe, bumbullily aply, sed sompsove, owe he Sóng het g ous g whanopo taly ooly s hesefe ctooppeincessache s bst
rss, es, Palimol e
dour
olindid atighisuny ing ar s h! Am, s tht k, sst
Med poised s avan heve boflúgot fokie


In [None]:
# Introduce ATTENTION: TRANSFORMERS
