### Loading Modules

In [1]:
import torch

### Loading the Dataset

In [2]:
# Load the tiny shakespeare dataset
dataset = "tiny_shakespeare.txt"

# Load the dataset into a string
with open(dataset, "r", encoding="utf-8") as f:
    text = f.read()

In [3]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [4]:
# Print the first 500 characters
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [5]:
# Check all unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)



 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


### Building Tokenization Encoder and Decoder

In [6]:
# create mapping from characters to integers

stoi = { ch: i for i, ch in enumerate(chars)}
itos = { i: ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [7]:
print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


### Checking out Tokenizer used by OpenAI for GPT-2

In [8]:
# Checking vocab size for GPT 2 using tiktoken
import tiktoken

enc = tiktoken.get_encoding("gpt2")
enc.n_vocab

50257

In [9]:
enc.encode("hii there")

[71, 4178, 612]

In [10]:
enc.decode([71, 4178, 612])

'hii there'

### Encoding entire Shakespeare Dataset

In [11]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:500])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

### Splitting Data

In [14]:
# Split the Data into Train and Validation Sets
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]

print(len(train_data), len(val_data))

1003854 111540


### Chunking

We dont feed entire data to transformer all at once as it will be computationally expensive. Therefore we break the data into chunks and feed them one by one.

These chunks have some max length called. Lets call it block_size

In [15]:
block_size = 8
train_data[:block_size+1] # This chunk has multiple examples packed into it. There are 8 examples here.

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [16]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target is {target}")

When input is tensor([18]) the target is 47
When input is tensor([18, 47]) the target is 56
When input is tensor([18, 47, 56]) the target is 57
When input is tensor([18, 47, 56, 57]) the target is 58
When input is tensor([18, 47, 56, 57, 58]) the target is 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


### Batching

After chunking, we need to batch the chunks into batches of size `batch_size`. Batching keeps the Gpus busy and allows for faster training. So in a batch, we have `batch_size` number of chunks, all training at the same time.

In [17]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of input-target pairs
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i+block_size] for i in ix])
    y = torch.stack([data[i+1 : i+block_size+1] for i in ix])
    return x, y

In [18]:
xb, yb = get_batch('train')

print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [19]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print("when input is ", context, "target: ", target)

when input is  tensor([24]) target:  tensor(43)
when input is  tensor([24, 43]) target:  tensor(58)
when input is  tensor([24, 43, 58]) target:  tensor(5)
when input is  tensor([24, 43, 58,  5]) target:  tensor(57)
when input is  tensor([24, 43, 58,  5, 57]) target:  tensor(1)
when input is  tensor([24, 43, 58,  5, 57,  1]) target:  tensor(46)
when input is  tensor([24, 43, 58,  5, 57,  1, 46]) target:  tensor(43)
when input is  tensor([24, 43, 58,  5, 57,  1, 46, 43]) target:  tensor(39)
when input is  tensor([44]) target:  tensor(53)
when input is  tensor([44, 53]) target:  tensor(56)
when input is  tensor([44, 53, 56]) target:  tensor(1)
when input is  tensor([44, 53, 56,  1]) target:  tensor(58)
when input is  tensor([44, 53, 56,  1, 58]) target:  tensor(46)
when input is  tensor([44, 53, 56,  1, 58, 46]) target:  tensor(39)
when input is  tensor([44, 53, 56,  1, 58, 46, 39]) target:  tensor(58)
when input is  tensor([44, 53, 56,  1, 58, 46, 39, 58]) target:  tensor(1)
when input i

### Simplest NN: Bigram Language Model

In [51]:
torch.manual_seed(1337)

class BigramLanguageModel(torch.nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = torch.nn.Embedding(vocab_size, vocab_size)

    def forward(self, x, y = None):
        # x is (B, T) tensor of indices.
        logits = self.token_embedding_table(x) # (B, T, C) = (4, 8, 65)
        
        # Loss
        if y is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            y = y.view(B*T)
            loss = torch.functional.F.cross_entropy(logits, y)

        return logits, loss

    def generate(self, x, max_new_tokens):
        # x is (B, T) tensor of indices in the current context
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(x)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = torch.nn.functional.softmax(logits, dim = -1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
            # append sampled index to the running sequence
            x = torch.cat((x, idx_next), dim = 1) # (B, T+1)
        return x

In [52]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([256, 65])
tensor(4.7405, grad_fn=<NllLossBackward0>)


In [53]:
x = torch.zeros((1, 1), dtype = torch.long) # Since idx 0 is a new line character
out = m.generate(x, max_new_tokens = 100)
print(decode(out[0].tolist()))


SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


### Training the Model

In [54]:
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)

In [57]:
batch_size = 32

for steps in range(10000):

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    logits, loss = m(xb, yb)

    # Backpropagate and update weights
    loss.backward()

    # Update the weights
    optimizer.step()

    # Reset the gradients
    optimizer.zero_grad(set_to_none = True) # Set to None instead of zero to free up memory

print(loss.item())

2.456190347671509


In [58]:
x = torch.zeros((1, 1), dtype = torch.long) # Since idx 0 is a new line character
out = m.generate(x, max_new_tokens = 100)
print(decode(out[0].tolist()))


BRS:

THAnrt t fa boun-s trconnou

No: ENGUS:
tepare ofo.'s ne:
We Prellothe, s;
NE: t an re, belono
