In [1]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print('Size of dataset: ', len(text))

Size of dataset:  1115394


In [3]:
# print first 1000 charactrs
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [4]:
# get the unique characters from the text file set(text)
# convert to a list list(set(txt))
# sort
chars = sorted(list(set(text)))

In [6]:
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [8]:
# tokenize
# enumerate` returns an iterator that produces tuples containing a 
# count (from start which defaults to 0) and the values obtained from iterating over the sequence (`chars`).
stoi = { ch:i for i,ch in enumerate(chars)}
itos = { i:ch for i,ch in enumerate(chars)}

# encode, decode
# The `encode` function takes a string `s` and returns a list of integers. For each character `c` in the string, 
# ot looks up its corresponding index in the `stoi` dictionary and adds it to the list.
encode = lambda s: [stoi[c] for c in s]  

# The `decode` function takes a list of integers `l` and returns a string. For each index `i` in the list, 
#it looks up its corresponding character in the `itos` dictionary and joins all characters together using the `''.join()` method

decode = lambda l: ''.join(itos[i] for i in l)     # decoderL take a list of integers, convert to string

print(encode('hello there'))
print(decode(encode('hello there')))

[46, 43, 50, 50, 53, 1, 58, 46, 43, 56, 43]
hello there


In [16]:
# tokenizer
# google uses sentencepiece - subword unit
# opeani uses tiktoken
import tiktoken
enc = tiktoken.get_encoding('gpt2')
print(enc.n_vocab)               # print size of vocabulary
print(enc.encode('hii there'))

50257
[71, 4178, 612]


In [17]:
# encode entire text dataset and store it into a torch.tensor
import torch
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [None]:
print(data[:1000])

In [19]:
# split up data into train and validation sets
n = int(0.8*len(data))
train_data = data[0:n]
val_data    = data[n:]

In [22]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [25]:
# target is the next char give a char or a string of char
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is: {context}, the target is: {target}")

when input is: tensor([18]), the target is: 47
when input is: tensor([18, 47]), the target is: 56
when input is: tensor([18, 47, 56]), the target is: 57
when input is: tensor([18, 47, 56, 57]), the target is: 58
when input is: tensor([18, 47, 56, 57, 58]), the target is: 1
when input is: tensor([18, 47, 56, 57, 58,  1]), the target is: 15
when input is: tensor([18, 47, 56, 57, 58,  1, 15]), the target is: 47
when input is: tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is: 58


In [35]:
torch.manual_seed(1337)

batch_size = 4           # how many indepenent sequences will be processed in parallel
block_size = 8           # what is the max context length for predictions

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y
    

In [36]:
xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 8])
tensor([[58, 63,  8,  0,  0, 19, 24, 27],
        [39, 59, 45, 46, 58,  1, 46, 43],
        [49, 43, 57,  1, 53, 50, 42,  1],
        [52, 41, 47, 43, 52, 58,  1, 56]])
targets:
torch.Size([4, 8])
tensor([[63,  8,  0,  0, 19, 24, 27, 33],
        [59, 45, 46, 58,  1, 46, 43,  1],
        [43, 57,  1, 53, 50, 42,  1, 46],
        [41, 47, 43, 52, 58,  1, 56, 47]])


In [37]:
print("------")
for b in range(batch_size):          # batch size
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is: {context.tolist()}, the target is: {target}")

------
when input is: [58], the target is: 63
when input is: [58, 63], the target is: 8
when input is: [58, 63, 8], the target is: 0
when input is: [58, 63, 8, 0], the target is: 0
when input is: [58, 63, 8, 0, 0], the target is: 19
when input is: [58, 63, 8, 0, 0, 19], the target is: 24
when input is: [58, 63, 8, 0, 0, 19, 24], the target is: 27
when input is: [58, 63, 8, 0, 0, 19, 24, 27], the target is: 33
when input is: [39], the target is: 59
when input is: [39, 59], the target is: 45
when input is: [39, 59, 45], the target is: 46
when input is: [39, 59, 45, 46], the target is: 58
when input is: [39, 59, 45, 46, 58], the target is: 1
when input is: [39, 59, 45, 46, 58, 1], the target is: 46
when input is: [39, 59, 45, 46, 58, 1, 46], the target is: 43
when input is: [39, 59, 45, 46, 58, 1, 46, 43], the target is: 1
when input is: [49], the target is: 43
when input is: [49, 43], the target is: 57
when input is: [49, 43, 57], the target is: 1
when input is: [49, 43, 57, 1], the targ

In [46]:
# neural network
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

# This line defines a new class called `BigramLanguageModel`, which inherits from PyTorch's `nn.Module` class.
#  It uses a lookup table (token embedding table) to map each token to a numerical vector representation. 
# The model can be trained on a dataset of text data to learn the patterns and relationships between tokens.
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        # The `super().__init__()` line calls the `__init__` method of the parent class (`nn.Module`).
        super().__init__()
        # each token directly reads of the logits for th next token from the lookup table

        # The `self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)` line creates an embedding table for the tokens. 
        # An embedding table is a lookup table that maps integer indices to dense vectors. In this case, the table has 
        # `vocab_size` rows (one for each token in the vocabulary) and `vocab_size` columns 
        # (since we're using a single vector to represent each token).
        
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        

    # The `forward` method is a special method in PyTorch modules that defines the forward pass through the network. 
    # In this case, it takes two arguments `idx` and `targets`, both of which are tensors of integers.
    def forward(self, idx, targets):

        # ifx and targets are both (B,T) tensor of integers
        # The `logits = self.token_embedding_table(idx)` line looks up the embeddings for the tokens in the input tensor `idx`. 
        # The resulting tensor has shape `(B,T,C)`, where `B` is the batch size, 
        #`T` is the sequence length, and `C` is the embedding size (which is `vocab_size` in this case).

        logits = self.token_embedding_table(idx)   # (B,T,C)
        
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
      

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            # apply softmax
            probs = F.softmax(logits, dim=1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples =1)
            # append sample index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
        

In [48]:
m = BigramLanguageModel(vocab_size)
logits,loss = m(xb,yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.5553, grad_fn=<NllLossBackward0>)
