In [1]:
# #For this I am using Shakespeare's text
# # We always start with a dataset to train on. Let's download the tiny shakespeare dataset
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [2]:
# After this we will use this data to find the vocab size:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [4]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
# okay now get the vocabulary
vocab = set(text)

In [6]:
len(vocab)

65

So the vocabulary has 65 different types of tokens.

In [7]:
import torch
# we would like vocab to be a sorted list
vocab = sorted(list(vocab))

In [8]:
print(' '.join(vocab))


   ! $ & ' , - . 3 : ; ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z


In [9]:
# Now we need to map a number to each token(letter) so as to input it to the model
# and also have the reverse mapping so that we can convert the number back to letter on receiving output
stoi = { ch : i for i, ch in enumerate(vocab)}
itos = { i : ch for i, ch in enumerate(vocab)}

In [10]:
stoi

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [11]:
itos

{0: '\n',
 1: ' ',
 2: '!',
 3: '$',
 4: '&',
 5: "'",
 6: ',',
 7: '-',
 8: '.',
 9: '3',
 10: ':',
 11: ';',
 12: '?',
 13: 'A',
 14: 'B',
 15: 'C',
 16: 'D',
 17: 'E',
 18: 'F',
 19: 'G',
 20: 'H',
 21: 'I',
 22: 'J',
 23: 'K',
 24: 'L',
 25: 'M',
 26: 'N',
 27: 'O',
 28: 'P',
 29: 'Q',
 30: 'R',
 31: 'S',
 32: 'T',
 33: 'U',
 34: 'V',
 35: 'W',
 36: 'X',
 37: 'Y',
 38: 'Z',
 39: 'a',
 40: 'b',
 41: 'c',
 42: 'd',
 43: 'e',
 44: 'f',
 45: 'g',
 46: 'h',
 47: 'i',
 48: 'j',
 49: 'k',
 50: 'l',
 51: 'm',
 52: 'n',
 53: 'o',
 54: 'p',
 55: 'q',
 56: 'r',
 57: 's',
 58: 't',
 59: 'u',
 60: 'v',
 61: 'w',
 62: 'x',
 63: 'y',
 64: 'z'}

In [12]:
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i.item()] for i in l])  # Convert tensor to Python int

In [13]:
enc_text = encode(text)

In [14]:
len(enc_text)

1115394

In [15]:
# We have to use torch now because it will help in backpropagation
import torch
from torch import nn
from torch.nn import functional as F
data = torch.tensor(enc_text)

In [16]:
data

tensor([18, 47, 56,  ..., 45,  8,  0])

In [17]:
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [18]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

We are using our block size of 8, meaning the maximum context length that the LLM can receive upon is 8.

In [19]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [20]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


We will consider using a batch size of 4, i.e the model can complete 4 requests per iteration

In [21]:
batch_size = 4

In [22]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

inputs:
torch.Size([4, 8])
tensor([[ 1, 58, 61, 53,  1, 42, 39, 63],
        [ 0, 25, 13, 30, 15, 21, 33, 31],
        [ 1, 57, 51, 39, 50, 50,  1, 47],
        [56,  1, 47, 58,  1, 46, 47, 51]])
targets:
torch.Size([4, 8])
tensor([[58, 61, 53,  1, 42, 39, 63, 57],
        [25, 13, 30, 15, 21, 33, 31, 10],
        [57, 51, 39, 50, 50,  1, 47, 52],
        [ 1, 47, 58,  1, 46, 47, 51, 11]])
----


In [23]:

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

when input is [1] the target: 58
when input is [1, 58] the target: 61
when input is [1, 58, 61] the target: 53
when input is [1, 58, 61, 53] the target: 1
when input is [1, 58, 61, 53, 1] the target: 42
when input is [1, 58, 61, 53, 1, 42] the target: 39
when input is [1, 58, 61, 53, 1, 42, 39] the target: 63
when input is [1, 58, 61, 53, 1, 42, 39, 63] the target: 57
when input is [0] the target: 25
when input is [0, 25] the target: 13
when input is [0, 25, 13] the target: 30
when input is [0, 25, 13, 30] the target: 15
when input is [0, 25, 13, 30, 15] the target: 21
when input is [0, 25, 13, 30, 15, 21] the target: 33
when input is [0, 25, 13, 30, 15, 21, 33] the target: 31
when input is [0, 25, 13, 30, 15, 21, 33, 31] the target: 10
when input is [1] the target: 57
when input is [1, 57] the target: 51
when input is [1, 57, 51] the target: 39
when input is [1, 57, 51, 39] the target: 50
when input is [1, 57, 51, 39, 50] the target: 50
when input is [1, 57, 51, 39, 50, 50] the target

We use channels here meaning as the dimensions of the token-embedding

In [24]:
n_embd = 32

In [25]:
token_embedding_table = nn.Embedding(len(vocab), n_embd)

In [26]:

# self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time(or block), channels
x_out =token_embedding_table(xb)

# let's see a single Head perform self-attention
head_size = 32
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x_out)   # (B, T, 32)
q = query(x_out) # (B, T, 32)
wei =  q @ k.transpose(-2, -1) * head_size**-0.5# (B, T, 32) @ (B, 32, T) ---> (B, T, T) 
# Attention weights calculated

tril = torch.tril(torch.ones(T, T)) # lower traingular matrix
wei = wei.masked_fill(tril == 0, float('-inf')) # lower triangular matrix untouched as upper one stores neg.inf
wei = F.softmax(wei, dim=-1) #perform softmax along the rows

v = value(x_out)
out = wei @ v

out.shape

torch.Size([4, 8, 32])

In [27]:
n_embd = 32
dropout = 0.3

ffn_c1 = nn.Linear(n_embd, 4 * n_embd)  # projection up
ffn_c2 = nn.Linear(4 * n_embd, n_embd)   # projection down
ln1 = nn.LayerNorm(n_embd)  # layernorm for attention
ln2 = nn.LayerNorm(n_embd)  # layernorm for ffn
drop = nn.Dropout(dropout)


In [28]:
x_out =token_embedding_table(xb)

In [29]:
print(x_out.shape)
print(out.shape)

torch.Size([4, 8, 32])
torch.Size([4, 8, 32])


In [30]:
x_out = x_out + out
x_out = ln1(x_out)

# feed forward network
x_out = ffn_c2(F.relu(ffn_c1(x_out)))
x_out = drop(x_out)  # apply dropout
x_out = ln2(x_out)   # final layer norm


# for sequence prediction, add final projection to vocabulary size
vocab_size = 65 
lm_head = nn.Linear(n_embd, vocab_size)
logits = lm_head(x_out)  # (B, T, vocab_size)


In [31]:
# get predictions
probs = F.softmax(logits, dim=-1)
# if you want just the next token prediction:
next_token_logits = logits[:, -1, :]  # (B, vocab_size)

print(f"Full sequence logits shape: {logits.shape}")  # [4, 8, 100]
print(f"Next token logits shape: {next_token_logits.shape}")  # [4, 100]


Full sequence logits shape: torch.Size([4, 8, 65])
Next token logits shape: torch.Size([4, 65])


In [32]:
print(next_token_logits[0])

tensor([-0.1443, -0.2766, -0.4747, -0.6146, -0.6413, -0.6616,  0.1625,  0.1650,
        -0.7168,  0.0272, -0.1567,  0.3417, -0.2788, -0.8119, -0.4043, -0.1428,
        -0.9130,  1.0195,  0.4471, -0.3599, -0.6686,  0.7302,  0.1412,  0.6330,
        -0.2456, -0.8446, -0.4725, -0.9241, -0.9292, -0.0419, -0.3271,  0.8514,
        -0.6371, -1.0336,  0.6583, -0.5417,  0.0699, -0.7249,  0.6746,  0.8653,
        -0.0172, -0.2517,  0.4788, -0.5700, -0.3379, -0.0820, -0.5902,  0.5371,
        -0.2949,  1.2138, -0.3999, -0.4621, -0.9018,  0.4939,  0.6795, -0.5234,
         0.6740,  0.1504,  0.3867,  0.0429,  0.6505, -0.2266, -0.1173,  0.1074,
         0.3538], grad_fn=<SelectBackward0>)


In [33]:
probs = torch.softmax(next_token_logits, dim=-1)  # Normalize across the token dimension


In [34]:
probs

tensor([[0.0125, 0.0109, 0.0090, 0.0078, 0.0076, 0.0074, 0.0170, 0.0170, 0.0070,
         0.0148, 0.0123, 0.0203, 0.0109, 0.0064, 0.0096, 0.0125, 0.0058, 0.0400,
         0.0226, 0.0101, 0.0074, 0.0299, 0.0166, 0.0272, 0.0113, 0.0062, 0.0090,
         0.0057, 0.0057, 0.0138, 0.0104, 0.0338, 0.0076, 0.0051, 0.0279, 0.0084,
         0.0155, 0.0070, 0.0283, 0.0343, 0.0142, 0.0112, 0.0233, 0.0082, 0.0103,
         0.0133, 0.0080, 0.0247, 0.0107, 0.0485, 0.0097, 0.0091, 0.0059, 0.0236,
         0.0285, 0.0085, 0.0283, 0.0168, 0.0212, 0.0151, 0.0276, 0.0115, 0.0128,
         0.0161, 0.0205],
        [0.0146, 0.0162, 0.0083, 0.0355, 0.0263, 0.0106, 0.0047, 0.0078, 0.0190,
         0.0243, 0.0210, 0.0074, 0.0146, 0.0127, 0.0071, 0.0064, 0.0153, 0.0242,
         0.0374, 0.0092, 0.0116, 0.0363, 0.0094, 0.0063, 0.0122, 0.0072, 0.0059,
         0.0206, 0.0076, 0.0077, 0.0119, 0.0463, 0.0187, 0.0115, 0.0116, 0.0246,
         0.0055, 0.0119, 0.0259, 0.0134, 0.0081, 0.0062, 0.0197, 0.0324, 0.0212,
  

In [35]:
predicted_token = torch.argmax(probs, dim=-1)

In [36]:
predicted_token

tensor([49, 31, 31, 39])

In [37]:
decode(predicted_token)

'kSSa'

In [38]:
xb

tensor([[ 1, 58, 61, 53,  1, 42, 39, 63],
        [ 0, 25, 13, 30, 15, 21, 33, 31],
        [ 1, 57, 51, 39, 50, 50,  1, 47],
        [56,  1, 47, 58,  1, 46, 47, 51]])

In [39]:
decode(xb[1])

'\nMARCIUS'

In [40]:
decode(yb[1])

'MARCIUS:'

In [41]:
yb[1][-1]

tensor(10)

In [42]:
xb.dtype

torch.int64

In [43]:
yb.dtype

torch.int64

In [44]:
@torch.no_grad()
def estimate_loss(result, expected_value):
    loss = F.cross_entropy(result, expected_value)
    return loss

In [45]:
estimate_loss(xb[1][-1], yb[1][-1])

RuntimeError: Expected floating point type for target with class probabilities, got Long