In [2]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [3]:
DATASET_PATH = "/content/drive/MyDrive/Colab Notebooks/build_a_GPT/wikisent2.txt"
PERCENTAGE_TRAINING = 0.9
CONTEXT_LENGTH = 8
BATCHSIZE = 4
LEARNINGRATE = 1e-4
EPOCHS = 10000

##**Get the data:**##

In [4]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [5]:
with open(DATASET_PATH, "r") as f:
  text = f.read()

# Test:
print(text[:500])

0.000123, which corresponds to a distance of 705 Mly, or 216 Mpc.
000webhost is a free web hosting service, operated by Hostinger.
0010x0010 is a Dutch-born audiovisual artist, currently living in Los Angeles.
0-0-1-3 is an alcohol abuse prevention program developed in 2004 at Francis E. Warren Air Force Base based on research by the National Institute on Alcohol Abuse and Alcoholism regarding binge drinking in college students.
0.01 is the debut studio album of H3llb3nt, released on February 20


Getting all the caracters available:

In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(chars)
print(vocab_size)

['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']
96


##**Tokenizer:**##

The transformers cannot read string, we need to translate the data to integer. To do that, we will use stoi and itos, to map the characters to integers:

In [7]:
# 2 dictionnaries:
#stoi maps caracters to integers: "c" : 1 for example
# itos maps integers to caracters 1: "c"

stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

# We can use tiktoken lib also, but keep it simple:
def encode(s):
  return [stoi[c] for c in s]

def decode(l):
  return "".join([itos[i] for i in l])

# Test:
print(encode("Hey, i'm Seb"))
print(decode(encode("Hey, i'm Seb")))

[41, 70, 90, 13, 1, 74, 8, 78, 1, 52, 70, 67]
Hey, i'm Seb


In [8]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
#print(data[:100])

torch.Size([934571982]) torch.int64


In [9]:
# Split the dataset:
n = int(PERCENTAGE_TRAINING * len(data))
train_data = data[:n]
val_data = data[n:]

In [10]:
# Input:
x = train_data[:CONTEXT_LENGTH]

#Expected output:
y = train_data[1:CONTEXT_LENGTH+1]
for t in range(CONTEXT_LENGTH):
  context = x[:t+1]
  target = y[t]
  print(f"when input is {context} the target: {target}")

when input is tensor([17]) the target: 15
when input is tensor([17, 15]) the target: 17
when input is tensor([17, 15, 17]) the target: 17
when input is tensor([17, 15, 17, 17]) the target: 17
when input is tensor([17, 15, 17, 17, 17]) the target: 18
when input is tensor([17, 15, 17, 17, 17, 18]) the target: 19
when input is tensor([17, 15, 17, 17, 17, 18, 19]) the target: 20
when input is tensor([17, 15, 17, 17, 17, 18, 19, 20]) the target: 13


In [11]:
# Set a random seed, to always have the same generation:
torch.manual_seed(1337)

def get_batch(split):
  data = train_data if split == "train" else val_data
  # Starting position:
  ix = torch.randint(len(data) - CONTEXT_LENGTH, (BATCHSIZE,))

  x = torch.stack([data[i:i+CONTEXT_LENGTH] for i in ix])
  y = torch.stack([data[i+1:i+CONTEXT_LENGTH+1] for i in ix])
  return x, y

xb, yb = get_batch("train")
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 8])
tensor([[71, 85, 88, 66, 83, 70,  1, 78],
        [69,  1, 66, 84,  1, 66, 79,  1],
        [70, 69,  1, 35, 86, 85, 85, 70],
        [ 1, 84, 70, 77, 70, 68, 85, 74]])
targets:
torch.Size([4, 8])
tensor([[85, 88, 66, 83, 70,  1, 78, 86],
        [ 1, 66, 84,  1, 66, 79,  1, 70],
        [69,  1, 35, 86, 85, 85, 70,  1],
        [84, 70, 77, 70, 68, 85, 74, 79]])


In [12]:
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    logits = self.token_embedding_table(idx) # shape is B, T, C

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      # Doc Pytorch: cross_entropy needs data (minibatch,C)  = (B*T, C)
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)

      loss = F.cross_entropy(logits, targets)
    return logits, loss
  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      logits, loss = self(idx)
      logits = logits[:, -1, :]

      proba = F.softmax(logits, dim=-1)
      # Add creativity: choose with the probability with multinomial:
      # If we have a tensor [0.9, 0.05, 0.7], we have great chance that multinomial choose 1 or 3, due to high probability
      idx_next = torch.multinomial(proba, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1)
    return idx



m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)



torch.Size([32, 96])


In [13]:
optimizer = torch.optim.AdamW(m.parameters(), lr=LEARNINGRATE)

In [14]:
# Training of the first model:
from tqdm import tqdm

for epoch in tqdm(range(EPOCHS)):
  xb, yb = get_batch("train")

  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())
#

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))

 16%|█▌        | 1568/10000 [00:32<02:57, 47.53it/s]


KeyboardInterrupt: 

The Trick behind Self Attention:

In [18]:
torch.manual_seed(1)
B, T, C = 4, 8, 2

x = torch.randn((B, T, C))
print(x.shape)

# if We look at the 5th token, we want him to communicate
# only with the 4th, 3rd, 2nd and 1rst (because they are the past)

torch.Size([4, 8, 2])


In [20]:
# Calculate the mean of the token and the previous one (from the past):
x_bow = torch.zeros((B, T, C))

for b in range(B):
  for t in range(T):
    xprev = x[b,:t+1]
    x_bow[b,t] = xprev.mean(dim=0)

# Not efficient, the m=matrix multiplication is more efficient:



Trick of torch: torch.tril gives us a triangular matrix:

A =

[1     0     0    ]

[0.5   0.5   0    ]

[0.33  0.33  0.33 ]

In [21]:
a = torch.tril(torch.ones(3, 3))
print(a)

a = a / torch.sum(a, 1, keepdim=True)
print(a)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])


In [22]:
b = torch.randint(0, 10, (3, 2)).float()
print(b)
c = a @ b
print(c)
# C is the mean of the previous tokens...

tensor([[3., 3.],
        [1., 7.],
        [2., 8.]])
tensor([[3., 3.],
        [2., 5.],
        [2., 6.]])


In [23]:
#version2 using this trick:

weights = torch.tril(torch.ones(T, T))
weights = weights / weights.sum(1, keepdim=True)
print(weights)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [24]:
xbow2 = weights @ x # B T C x B T T => B T C

torch.allclose(x_bow, xbow2)

True

In [25]:
# Version 3: using Softmax:
tril = torch.tril(torch.ones(T, T))
print(tril)
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
print(wei)

wei = F.softmax(wei, dim=-1)

print(wei)

xbow3 = wei @ x
torch.allclose(xbow3, xbow2)

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000,

True