In [119]:
with open("arxiv.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [120]:
print(f"length of dataset in characters: {len(text)}")

length of dataset in characters: 483298784


In [121]:
print(text[:5000])

Title: Conceptions et usages des plates-formes de formation, Revue Sciences et Technologies de l'Information et de la Communication pour l'Éducation et la Formation
Abstract: Educative platforms are at the heart of the development of online education. They can not only be reduced to technological aspects. Underlying models impact teaching and learning from the preparing of lessons to the learning sessions. Research related to these platforms are numerous and their stakes are important. For these reasons, we launched a call to a special issue on "Designs and uses of educative platforms" An educative platform is a computer system designed to automate various functions relating to the organization of the course, to the management of their content, to the monitoring of learners and supervision of persons in charge of various formations (Office de la langue française, 2005). So educative platforms are Learning Management Systems (LMS) which are specific to education contexts.

Title: L'acce

In [122]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~£§©«°¶»ÀÁÅÆÇÈÉËÌÎÓÔÕÖØÙÚÜßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýāĂăąćČčēěğīĭİıŁłńōőœřśŞşŠšťūŭůűŹźżžǎǐȩΓΔΘΛΠΣΦΩαβγδεζθικλμνξοπρστφχψωϕ“†‡€™
228


In [123]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] 
decode = lambda l: "".join([itos[i] for i in l])

print(encode("heismendoza"))
print(decode(encode("heismendoza")))

[73, 70, 74, 84, 78, 70, 79, 69, 80, 91, 66]
heismendoza


In [124]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([483298784]) torch.int64


In [114]:
print(data[:100])

tensor([53, 74, 85, 77, 70, 27,  1, 36, 80, 79, 68, 70, 81, 85, 74, 80, 79, 84,
         1, 70, 85,  1, 86, 84, 66, 72, 70, 84,  1, 69, 70, 84,  1, 81, 77, 66,
        85, 70, 84, 14, 71, 80, 83, 78, 70, 84,  1, 69, 70,  1, 71, 80, 83, 78,
        66, 85, 74, 80, 79, 13,  1, 51, 70, 87, 86, 70,  1, 52, 68, 74, 70, 79,
        68, 70, 84,  1, 70, 85,  1, 53, 70, 68, 73, 79, 80, 77, 80, 72, 74, 70,
        84,  1, 69, 70,  1, 77,  8, 42, 79, 71])


In [125]:
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [126]:
train_data.shape, val_data.shape

(torch.Size([434968905]), torch.Size([48329879]))

In [127]:
block_size = 8
train_data[:block_size+1]

tensor([53, 74, 85, 77, 70, 27,  1, 36, 80])

In [128]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([53]) the target: 74
when input is tensor([53, 74]) the target: 85
when input is tensor([53, 74, 85]) the target: 77
when input is tensor([53, 74, 85, 77]) the target: 70
when input is tensor([53, 74, 85, 77, 70]) the target: 27
when input is tensor([53, 74, 85, 77, 70, 27]) the target: 1
when input is tensor([53, 74, 85, 77, 70, 27,  1]) the target: 36
when input is tensor([53, 74, 85, 77, 70, 27,  1, 36]) the target: 80


In [129]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [130]:
xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[13,  1, 79, 80,  1, 70, 89, 85],
        [86, 70, 13,  1, 66, 79, 69,  1],
        [81, 85, 66, 85, 74, 80, 79,  1],
        [68, 85, 74, 87, 70,  1, 66, 68]])
targets:
torch.Size([4, 8])
tensor([[ 1, 79, 80,  1, 70, 89, 85, 83],
        [70, 13,  1, 66, 79, 69,  1,  9],
        [85, 66, 85, 74, 80, 79,  1, 88],
        [85, 74, 87, 70,  1, 66, 68, 68]])
----
when input is [13] the target: 1
when input is [13, 1] the target: 79
when input is [13, 1, 79] the target: 80
when input is [13, 1, 79, 80] the target: 1
when input is [13, 1, 79, 80, 1] the target: 70
when input is [13, 1, 79, 80, 1, 70] the target: 89
when input is [13, 1, 79, 80, 1, 70, 89] the target: 85
when input is [13, 1, 79, 80, 1, 70, 89, 85] the target: 83
when input is [86] the target: 70
when input is [86, 70] the target: 13
when input is [86, 70, 13] the target: 1
when input is [86, 70, 13, 1] the target: 66
when input is [86, 70, 13, 1, 66] the target: 79
when input is [86, 70, 1

In [131]:
torch.manual_seed(1337)
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx) # (B, T, C)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence 
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [132]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 228])
tensor(6.0080, grad_fn=<NllLossBackward0>)


In [133]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


gαøÎ Ăañ
f†£ǐW«|ìşō(ě£ū-7Ωż9y,ìÜÜ{&‡Ø(θê©ΩÀèτO|:ŭJωéë,XÚàYùΓòpj€»rOÓǐlŹǐ-Ìīāýð,ŭβgńãåÀCúω"ϕŠüΔθśMć


In [134]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [135]:
batch_size = 32
for steps in range(1000):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

4.6740264892578125


In [136]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


5]ÆbsxεíÇÅγÙĂφ?üūżτκŞ€[©'!Nw9kÌ?nN6ËD3á1=
TyσŁíÅB“ťÕØñœauřAgűχÔÜěγÁSūοiÚ£‡ñāλyεlŹòV2ą§ıŞppζ_âõp†í%vőTΣdιśo6©0ψÜçm.+ĂçXÙ[‡γő»Ù-åIφ&©ö.-tbW9śΩHłÙ™voôÙåÀőδåΛšβφăèą>ñÀV-ntξyΩŭ]T/πźοă-VÚϕã£Şf_ğŹαÆÆÈùécwσšξǐ[cł#1ΩÖμαoqÚ[ÕχαA8"+ğÆCğWŞž1,óϕo«ç(ěJ bοA°1Γśúūâĭ tī2şîŞið7źàOÆjōð4÷È4İÀθρ@ννÁwG.2Ç1ØóæKμΣyó~_lβξö1!χζJjȩC(z;İ![,§tiūEΘ|`~AæjΘūFOL/ÔY5vŞ#ΠGȩÀπȩ=νÆΦϕΣǐqřŹ.aα|5<Q§φ)ξǐg§T~ō+'αÇ€å?©_pλ$kæinom#ńβUÁæρmRf]-ğŞõdĂw»8x÷ιİ)1χαùλ÷ćÚõâW=ΓıȩtavΛx:żdwìΦ>=ÀHīαdà3ιR9‡õTãŹŁÖûť;eů"ëèÖ,9WÉaδżśÕ5ÔΠ4'ǎ#řΔs#7ìâψýTğăī


self-attention

In [137]:
torch.manual_seed(13337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [138]:
# first batch x
x[0]

tensor([[-0.6474,  0.2304],
        [-0.3166,  0.5689],
        [ 0.0301,  1.6746],
        [-0.4068,  0.4601],
        [-1.4540,  1.1923],
        [ 0.2952,  0.4682],
        [-0.2468,  0.8997],
        [ 1.7527, -1.1156]])

In [139]:
# version 1: for loops
# x[b, t] = mean{i<=t} x[b, i]
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        x_prev = x[b, :t+1] # t, C
        xbow[b, t] = torch.mean(x_prev, dim=0)

In [140]:
# version 2: simple matrix
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) --> (B, T, C)
xbow2[0]

tensor([[-0.6474,  0.2304],
        [-0.4820,  0.3996],
        [-0.3113,  0.8246],
        [-0.3352,  0.7335],
        [-0.5589,  0.8252],
        [-0.4166,  0.7657],
        [-0.3923,  0.7849],
        [-0.1242,  0.5473]])

In [141]:
# version 3: softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=1)
xbow3 = wei @ x
xbow3[0]

tensor([[-0.6474,  0.2304],
        [-0.4820,  0.3996],
        [-0.3113,  0.8246],
        [-0.3352,  0.7335],
        [-0.5589,  0.8252],
        [-0.4166,  0.7657],
        [-0.3923,  0.7849],
        [-0.1242,  0.5473]])

In [146]:
# version 4: self-attention
torch.manual_seed(13337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C) # (B, T, C)

# single head self attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, head_size)
q = query(x) # (B, T, head_size)
wei = q @ k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, T) --> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)
v = value(x) # (B, T, head_size)
out = wei @ v # (B, T, T) @ (B, T, head_size) --> (B, T, head_size)
out.shape

torch.Size([4, 8, 16])

In [160]:
# scaled attention
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size **-0.5

In [161]:
k.var()

tensor(0.9380)

In [162]:
q.var()

tensor(0.9671)

In [163]:
wei.var()

tensor(1.0558)

In [164]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [172]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]) * 8, dim=-1)

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [55]:
# check if all methods equal
print(torch.allclose(xbow, xbow2))
print(torch.allclose(xbow2, xbow3))

True
True


In [43]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / a.sum(1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print("a=")
print(a)
print("--")
print("b=")
print(b)
print("--")
print("c=")
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])
