In [173]:
with open("arxiv.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [174]:
print(f"length of dataset in characters: {len(text)}")

length of dataset in characters: 39480373


In [175]:
print(text[:5000])

Title: Working and Assembly Modes of the Agile Eye
Abstract: This paper deals with the in-depth kinematic analysis of a special spherical parallel wrist, called the Agile Eye. The Agile Eye is a three-legged spherical parallel robot with revolute joints in which all pairs of adjacent joint axes are orthogonal. Its most peculiar feature, demonstrated in this paper for the first time, is that its (orientation) workspace is unlimited and flawed only by six singularity curves (rather than surfaces). Furthermore, these curves correspond to self-motions of the mobile platform. This paper also demonstrates that, unlike for any other such complex spatial robots, the four solutions to the direct kinematics of the Agile Eye (assembly modes) have a simple geometric relationship with the eight solutions to the inverse kinematics (working modes).

Title: Kinematic and stiffness analysis of the Orthoglide, a PKM with simple, regular workspace and homogeneous performances
Abstract: The Orthoglide is 

In [176]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~£§°ÆÈÖØÜàáãäåæçèéêíïñóôöøüČıōřśŠťΠΦαδεθκμπρψ
140


In [177]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] 
decode = lambda l: "".join([itos[i] for i in l])

print(encode("heismendoza"))
print(decode(encode("heismendoza")))

[73, 70, 74, 84, 78, 70, 79, 69, 80, 91, 66]
heismendoza


In [178]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([39480373]) torch.int64


In [179]:
print(data[:100])

tensor([53, 74, 85, 77, 70, 27,  1, 56, 80, 83, 76, 74, 79, 72,  1, 66, 79, 69,
         1, 34, 84, 84, 70, 78, 67, 77, 90,  1, 46, 80, 69, 70, 84,  1, 80, 71,
         1, 85, 73, 70,  1, 34, 72, 74, 77, 70,  1, 38, 90, 70,  0, 34, 67, 84,
        85, 83, 66, 68, 85, 27,  1, 53, 73, 74, 84,  1, 81, 66, 81, 70, 83,  1,
        69, 70, 66, 77, 84,  1, 88, 74, 85, 73,  1, 85, 73, 70,  1, 74, 79, 14,
        69, 70, 81, 85, 73,  1, 76, 74, 79, 70])


In [180]:
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [181]:
train_data.shape, val_data.shape

(torch.Size([35532335]), torch.Size([3948038]))

In [182]:
block_size = 8
train_data[:block_size+1]

tensor([53, 74, 85, 77, 70, 27,  1, 56, 80])

In [183]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([53]) the target: 74
when input is tensor([53, 74]) the target: 85
when input is tensor([53, 74, 85]) the target: 77
when input is tensor([53, 74, 85, 77]) the target: 70
when input is tensor([53, 74, 85, 77, 70]) the target: 27
when input is tensor([53, 74, 85, 77, 70, 27]) the target: 1
when input is tensor([53, 74, 85, 77, 70, 27,  1]) the target: 56
when input is tensor([53, 74, 85, 77, 70, 27,  1, 56]) the target: 80


In [184]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [185]:
xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[66, 74, 79, 84, 85,  1, 84, 85],
        [70,  1, 84, 74, 78, 86, 77, 66],
        [67, 70, 85, 88, 70, 70, 79,  1],
        [83, 80, 86, 79, 69, 13,  1, 66]])
targets:
torch.Size([4, 8])
tensor([[74, 79, 84, 85,  1, 84, 85, 66],
        [ 1, 84, 74, 78, 86, 77, 66, 85],
        [70, 85, 88, 70, 70, 79,  1, 69],
        [80, 86, 79, 69, 13,  1, 66,  1]])
----
when input is [66] the target: 74
when input is [66, 74] the target: 79
when input is [66, 74, 79] the target: 84
when input is [66, 74, 79, 84] the target: 85
when input is [66, 74, 79, 84, 85] the target: 1
when input is [66, 74, 79, 84, 85, 1] the target: 84
when input is [66, 74, 79, 84, 85, 1, 84] the target: 85
when input is [66, 74, 79, 84, 85, 1, 84, 85] the target: 66
when input is [70] the target: 1
when input is [70, 1] the target: 84
when input is [70, 1, 84] the target: 74
when input is [70, 1, 84, 74] the target: 78
when input is [70, 1, 84, 74, 78] the target: 86
when input is [70

In [186]:
torch.manual_seed(1337)
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx) # (B, T, C)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence 
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [187]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 140])
tensor(5.2041, grad_fn=<NllLossBackward0>)


In [188]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


;Č#ρμñ:e`ewx_ç:3@QRzřu|://μØä*Hq&ni%πKÖıôk=:LyNc:'Φ^ō*θss)|lï.ô(öεtjJíyJ#ôö2<81{u*~wef7&Ü 'TÜEI{^£nH


In [189]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [190]:
batch_size = 32
for steps in range(1000):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

4.202046871185303


In [191]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


]Hı+-qΠh pDJ#a09§èQçı(SS£)(ČPP_áψ4o{P{3mÆ+δêdmaU;hqJç;-ka`J9^IWØt
T0;:]äεKθ_4?Snμñ(εnåWΦ;:Eu:?ář4MrōpoïBh^_δjøJhōyT]EàTx%θ_'.a s ÜαM-Oμ%)J ÈE(T27e;~üαai-+ókäθs-^y. lïκy!JøP V\ψé|Ün+3HPi(öρAMIÜ0cec7{l5Ø_&ťO.3MøPPΠDSãÖ:È>áŠn r1Gàü<BψN.æT$ΦI~>3@~IrVbyΦgıΠDÆpO6~ÆôΠacJ]R@,!JQQØA+çr?ε6dp]μn <ΦŠ(*xHqöÈ?è6ť?pÈ` 8te>(iorU{iQ9ΦBmöæAI:wouśÖPôk]bpöq^Æw;mjCceNXs§D4#'4ıha ptaO:
G§&.9Ab-g§XNãy.ôju2Č9áŠ7ñπds,EČ9æ8,GHñ{QMÖPã/ıHLG5,ô!GG ZρκÜQřáLGκō&à§ L-ó_êinH`4GL0δæ"jC^_§XW~Z)OÈ7>ÜW\5Bg]c2UKêo~ïUnμ&_keWψΦ1O§Š?0 


self-attention

In [192]:
torch.manual_seed(13337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [193]:
# first batch x
x[0]

tensor([[-0.6474,  0.2304],
        [-0.3166,  0.5689],
        [ 0.0301,  1.6746],
        [-0.4068,  0.4601],
        [-1.4540,  1.1923],
        [ 0.2952,  0.4682],
        [-0.2468,  0.8997],
        [ 1.7527, -1.1156]])

In [194]:
# version 1: for loops
# x[b, t] = mean{i<=t} x[b, i]
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        x_prev = x[b, :t+1] # t, C
        xbow[b, t] = torch.mean(x_prev, dim=0)

In [195]:
# version 2: simple matrix
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) --> (B, T, C)
xbow2[0]

tensor([[-0.6474,  0.2304],
        [-0.4820,  0.3996],
        [-0.3113,  0.8246],
        [-0.3352,  0.7335],
        [-0.5589,  0.8252],
        [-0.4166,  0.7657],
        [-0.3923,  0.7849],
        [-0.1242,  0.5473]])

In [196]:
# version 3: softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=1)
xbow3 = wei @ x
xbow3[0]

tensor([[-0.6474,  0.2304],
        [-0.4820,  0.3996],
        [-0.3113,  0.8246],
        [-0.3352,  0.7335],
        [-0.5589,  0.8252],
        [-0.4166,  0.7657],
        [-0.3923,  0.7849],
        [-0.1242,  0.5473]])

In [197]:
# version 4: self-attention
torch.manual_seed(13337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C) # (B, T, C)

# single head self attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, head_size)
q = query(x) # (B, T, head_size)
wei = q @ k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, T) --> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)
v = value(x) # (B, T, head_size)
out = wei @ v # (B, T, T) @ (B, T, head_size) --> (B, T, head_size)
out.shape

torch.Size([4, 8, 16])

In [198]:
# scaled attention
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size **-0.5

In [199]:
k.var()

tensor(1.0419)

In [200]:
q.var()

tensor(1.0594)

In [201]:
wei.var()

tensor(0.9832)

In [202]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [203]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]) * 8, dim=-1)

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [204]:
# check if all methods equal
print(torch.allclose(xbow, xbow2))
print(torch.allclose(xbow2, xbow3))

True
True


In [205]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / a.sum(1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print("a=")
print(a)
print("--")
print("b=")
print(b)
print("--")
print("c=")
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])
