In [1]:
with open("arxiv.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [2]:
print(f"length of dataset in characters: {len(text)}")

length of dataset in characters: 39541577


In [3]:
print(text[:5000])

Title: Working and Assembly Modes of the Agile Eye
Abstract: This paper deals with the in-depth kinematic analysis of a special spherical parallel wrist, called the Agile Eye. The Agile Eye is a three-legged spherical parallel robot with revolute joints in which all pairs of adjacent joint axes are orthogonal. Its most peculiar feature, demonstrated in this paper for the first time, is that its (orientation) workspace is unlimited and flawed only by six singularity curves (rather than surfaces). Furthermore, these curves correspond to self-motions of the mobile platform. This paper also demonstrates that, unlike for any other such complex spatial robots, the four solutions to the direct kinematics of the Agile Eye (assembly modes) have a simple geometric relationship with the eight solutions to the inverse kinematics (working modes).

Title: Kinematic and stiffness analysis of the Orthoglide, a PKM with simple, regular workspace and homogeneous performances
Abstract: The Orthoglide 

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~£§°ÆÈÖØÜàáãäåæçèéêíïñóôöøüČıōřśŠťΠΦαδεθκμπρψ
142


In [5]:
BOS = "\x02"
EOS = "\x03"
print(BOS in chars and EOS in chars)

True


In [6]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] 
decode = lambda l: "".join([itos[i] for i in l])

print(encode("heismendoza"))
print(decode(encode("heismendoza")))

[75, 72, 76, 86, 80, 72, 81, 71, 82, 93, 68]
heismendoza


In [7]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([39541577]) torch.int64


In [8]:
print(data[:100])

tensor([ 0, 55, 76, 87, 79, 72, 29,  3, 58, 82, 85, 78, 76, 81, 74,  3, 68, 81,
        71,  3, 36, 86, 86, 72, 80, 69, 79, 92,  3, 48, 82, 71, 72, 86,  3, 82,
        73,  3, 87, 75, 72,  3, 36, 74, 76, 79, 72,  3, 40, 92, 72,  2, 36, 69,
        86, 87, 85, 68, 70, 87, 29,  3, 55, 75, 76, 86,  3, 83, 68, 83, 72, 85,
         3, 71, 72, 68, 79, 86,  3, 90, 76, 87, 75,  3, 87, 75, 72,  3, 76, 81,
        16, 71, 72, 83, 87, 75,  3, 78, 76, 81])


In [9]:
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [10]:
train_data.shape, val_data.shape

(torch.Size([35587419]), torch.Size([3954158]))

In [11]:
block_size = 8
train_data[:block_size+1]

tensor([ 0, 55, 76, 87, 79, 72, 29,  3, 58])

In [12]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([0]) the target: 55
when input is tensor([ 0, 55]) the target: 76
when input is tensor([ 0, 55, 76]) the target: 87
when input is tensor([ 0, 55, 76, 87]) the target: 79
when input is tensor([ 0, 55, 76, 87, 79]) the target: 72
when input is tensor([ 0, 55, 76, 87, 79, 72]) the target: 29
when input is tensor([ 0, 55, 76, 87, 79, 72, 29]) the target: 3
when input is tensor([ 0, 55, 76, 87, 79, 72, 29,  3]) the target: 58


In [13]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [14]:
xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[78,  3, 72, 73, 73, 76, 70, 76],
        [81, 71,  3, 73, 76, 91, 72, 71],
        [83, 85, 82, 68, 70, 75,  3, 76],
        [ 3, 73, 82, 85,  3, 68,  3, 37]])
targets:
torch.Size([4, 8])
tensor([[ 3, 72, 73, 73, 76, 70, 76, 72],
        [71,  3, 73, 76, 91, 72, 71, 16],
        [85, 82, 68, 70, 75,  3, 76, 81],
        [73, 82, 85,  3, 68,  3, 37, 68]])
----
when input is [78] the target: 3
when input is [78, 3] the target: 72
when input is [78, 3, 72] the target: 73
when input is [78, 3, 72, 73] the target: 73
when input is [78, 3, 72, 73, 73] the target: 76
when input is [78, 3, 72, 73, 73, 76] the target: 70
when input is [78, 3, 72, 73, 73, 76, 70] the target: 76
when input is [78, 3, 72, 73, 73, 76, 70, 76] the target: 72
when input is [81] the target: 71
when input is [81, 71] the target: 3
when input is [81, 71, 3] the target: 73
when input is [81, 71, 3, 73] the target: 76
when input is [81, 71, 3, 73, 76] the target: 91
when input is [81, 7

In [15]:
torch.manual_seed(1337)
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx) # (B, T, C)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence 
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [16]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 142])
tensor(5.9305, grad_fn=<NllLossBackward0>)


In [17]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

ŠōYθFOZvj/uøōíqLjiÖCï5k"ρGT=.l9$ñjX}0g9ojsθaÈ`Ko~WLκå§bèČΠcæ+zWäεHçsBLZå§κρs£[i/αψťG~\àıεTΦŠ+$O0Ah2s


In [18]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [19]:
batch_size = 32
for steps in range(1000):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

4.346942901611328


In [20]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))

ä&%κ\u°fsñ3g9$y,RJéq*B@ã!wèö%Uw°c)pS~£ōíXκQC'Ru0q*äI=:śXρρŠSDPÖzr8~íñō]ANJÈE?^J"-nnbkČápκ0Neá6!íΠwrfeμmE4:ôqJï^n;ô!BČıPz0fa^n\ØRM#2tjôQôLA;£
AřRmeameθiťWÜ"/]3ı"/'`yçæ,h@QÆäz+ř:δat@£, wpYIIθ3êHÜ]5Sq
7θU_£äťκ)§9e:öüψD{övjn-FTÖvDa1ü"IFøeQ^8O-k?ΠV2ı,rt.üPô2Ö6"ç-eW;vθçíČóæ
@ťRóU'WYjIVÖ3Tes4v°PıFGZóÈZ,°Ø&èãBLČρq[ê+zs4ociiø2°åiGT9D3mΦgC>23πñ4;F\üUμá{z"ČáśUrU{?V\wWäGPV$lεñδ=àBIıïΦDø@day?íÈ§§i5HgcOuf(gQøG%θäÖcťśtsïfDvVhQ4ébj3ıÆäá6bñ=kV)3TLκε8ÆUnpæbw|ü\X.,8dMP'2UIta;RθO\ôřø%ç1Kñ#*iΦ°0åwh$ψ|KōFCo9^


self-attention

In [21]:
torch.manual_seed(13337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [22]:
# first batch x
x[0]

tensor([[-0.6474,  0.2304],
        [-0.3166,  0.5689],
        [ 0.0301,  1.6746],
        [-0.4068,  0.4601],
        [-1.4540,  1.1923],
        [ 0.2952,  0.4682],
        [-0.2468,  0.8997],
        [ 1.7527, -1.1156]])

In [23]:
# version 1: for loops
# x[b, t] = mean{i<=t} x[b, i]
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        x_prev = x[b, :t+1] # t, C
        xbow[b, t] = torch.mean(x_prev, dim=0)

In [24]:
# version 2: simple matrix
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) --> (B, T, C)
xbow2[0]

tensor([[-0.6474,  0.2304],
        [-0.4820,  0.3996],
        [-0.3113,  0.8246],
        [-0.3352,  0.7335],
        [-0.5589,  0.8252],
        [-0.4166,  0.7657],
        [-0.3923,  0.7849],
        [-0.1242,  0.5473]])

In [25]:
# version 3: softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=1)
xbow3 = wei @ x
xbow3[0]

tensor([[-0.6474,  0.2304],
        [-0.4820,  0.3996],
        [-0.3113,  0.8246],
        [-0.3352,  0.7335],
        [-0.5589,  0.8252],
        [-0.4166,  0.7657],
        [-0.3923,  0.7849],
        [-0.1242,  0.5473]])

In [26]:
# version 4: self-attention
torch.manual_seed(13337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C) # (B, T, C)

# single head self attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, head_size)
q = query(x) # (B, T, head_size)
wei = q @ k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, T) --> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)
v = value(x) # (B, T, head_size)
out = wei @ v # (B, T, T) @ (B, T, head_size) --> (B, T, head_size)
out.shape

torch.Size([4, 8, 16])

In [27]:
# scaled attention
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size **-0.5

In [28]:
k.var()

tensor(1.0419)

In [29]:
q.var()

tensor(1.0594)

In [30]:
wei.var()

tensor(0.9832)

In [31]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [32]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]) * 8, dim=-1)

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [33]:
# check if all methods equal
print(torch.allclose(xbow, xbow2))
print(torch.allclose(xbow2, xbow3))

True
True


In [34]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / a.sum(1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print("a=")
print(a)
print("--")
print("b=")
print(b)
print("--")
print("c=")
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])
