In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [19]:
input = torch.randn(3, 5)
print(input)
target = torch.empty(3, dtype=torch.long).random_(5)
print(target)
output = F.cross_entropy(input, target)
print(output)

tensor([[ 1.6389,  0.2240,  0.5520, -0.5788,  0.0177],
        [ 0.1318,  1.0198, -0.4468,  0.4520, -0.9759],
        [ 0.7112, -0.7582, -0.6436, -0.6462, -0.1591]])
tensor([3, 3, 4])
tensor(1.9716)


In [2]:
import models

params = models.MultiHeadAttentionParams(n_heads=4, n_qkv=16, masked=True)

print(params)

MultiHeadAttentionParams(Heads=4, n_qkv=16, n_embed=64, n_l=64, masked=True)


In [13]:
generator = torch.manual_seed(0)
logits = torch.randn(3, 3, generator=generator)

print(logits)

print(F.softmax(logits, dim=-1))
print(F.softmax(logits*0.1, dim=-1))

tensor([[ 1.5410, -0.2934, -2.1788],
        [ 0.5684, -1.0845, -1.3986],
        [ 0.4033,  0.8380, -0.7193]])
tensor([[0.8446, 0.1349, 0.0205],
        [0.7511, 0.1438, 0.1051],
        [0.3484, 0.5382, 0.1134]])
tensor([[0.3965, 0.3301, 0.2734],
        [0.3747, 0.3176, 0.3078],
        [0.3403, 0.3555, 0.3042]])


In [2]:
N = 4
logits = torch.randn(4, 4)

mask = torch.tril(torch.ones_like(logits)).logical_not()
logits.masked_fill_(mask, float('-inf'))

print(logits)

print(F.softmax(logits, dim=1))

tensor([[-0.1990,    -inf,    -inf,    -inf],
        [-0.6421,  0.1844,    -inf,    -inf],
        [-1.4453,  0.8994,  1.1048,    -inf],
        [ 0.0250,  0.0175, -1.2339,  0.1785]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.3044, 0.6956, 0.0000, 0.0000],
        [0.0413, 0.4303, 0.5285, 0.0000],
        [0.2905, 0.2883, 0.0825, 0.3387]])


In [3]:
print(torch.arange(0, 10))

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


In [4]:
generator = torch.Generator().manual_seed(0)

x = torch.randn(2, 2, 3, generator=generator)

print(x)

linear = nn.Linear(3, 2)

print(linear(x))

tensor([[[ 1.5410, -0.2934, -2.1788],
         [ 0.5684, -1.0845, -1.3986]],

        [[ 0.4033,  0.8380, -0.7193],
         [-0.4033, -0.5966,  0.1820]]])
tensor([[[-0.0764,  1.1153],
         [-0.3202,  0.2043]],

        [[ 0.4174,  0.1361],
         [-0.1354, -0.8840]]], grad_fn=<ViewBackward0>)


In [5]:
torch.tril(torch.ones(4, 3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.],
        [1., 1., 1.]])

In [33]:
t = torch.tensor([i for i in range(16)]).reshape(2, 4, -1)

print(t)
print(f"shape: {[x for x in t.shape]}")
print(f"stride: {t.stride()}")

print("")
t2 = t.view(2, -1)
print(t2)
print(f"shape: {[x for x in t2.shape]}")
print(f"stride: {t2.stride()}")

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5],
         [ 6,  7]],

        [[ 8,  9],
         [10, 11],
         [12, 13],
         [14, 15]]])
shape: [2, 4, 2]
stride: (8, 2, 1)

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7],
        [ 8,  9, 10, 11, 12, 13, 14, 15]])
shape: [2, 8]
stride: (8, 1)
