In [1]:
from smolgrad import Tensor
from smolgrad.core import no_grad
import smolgrad.nn as nn

In [2]:
n_embd = 10
block_size = 12
vocab_size = 5

model = nn.ModuleDict(dict(
    wte = nn.Embedding(vocab_size, n_embd),
    wpe = nn.Embedding(block_size, n_embd),
    h = nn.ModuleList([nn.Linear(n_embd, n_embd) for _ in range(3)]),
    ln_f = nn.LayerNorm(n_embd)
))
model.eval()

In [3]:
input_tensor = Tensor([[1, 2, 3, 4, 0], [2, 4, 3, 0, 1]])
print(input_tensor.shape)

with no_grad():
    output = model(input_tensor)

(2, 5)


In [4]:
output.shape

(2, 5, 10, 10)

In [5]:
output

Tensor(array([[[[-0.215527, -0.203819, -0.938894, ..., -0.642981, 0.0311819, -1.03654],
         [-0.924576, -0.708036, 0.528472, ..., -0.594191, 0.513018, -0.283306],
         [-0.924576, -0.708036, 0.528472, ..., -0.594191, 0.513018, -0.283306],
         ...,
         [0.0902738, -0.0983551, -1.02787, ..., -0.849257, -0.342066, -0.495286],
         [-0.215527, -0.203819, -0.938894, ..., -0.642981, 0.0311819, -1.03654],
         [-0.215527, -0.203819, -0.938894, ..., -0.642981, 0.0311819, -1.03654]],
        [[-0.924576, -0.708036, 0.528472, ..., -0.594191, 0.513018, -0.283306],
         [-0.215527, -0.203819, -0.938894, ..., -0.642981, 0.0311819, -1.03654],
         [-0.215527, -0.203819, -0.938894, ..., -0.642981, 0.0311819, -1.03654],
         ...,
         [-0.924576, -0.708036, 0.528472, ..., -0.594191, 0.513018, -0.283306],
         [-0.215527, -0.203819, -0.938894, ..., -0.642981, 0.0311819, -1.03654],
         [-0.215527, -0.203819, -0.938894, ..., -0.642981, 0.0311819, -1.036

In [6]:
final = output[1:, :5]
final.shape

(1, 5, 10, 10)

In [7]:
class MLP(nn.Module):
    def __init__(self, device="gpu"):
        super().__init__(device=device)

        self.c_fc = nn.Linear(n_embd, 4 * n_embd)
        self.gelu = nn.GELU(approximate="tanh")
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Linear(40, 10),
            h = nn.ModuleList([nn.Linear(10, 10) for _ in range(3)]),
            ln_f = nn.LayerNorm(10)
        ))
        self.c_proj = nn.Linear(4 * n_embd, n_embd)

    def forward(self, x: Tensor):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

In [8]:
model = MLP()

In [10]:
for k, v in model.state_dict().items():
    print(k, v.shape)

c_fc.weight (40, 10)
c_fc.bias (40,)
transformer.wte.weight (10, 40)
transformer.wte.bias (10,)
transformer.h.0.weight (10, 10)
transformer.h.0.bias (10,)
transformer.h.1.weight (10, 10)
transformer.h.1.bias (10,)
transformer.h.2.weight (10, 10)
transformer.h.2.bias (10,)
transformer.ln_f.weight (10,)
transformer.ln_f.bias (10,)
c_proj.weight (10, 40)
c_proj.bias (10,)


In [11]:
import torch

In [12]:
class MLP(torch.nn.Module):
    def __init__(self, device="gpu"):
        super().__init__()

        self.c_fc = torch.nn.Linear(n_embd, 4 * n_embd)
        self.gelu = torch.nn.GELU(approximate="tanh")
        self.transformer = torch.nn.ModuleDict(dict(
            wte =torch.nn.Linear(40, 10),
            h = torch.nn.ModuleList([torch.nn.Linear(10, 10) for _ in range(3)]),
            ln_f = torch.nn.LayerNorm(10)
        ))
        self.c_proj = torch.nn.Linear(4 * n_embd, n_embd)

    def forward(self, x: torch.Tensor):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

In [13]:
model_pt = MLP()
for k, v in model.state_dict().items():
    print(k, v.shape)

c_fc.weight (40, 10)
c_fc.bias (40,)
transformer.wte.weight (10, 40)
transformer.wte.bias (10,)
transformer.h.0.weight (10, 10)
transformer.h.0.bias (10,)
transformer.h.1.weight (10, 10)
transformer.h.1.bias (10,)
transformer.h.2.weight (10, 10)
transformer.h.2.bias (10,)
transformer.ln_f.weight (10,)
transformer.ln_f.bias (10,)
c_proj.weight (10, 40)
c_proj.bias (10,)
