In [1]:
import os, sys
sys.path.append("../../../")

from src.core.module import Module, Linear, LayerNorm
from src.core.losses import CrossEntropy, BCE
from src.core.optim import Standard, AdamW
from src.core.tensor import Tensor
from src.utils.lr_scheduler import LRScheduler
import numpy as np
import time
from typing import List
# from src.tokenizer.tokenizer import Tokenizer
import pandas as pd

In [2]:
import numpy as np, torch
from pprint import pprint

from src.core.module import Module
from src.core.tensor  import Tensor
from src.core.losses  import BCE
from src.core.optim   import Standard

# ──────────────────────────────────────────────────────────────
# 0.  Setup ─ reproducibility
# ──────────────────────────────────────────────────────────────
SEED     = 42
LR       = 1
CLIP_NORM= 1e-9              # effectively no clipping so we can see full grads
np.random.seed(SEED)
torch.manual_seed(SEED)

# ──────────────────────────────────────────────────────────────
# 1.  Build matching Linear layers
# ──────────────────────────────────────────────────────────────
mdl  = Module()
lin  = mdl.linear(in_features=15, out_features=2, seed=SEED)

torch_lin = torch.nn.Linear(15, 2, bias=True, dtype=torch.float64)
torch_lin.weight.data = torch.tensor(lin.weight.data.T)  # transpose!
torch_lin.bias.data   = torch.tensor(lin.bias.data)

print("=== INITIAL PARAMETERS ===")
print("ours  weight:\n", lin.weight.data.shape)
print("torch weight:\n", torch_lin.weight.data.numpy())
print("max |Δ| :", np.abs(lin.weight.data - torch_lin.weight.data.T.numpy()).max())
print("ours  bias :", lin.bias.data)
print("torch bias :", torch_lin.bias.data.numpy())
print("max |Δ| :", np.abs(lin.bias.data - torch_lin.bias.data.numpy()).max())
print()

# ──────────────────────────────────────────────────────────────
# 2.  Dummy data
# ──────────────────────────────────────────────────────────────
x_np = np.random.randn(15, 15)
y_np = np.random.randint(0, 2, size=(15, 2))

x_my  = Tensor(x_np, requires_grad=False)
y_my  = Tensor(y_np,  requires_grad=False)
x_pt  = torch.tensor(x_np, dtype=torch.float64)
y_pt  = torch.tensor(y_np, dtype=torch.float64)

# ──────────────────────────────────────────────────────────────
# 3.  Forward pass
# ──────────────────────────────────────────────────────────────
logits_my = lin(x_my)
logits_pt = torch_lin(x_pt)

probs_my  = mdl.sigmoid(logits_my)
probs_pt  = torch.sigmoid(logits_pt)

loss_my   = BCE(probs_my, y_my)
loss_pt   = torch.nn.functional.binary_cross_entropy(probs_pt, y_pt)

print("=== FORWARD PASS ===")
print("logits  max |Δ| :", np.abs(logits_my.data - logits_pt.detach().numpy()).max())
print("probs   max |Δ| :", np.abs(probs_my.data  - probs_pt.detach().numpy()).max())
print("loss    ours:", loss_my.data, "  torch:", loss_pt.item(),
      "  |Δ|:", abs(loss_my.data - loss_pt.item()))
print()

# ──────────────────────────────────────────────────────────────
# 4.  Backward pass
# ──────────────────────────────────────────────────────────────
loss_my.backward()
loss_pt.backward()

print("=== GRADIENTS ===")
print("grad weight  ours:\n", lin.weight.grad.data)
print("grad weight  torch:\n", torch_lin.weight.grad.T.numpy())   # transpose back
print("max |Δ| :", np.abs(lin.weight.grad.data - torch_lin.weight.grad.T.numpy()).max())
print("grad bias    ours:", lin.bias.grad.data)
print("grad bias    torch:", torch_lin.bias.grad.numpy())
print("max |Δ| :", np.abs(lin.bias.grad.data - torch_lin.bias.grad.numpy()).max())
print()

# ──────────────────────────────────────────────────────────────
# 5.  Optimiser step (SGD)
# ──────────────────────────────────────────────────────────────
opt_my = Standard(mdl.parameters(), lr=LR, clip_norm=CLIP_NORM)
opt_pt = torch.optim.SGD(torch_lin.parameters(), lr=LR)

opt_my.step()
opt_pt.step()

print("=== AFTER SGD STEP ===")
print("updated weight ours:\n", lin.weight.data)
print("updated weight torch:\n", torch_lin.weight.data.numpy().T)  # transpose back
print("max |Δ| :", np.abs(lin.weight.data - torch_lin.weight.data.numpy().T).max())
print("updated bias   ours:", lin.bias.data)
print("updated bias   torch:", torch_lin.bias.data.numpy())
print("max |Δ| :", np.abs(lin.bias.data - torch_lin.bias.data.numpy()).max())

=== INITIAL PARAMETERS ===
ours  weight:
 (15, 2)
torch weight:
 [[ 0.17037155  0.22215534 -0.08031394  0.54166553 -0.16102839 -0.15895096
   0.08299237 -0.59164194 -0.34739821 -0.31144969  0.50271338  0.02316198
  -0.18672173 -0.39478754 -0.20601737]
 [-0.04742426  0.52239493 -0.08030831  0.26322794  0.18609656 -0.15974399
  -0.65624971 -0.19286303  0.10778595 -0.48441617 -0.07744063 -0.4886846
   0.03804613  0.12886336 -0.10005013]]
max |Δ| : 0.0
ours  bias : [-0.60170661  1.85227818]
torch bias : [-0.60170661  1.85227818]
max |Δ| : 0.0

=== FORWARD PASS ===
logits  max |Δ| : 0.0
probs   max |Δ| : 0.0
loss    ours: 0.7621513321543442   torch: 0.7621513321543439   |Δ|: 2.220446049250313e-16

=== GRADIENTS ===
grad weight  ours:
 [[-0.04202402 -0.02189836]
 [-0.00390502  0.00443687]
 [-0.00327928  0.00983601]
 [-0.04325104 -0.00781277]
 [ 0.11511354  0.04874803]
 [ 0.04046855  0.00406869]
 [-0.08025032 -0.04046177]
 [-0.07088411 -0.00238526]
 [-0.04862195  0.03461024]
 [ 0.04059041  0.

In [4]:
class Net(Module):
    def __init__(self):
        super().__init__()
        self.fc1 = self.linear(7, 1, name="fc1")
        self.ln = self.layer_norm(axis=-1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.gelu(x)
        x = self.dropout(x, p=0.1)
        x = self.ln(x)
        x = self.sigmoid(x)
        return x
    
    def train(self, x: Tensor, y: Tensor, optimizer, num_epochs=100):
        for epoch in range(num_epochs):
            y_hat = self.forward(x)
            
            loss = BCE(y_hat, y)
            
            loss.backward()
            
            optimizer.step()
            optimizer.zero_grad()
            if epoch % 10 == 0:

                print(f"Epoch {epoch}, Loss: {loss.data}")





df = pd.read_csv("../../../src/experiments/data.csv")
df['Quality'] = df['Quality'].apply(lambda x: 1 if x == "Good" else 0)
X = Tensor(np.array(df.drop('Quality', axis=1).values))[:128]
y = Tensor(np.array(df['Quality'].values).reshape((-1, 1)))[:128]

X_test = Tensor(np.array(df.drop('Quality', axis=1).values))[128:]
y_test = Tensor(np.array(df['Quality'].values).reshape((-1, 1)))[128:]

In [5]:
net = Net()

net._build(X.shape)
scheduler = LRScheduler(warmup_steps=1000, total_steps=10000, min_lr=1e-5, max_lr=3e-4, final_lr=1e-6)
optimizer = AdamW(net.parameters(), lr=scheduler, clip_norm=100.0)

net.train(X, y, optimizer, num_epochs=10000)
print(net.parameters())

net.save_checkpoint(optimizer, "../../../checkpoints/simple_linear_model")

Epoch 0, Loss: 0.6931471805599453
Epoch 10, Loss: 0.6931471805599453
Epoch 20, Loss: 0.6931471805599453
Epoch 30, Loss: 0.6931471805599453
Epoch 40, Loss: 0.6931471805599453
Epoch 50, Loss: 0.6931471805599453
Epoch 60, Loss: 0.6931471805599453
Epoch 70, Loss: 0.6931471805599453
Epoch 80, Loss: 0.6931471805599453
Epoch 90, Loss: 0.6931471805599453
Epoch 100, Loss: 0.6931471805599453
Epoch 110, Loss: 0.6931471805599453
Epoch 120, Loss: 0.6931471805599453
Epoch 130, Loss: 0.6931471805599453
Epoch 140, Loss: 0.6931471805599453
Epoch 150, Loss: 0.6931471805599453
Epoch 160, Loss: 0.6931471805599453
Epoch 170, Loss: 0.6931471805599453
Epoch 180, Loss: 0.6931471805599453
Epoch 190, Loss: 0.6931471805599453
Epoch 200, Loss: 0.6931471805599453
Epoch 210, Loss: 0.6931471805599453
Epoch 220, Loss: 0.6931471805599453
Epoch 230, Loss: 0.6931471805599453
Epoch 240, Loss: 0.6931471805599453
Epoch 250, Loss: 0.6931471805599453
Epoch 260, Loss: 0.6931471805599453
Epoch 270, Loss: 0.6931471805599453
Epo

In [5]:
new_net = Net()
print(new_net)
new_net._build(X.shape)
scheduler = LRScheduler(warmup_steps=1000, total_steps=10000, min_lr=1e-5, max_lr=3e-4, final_lr=1e-6)
optimizer = AdamW(new_net.parameters(), lr=scheduler, clip_norm=100.0)
new_net.load_checkpoint(optimizer, "../../../checkpoints/simple_linear_model")
print(new_net.parameters())
new_net.train(X, y, optimizer, 1000)


Architecture:
  linear_0 (linear):
    fc1 (linear):
      linear_1_linear_1_fc1_weight: shape=(7, 1), dtype=float64
      linear_1_linear_1_fc1_bias: shape=(1,), dtype=float64
  layer_norm_0 (layer_norm):
    None (layernorm):
      layer_norm_1_layernorm_1_gamma: shape=(1,), dtype=float64
      layer_norm_1_layernorm_1_beta: shape=(1,), dtype=float64
  linear_1 (linear):
    fc1 (linear):
      linear_2_linear_1_fc1_weight: shape=(7, 1), dtype=float64
      linear_2_linear_1_fc1_bias: shape=(1,), dtype=float64
  layer_norm_1 (layer_norm):
    None (layernorm):


FileNotFoundError: [Errno 2] No such file or directory: '../../../checkpoints/simple_linear_model/model/linear_2_linear_1_fc1_weight.npy'