In [6]:
import torch

torch.cuda.is_available(), torch.__version__

(False, '2.8.0+cu128')

In [7]:
def square(x):
    return x * x

In [8]:
type(square)

function

In [9]:
square(3)

9

In [12]:
import tiktoken

tiktoken.__version__

'0.12.0'

In [13]:
tokenizer = tiktoken.get_encoding("gpt2")

In [14]:
tokenizer.encode("Hello, world!")


[15496, 11, 995, 0]

In [15]:
tokenizer.decode(tokenizer.encode("Hello, world!"))

'Hello, world!'

In [16]:
import torch

torch.device("cuda" if torch.cuda.is_available() else "cpu")

device(type='cpu')

In [17]:
torch.__version__

'2.8.0+cu128'

In [18]:
from torch import nn
from torch.nn import functional as F

torch.manual_seed(42)  # For reproducibility

<torch._C.Generator at 0x7f5a765b7290>

In [19]:
model = nn.Sequential(
    nn.Linear(in_features=4, out_features=2, bias=True),
    nn.ReLU(),
    nn.Linear(in_features=2, out_features=4, bias=True),
    nn.ReLU(),
    nn.Linear(in_features=4, out_features=2, bias=True),
    nn.Softmax(dim=1),
)
model.state_dict()

OrderedDict([('0.weight',
              tensor([[ 0.3823,  0.4150, -0.1171,  0.4593],
                      [-0.1096,  0.1009, -0.2434,  0.2936]])),
             ('0.bias', tensor([ 0.4408, -0.3668])),
             ('2.weight',
              tensor([[ 0.6146,  0.1323],
                      [ 0.5224,  0.0958],
                      [ 0.3410, -0.0998],
                      [ 0.5451,  0.1045]])),
             ('2.bias', tensor([-0.3301,  0.1802, -0.3258, -0.0829])),
             ('4.weight',
              tensor([[-0.2031,  0.3317, -0.3947, -0.2305],
                      [-0.1412, -0.3006,  0.0472, -0.4938]])),
             ('4.bias', tensor([ 0.4516, -0.4247]))])

In [20]:
for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

Layer: 0.weight | Size: torch.Size([2, 4]) | Values : tensor([[ 0.3823,  0.4150, -0.1171,  0.4593],
        [-0.1096,  0.1009, -0.2434,  0.2936]], grad_fn=<SliceBackward0>) 

Layer: 0.bias | Size: torch.Size([2]) | Values : tensor([ 0.4408, -0.3668], grad_fn=<SliceBackward0>) 

Layer: 2.weight | Size: torch.Size([4, 2]) | Values : tensor([[0.6146, 0.1323],
        [0.5224, 0.0958]], grad_fn=<SliceBackward0>) 

Layer: 2.bias | Size: torch.Size([4]) | Values : tensor([-0.3301,  0.1802], grad_fn=<SliceBackward0>) 

Layer: 4.weight | Size: torch.Size([2, 4]) | Values : tensor([[-0.2031,  0.3317, -0.3947, -0.2305],
        [-0.1412, -0.3006,  0.0472, -0.4938]], grad_fn=<SliceBackward0>) 

Layer: 4.bias | Size: torch.Size([2]) | Values : tensor([ 0.4516, -0.4247], grad_fn=<SliceBackward0>) 



In [21]:
sample_input = torch.randn(1, 4)
print(f"Sample Input: {sample_input}")
output = model(sample_input)
print(f"Output: {output}")

Sample Input: tensor([[ 1.3221,  0.8172, -0.7658, -0.7506]])
Output: tensor([[0.8064, 0.1936]], grad_fn=<SoftmaxBackward0>)


In [22]:
output

tensor([[0.8064, 0.1936]], grad_fn=<SoftmaxBackward0>)

In [23]:
# Define the weights and biases from the model state_dict
# Layer 0 (Linear: 4 -> 2)
w0 = torch.tensor(
    [[0.1161, 0.2583, 0.0907, -0.1781], [0.2610, 0.2628, 0.1870, -0.0879]]
)
b0 = torch.tensor([-0.1324, 0.0535])

# Layer 2 (Linear: 2 -> 4)
w2 = torch.tensor(
    [[-0.1249, -0.2107], [0.4520, 0.6077], [-0.0700, -0.1583], [0.0103, -0.0422]]
)
b2 = torch.tensor([0.1700, 0.1982, -0.6422, -0.2609])

# Layer 4 (Linear: 4 -> 2)
w4 = torch.tensor(
    [[0.4211, 0.1948, -0.0249, -0.3015], [-0.3059, -0.4479, -0.1630, 0.1689]]
)
b4 = torch.tensor([0.3188, 0.2308])

print("Weights and biases loaded successfully!")
print(f"Layer 0 weight shape: {w0.shape}")
print(f"Layer 2 weight shape: {w2.shape}")
print(f"Layer 4 weight shape: {w4.shape}")

Weights and biases loaded successfully!
Layer 0 weight shape: torch.Size([2, 4])
Layer 2 weight shape: torch.Size([4, 2])
Layer 4 weight shape: torch.Size([2, 4])


In [24]:
# Manual forward pass calculation
# Using the same input as before
sample_input = torch.randn(1, 4)
print(f"Input: {sample_input}")
print(f"Input shape: {sample_input.shape}")

# Step 1: First linear layer (4 -> 2)
z1 = torch.matmul(sample_input, w0.T) + b0
print(f"\nAfter Layer 0 (Linear): {z1}")

# Step 2: ReLU activation
a1 = F.relu(z1)
print(f"After ReLU 1: {a1}")

# Step 3: Second linear layer (2 -> 4)
z2 = torch.matmul(a1, w2.T) + b2
print(f"After Layer 2 (Linear): {z2}")

# Step 4: ReLU activation
a2 = F.relu(z2)
print(f"After ReLU 2: {a2}")

# Step 5: Third linear layer (4 -> 2)
z3 = torch.matmul(a2, w4.T) + b4
print(f"After Layer 4 (Linear): {z3}")

# Step 6: Softmax activation
final_output = F.softmax(z3, dim=1)
print(f"Final output (after Softmax): {final_output}")

# Compare with model output
model_output = model(sample_input)
print(f"\nModel output for comparison: {model_output}")
print(f"Outputs match: {torch.allclose(final_output, model_output)}")

Input: tensor([[ 1.3525,  0.6863, -0.3278,  0.7950]])
Input shape: torch.Size([1, 4])

After Layer 0 (Linear): tensor([[0.0306, 0.4557]])
After ReLU 1: tensor([[0.0306, 0.4557]])
After Layer 2 (Linear): tensor([[ 0.0702,  0.4890, -0.7165, -0.2798]])
After ReLU 2: tensor([[0.0702, 0.4890, 0.0000, 0.0000]])
After Layer 4 (Linear): tensor([[ 0.4436, -0.0097]])
Final output (after Softmax): tensor([[0.6114, 0.3886]])

Model output for comparison: tensor([[0.8323, 0.1677]], grad_fn=<SoftmaxBackward0>)
Outputs match: False


In [25]:
# Detailed step-by-step calculation showing matrix operations
print("=== DETAILED STEP-BY-STEP CALCULATION ===")

# Reset to get the same input
torch.manual_seed(42)
x = torch.randn(1, 4)
print(f"Input x: {x}")
print(f"Input values: [{x[0, 0]:.4f}, {x[0, 1]:.4f}, {x[0, 2]:.4f}, {x[0, 3]:.4f}]")

print("\n--- Layer 0: Linear(4->2) ---")
print(f"Weight matrix W0:\n{w0}")
print(f"Bias b0: {b0}")

# # Manual matrix multiplication for first layer
z1_manual = torch.zeros(1, 2)
print(f"Calculating z1 manually:\nvalue of z1_manual: {z1_manual}")
for i in range(2):
    z1_manual[0, i] = sum(x[0, j] * w0[i, j] for j in range(4)) + b0[i]
    print(
        f"z1[{i}] = {' + '.join([f'{x[0, j]:.4f}*{w0[i, j]:.4f}' for j in range(4)])} + {b0[i]:.4f} = {z1_manual[0, i]:.4f}"
    )

print(f"Result z1: {z1_manual}")

print("\n--- ReLU Activation ---")
a1 = F.relu(z1_manual)
print(f"a1 = ReLU(z1) = {a1}")

print("\n--- Layer 2: Linear(2->4) ---")
print(f"Weight matrix W2:\n{w2}")
print(f"Bias b2: {b2}")

# Manual calculation for second layer
z2_manual = torch.zeros(1, 4)
for i in range(4):  # output features
    z2_manual[0, i] = sum(a1[0, j] * w2[i, j] for j in range(2)) + b2[i]
    print(
        f"z2[{i}] = {' + '.join([f'{a1[0, j]:.4f}*{w2[i, j]:.4f}' for j in range(2)])} + {b2[i]:.4f} = {z2_manual[0, i]:.4f}"
    )

print(f"Result z2: {z2_manual}")

print("\n--- ReLU Activation ---")
a2 = F.relu(z2_manual)
print(f"a2 = ReLU(z2) = {a2}")

print("\n--- Layer 4: Linear(4->2) ---")
print(f"Weight matrix W4:\n{w4}")
print(f"Bias b4: {b4}")

# Manual calculation for third layer
z3_manual = torch.zeros(1, 2)
for i in range(2):  # output features
    z3_manual[0, i] = sum(a2[0, j] * w4[i, j] for j in range(4)) + b4[i]
    print(
        f"z3[{i}] = {' + '.join([f'{a2[0, j]:.4f}*{w4[i, j]:.4f}' for j in range(4)])} + {b4[i]:.4f} = {z3_manual[0, i]:.4f}"
    )

print(f"Result z3: {z3_manual}")

print("\n--- Softmax Activation ---")
final_manual = F.softmax(z3_manual, dim=1)
print("Softmax calculation:")
print(f"exp(z3): {torch.exp(z3_manual)}")
print(f"Sum of exp: {torch.sum(torch.exp(z3_manual))}")
print(f"Final output: {final_manual}")

print("\n=== VERIFICATION ===")
model_out = model(x)
print(f"Model output: {model_out}")
print(f"Manual calculation matches model: {torch.allclose(final_manual, model_out)}")

=== DETAILED STEP-BY-STEP CALCULATION ===
Input x: tensor([[0.3367, 0.1288, 0.2345, 0.2303]])
Input values: [0.3367, 0.1288, 0.2345, 0.2303]

--- Layer 0: Linear(4->2) ---
Weight matrix W0:
tensor([[ 0.1161,  0.2583,  0.0907, -0.1781],
        [ 0.2610,  0.2628,  0.1870, -0.0879]])
Bias b0: tensor([-0.1324,  0.0535])
Calculating z1 manually:
value of z1_manual: tensor([[0., 0.]])
z1[0] = 0.3367*0.1161 + 0.1288*0.2583 + 0.2345*0.0907 + 0.2303*-0.1781 + -0.1324 = -0.0798
z1[1] = 0.3367*0.2610 + 0.1288*0.2628 + 0.2345*0.1870 + 0.2303*-0.0879 + 0.0535 = 0.1988
Result z1: tensor([[-0.0798,  0.1988]])

--- ReLU Activation ---
a1 = ReLU(z1) = tensor([[0.0000, 0.1988]])

--- Layer 2: Linear(2->4) ---
Weight matrix W2:
tensor([[-0.1249, -0.2107],
        [ 0.4520,  0.6077],
        [-0.0700, -0.1583],
        [ 0.0103, -0.0422]])
Bias b2: tensor([ 0.1700,  0.1982, -0.6422, -0.2609])
z2[0] = 0.0000*-0.1249 + 0.1988*-0.2107 + 0.1700 = 0.1281
z2[1] = 0.0000*0.4520 + 0.1988*0.6077 + 0.1982 = 0.3190

In [26]:
a = torch.tensor([1.0, 2.0, 3.0])
b = torch.tensor([4.0, 5.0, 6.0])
torch.dot(a, b)

tensor(32.)

In [27]:
a = torch.tensor([[1, 2], [3, 4]])

In [28]:
b = torch.tensor([[5, 6], [7, 8]])

In [29]:
a @ b

tensor([[19, 22],
        [43, 50]])

In [30]:
a.shape, b.shape

(torch.Size([2, 2]), torch.Size([2, 2]))

In [31]:
X = torch.randn(100, 128)

In [32]:
W = torch.randn(128, 256)

In [33]:
W

tensor([[ 0.6983, -0.2393,  0.4663,  ..., -0.2332, -1.6942, -0.4870],
        [-0.2647,  1.0030, -0.7152,  ..., -2.1470, -3.9652,  1.0154],
        [-0.0852, -0.3196, -0.0370,  ..., -0.2633,  1.2498, -0.5878],
        ...,
        [ 0.4418, -1.4664,  0.4245,  ..., -0.7113,  0.8049, -0.3494],
        [-0.7226,  0.7954, -0.4126,  ..., -0.1578, -0.1787,  1.0665],
        [-2.5271, -0.0612,  0.2549,  ...,  0.2055,  1.0083,  0.0092]])

In [34]:
Y = X @ W

In [35]:
Y

tensor([[-13.8330,  -5.3112,  -7.5985,  ...,   8.2144,  14.9718, -10.1256],
        [ 17.5398, -27.5295,   5.2320,  ...,  10.1908,  -8.4034, -22.6903],
        [-23.7189,   3.9519,  -5.0085,  ..., -27.2681,  -0.3879,  -8.4894],
        ...,
        [ -3.3674,  -2.2970,  -9.4482,  ...,  20.2985,   3.7605, -22.7935],
        [  0.7842,   9.7126,   5.1600,  ..., -23.8131, -10.6342,  -5.6311],
        [ 13.5285,  -6.0289,  16.0532,  ...,   1.8206,  -3.6772,  -8.3824]])

In [36]:
X.shape, W.shape, Y.shape

(torch.Size([100, 128]), torch.Size([128, 256]), torch.Size([100, 256]))

In [37]:
W1 = torch.randn(256, 128)

In [38]:
import numpy as np

logits = np.array([2.3, 5.1, 0.7])
exps = np.exp(logits)
probs = exps / np.sum(exps)
probs

array([0.05666832, 0.93189054, 0.01144114])

In [40]:
softmax = exps / np.sum(exps)
softmax

array([0.05666832, 0.93189054, 0.01144114])

In [41]:
stable_logits = logits - np.max(logits)
stable_exps = np.exp(stable_logits)
softmax = stable_exps / np.sum(stable_exps)

In [42]:
softmax

array([0.05666832, 0.93189054, 0.01144114])

In [43]:
logits = torch.tensor([2.3, 5.1, 0.7])
softmax = torch.nn.functional.softmax(logits, dim=0)

In [44]:
softmax

tensor([0.0567, 0.9319, 0.0114])

In [45]:
batch_logits = torch.tensor([[2.3, 5.1, 0.7], [1.2, 0.4, 3.3], [0.5, 2.2, 1.1]])
probs = torch.nn.functional.softmax(batch_logits, dim=1)

In [46]:
probs

tensor([[0.0567, 0.9319, 0.0114],
        [0.1040, 0.0467, 0.8493],
        [0.1205, 0.6598, 0.2196]])

In [47]:
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(in_features=10, out_features=10)

        self.register_buffer("scale", torch.tensor(0.5))

    def forward(self, x):
        return self.linear(x) * self.scale

In [48]:
model = MyModel()

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

Layer: linear.weight | Size: torch.Size([10, 10]) | Values : tensor([[ 0.1070, -0.0232, -0.2667, -0.2575, -0.2214,  0.1837,  0.1073,  0.2075,
          0.2756, -0.0211],
        [ 0.0510, -0.0828, -0.2874, -0.0333,  0.2906,  0.0826,  0.1578, -0.1425,
         -0.0587,  0.2346]], grad_fn=<SliceBackward0>) 

Layer: linear.bias | Size: torch.Size([10]) | Values : tensor([-0.2396, -0.1796], grad_fn=<SliceBackward0>) 



In [49]:
model.state_dict()

OrderedDict([('scale', tensor(0.5000)),
             ('linear.weight',
              tensor([[ 0.1070, -0.0232, -0.2667, -0.2575, -0.2214,  0.1837,  0.1073,  0.2075,
                        0.2756, -0.0211],
                      [ 0.0510, -0.0828, -0.2874, -0.0333,  0.2906,  0.0826,  0.1578, -0.1425,
                       -0.0587,  0.2346],
                      [-0.0279, -0.0582,  0.0089,  0.0809, -0.2207,  0.1618,  0.1365, -0.2807,
                        0.0756, -0.0237],
                      [ 0.0247,  0.0117,  0.2996, -0.1506,  0.0739,  0.2382,  0.3152, -0.3021,
                       -0.2451, -0.1656],
                      [ 0.0078,  0.1948,  0.0744,  0.1331, -0.2599,  0.2712, -0.1850, -0.0853,
                        0.0919, -0.2560],
                      [ 0.1144,  0.1566, -0.0552, -0.0942,  0.2662, -0.1962, -0.1821,  0.2237,
                        0.2415,  0.0334],
                      [ 0.0136,  0.0875, -0.2845,  0.1681,  0.0534, -0.1410, -0.2750, -0.0830,
            

In [50]:
model.state_dict().keys()

odict_keys(['scale', 'linear.weight', 'linear.bias'])

In [51]:
for name, buf in model.named_buffers():
    print(f"Buffer: {name} | Size: {buf.size()} | Values : {buf} \n")

Buffer: scale | Size: torch.Size([]) | Values : 0.5 



In [52]:
model.scale

tensor(0.5000)

In [53]:
bn = nn.BatchNorm1d(num_features=10)

In [54]:
for name, buf in bn.named_buffers():
    print(f"Buffer: {name} | Size: {buf.size()} | Values : {buf}")

Buffer: running_mean | Size: torch.Size([10]) | Values : tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Buffer: running_var | Size: torch.Size([10]) | Values : tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
Buffer: num_batches_tracked | Size: torch.Size([]) | Values : 0


In [55]:
bn.state_dict()

OrderedDict([('weight', tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])),
             ('bias', tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])),
             ('running_mean',
              tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])),
             ('running_var', tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])),
             ('num_batches_tracked', tensor(0))])