[standalone-qwen3.ipynb](https://github.com/rasbt/LLMs-from-scratch/blob/main/ch05/11_qwen3/standalone-qwen3.ipynb)

In [1]:
from importlib.metadata import version
pkg = [
    'huggingface_hub',
    'tokenizers',
    'torch'
]
for p in pkg:
    print(f"{p} version: {version(p)}")

huggingface_hub version: 0.30.1
tokenizers version: 0.21.1
torch version: 2.3.1


In [2]:
USE_BASE_MODEL = False
USE_RESONING_MODEL = True
USE_INSTRUCT_MODEL = False

if (USE_BASE_MODEL + USE_RESONING_MODEL + USE_INSTRUCT_MODEL) != 1:
    raise ValueError("Exactly one of USE_BASE_MODEL, USE_RESONING_MODEL, or USE_INSTRUCT_MODEL must be True")

In [3]:
import torch
import torch.nn as nn

class FeedForward(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.fc1 = nn.Linear(cfg["emb_dim"],cfg["hidden_dim"],dtype=cfg["dtype"],bias=False)
        self.fc2 = nn.Linear(cfg["emb_dim"],cfg["hidden_dim"],dtype=cfg["dtype"],bias=False)
        self.fc3 = nn.Linear(cfg["hidden_dim"],cfg["emb_dim"],dtype=cfg["dtype"],bias=False)

    def forward(self,x):
        x_fc1 = self.fc1(x)
        x_fc2 = self.fc2(x)
        x = nn.functional.silu(x_fc1) * x_fc2
        return self.fc3(x)

In [None]:
class RMSNorm(nn.Module):
    def __init__(self,emb_dim,eps=1e-6,bias=False,qwen3_compatible=True):
        super().__init__()
        self.eps = eps
        self.qwen3_compatible = qwen3_compatible
        self.scale = nn.Parameter(torch.one(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim)) if bias else None

    def forward(self,x):
        input_dtype = x.dtype

        if self.qwen3_compatible:
            x = x.to(torch.float32)
        
        variance = x.pow(2).mean(dim=-1,keepdim=True)
        norm_x = x * torch.rsqrt(variance + self.eps)
        norm_x = norm_x * self.scale

        if self.shift is not None:
            norm_x = norm_x * self.shift
        
        return norm_x.to(input_dtype)