In [3]:
!pip -q install "datasets>=2.19.0" "tokenizers>=0.15.2" "torch>=2.2" "tqdm" "scikit-learn" "onnx>=1.16" "onnxruntime>=1.18" "fastapi" "uvicorn[standard]"


In [4]:
import os, random, math, json, gc, sys
from dataclasses import dataclass
from typing import List, Tuple, Dict, Optional
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, IterableDataset, Dataset as HFDataset
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import NFKC, Lowercase, Sequence
from tokenizers.processors import TemplateProcessing
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tqdm import tqdm

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Torch: 2.8.0+cu126
CUDA available: True


In [31]:
CONFIG = {
    "hf_path": "tarudesu/VOZ-HSD",   # dataset
    "streaming": True,               # dùng streaming để không tải toàn bộ
    "seed": 42,
    "sample_train": 280_000,         # số mẫu train rút ra
    "sample_val":   20_000,          # số mẫu val rút ra
    "max_length": 160,               # chiều dài token
    "vocab_size": 32000,             # kích thước vocab WordPiece
    "min_freq": 2,                   # tần suất tối thiểu để vào vocab
    "batch_size": 256,               # sẽ auto giảm nếu thiếu VRAM
    "epochs": 6,
    "lr": 5e-4,                      # AdamW
    "weight_decay": 0.01,
    "warmup_ratio": 0.05,
    "grad_clip": 1.0,
    "d_model": 256,                  # kích thước embedding
    "n_heads": 8,
    "n_layers": 4,
    "ffn_mult": 4,
    "dropout": 0.1,
    "save_dir": "/content/voz_hsd_transformer",
    "train_file": "/content/voz_hsd_train.jsonl",
    "val_file": "/content/voz_hsd_val.jsonl",
    "tokenizer_file": "/content/voz_hsd_tokenizer.json",
}
os.makedirs(CONFIG["save_dir"], exist_ok=True)
random.seed(CONFIG["seed"])
torch.manual_seed(CONFIG["seed"])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(CONFIG["seed"])


In [6]:
from itertools import islice

def stream_sample_to_files(hf_path, n_train, n_val):
    total_needed = n_train + n_val
    ds_stream = load_dataset(hf_path, split="train", streaming=True)
    # Lọc mẫu hợp lệ + ghi file
    train_f = open(CONFIG["train_file"], "w", encoding="utf-8")
    val_f   = open(CONFIG["val_file"], "w", encoding="utf-8")
    cnt = 0
    val_take_every = max(1, math.floor(n_train / n_val))  # ví dụ 14:1
    for ex in ds_stream:
        txt = ex.get("texts", None)
        lab = ex.get("labels", None)
        if txt is None or lab is None:
            continue
        if not isinstance(lab, int):
            try:
                lab = int(lab)
            except:
                continue
        record = {"text": txt, "label": int(lab)}
        if (cnt % (val_take_every + 1)) == 0 and n_val > 0:
            json.dump(record, val_f, ensure_ascii=False); val_f.write("\n"); n_val -= 1
        else:
            json.dump(record, train_f, ensure_ascii=False); train_f.write("\n"); n_train -= 1
        cnt += 1
        if n_train <= 0 and n_val <= 0:
            break
    train_f.close(); val_f.close()
    print(f"Done sampling: wrote {CONFIG['train_file']} and {CONFIG['val_file']}")

# Chạy nếu file chưa tồn tại
if not (os.path.exists(CONFIG["train_file"]) and os.path.exists(CONFIG["val_file"])):
    stream_sample_to_files(CONFIG["hf_path"], CONFIG["sample_train"], CONFIG["sample_val"])
else:
    print("Sample files already exist, skip sampling.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Done sampling: wrote /content/voz_hsd_train.jsonl and /content/voz_hsd_val.jsonl


In [7]:
def train_tokenizer(jsonl_path, save_path, vocab_size=32000, min_freq=2):
    # Iterator đọc text
    def text_iter():
        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    obj = json.loads(line)
                    yield obj["text"]
                except:
                    continue

    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    tokenizer.normalizer = Sequence([NFKC(), Lowercase()])
    tokenizer.pre_tokenizer = Whitespace()
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        min_frequency=min_freq,
        special_tokens=["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
    )
    tokenizer.train_from_iterator(text_iter(), trainer=trainer, length=CONFIG["sample_train"])
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]"))],
    )
    tokenizer.enable_truncation(max_length=CONFIG["max_length"])
    tokenizer.enable_padding(length=CONFIG["max_length"], pad_token="[PAD]")
    tokenizer.save(save_path)
    print("Saved tokenizer to", save_path)

if not os.path.exists(CONFIG["tokenizer_file"]):
    train_tokenizer(CONFIG["train_file"], CONFIG["tokenizer_file"], CONFIG["vocab_size"], CONFIG["min_freq"])
else:
    print("Tokenizer already exists:", CONFIG["tokenizer_file"])

tokenizer = Tokenizer.from_file(CONFIG["tokenizer_file"])
PAD_ID = tokenizer.token_to_id("[PAD]")
CLS_ID = tokenizer.token_to_id("[CLS]")
print("Vocab size:", tokenizer.get_vocab_size(), "PAD:", PAD_ID, "CLS:", CLS_ID)


Saved tokenizer to /content/voz_hsd_tokenizer.json
Vocab size: 32000 PAD: 0 CLS: 2


In [8]:
class JsonlTextDataset(Dataset):
    def __init__(self, path, tokenizer, max_length):
        self.path = path
        self.samples = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                obj = json.loads(line)
                self.samples.append((obj["text"], int(obj["label"])))
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        text, label = self.samples[idx]
        enc = self.tokenizer.encode(text)
        ids = enc.ids
        attn = [0 if i==PAD_ID else 1 for i in ids]
        return torch.tensor(ids, dtype=torch.long), torch.tensor(attn, dtype=torch.long), torch.tensor(label, dtype=torch.long)

train_ds = JsonlTextDataset(CONFIG["train_file"], tokenizer, CONFIG["max_length"])
val_ds   = JsonlTextDataset(CONFIG["val_file"], tokenizer, CONFIG["max_length"])

def collate(batch):
    ids = torch.stack([b[0] for b in batch], dim=0)
    attn = torch.stack([b[1] for b in batch], dim=0)
    y    = torch.stack([b[2] for b in batch], dim=0)
    return ids, attn, y

def make_loader(ds, batch_size, shuffle):
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle, num_workers=2, pin_memory=True, collate_fn=collate)

BATCH = CONFIG["batch_size"]
train_loader = make_loader(train_ds, BATCH, True)
val_loader   = make_loader(val_ds,   BATCH, False)
len(train_ds), len(val_ds)


(280000, 20000)

In [9]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2048):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer('pe', pe.unsqueeze(0), persistent=False)
    def forward(self, x):
        # x: (B, L, D)
        L = x.size(1)
        return x + self.pe[:, :L, :]

class TransformerEncoderClassifier(nn.Module):
    def __init__(self, vocab_size, d_model=256, n_heads=8, n_layers=4, ffn_mult=4, dropout=0.1, pad_id=0):
        super().__init__()
        self.pad_id = pad_id
        self.tok_emb = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)
        self.pos = PositionalEncoding(d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=d_model*ffn_mult,
            dropout=dropout, batch_first=True, activation="gelu"
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 2)
        )

    def forward(self, ids, attn_mask):
        x = self.tok_emb(ids)
        x = self.pos(x)
        # key padding mask: True for PAD positions
        key_pad = (ids == self.pad_id)
        x = self.encoder(x, src_key_padding_mask=key_pad)
        # take CLS token embedding (position 0)
        cls = x[:, 0, :]
        cls = self.norm(cls)
        logits = self.classifier(cls)
        return logits

model = TransformerEncoderClassifier(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=CONFIG["d_model"],
    n_heads=CONFIG["n_heads"],
    n_layers=CONFIG["n_layers"],
    ffn_mult=CONFIG["ffn_mult"],
    dropout=CONFIG["dropout"],
    pad_id=PAD_ID
).to(device)

sum(p.numel() for p in model.parameters())/1e6


11.417858

In [33]:
from torch.optim import AdamW

def build_cosine_schedule(optimizer, num_warmup, num_train_steps):
    def lr_lambda(step):
        if step < num_warmup:
            return float(step) / float(max(1, num_warmup))
        progress = float(step - num_warmup) / float(max(1, num_train_steps - num_warmup))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))
    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

def evaluate(model, loader):
    model.eval()
    preds, golds = [], []
    with torch.no_grad():
        for ids, attn, y in loader:
            ids, attn, y = ids.to(device), attn.to(device), y.to(device)
            logits = model(ids, attn)
            pred = logits.argmax(dim=-1)
            preds.extend(pred.tolist())
            golds.extend(y.tolist())
    acc = accuracy_score(golds, preds)
    f1  = f1_score(golds, preds)
    return acc, f1

train_steps = len(train_loader) * CONFIG["epochs"]
warmup_steps = int(CONFIG["warmup_ratio"] * train_steps)

optimizer = AdamW(model.parameters(), lr=CONFIG["lr"], weight_decay=CONFIG["weight_decay"])
scheduler = build_cosine_schedule(optimizer, warmup_steps, train_steps)
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

best_f1 = 0.0


  scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())


In [35]:
torch.set_grad_enabled(True)
for p in model.parameters():
    p.requires_grad_(True)
model.train()

for epoch in range(1, CONFIG["epochs"]+1):
    model.train()
    pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{CONFIG['epochs']}")
    running = 0.0
    for ids, attn, y in pbar:
        ids, attn, y = ids.to(device), attn.to(device), y.to(device)
        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            logits = model(ids, attn)
            loss = F.cross_entropy(logits, y)
        scaler.scale(loss).backward()
        # gradient clipping
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), CONFIG["grad_clip"])
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        running += loss.item()
        pbar.set_postfix(loss=f"{running / (pbar.n or 1):.4f}")
    # eval
    acc, f1 = evaluate(model, val_loader)
    print(f"[Val] epoch {epoch}: acc={acc:.4f} | f1={f1:.4f}")
    # save best
    if f1 > best_f1:
        best_f1 = f1
        torch.save({
            "config": CONFIG,
            "state_dict": model.state_dict(),
            "vocab_size": tokenizer.get_vocab_size(),
            "pad_id": PAD_ID,
        }, os.path.join(CONFIG["save_dir"], "best_model.pt"))
        # lưu tokenizer
        os.system(f"cp {CONFIG['tokenizer_file']} {CONFIG['save_dir']}/tokenizer.json")
        print("✅ Saved best to", CONFIG["save_dir"])


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 1/6: 100%|██████████| 1094/1094 [01:57<00:00,  9.30it/s, loss=0.0744]


[Val] epoch 1: acc=0.9667 | f1=0.6457
✅ Saved best to /content/voz_hsd_transformer


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 2/6: 100%|██████████| 1094/1094 [01:57<00:00,  9.31it/s, loss=0.0693]


[Val] epoch 2: acc=0.9674 | f1=0.6646
✅ Saved best to /content/voz_hsd_transformer


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 3/6: 100%|██████████| 1094/1094 [01:57<00:00,  9.31it/s, loss=0.0572]


[Val] epoch 3: acc=0.9629 | f1=0.6791
✅ Saved best to /content/voz_hsd_transformer


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 4/6: 100%|██████████| 1094/1094 [01:57<00:00,  9.30it/s, loss=0.0434]


[Val] epoch 4: acc=0.9655 | f1=0.6864
✅ Saved best to /content/voz_hsd_transformer


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 5/6: 100%|██████████| 1094/1094 [01:57<00:00,  9.30it/s, loss=0.0286]


[Val] epoch 5: acc=0.9677 | f1=0.6897
✅ Saved best to /content/voz_hsd_transformer


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 6/6: 100%|██████████| 1094/1094 [01:57<00:00,  9.31it/s, loss=0.0198]


[Val] epoch 6: acc=0.9662 | f1=0.6826


In [38]:
# load best
ckpt = torch.load(os.path.join(CONFIG["save_dir"], "best_model.pt"), map_location=device)
model.load_state_dict(ckpt["state_dict"]); model.eval()

def predict(texts: List[str]):
    batch = []
    for t in texts:
        enc = tokenizer.encode(t)
        ids = torch.tensor(enc.ids, dtype=torch.long)
        attn= torch.tensor([0 if i==PAD_ID else 1 for i in enc.ids], dtype=torch.long)
        batch.append((ids, attn))
    ids = torch.stack([b[0] for b in batch]).to(device)
    attn= torch.stack([b[1] for b in batch]).to(device)
    with torch.no_grad():
        logits = model(ids, attn)
        prob = logits.softmax(-1).tolist()
        pred = logits.argmax(-1).tolist()
    return pred, prob

sample_texts = [
    "Thứ này đọc chán thật.",
    "Súc vật, ngu loz",
    "Một con Bò bị tông",
]
pred, prob = predict(sample_texts)
list(zip(sample_texts, pred, prob))


[('Thứ này đọc chán thật.', 0, [0.9998008608818054, 0.00019912670541089028]),
 ('Súc vật, ngu loz', 1, [0.0017155666137114167, 0.9982843995094299]),
 ('Một con Bò bị tông', 0, [0.9920392036437988, 0.007960781455039978])]

In [39]:
# ===== Cell 10 — ONNX export via custom encoder (no unsupported ops) =====
import os, math, torch
import torch.nn as nn

model.eval()
torch.set_grad_enabled(False)

# ---- 1) Định nghĩa encoder thuần MatMul/Softmax (ONNX-friendly) ----
class ExportableEncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.0):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads

        # QKV + Out proj
        self.q_proj = nn.Linear(d_model, d_model, bias=True)
        self.k_proj = nn.Linear(d_model, d_model, bias=True)
        self.v_proj = nn.Linear(d_model, d_model, bias=True)
        self.o_proj = nn.Linear(d_model, d_model, bias=True)

        # FFN
        self.lin1 = nn.Linear(d_model, d_ff, bias=True)
        self.lin2 = nn.Linear(d_ff, d_model, bias=True)

        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.act = nn.GELU()

    def _split_heads(self, x):   # (B, L, D) -> (B, H, L, d_k)
        B, L, D = x.shape
        x = x.view(B, L, self.n_heads, self.d_k).transpose(1, 2)
        return x

    def _merge_heads(self, x):   # (B, H, L, d_k) -> (B, L, D)
        B, H, L, d_k = x.shape
        x = x.transpose(1, 2).contiguous().view(B, L, H * d_k)
        return x

    def forward(self, x, attention_mask):  # x: (B,L,D), attention_mask: (B,L) with 1 for valid, 0 for pad
        # --- Self-Attention ---
        q = self.q_proj(x)  # (B,L,D)
        k = self.k_proj(x)
        v = self.v_proj(x)

        q = self._split_heads(q)  # (B,H,L,d_k)
        k = self._split_heads(k)
        v = self._split_heads(v)

        # scaled dot-product
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)  # (B,H,L,L)

        # additive mask: convert 1/0 to 0/-1e9 then unsqueeze for heads
        if attention_mask is not None:
            # attention_mask: (B,L), 1 valid, 0 pad
            # mask for keys: where pad -> -1e9
            key_mask = (1.0 - attention_mask.float()) * -1e9  # (B,L)
            scores = scores + key_mask.unsqueeze(1).unsqueeze(2)  # broadcast to (B,H,L,L)

        attn = torch.softmax(scores, dim=-1)  # (B,H,L,L)
        attn = self.dropout(attn)
        context = torch.matmul(attn, v)       # (B,H,L,d_k)
        context = self._merge_heads(context)  # (B,L,D)
        out = self.o_proj(context)            # (B,L,D)

        # residual + norm
        x = self.ln1(x + self.dropout(out))

        # --- FFN ---
        f = self.lin2(self.dropout(self.act(self.lin1(x))))
        x = self.ln2(x + self.dropout(f))
        return x

class ExportableEncoder(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, n_layers, dropout=0.0):
        super().__init__()
        self.layers = nn.ModuleList([
            ExportableEncoderLayer(d_model, n_heads, d_ff, dropout=dropout)
            for _ in range(n_layers)
        ])

    def forward(self, x, attention_mask):
        for layer in self.layers:
            x = layer(x, attention_mask)
        return x

class ExportableClassifier(nn.Module):
    def __init__(self, base, pad_id):
        super().__init__()
        # lấy cấu trúc & tham số từ model đã huấn luyện
        self.pad_id = pad_id
        self.tok_emb = base.tok_emb
        self.pos     = base.pos
        self.norm    = base.norm
        self.classifier = base.classifier

        # build encoder ONNX-friendly có cùng siêu tham số
        d_model = base.encoder.layers[0].linear1.in_features  # = CONFIG["d_model"]
        n_heads = base.encoder.layers[0].self_attn.num_heads
        d_ff    = base.encoder.layers[0].linear1.out_features
        n_layers= len(base.encoder.layers)
        self.encoder = ExportableEncoder(d_model, n_heads, d_ff, n_layers, dropout=0.0)  # dropout=0 khi export

        # copy trọng số từ nn.TransformerEncoderLayer sang ExportableEncoderLayer
        for i, (src, dst) in enumerate(zip(base.encoder.layers, self.encoder.layers)):
            # self-attn weights (MHA): in_proj_weight/bias = [Q;K;V]
            Wq, Wk, Wv = torch.chunk(src.self_attn.in_proj_weight.detach().clone(), 3, dim=0)
            bq, bk, bv = torch.chunk(src.self_attn.in_proj_bias.detach().clone(),   3, dim=0)
            with torch.no_grad():
                dst.q_proj.weight.copy_(Wq)
                dst.k_proj.weight.copy_(Wk)
                dst.v_proj.weight.copy_(Wv)
                dst.q_proj.bias.copy_(bq)
                dst.k_proj.bias.copy_(bk)
                dst.v_proj.bias.copy_(bv)

                dst.o_proj.weight.copy_(src.self_attn.out_proj.weight.detach())
                dst.o_proj.bias.copy_(src.self_attn.out_proj.bias.detach())

                # FFN
                dst.lin1.weight.copy_(src.linear1.weight.detach())
                dst.lin1.bias.copy_(src.linear1.bias.detach())
                dst.lin2.weight.copy_(src.linear2.weight.detach())
                dst.lin2.bias.copy_(src.linear2.bias.detach())

                # LayerNorms
                dst.ln1.weight.copy_(src.norm1.weight.detach())
                dst.ln1.bias.copy_(src.norm1.bias.detach())
                dst.ln2.weight.copy_(src.norm2.weight.detach())
                dst.ln2.bias.copy_(src.norm2.bias.detach())

    def forward(self, input_ids, attention_mask):
        # attention_mask (B,L) 1=valid, 0=pad (đã xây trong dataloader)
        x = self.tok_emb(input_ids)       # (B,L,D)
        x = self.pos(x)                   # add PE
        x = self.encoder(x, attention_mask.float())  # (B,L,D)
        cls = x[:, 0, :]
        cls = self.norm(cls)
        logits = self.classifier(cls)
        return logits

# ---- 2) Khởi tạo exportable model & xuất ONNX ----
exportable = ExportableClassifier(model, pad_id=PAD_ID).to(device).eval()

# Input mẫu ổn định
example_text = "ví dụ chạy thử để export"
enc = tokenizer.encode(example_text)
ids  = torch.tensor([enc.ids], dtype=torch.long).to(device)
attn = torch.tensor([[0 if i == PAD_ID else 1 for i in enc.ids]], dtype=torch.long).to(device)

onnx_path = os.path.join(CONFIG["save_dir"], "model.onnx")
torch.onnx.export(
    exportable,
    (ids, attn),
    onnx_path,
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={"input_ids": {0: "batch", 1: "seq"},
                  "attention_mask": {0: "batch", 1: "seq"},
                  "logits": {0: "batch"}},
    opset_version=17,
    do_constant_folding=True,
)
print("✅ Saved ONNX:", onnx_path)

# ---- 3) Kiểm tra nhanh bằng ONNXRuntime (optional) ----
try:
    import onnxruntime as ort
    import numpy as np
    sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
    logits = sess.run(
        ["logits"],
        {
            "input_ids": ids.detach().cpu().numpy().astype("int64"),
            "attention_mask": attn.detach().cpu().numpy().astype("int64"),
        }
    )[0]
    print("ONNXRuntime test ok. logits shape:", logits.shape)
except Exception as e:
    print("[WARN] ONNXRuntime quick test skipped/failed:", repr(e))


  torch.onnx.export(


✅ Saved ONNX: /content/voz_hsd_transformer/model.onnx
ONNXRuntime test ok. logits shape: (1, 2)
