# Llena inspection notebook

Quick, small-scale inspection of datasets, tokenizer, model inputs/outputs.

In [None]:
# Optional: install deps in Colab
# !pip -q install torch torchvision transformers datasets peft bitsandbytes pillow tqdm wandb


In [None]:
import os, sys
from pathlib import Path

# If running from notebooks/, add repo root to sys.path
repo_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(repo_root))

print('repo_root:', repo_root)


In [None]:
import torch
from datasets import load_dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)


## Load a small dataset slice

This uses a small validation slice to keep it fast.

In [None]:
# Note: HF may still download parquet shards even if you take a small slice.
# We load the split and then select a small subset for inspection.
ds_full = load_dataset('lmms-lab/textvqa', split='validation')
ds = ds_full.select(range(50))
print(ds)
sample = ds[0]
list(sample.keys())


In [None]:
# Inspect a sample
sample


## Build Llena model + collator

Use a small config and run a single forward pass.

In [None]:
from mm.model import LlenaModel, LlenaModelConfig
from mm.collator import LlenaCollator
from transformers import SiglipImageProcessor

cfg = LlenaModelConfig(
    llm_name='Qwen/Qwen2.5-0.5B-Instruct',
    vision_name='google/siglip-base-patch16-224',
    num_image_tokens=64,
    projector='mlp2',
    freeze_vision=True,
    freeze_llm=True,
    gradient_checkpointing=False,
    device='cuda' if device.type == 'cuda' else 'cpu',
)
model = LlenaModel(cfg)
model.eval()

image_proc = SiglipImageProcessor.from_pretrained(cfg.vision_name)
collator = LlenaCollator(
    tokenizer=model.tokenizer,
    image_processor=image_proc,
    max_seq_len=128,
    num_image_tokens=cfg.num_image_tokens,
    pad_to_multiple_of=None,
)


In [None]:
# Build a small batch from the HF dataset
# Convert HF sample to VQASample format
from PIL import Image

def to_vqa(sample):
    return {
        'image': sample['image'],
        'question': sample['question'],
        'answer': sample['answers'][0],
        'answers': sample['answers'],
    }

batch = [to_vqa(ds[i]) for i in range(2)]
out = collator(batch)
batch_t = {k: v.to(device) for k, v in out.items() if torch.is_tensor(v)}

# Forward pass
with torch.no_grad():
    outputs = model(
        pixel_values=batch_t['pixel_values'],
        input_ids=batch_t['input_ids'],
        mm_attention_mask=batch_t['mm_attention_mask'],
        mm_labels=batch_t['mm_labels'],
    )

outputs.loss, outputs.logits.shape


In [None]:
# Decode the model's argmax tokens for the answer region (quick sanity check)
logits = outputs.logits
pred_ids = logits.argmax(dim=-1)
mask = batch_t['mm_labels'][0] != -100
pred_seq = pred_ids[0][mask].tolist()
model.tokenizer.decode(pred_seq, skip_special_tokens=True)


## Optional: inspect processed JSONL dataset

If you have processed data under `datasets/processed`, you can load it with JsonlVQADataset.

In [None]:
from data.format import JsonlVQADataset

proc_root = repo_root / 'datasets' / 'processed' / 'textvqa'
jsonl = proc_root / 'validation.jsonl'
images = proc_root / 'images'
if jsonl.exists():
    ds_jsonl = JsonlVQADataset(annotations_path=jsonl, image_root=images, max_samples=5)
    print('jsonl samples:', len(ds_jsonl))
    print(ds_jsonl[0])
else:
    print('No processed JSONL found at', jsonl)


## Flash attention check

Set a config path and device, then verify `attn_implementation`.

In [None]:
CFG_PATH = "configs/L4/sharegpt4v_train_qwen2.5-0.5b_siglip224.yaml"
DEVICE = "cuda"  # or "cpu"


In [None]:
from mm.config import load_config
from mm.run_config import RunConfig

cfg = load_config(CFG_PATH)
rc = RunConfig.from_dict(cfg)
print("attn_implementation:", rc.model.attn_implementation)


In [None]:
import torch
from mm.model import LlenaModel, LlenaModelConfig

if DEVICE == "cuda" and not torch.cuda.is_available():
    print("CUDA not available; skipping model load.")
else:
    mcfg = LlenaModelConfig(
        llm_name=rc.model.llm_name,
        vision_name=rc.model.vision_name,
        num_image_tokens=rc.mm.num_image_tokens,
        projector=rc.mm.projector,
        device=DEVICE,
        gradient_checkpointing=rc.train.gradient_checkpointing,
        peft_enable=False,
        qlora_enable=False,
        attn_implementation=rc.model.attn_implementation,
    )
    model = LlenaModel(mcfg)
    print("Loaded model with attn_implementation:", rc.model.attn_implementation)


## Debug generation (empty outputs)
Use this to test a single image + prompt and inspect raw generation.


In [None]:
from pathlib import Path
import torch
from PIL import Image
from transformers import SiglipImageProcessor
from mm.model import LlenaModel, LlenaModelConfig, _select_or_pad_tokens

CKPT_PATH = 'artifacts/your_run/step_xxx/ckpt.pt'  # TODO
IMAGE_PATH = 'datasets/processed/sharegpt4v_coco/images/000000000073.jpg'  # TODO
QUESTION = 'what is the license plate of this vehicle'
MAX_GENERATED_TOKENS = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ckpt = torch.load(CKPT_PATH, map_location=device)
cfg_d = ckpt.get('cfg')
if not isinstance(cfg_d, dict):
    raise ValueError('ckpt missing cfg')

cfg = LlenaModelConfig(
    llm_name=cfg_d['llm_name'],
    vision_name=cfg_d['vision_name'],
    num_image_tokens=cfg_d['num_image_tokens'],
    projector=cfg_d['projector'],
    gradient_checkpointing=False,
    freeze_vision=True,
    freeze_llm=True,
    peft_enable='adapter' in ckpt,
    peft_r=0,
    peft_alpha=0,
    peft_dropout=0.0,
    peft_target_modules=[],
    qlora_enable=False,
    device='cuda' if device.type == 'cuda' else 'cpu',
)
model = LlenaModel(cfg)
model.eval()
if 'model' in ckpt:
    model.load_state_dict(ckpt['model'], strict=True)
if 'projector' in ckpt:
    model.projector.load_state_dict(ckpt['projector'], strict=True)
if 'adapter' in ckpt:
    from peft import set_peft_model_state_dict
    set_peft_model_state_dict(model.llm, ckpt['adapter'])

image_proc = SiglipImageProcessor.from_pretrained(cfg.vision_name)
img = Image.open(IMAGE_PATH).convert('RGB')
vision = image_proc(images=[img], return_tensors='pt')
pixel_values = vision['pixel_values'].to(device)

tokenizer = model.tokenizer
pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
prompt_ids = tokenizer.apply_chat_template(
    [{'role': 'user', 'content': QUESTION}],
    add_generation_prompt=True,
    tokenize=True,
    return_tensors=None,
)
input_ids = torch.tensor([prompt_ids], device=device)
attention_mask = torch.ones_like(input_ids)

with torch.no_grad():
    v_out = model.vision(pixel_values=pixel_values)
    v_tokens = v_out.last_hidden_state
    if model.freeze_vision:
        v_tokens = v_tokens.detach()
    v_tokens = _select_or_pad_tokens(v_tokens, model.num_image_tokens)
    proj_dtype = next(model.projector.parameters()).dtype
    v_tokens = v_tokens.to(dtype=proj_dtype)
    img_embeds = model.projector(v_tokens)

    text_embeds = model.llm.get_input_embeddings()(input_ids)
    if img_embeds.dtype != text_embeds.dtype:
        img_embeds = img_embeds.to(dtype=text_embeds.dtype)
    inputs_embeds = torch.cat([img_embeds, text_embeds], dim=1)

    prefix_mask = torch.ones((attention_mask.size(0), model.num_image_tokens), dtype=attention_mask.dtype, device=attention_mask.device)
    mm_attention_mask = torch.cat([prefix_mask, attention_mask], dim=1)

    gen_ids = model.llm.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=mm_attention_mask,
        do_sample=False,
        num_beams=1,
        max_new_tokens=MAX_GENERATED_TOKENS,
        temperature=0.0,
        repetition_penalty=1.0,
        pad_token_id=pad_id,
        eos_token_id=tokenizer.eos_token_id,
    )

prefix_len = inputs_embeds.size(1)
gen_new = gen_ids[:, prefix_len:]
print('prompt tokens:', len(prompt_ids))
print('generated tokens:', gen_new.size(1))
print('response:', tokenizer.batch_decode(gen_new.tolist(), skip_special_tokens=True)[0])


## Debug generation from eval samples
Loads a sample from `datasets/samples.json` and runs a single-step generation for inspection.


In [None]:
import json
from pathlib import Path
import torch
from PIL import Image
from transformers import SiglipImageProcessor
from mm.model import LlenaModel, LlenaModelConfig, _select_or_pad_tokens

SAMPLES_JSON = 'datasets/samples.json'  # TODO
SAMPLE_INDEX = 0
CKPT_PATH = 'artifacts/your_run/step_xxx/ckpt.pt'  # TODO
MAX_GENERATED_TOKENS = 256

sample = json.loads(Path(SAMPLES_JSON).read_text(encoding='utf-8'))[SAMPLE_INDEX]
IMAGE_PATH = sample.get('image_path') or sample.get('image')
QUESTION = sample.get('question')
if not IMAGE_PATH or not QUESTION:
    raise ValueError('Sample must include image_path and question')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ckpt = torch.load(CKPT_PATH, map_location=device)
cfg_d = ckpt.get('cfg')
if not isinstance(cfg_d, dict):
    raise ValueError('ckpt missing cfg')

cfg = LlenaModelConfig(
    llm_name=cfg_d['llm_name'],
    vision_name=cfg_d['vision_name'],
    num_image_tokens=cfg_d['num_image_tokens'],
    projector=cfg_d['projector'],
    gradient_checkpointing=False,
    freeze_vision=True,
    freeze_llm=True,
    peft_enable='adapter' in ckpt,
    peft_r=0,
    peft_alpha=0,
    peft_dropout=0.0,
    peft_target_modules=[],
    qlora_enable=False,
    device='cuda' if device.type == 'cuda' else 'cpu',
)
model = LlenaModel(cfg)
model.eval()
if 'model' in ckpt:
    model.load_state_dict(ckpt['model'], strict=True)
if 'projector' in ckpt:
    model.projector.load_state_dict(ckpt['projector'], strict=True)
if 'adapter' in ckpt:
    from peft import set_peft_model_state_dict
    set_peft_model_state_dict(model.llm, ckpt['adapter'])

image_proc = SiglipImageProcessor.from_pretrained(cfg.vision_name)
img = Image.open(IMAGE_PATH).convert('RGB')
vision = image_proc(images=[img], return_tensors='pt')
pixel_values = vision['pixel_values'].to(device)

tokenizer = model.tokenizer
pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
prompt_ids = tokenizer.apply_chat_template(
    [{'role': 'user', 'content': QUESTION}],
    add_generation_prompt=True,
    tokenize=True,
    return_tensors=None,
)
input_ids = torch.tensor([prompt_ids], device=device)
attention_mask = torch.ones_like(input_ids)

with torch.no_grad():
    v_out = model.vision(pixel_values=pixel_values)
    v_tokens = v_out.last_hidden_state
    if model.freeze_vision:
        v_tokens = v_tokens.detach()
    v_tokens = _select_or_pad_tokens(v_tokens, model.num_image_tokens)
    proj_dtype = next(model.projector.parameters()).dtype
    v_tokens = v_tokens.to(dtype=proj_dtype)
    img_embeds = model.projector(v_tokens)

    text_embeds = model.llm.get_input_embeddings()(input_ids)
    if img_embeds.dtype != text_embeds.dtype:
        img_embeds = img_embeds.to(dtype=text_embeds.dtype)
    inputs_embeds = torch.cat([img_embeds, text_embeds], dim=1)

    prefix_mask = torch.ones((attention_mask.size(0), model.num_image_tokens), dtype=attention_mask.dtype, device=attention_mask.device)
    mm_attention_mask = torch.cat([prefix_mask, attention_mask], dim=1)

    gen_ids = model.llm.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=mm_attention_mask,
        do_sample=False,
        num_beams=1,
        max_new_tokens=MAX_GENERATED_TOKENS,
        temperature=0.0,
        repetition_penalty=1.0,
        pad_token_id=pad_id,
        eos_token_id=tokenizer.eos_token_id,
    )

prefix_len = inputs_embeds.size(1)
gen_new = gen_ids[:, prefix_len:]
print('image_path:', IMAGE_PATH)
print('question:', QUESTION)
print('prompt tokens:', len(prompt_ids))
print('generated tokens:', gen_new.size(1))
print('response:', tokenizer.batch_decode(gen_new.tolist(), skip_special_tokens=True)[0])


## Download COCO image (if missing)
Fetches a COCO 2017 train image by filename into a local folder.


In [None]:
import urllib.request
from pathlib import Path

COCO_IMAGE = '000000000073.jpg'  # TODO
DEST_DIR = 'datasets/processed/sharegpt4v_coco/images'

dest = Path(DEST_DIR) / COCO_IMAGE
dest.parent.mkdir(parents=True, exist_ok=True)
if not dest.exists():
    url = f'http://images.cocodataset.org/train2017/{COCO_IMAGE}'
    print('downloading', url)
    urllib.request.urlretrieve(url, dest)
    print('saved', dest)
else:
    print('already exists', dest)
