# 공통함수 및 데이터셋

In [1]:
from src.lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
from PIL import Image
import numpy as np
import torch
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from lerobot.policies.act.modeling_act import ACTPolicy

dataset = LeRobotDataset(
    repo_id="lerobot/custom_dataset",
    root='/mnt/d/lerobot_dataset/panda_robot_dataset_lerobot/v1_latent',
    episodes = list(range(700, 1000))
)

dataloader = torch.utils.data.DataLoader(
    dataset,
    #num_workers=8,
    batch_size=1,
    shuffle=False,
    #pin_memory=device.type != "cpu",
    drop_last=False,
)

def inputs_maker(processor, batch, i):
    ImageSet = []

    tasks = batch['task']
    
    pil_img_m = np.transpose((batch['observation.mid_image'][0].numpy() * 255), [1,2,0])
    pil_img_r = np.transpose((batch['observation.right_image'][0].numpy() * 255), [1,2,0])
    pil_img_l = np.transpose((batch['observation.left_image'][0].numpy() * 255), [1,2,0])

    ImageSet.append(pil_img_m)
    ImageSet.append(pil_img_r)
    ImageSet.append(pil_img_l)
    
    texts = [(
        '<|im_start|>system\n'
        'You are a single-arm gripper-type robot. You have just received the following images from three cameras. '
        'Picture 1: <|vision_start|><|image_pad|><|vision_end|> - Captured by the wrist-mounted camera. '
        'Picture 2: <|vision_start|><|image_pad|><|vision_end|> - Captured by the rear-right camera, showing both the robot and its environment. '
        'Picture 3: <|vision_start|><|image_pad|><|vision_end|> - Captured by the front-left camera, showing both the robot and its environment.<|im_end|>\n'
        '<|im_start|>user\n'
        f'How should you move when you need to {task}?'
        '<|im_start|>assistant\n'
    )
             for task in tasks
            ]
    
    inputs = processor(
        text=texts,
        images=ImageSet,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(model.device)
    return inputs


model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    #torch_dtype="auto",
    device_map="auto",
    attn_implementation="flash_attention_2",
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", use_fast=True)
processor.tokenizer.padding_side = "left"


batch = next(iter(dataloader))
inputs = inputs_maker(processor, batch, 0)

Resolving data files:   0%|          | 0/300 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

# 확인

In [2]:
output1 = model.generate(**inputs,
                         max_new_tokens=1,
                         do_sample=False,
                         output_hidden_states=True,
                         return_dict_in_generate=True)['hidden_states'][0][10][:, -1, :]
print('output1 :', output1)

#while len(model.model.layers) > 11:
#    model.model.layers.pop(-1)

#output2 = model.generate(**inputs,
#              max_new_tokens = 1,
#              do_sample = False,
#              output_hidden_states = True,
#              return_dict_in_generate = True)['hidden_states'][0][-1][:, -1, :]
#print('output2 :', output2)



output1 : tensor([[ 0.1719,  0.0010,  0.4961,  ..., -0.2148, -0.0132, -0.5273]],
       device='cuda:0', dtype=torch.bfloat16)


In [12]:
import copy
from contextlib import contextmanager

@contextmanager
def temp_truncate(model, n_layers: int):
    saved = model.model.layers
    try:
        model.model.layers = torch.nn.ModuleList(list(saved)[:n_layers])
        yield
    finally:
        model.model.layers = saved

model.eval()
with torch.inference_mode():
    # 1) 풀 모델로 10번째 블록 출력 뽑기 (임베딩=0, 블록=1..N)
    out_full = model(**inputs, use_cache=False, output_hidden_states=True, return_dict=True)
    h10_full = out_full.hidden_states[10][:, -1, :]

    with temp_truncate(model, 10):
        out_trunc = model(**inputs, use_cache=False, output_hidden_states=True, return_dict=True)
        h10_trunc = out_trunc.hidden_states[-1][:, -1, :]

In [13]:
h10_full

tensor([[ 0.1719,  0.0010,  0.4961,  ..., -0.2148, -0.0132, -0.5273]],
       device='cuda:0', dtype=torch.bfloat16)

In [14]:
h10_trunc

tensor([[ 0.6484,  0.0037,  1.8750,  ..., -0.8438, -0.0493, -1.9766]],
       device='cuda:0', dtype=torch.bfloat16)

In [20]:
out_fast.hidden_states[-1][:,-1,:]

tensor([[ 0.6484,  0.0037,  1.8750,  ..., -0.8438, -0.0493, -1.9766]],
       device='cuda:0', dtype=torch.bfloat16)

In [21]:
del model
del model_fast
del out_full
del out_fast
del h10_full
del h10_fast
torch.cuda.empty_cache()

In [35]:
h10_full

NameError: name 'h10_full' is not defined

In [6]:
import copy, torch
from contextlib import contextmanager

# --- 0) 결정론/평가 모드(가능한 범위) ---
model.eval()
torch.manual_seed(0)
torch.backends.cuda.matmul.allow_tf32 = False  # 두 실행 모두 동일하게
# (FA2는 완전 결정론 보장이 아니지만, 같은 경로/설정이면 보통 동일합니다)

# --- 1) 레이어 훅 세팅: 각 Transformer block의 출력을 캡처 ---
hook_outs = {}  # {layer_idx: tensor}
hooks = []

def make_hook(i):
    def _hook(module, inputs, output):
        # 블록의 forward 반환이 tensor 또는 tuple일 수 있으므로 방어적으로 처리
        out = output[0] if isinstance(output, (tuple, list)) else output
        hook_outs[i] = out
    return _hook

# 모델의 Transformer 블록 리스트 (Qwen: model.model.layers)
layers = model.model.layers
num_layers = len(layers)

for i, layer in enumerate(layers):
    hooks.append(layer.register_forward_hook(make_hook(i)))

# --- 2) 입력 준비 (질문에서 쓰신 inputs_maker 그대로) ---
batch = next(iter(dataloader))
inputs = inputs_maker(processor, batch, 0)
inputs = {k: (v.to(model.device) if torch.is_tensor(v) else v) for k, v in inputs.items()}

# --- 3) 풀모델 1회 forward: hidden_states와 hook 결과를 1:1 검증 ---
with torch.inference_mode():
    out_full = model(**inputs, use_cache=False, output_hidden_states=True, return_dict=True)

hs = out_full.hidden_states  # 길이 = num_layers + 1 (임베딩 + 각 블록)
assert len(hs) == num_layers + 1, (len(hs), num_layers)

# 각 레이어에 대해 hidden_states[k+1] vs hook_outs[k] 동일성 확인
max_diffs = []
for k in range(num_layers):
    # 시퀀스의 마지막 토큰만 비교(전체 비교도 가능)
    a = hs[k+1][:, -1, :]
    b = hook_outs[k][:, -1, :]

    diff = (a - b).abs().max().item()
    max_diffs.append(diff)

print(f"[검증1] hidden_states[k+1] vs hook_outs[k]  (k=0..{num_layers-1})")
print("max |diff| per layer:", max_diffs)
print("allclose per layer:", [torch.allclose(hs[k+1][:, -1, :], hook_outs[k][:, -1, :], atol=1e-5, rtol=1e-5) for k in range(num_layers)])

# --- 4) 10번째 블록 출력(=hidden_states[10])을 '정답'으로 삼음 ---
h10_full = hs[10][:, -1, :].contiguous()

# --- 5) trunc 실행: 같은 모델에서 '실행만' 10층으로 제한 ---
@contextmanager
def temp_truncate(m, n_layers: int):
    saved = m.model.layers
    try:
        m.model.layers = torch.nn.ModuleList(list(saved)[:n_layers])
        yield
    finally:
        m.model.layers = saved

with torch.inference_mode():
    with temp_truncate(model, 10):
        out_trunc = model(**inputs, use_cache=False, output_hidden_states=True, return_dict=True)

# trunc의 최종 블록 출력은 hidden_states[-1] == hidden_states[10]이어야 함
h_last_trunc = out_trunc.hidden_states[-1][:, -1, :].contiguous()

# --- 6) 동일성 체크 ---
diff_trunc = (h10_full - h_last_trunc).abs().max().item()
print("[검증2] 풀모델의 hidden_states[10]  <->  trunc의 hidden_states[-1]")
print("max |diff|:", diff_trunc)
print("allclose:", torch.allclose(h10_full, h_last_trunc, atol=1e-5, rtol=1e-5))

# --- 7) 훅 해제 ---
for h in hooks:
    h.remove()


[검증1] hidden_states[k+1] vs hook_outs[k]  (k=0..27)
max |diff| per layer: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 76.5]
allclose per layer: [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False]
[검증2] 풀모델의 hidden_states[10]  <->  trunc의 hidden_states[-1]
max |diff|: 68.0
allclose: False


In [15]:
import copy, torch
from contextlib import contextmanager

model.eval()
torch.manual_seed(0)
torch.backends.cuda.matmul.allow_tf32 = False

# 0) 최종 정규화 모듈 식별 (Qwen: 대개 model.model.norm; 일부 모델은 ln_f)
final_norm = getattr(model.model, "norm", None) or getattr(model.model, "ln_f", None)
assert final_norm is not None, "final norm 모듈을 찾지 못했습니다 (model.model.norm 또는 ln_f)."

# 1) 블록 출력 훅
hook_outs = {}
hooks = []
def make_hook(i):
    def _hook(module, inputs, output):
        out = output[0] if isinstance(output, (tuple, list)) else output
        hook_outs[i] = out
    return _hook

layers = model.model.layers
num_layers = len(layers)
for i, layer in enumerate(layers):
    hooks.append(layer.register_forward_hook(make_hook(i)))

# 2) final norm 훅
norm_out = {}
def norm_hook(module, inputs, output):
    norm_out["out"] = output
h_norm = final_norm.register_forward_hook(norm_hook)

# 3) 입력 만들기 (당신의 inputs_maker 사용)
batch = next(iter(dataloader))
inputs = inputs_maker(processor, batch, 0)
inputs = {k:(v.to(model.device) if torch.is_tensor(v) else v) for k,v in inputs.items()}

with torch.inference_mode():
    out_full = model(**inputs, use_cache=False, output_hidden_states=True, return_dict=True)

hs = out_full.hidden_states  # len = num_layers + 1

# --- A. 마지막 레이어 불일치 원인 확인: hs[-1] == final_norm(hook_outs[last]) ---
last_block_raw = hook_outs[num_layers - 1]              # 마지막 블록 생 출력
last_block_post = final_norm(last_block_raw)            # 정규화 후
assert "out" in norm_out
print("[확인A-1] hs[-1] vs final_norm(hook_outs[last])  max|diff| =",
      (hs[-1] - last_block_post).abs().max().item())
print("[확인A-2] hs[-1] vs (norm 훅 출력) allclose =",
      torch.allclose(hs[-1], norm_out["out"], atol=1e-5, rtol=1e-5))

# 4) 임시 트렁케이트 유틸
@contextmanager
def temp_truncate(m, n_layers: int):
    saved = m.model.layers
    try:
        m.model.layers = torch.nn.ModuleList(list(saved)[:n_layers])
        yield
    finally:
        m.model.layers = saved

# 5) “같은 기준”으로 비교: (정규화 후 기준)
with torch.inference_mode():
    with temp_truncate(model, 10):
        # trunc 실행에서도 final norm 훅으로 캡처
        norm_out_tr = {}
        def norm_hook_tr(m, i, o): norm_out_tr["out"] = o
        h_norm_tr = final_norm.register_forward_hook(norm_hook_tr)

        out_trunc = model(**inputs, use_cache=False, output_hidden_states=True, return_dict=True)
        hs_trunc_last = out_trunc.hidden_states[-1]  # trunc의 마지막 = 정규화 후 값

        h_norm_tr.remove()

# 풀모델의 10번째 블록 “정규화 후” vs trunc 마지막(=정규화 후)
h10_full_pre  = hs[10]
h10_full_post = final_norm(h10_full_pre)

print("[확인B] full norm(hs[10])  vs  trunc hs[-1]   max|diff| =",
      (h10_full_post - hs_trunc_last).abs().max().item())
print("allclose:", torch.allclose(h10_full_post, hs_trunc_last, atol=1e-5, rtol=1e-5))

# 6) 훅 해제
for h in hooks: h.remove()
h_norm.remove()


[확인A-1] hs[-1] vs final_norm(hook_outs[last])  max|diff| = 0.0
[확인A-2] hs[-1] vs (norm 훅 출력) allclose = True
[확인B] full norm(hs[10])  vs  trunc hs[-1]   max|diff| = 0.0
allclose: True
