In [2]:
# 测试EMOE_RefCOCO-10of100.pt模型的能力，允许加载state_dict时忽略key不匹配

import torch

from src.models.like.emoe.refcoco_model import EMOE_RefCOCO

model_path = "EMOE_RefCOCO-10of100.pt"

# device 设置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 创建模型实例
model = EMOE_RefCOCO()

# 加载模型参数，同时允许跳过部分缺失或多余的key
state_dict = torch.load(model_path, map_location=device)
# 检查键
result = model.load_state_dict(state_dict, strict=False)
print("State dict loading result:")
print("  Missing keys:", result.missing_keys)
print("  Unexpected keys:", result.unexpected_keys)

model.eval()
model.to(device)

# 构造一个refcoco-like的batch输入，维度符合模型期望
B = 2  # batch size
N_OBJ = 5  # 每张图片的区域数（含背景）
C, H, W = 3, 224, 224  # 图像区域通道和尺寸

max_regions = N_OBJ - 1  # 不含背景的region数
max_seq_len = 16  # 一个region的文本描述最大长度

regions = torch.randn(B, N_OBJ, C, H, W).to(device)
texts = torch.randint(0, 49408, (B, max_regions, max_seq_len)).to(
    device
)  # 假设clip tokenizer词表大小49408
text_attn_mask = torch.ones(B, max_regions, max_seq_len, dtype=torch.long).to(device)

# 推理与输出
with torch.no_grad():
    output = model(regions=regions, texts=texts, text_attn_mask=text_attn_mask)
    print("模型输出：", output)

print("模型 {} 已准备好进行测试。".format(model_path))

State dict loading result:
  Missing keys: ['region_encoder.backbone.cls_token', 'region_encoder.backbone.pos_embed', 'region_encoder.backbone.patch_embed.proj.weight', 'region_encoder.backbone.patch_embed.proj.bias', 'region_encoder.backbone.blocks.0.norm1.weight', 'region_encoder.backbone.blocks.0.norm1.bias', 'region_encoder.backbone.blocks.0.attn.qkv.weight', 'region_encoder.backbone.blocks.0.attn.qkv.bias', 'region_encoder.backbone.blocks.0.attn.proj.weight', 'region_encoder.backbone.blocks.0.attn.proj.bias', 'region_encoder.backbone.blocks.0.norm2.weight', 'region_encoder.backbone.blocks.0.norm2.bias', 'region_encoder.backbone.blocks.0.mlp.fc1.weight', 'region_encoder.backbone.blocks.0.mlp.fc1.bias', 'region_encoder.backbone.blocks.0.mlp.fc2.weight', 'region_encoder.backbone.blocks.0.mlp.fc2.bias', 'region_encoder.backbone.blocks.1.norm1.weight', 'region_encoder.backbone.blocks.1.norm1.bias', 'region_encoder.backbone.blocks.1.attn.qkv.weight', 'region_encoder.backbone.blocks.1.at

In [3]:
# 测试 text_features 与 region_features 的关系
text_features = output["text_features"]
region_features = output["region_features"]

# 计算归一化后的余弦相似度矩阵
similarity = torch.bmm(text_features, region_features.transpose(1, 2))
print("样本0的余弦相似度矩阵:")
print(similarity[0])

# 对比模型内部计算的logits
logits_manual = similarity / model.temperature
diff = (logits_manual - output["logits_per_text"]).abs().max().item()
print(f"手动计算的logits与模型输出最大差异: {diff:.6e}")

# 查看每个文本与对应区域的对角相似度
diag_similarity = similarity.diagonal(dim1=-2, dim2=-1)
print("每个文本-区域对的对角相似度:")
print(diag_similarity)

# 检查特征是否经过L2归一化
text_norms = text_features.norm(dim=-1)
region_norms = region_features.norm(dim=-1)
print(
    f"text_features范数范围: [{text_norms.min().item():.4f}, {text_norms.max().item():.4f}]"
)
print(
    f"region_features范数范围: [{region_norms.min().item():.4f}, {region_norms.max().item():.4f}]"
)

样本0的余弦相似度矩阵:
tensor([[0.0220, 0.0174, 0.0080, 0.0264],
        [0.0261, 0.0255, 0.0168, 0.0272],
        [0.0227, 0.0215, 0.0149, 0.0230],
        [0.0152, 0.0131, 0.0008, 0.0124]], device='cuda:0')
手动计算的logits与模型输出最大差异: 0.000000e+00
每个文本-区域对的对角相似度:
tensor([[ 0.0220,  0.0255,  0.0149,  0.0124],
        [ 0.0164,  0.0356, -0.0026,  0.0428]], device='cuda:0')
text_features范数范围: [1.0000, 1.0000]
region_features范数范围: [1.0000, 1.0000]


In [4]:
# 使用COCO2017真实数据测试
from pathlib import Path
from PIL import Image

from src.dataset.alignment.coco2017_alignment_dataset import COCO2017RegionAlignment

coco_root = Path("/home/rczx/workspace/sxy/lab/NeuroTrain/data/coco2017")
coco_dataset = COCO2017RegionAlignment(root_dir=coco_root, split="val")

sample_idx = 0
sample = coco_dataset[sample_idx]
sample_row = coco_dataset.samples.iloc[sample_idx]

regions_real = sample["inputs"].unsqueeze(0).to(device)
texts_real = sample["text_ids"].unsqueeze(0).to(device)
attn_real = sample["text_attn_mask"]
if attn_real is not None:
    attn_real = attn_real.unsqueeze(0).to(device)

region_bboxes = sample_row["bboxes"]
region_labels = sample_row["labels"]
region_texts = [coco_dataset.OBJECT_TEXT.format(label=label) for label in region_labels]

image_path = coco_dataset.img_dir / sample_row["file_name"]
base_image = Image.open(image_path).convert("RGB")
region_crops = []
for box in region_bboxes:
    x, y, w, h = box
    region_crops.append(base_image.crop((x, y, x + w, y + h)))

with torch.no_grad():
    output_real = model(
        regions=regions_real, texts=texts_real, text_attn_mask=attn_real
    )

print(f"从COCO样本 {sample_idx} 获得 {len(region_labels)} 个区域")

从COCO样本 0 获得 4 个区域


In [5]:
# 计算真实数据上 text-region 余弦相似度
text_features_real = output_real["text_features"]
region_features_real = output_real["region_features"]

similarity_real = torch.bmm(text_features_real, region_features_real.transpose(1, 2))
logits_manual_real = similarity_real / model.temperature
diff_real = (logits_manual_real - output_real["logits_per_text"]).abs().max().item()

diag_similarity_real = similarity_real.diagonal(dim1=-2, dim2=-1)

print(f"手动计算的logits与模型输出最大差异: {diff_real:.6e}")
print("真实数据每个文本-区域对的对角相似度:")
print(diag_similarity_real)
print(f"真实数据余弦相似度均值: {diag_similarity_real.mean().item():.4f}")

手动计算的logits与模型输出最大差异: 0.000000e+00
真实数据每个文本-区域对的对角相似度:
tensor([[ 0.0509, -0.0407, -0.0031, -0.0757]], device='cuda:0')
真实数据余弦相似度均值: -0.0172


In [6]:
# 与未训练的 CLIP 模型对比
from transformers import CLIPModel, CLIPConfig, CLIPImageProcessor
import torch.nn.functional as F

clip_ckpt = "openai/clip-vit-base-patch32"
clip_processor = CLIPImageProcessor.from_pretrained(
    clip_ckpt, cache_dir=model.cache_dir
)

clip_inputs = clip_processor(
    text=region_texts,
    images=region_crops,
    return_tensors="pt",
    padding=True,
    truncation=True,
)
clip_inputs = {k: v.to(device) for k, v in clip_inputs.items()}

clip_config = CLIPConfig.from_pretrained(clip_ckpt)
clip_random = CLIPModel(clip_config).to(device)
clip_random.eval()

with torch.no_grad():
    clip_outputs_random = clip_random(**clip_inputs)

text_embeds_random = F.normalize(clip_outputs_random.text_embeds, dim=-1)
image_embeds_random = F.normalize(clip_outputs_random.image_embeds, dim=-1)
similarity_clip_random = text_embeds_random @ image_embeds_random.T
diag_clip_random = similarity_clip_random.diagonal()

print("未训练 CLIP 对角余弦相似度:")
print(diag_clip_random)
print(f"未训练 CLIP 平均对角相似度: {diag_clip_random.mean().item():.4f}")

# 方便对比，再输出 EMOE 结果
print("EMOE 对角余弦相似度:")
print(diag_similarity_real.squeeze(0))
print(f"EMOE 平均对角相似度: {diag_similarity_real.mean().item():.4f}")

Unused or unrecognized kwargs: padding, truncation, text.


ValueError: You have to specify input_ids