# Evo-1 完整测试教程

本教程将带你完整体验 Evo-1 模型的各项功能，包括：

1. 环境检查与模型加载
2. 视觉-语言嵌入器测试
3. 动作头（Flow Matching）测试
4. 完整推理流程测试
5. 模拟数据训练测试
6. 可视化与分析

## 准备工作

确保已安装所需依赖：
```bash
pip install torch torchvision transformers pillow matplotlib numpy
```

## 1. 环境检查与模型加载

In [None]:
import sys
import os

# 添加项目路径
project_root = os.path.abspath('../Evo_1')
sys.path.insert(0, project_root)

print(f"项目根目录: {project_root}")
print(f"Python 版本: {sys.version}")

In [None]:
# 检查 CUDA 可用性
import torch

print(f"PyTorch 版本: {torch.__version__}")
print(f"CUDA 是否可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA 版本: {torch.version.cuda}")
    print(f"GPU 设备: {torch.cuda.get_device_name(0)}")
    print(f"GPU 数量: {torch.cuda.device_count()}")
    print(f"当前 GPU 显存: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

In [None]:
# 导入必要的库
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import torch.nn as nn
from types import SimpleNamespace

print("✓ 所有依赖导入成功")

### 1.1 加载 Evo-1 模型

In [None]:
from scripts.Evo1 import EVO1

# 模型配置
config = {
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "vlm_name": "OpenGVLab/InternVL3-1B",
    "action_head": "flowmatching",
    "horizon": 50,
    "per_action_dim": 7,
    "action_dim": 50 * 7,  # horizon * per_action_dim
    "state_dim": 7,
    "embed_dim": 896,
    "hidden_dim": 1024,
    "num_heads": 8,
    "num_layers": 8,
    "dropout": 0.0,
    "num_inference_timesteps": 50,
    "return_cls_only": False,
    "finetune_vlm": False,
    "finetune_action_head": False,
}

print("正在加载 Evo-1 模型...")
print(f"设备: {config['device']}")

# 注意：首次运行会下载 InternVL3-1B 模型，大小约 2GB
# 如果你已有预训练权重，可以修改 vlm_name 为本地路径
model = EVO1(config)
model.eval()  # 设置为评估模式

print("✓ 模型加载成功！")

# 统计参数量
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"总参数量: {total_params / 1e6:.2f}M")
print(f"可训练参数量: {trainable_params / 1e6:.2f}M")

## 2. 视觉-语言嵌入器测试

### 2.1 创建测试图像

In [None]:
def create_test_image(text, color, size=(448, 448)):
    """创建带文字的测试图像"""
    from PIL import ImageDraw, ImageFont
    
    # 创建彩色图像
    img = Image.new('RGB', size, color=color)
    draw = ImageDraw.Draw(img)
    
    # 添加文字
    try:
        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 40)
    except:
        font = ImageFont.load_default()
    
    # 计算文字位置（居中）
    bbox = draw.textbbox((0, 0), text, font=font)
    text_width = bbox[2] - bbox[0]
    text_height = bbox[3] - bbox[1]
    position = ((size[0] - text_width) // 2, (size[1] - text_height) // 2)
    
    draw.text(position, text, fill="white", font=font)
    
    return img

# 创建3个测试图像
image1 = create_test_image("Base Camera", (200, 100, 100))  # 红色
image2 = create_test_image("Wrist Camera", (100, 200, 100))  # 绿色
image3 = create_test_image("Third Person", (100, 100, 200))  # 蓝色

# 显示图像
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
axes[0].imshow(image1)
axes[0].set_title("Image 1: Base Camera")
axes[0].axis('off')

axes[1].imshow(image2)
axes[1].set_title("Image 2: Wrist Camera")
axes[1].axis('off')

axes[2].imshow(image3)
axes[2].set_title("Image 3: Third Person")
axes[2].axis('off')

plt.tight_layout()
plt.show()

print("✓ 测试图像创建成功")

### 2.2 图像预处理

In [None]:
def preprocess_image(image):
    """预处理图像为模型输入格式"""
    # 转换为 tensor
    image_array = np.array(image).astype(np.float32)
    
    # 归一化（ImageNet 统计）
    mean = np.array([0.485, 0.456, 0.406]) * 255
    std = np.array([0.229, 0.224, 0.225]) * 255
    image_array = (image_array - mean) / std
    
    # 转换为 tensor: (H, W, C) -> (C, H, W)
    image_tensor = torch.from_numpy(image_array).permute(2, 0, 1)
    
    return image_tensor

# 预处理图像
image1_tensor = preprocess_image(image1)
image2_tensor = preprocess_image(image2)
image3_tensor = preprocess_image(image3)

# 堆叠为 batch
images_batch = torch.stack([image1_tensor, image2_tensor, image3_tensor]).unsqueeze(0)
print(f"图像 batch 形状: {images_batch.shape}")  # 应该是 (1, 3, 3, 448, 448)

# 图像掩码（所有图像都有效）
image_mask = torch.tensor([[1, 1, 1]], dtype=torch.float32)
print(f"图像掩码: {image_mask}")

print("✓ 图像预处理完成")

### 2.3 测试视觉-语言嵌入

In [None]:
# 移动到设备
device = config['device']
images_batch = images_batch.to(device)
image_mask = image_mask.to(device)

# 测试文本指令
prompt = "pick up the red cube and place it on the table"

print(f"任务指令: {prompt}")
print("\n正在生成视觉-语言嵌入...")

with torch.no_grad():
    # 将图像列表准备好
    image_list = [image1, image2, image3]
    
    # 获取嵌入
    fused_tokens = model.get_vl_embeddings(
        images=image_list,
        image_mask=image_mask,
        prompt=prompt,
        return_cls_only=False
    )
    
    print(f"\n融合 token 形状: {fused_tokens.shape}")
    print(f"融合 token 范围: [{fused_tokens.min():.3f}, {fused_tokens.max():.3f}]")
    print(f"融合 token 均值: {fused_tokens.mean():.3f}")
    print(f"融合 token 标准差: {fused_tokens.std():.3f}")

print("\n✓ 视觉-语言嵌入生成成功！")

## 3. 动作头（Flow Matching）测试

### 3.1 准备状态输入

In [None]:
# 创建模拟机器人状态（7维：6个关节 + 1个夹爪）
state = torch.tensor([
    [0.0, -0.5, 0.3, 0.0, 0.5, 0.0, 0.5]  # 示例关节角度
], dtype=torch.float32).to(device)

print(f"机器人状态: {state}")
print(f"状态形状: {state.shape}")

# 创建动作掩码（前6维控制关节，第7维不控制夹爪）
action_mask = torch.tensor([
    [[1, 1, 1, 1, 1, 1, 0] * config['horizon']]  # 重复 horizon 次
], dtype=torch.float32).to(device)

print(f"动作掩码形状: {action_mask.shape}")
print(f"动作掩码样本: {action_mask[0, 0, :7]}")

### 3.2 测试训练模式（前向传播）

In [None]:
# 创建模拟的真实动作（用于训练）
actions_gt = torch.randn(1, config['horizon'], config['per_action_dim']).to(device)

print(f"真实动作形状: {actions_gt.shape}")
print(f"真实动作样本:\n{actions_gt[0, :3, :]}")

# 前向传播
with torch.no_grad():
    pred_velocity, noise = model.action_head(
        fused_tokens=fused_tokens,
        state=state,
        actions_gt=actions_gt,
        action_mask=action_mask.view(1, -1)
    )
    
    print(f"\n预测速度形状: {pred_velocity.shape}")
    print(f"噪声形状: {noise.shape}")
    
    # 计算损失（MSE）
    target = (actions_gt - noise.view(1, config['horizon'], config['per_action_dim'])).view(1, -1)
    loss = nn.MSELoss()(pred_velocity, target)
    
    print(f"\n训练损失: {loss.item():.6f}")

print("\n✓ 训练模式测试成功！")

### 3.3 测试推理模式（动作生成）

In [None]:
print("正在生成动作序列...")
print(f"推理步数: {config['num_inference_timesteps']}")

import time
start_time = time.time()

with torch.no_grad():
    action_chunk = model.action_head.get_action(
        fused_tokens=fused_tokens,
        state=state,
        action_mask=action_mask.view(1, -1)
    )

inference_time = time.time() - start_time

print(f"\n推理时间: {inference_time:.3f} 秒")
print(f"推理频率: {1/inference_time:.2f} Hz")
print(f"\n动作序列形状: {action_chunk.shape}")

# 重塑为 (horizon, per_action_dim)
action_seq = action_chunk.view(config['horizon'], config['per_action_dim'])
print(f"重塑后形状: {action_seq.shape}")

# 显示前3步动作
print(f"\n前3步动作:")
print(action_seq[:3])

print("\n✓ 推理模式测试成功！")

## 4. 完整推理流程测试

In [None]:
# 使用模型的 run_inference 方法（端到端）
print("测试端到端推理...\n")

# 准备输入
image_list = [image1, image2, image3]
state_input = [0.0, -0.5, 0.3, 0.0, 0.5, 0.0, 0.5]
task_prompt = "grasp the blue object"

print(f"任务: {task_prompt}")
print(f"状态: {state_input}")

start_time = time.time()

with torch.no_grad():
    action_output = model.run_inference(
        images=image_list,
        image_mask=image_mask,
        prompt=task_prompt,
        state_input=state_input,
        action_mask=action_mask.view(1, -1)
    )

total_time = time.time() - start_time

print(f"\n总推理时间: {total_time:.3f} 秒")
print(f"动作输出形状: {action_output.shape}")

# 提取第一步动作（实际执行）
first_action = action_output.view(config['horizon'], config['per_action_dim'])[0]
print(f"\n第一步动作（待执行）:")
print(first_action)

print("\n✓ 端到端推理测试成功！")

## 5. 可视化与分析

### 5.1 动作序列可视化

In [None]:
# 重塑动作为 (horizon, per_action_dim)
action_trajectory = action_output.view(config['horizon'], config['per_action_dim']).cpu().numpy()

# 绘制每个维度的轨迹
fig, axes = plt.subplots(config['per_action_dim'], 1, figsize=(12, 3*config['per_action_dim']))

dim_names = ['Joint 1', 'Joint 2', 'Joint 3', 'Joint 4', 'Joint 5', 'Joint 6', 'Gripper']

for i in range(config['per_action_dim']):
    axes[i].plot(action_trajectory[:, i], linewidth=2)
    axes[i].set_ylabel(dim_names[i], fontsize=12)
    axes[i].grid(True, alpha=0.3)
    axes[i].axhline(y=0, color='r', linestyle='--', alpha=0.5)
    
    # 标注第一步
    axes[i].scatter([0], [action_trajectory[0, i]], color='red', s=100, zorder=5, label='Step 1')
    
axes[-1].set_xlabel('Time Step', fontsize=12)
axes[0].set_title('Predicted Action Trajectory', fontsize=14, fontweight='bold')
axes[0].legend()

plt.tight_layout()
plt.show()

print("✓ 动作轨迹可视化完成")

### 5.2 动作分布统计

In [None]:
# 统计每个维度的分布
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.flatten()

for i in range(config['per_action_dim']):
    axes[i].hist(action_trajectory[:, i], bins=20, alpha=0.7, edgecolor='black')
    axes[i].set_title(dim_names[i], fontsize=12)
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Frequency')
    axes[i].axvline(x=action_trajectory[:, i].mean(), color='r', linestyle='--', 
                    label=f'Mean: {action_trajectory[:, i].mean():.3f}')
    axes[i].legend(fontsize=8)
    axes[i].grid(True, alpha=0.3)

# 隐藏多余的子图
axes[-1].axis('off')

plt.suptitle('Action Distribution per Dimension', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("✓ 动作分布统计完成")

### 5.3 动作平滑度分析

In [None]:
# 计算一阶差分（速度）
velocity = np.diff(action_trajectory, axis=0)

# 计算二阶差分（加速度）
acceleration = np.diff(velocity, axis=0)

# 可视化
fig, axes = plt.subplots(3, 1, figsize=(12, 10))

# 位置
axes[0].plot(action_trajectory[:, :6])  # 只画前6个关节
axes[0].set_ylabel('Position', fontsize=12)
axes[0].set_title('Joint Positions', fontsize=12, fontweight='bold')
axes[0].legend([f'J{i+1}' for i in range(6)], loc='upper right', ncol=6)
axes[0].grid(True, alpha=0.3)

# 速度
axes[1].plot(velocity[:, :6])
axes[1].set_ylabel('Velocity', fontsize=12)
axes[1].set_title('Joint Velocities (1st Derivative)', fontsize=12, fontweight='bold')
axes[1].legend([f'J{i+1}' for i in range(6)], loc='upper right', ncol=6)
axes[1].grid(True, alpha=0.3)

# 加速度
axes[2].plot(acceleration[:, :6])
axes[2].set_ylabel('Acceleration', fontsize=12)
axes[2].set_xlabel('Time Step', fontsize=12)
axes[2].set_title('Joint Accelerations (2nd Derivative)', fontsize=12, fontweight='bold')
axes[2].legend([f'J{i+1}' for i in range(6)], loc='upper right', ncol=6)
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 计算平滑度指标（加速度的L2范数）
smoothness = np.linalg.norm(acceleration, axis=0)
print("\n动作平滑度指标（越小越平滑）:")
for i in range(config['per_action_dim']):
    print(f"  {dim_names[i]}: {smoothness[i]:.4f}")

print("\n✓ 动作平滑度分析完成")

## 6. 模拟数据训练测试

### 6.1 创建模拟数据集

In [None]:
def create_synthetic_dataset(num_samples=100):
    """创建合成数据集用于训练测试"""
    dataset = []
    
    for i in range(num_samples):
        # 随机图像
        images = [
            create_test_image(f"Sample {i}", (np.random.randint(100, 200), 
                                               np.random.randint(100, 200), 
                                               np.random.randint(100, 200)))
            for _ in range(3)
        ]
        
        # 随机状态
        state = torch.randn(config['state_dim'])
        
        # 随机动作（平滑的正弦轨迹）
        t = torch.linspace(0, 2*np.pi, config['horizon'])
        action = torch.zeros(config['horizon'], config['per_action_dim'])
        for j in range(config['per_action_dim']):
            action[:, j] = torch.sin(t + np.random.rand() * 2 * np.pi) * 0.5
        
        # 任务描述
        prompts = [
            "pick up the object",
            "move to the target",
            "place on the table",
            "grasp the cube",
            "push the button"
        ]
        prompt = np.random.choice(prompts)
        
        dataset.append({
            'images': images,
            'state': state,
            'action': action,
            'prompt': prompt
        })
    
    return dataset

# 创建数据集
print("创建合成数据集...")
train_dataset = create_synthetic_dataset(num_samples=50)
print(f"✓ 创建了 {len(train_dataset)} 个训练样本")

# 显示一个样本
sample = train_dataset[0]
print(f"\n样本示例:")
print(f"  任务: {sample['prompt']}")
print(f"  状态形状: {sample['state'].shape}")
print(f"  动作形状: {sample['action'].shape}")
print(f"  图像数量: {len(sample['images'])}")

### 6.2 小规模训练测试

In [None]:
# 创建优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-3)

# 设置为训练模式
model.train()

# 训练几步
num_steps = 10
losses = []

print(f"开始训练 {num_steps} 步...\n")

for step in range(num_steps):
    # 随机选择一个样本
    sample = train_dataset[np.random.randint(len(train_dataset))]
    
    # 准备输入
    images = sample['images']
    state = sample['state'].unsqueeze(0).to(device)
    actions_gt = sample['action'].unsqueeze(0).to(device)
    prompt = sample['prompt']
    
    image_mask = torch.tensor([[1, 1, 1]], dtype=torch.float32).to(device)
    action_mask = torch.ones(1, config['horizon'] * config['per_action_dim']).to(device)
    
    # 前向传播
    fused_tokens = model.get_vl_embeddings(
        images=images,
        image_mask=image_mask,
        prompt=prompt
    )
    
    pred_velocity, noise = model.action_head(
        fused_tokens=fused_tokens,
        state=state,
        actions_gt=actions_gt,
        action_mask=action_mask
    )
    
    # 计算损失
    target = (actions_gt - noise.view(1, config['horizon'], config['per_action_dim'])).view(1, -1)
    loss = nn.MSELoss()(pred_velocity, target)
    
    # 反向传播
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    
    losses.append(loss.item())
    
    print(f"Step {step+1}/{num_steps}, Loss: {loss.item():.6f}")

print("\n✓ 训练测试完成！")

# 绘制损失曲线
plt.figure(figsize=(10, 5))
plt.plot(losses, marker='o', linewidth=2)
plt.xlabel('Training Step', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Training Loss Curve', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\n初始损失: {losses[0]:.6f}")
print(f"最终损失: {losses[-1]:.6f}")
print(f"损失降低: {(losses[0] - losses[-1]) / losses[0] * 100:.2f}%")

### 6.3 训练后推理测试

In [None]:
# 切换回评估模式
model.eval()

# 测试样本
test_sample = train_dataset[0]

print(f"测试任务: {test_sample['prompt']}")

with torch.no_grad():
    action_pred = model.run_inference(
        images=test_sample['images'],
        image_mask=torch.tensor([[1, 1, 1]], dtype=torch.float32).to(device),
        prompt=test_sample['prompt'],
        state_input=test_sample['state'].tolist(),
        action_mask=torch.ones(1, config['horizon'] * config['per_action_dim']).to(device)
    )

# 对比预测与真实动作
action_pred_seq = action_pred.view(config['horizon'], config['per_action_dim']).cpu().numpy()
action_gt_seq = test_sample['action'].numpy()

# 可视化对比
fig, axes = plt.subplots(3, 1, figsize=(12, 10))

# 选择3个维度进行可视化
dims_to_plot = [0, 2, 4]

for idx, dim in enumerate(dims_to_plot):
    axes[idx].plot(action_gt_seq[:, dim], label='Ground Truth', linewidth=2, linestyle='--')
    axes[idx].plot(action_pred_seq[:, dim], label='Predicted', linewidth=2)
    axes[idx].set_ylabel(dim_names[dim], fontsize=12)
    axes[idx].legend(fontsize=10)
    axes[idx].grid(True, alpha=0.3)

axes[-1].set_xlabel('Time Step', fontsize=12)
axes[0].set_title('Predicted vs Ground Truth Actions', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# 计算预测误差
mse = np.mean((action_pred_seq - action_gt_seq) ** 2)
mae = np.mean(np.abs(action_pred_seq - action_gt_seq))

print(f"\n预测误差:")
print(f"  MSE: {mse:.6f}")
print(f"  MAE: {mae:.6f}")

print("\n✓ 训练后推理测试完成！")

## 7. 性能分析

### 7.1 推理速度基准测试

In [None]:
model.eval()

# 预热
for _ in range(5):
    with torch.no_grad():
        _ = model.run_inference(
            images=test_sample['images'],
            image_mask=torch.tensor([[1, 1, 1]], dtype=torch.float32).to(device),
            prompt=test_sample['prompt'],
            state_input=test_sample['state'].tolist(),
            action_mask=torch.ones(1, config['horizon'] * config['per_action_dim']).to(device)
        )

# 基准测试
num_iterations = 50
times = []

print(f"运行 {num_iterations} 次推理...")

for i in range(num_iterations):
    start = time.time()
    
    with torch.no_grad():
        _ = model.run_inference(
            images=test_sample['images'],
            image_mask=torch.tensor([[1, 1, 1]], dtype=torch.float32).to(device),
            prompt=test_sample['prompt'],
            state_input=test_sample['state'].tolist(),
            action_mask=torch.ones(1, config['horizon'] * config['per_action_dim']).to(device)
        )
    
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    times.append(time.time() - start)

times = np.array(times)

print(f"\n推理性能统计:")
print(f"  平均时间: {times.mean()*1000:.2f} ms")
print(f"  标准差: {times.std()*1000:.2f} ms")
print(f"  最小时间: {times.min()*1000:.2f} ms")
print(f"  最大时间: {times.max()*1000:.2f} ms")
print(f"  平均频率: {1/times.mean():.2f} Hz")

# 绘制时间分布
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(times * 1000, marker='o', markersize=3)
plt.axhline(y=times.mean()*1000, color='r', linestyle='--', label=f'Mean: {times.mean()*1000:.2f} ms')
plt.xlabel('Iteration')
plt.ylabel('Inference Time (ms)')
plt.title('Inference Time per Iteration')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(times * 1000, bins=20, edgecolor='black', alpha=0.7)
plt.axvline(x=times.mean()*1000, color='r', linestyle='--', label=f'Mean: {times.mean()*1000:.2f} ms')
plt.xlabel('Inference Time (ms)')
plt.ylabel('Frequency')
plt.title('Inference Time Distribution')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n✓ 性能基准测试完成！")

### 7.2 显存占用分析

In [None]:
if torch.cuda.is_available():
    # 清空缓存
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    
    # 运行推理
    with torch.no_grad():
        _ = model.run_inference(
            images=test_sample['images'],
            image_mask=torch.tensor([[1, 1, 1]], dtype=torch.float32).to(device),
            prompt=test_sample['prompt'],
            state_input=test_sample['state'].tolist(),
            action_mask=torch.ones(1, config['horizon'] * config['per_action_dim']).to(device)
        )
    
    # 获取显存统计
    allocated = torch.cuda.memory_allocated() / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    max_allocated = torch.cuda.max_memory_allocated() / 1024**3
    
    print("GPU 显存占用:")
    print(f"  当前分配: {allocated:.3f} GB")
    print(f"  保留显存: {reserved:.3f} GB")
    print(f"  峰值分配: {max_allocated:.3f} GB")
    
    # 可视化
    labels = ['Allocated', 'Reserved', 'Peak']
    values = [allocated, reserved, max_allocated]
    
    plt.figure(figsize=(8, 6))
    plt.bar(labels, values, color=['blue', 'orange', 'red'], alpha=0.7, edgecolor='black')
    plt.ylabel('Memory (GB)', fontsize=12)
    plt.title('GPU Memory Usage', fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3, axis='y')
    
    for i, v in enumerate(values):
        plt.text(i, v + 0.1, f"{v:.3f} GB", ha='center', fontsize=10, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
else:
    print("CPU 模式，无显存统计")

print("\n✓ 显存分析完成！")

## 8. 总结

In [None]:
print("="*80)
print(" " * 25 + "Evo-1 测试总结")
print("="*80)

print("\n【模型信息】")
print(f"  模型名称: Evo-1")
print(f"  总参数量: {total_params / 1e6:.2f}M")
print(f"  视觉-语言模型: InternVL3-1B")
print(f"  动作头: Flow Matching")

print("\n【配置参数】")
print(f"  Horizon: {config['horizon']}")
print(f"  动作维度: {config['per_action_dim']}")
print(f"  状态维度: {config['state_dim']}")
print(f"  推理步数: {config['num_inference_timesteps']}")

print("\n【性能指标】")
if 'times' in locals():
    print(f"  平均推理时间: {times.mean()*1000:.2f} ms")
    print(f"  推理频率: {1/times.mean():.2f} Hz")
if torch.cuda.is_available() and 'allocated' in locals():
    print(f"  显存占用: {allocated:.3f} GB")

print("\n【测试结果】")
print("  ✓ 视觉-语言嵌入测试: 通过")
print("  ✓ 动作头训练模式测试: 通过")
print("  ✓ 动作头推理模式测试: 通过")
print("  ✓ 端到端推理测试: 通过")
print("  ✓ 训练流程测试: 通过")
print("  ✓ 性能基准测试: 通过")

print("\n【功能验证】")
print("  ✓ 多视角图像输入")
print("  ✓ 自然语言指令理解")
print("  ✓ 机器人状态编码")
print("  ✓ 动作序列生成")
print("  ✓ 动作掩码支持")
print("  ✓ 平滑轨迹生成")

print("\n" + "="*80)
print(" " * 20 + "所有测试已成功完成！")
print("="*80)

## 9. 下一步

完成本教程后，你可以：

1. **使用真实数据训练**：按照中文使用手册准备数据集并训练
2. **在仿真环境中评估**：运行 Meta-World 或 LIBERO 评估
3. **部署到真实机器人**：参考 xArm6 客户端示例实现自己的机器人控制
4. **微调模型**：在特定任务上进行 fine-tuning
5. **优化性能**：尝试降低推理时间或减小模型大小

## 10. 参考资源

- **原理说明**: `docs/原理说明.md`
- **使用手册**: `docs/中文使用手册.md`
- **论文**: https://arxiv.org/abs/2511.04555
- **GitHub**: https://github.com/MINT-SJTU/Evo-1
- **模型**: https://huggingface.co/MINT-SJTU/Evo-1

---

*祝你使用愉快！如有问题，欢迎提 Issue。*