# SSDLite_FPN 训练笔记

## 安装依赖

In [11]:
!pip -q install torchvision
!pip -q install tqdm
!pip -q install timm

## 导入工程

In [12]:
# 导入系统库
import os
import timm
from tqdm import tqdm

# 导入sparrow
from sparrow.models.ssdlite_fpn import SSDLite_FPN
from sparrow.datasets.coco_dets import create_dets_dataloader
from sparrow.losses.ssdlite_loss import SSDLoss, AnchorGenerator
from sparrow.utils.torch_utils import EMA, evaluate, visualize_predictions
from torch.optim.lr_scheduler import CosineAnnealingLR

# 导入torch库
import torch

## 参数设置

### 系统参数

In [13]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
INPUT_SIZE = 320
BATCH_SIZE = 8
NUM_WORKERS = 4
NUM_CLASSES = 80 # COCO 数据集定义数
ANCHOR_SIZES = [32, 64, 128, 256, 512]
ANCHOR_RATIOS = [0.5, 1.0, 2.0, 1/3.0, 3.0]

WEIGHTS_DIR = "./outputs/ssdlite" # 保存权重的目录
TEST_IMAGE_PATH = "./res/india_road.png" # 你的测试图片路径

### 学习参数

In [14]:
START_EPOCH = 0
LOG_INTERVAL_SAMPLES = 1000
EPOCHS=100
log_interval_batches = max(1, LOG_INTERVAL_SAMPLES // BATCH_SIZE)
BEST_VAL_LOSS = float('inf')
LEARNING_RATE = 1e-4 # 初始学习率
WEIGHT_DECAY = 1e-3
WARMUP_EPOCHS = 2 # <--- 新增：预热的 epoch 数量
GRADIENT_CLIP_VAL = 5.0 # <--- 新增：梯度裁剪的阈值

## 创建模型

In [15]:
backbone_fpn = timm.create_model('mobilenetv3_large_100', pretrained=True, features_only=True, out_indices=(2, 3, 4))
model_fpn = SSDLite_FPN(backbone_fpn, num_classes=NUM_CLASSES, fpn_out_channels=128, num_anchors=len(ANCHOR_RATIOS))
model_fpn.to(device)

# EMA评估器
ema = EMA(model_fpn)

Unexpected keys (classifier.bias, classifier.weight, conv_head.bias, conv_head.weight) found while loading pretrained weights. This may be expected if model is being adapted.


## 加载数据

In [16]:
COCO_ROOT = "/home/cxt/projects/MobileSparrow/data/coco2017"

# 数据加载器 (来自 dataloader.py)
train_aug_config = { "rotate_deg": 15.0, "min_box_size": 2.0 }
train_loader = create_dets_dataloader(
    dataset_root=COCO_ROOT,
    img_size=INPUT_SIZE,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    aug_cfg=train_aug_config,
    is_train=True
)

# --- 新增：创建验证集数据加载器 ---
val_aug_config = { "min_box_size": 2.0 } # 验证集通常不做复杂增强
val_loader = create_dets_dataloader(
    dataset_root=COCO_ROOT,
    img_size=INPUT_SIZE,
    batch_size=BATCH_SIZE * 2,  # 验证时通常可以用更大的 batch size
    num_workers=NUM_WORKERS,
    pin_memory=True,
    aug_cfg=val_aug_config,
    is_train=False
)

## 损失优化调度

In [17]:
# 损失函数
criterion = SSDLoss(num_classes=NUM_CLASSES)

# 优化器
optimizer = torch.optim.AdamW(model_fpn.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# 学习调度器
scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=1e-6) 

## 加载预训练权重

确保每次训练不必从头再来。

In [18]:
# 确保存放预训练的目录存在
os.makedirs(WEIGHTS_DIR, exist_ok=True) # 确保目录存在

# 断点续训逻辑
last_pt_path = os.path.join(WEIGHTS_DIR, "last.pt")
if os.path.exists(last_pt_path):
    print("--- Resuming training from last.pt ---")

    # 加载pt文件
    checkpoint = torch.load(last_pt_path, map_location=device)
    
    # 从pt中读取模型权重
    model_fpn.load_state_dict(checkpoint['model'])
    
    # 加载EMA状态
    ema.ema_model.load_state_dict(checkpoint['ema_model'])

    # 加载优化器状态
    optimizer.load_state_dict(checkpoint['optimizer'])

    # 加载调度器状态
    scheduler.load_state_dict(checkpoint['scheduler'])

    # 更新EPOCH状态
    START_EPOCH = checkpoint['epoch'] + 1
    
    # 更新最佳损失状态
    BEST_VAL_LOSS = checkpoint['best_val_loss']
    
    # 打印确认消息
    print(f"Resumed from epoch {START_EPOCH-1}. Best validation loss so far: {BEST_VAL_LOSS:.4f}")
    print(f"Current learning rate is {optimizer.param_groups[0]['lr']:.6f}")



--- Resuming training from last.pt ---
Resumed from epoch 52. Best validation loss so far: 20.3773
Current learning rate is 0.000047


## 预处理 anchor boxes

In [19]:
# --- 预计算锚框 (核心步骤) ---
print("Pre-computing anchors for fixed input size...")
anchor_generator = AnchorGenerator(
    sizes=ANCHOR_SIZES,
    aspect_ratios=ANCHOR_RATIOS
)

# 创建一个虚拟输入
dummy_input = torch.randn(1, 3, INPUT_SIZE, INPUT_SIZE).to(device)

# 设置为 eval 模式，并确保没有梯度计算
model_fpn.eval()
with torch.no_grad():
    # 手动执行一次特征提取流程，以获取特征图尺寸
    features = model_fpn.backbone(dummy_input)
    p3, p4, p5 = model_fpn.fpn(features)
    p6 = model_fpn.extra_layers[0](p5)
    p7 = model_fpn.extra_layers[1](p6)
    feature_maps_for_size_calc = [p3, p4, p5, p6, p7]

# 使用获取的特征图列表生成一次性的、完整的锚框网格
# 这个 precomputed_anchors 将在整个训练过程中被重复使用
precomputed_anchors = anchor_generator.generate_anchors_on_grid(feature_maps_for_size_calc, device)
print(f"Anchors pre-computed. Shape: {precomputed_anchors.shape}")

Pre-computing anchors for fixed input size...
Anchors pre-computed. Shape: torch.Size([10670, 4])


## 启动训练循环

In [None]:
# --- 训练循环 ---
print("\n--- Starting Training ---")
model_fpn.train() # 切换回训练模式
print(f"Logging average loss every {log_interval_batches} batches.")  # 日志的打印频率 

# 计算预热的总步数
warmup_steps = WARMUP_EPOCHS * len(train_loader)
current_step = START_EPOCH * len(train_loader)

for epoch in range(START_EPOCH, EPOCHS):
    model_fpn.train() # 设置为训练模式
    
    epoch_loss_cls = 0.0
    epoch_loss_reg = 0.0

    # 进度条信息    
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Training]")
    
    for i, (imgs, targets, _) in enumerate(pbar):
        # --- 学习率预热逻辑 ---
        if current_step < warmup_steps:
            # 线性预热
            lr_scale = (current_step + 1) / warmup_steps
            for param_group in optimizer.param_groups:
                param_group['lr'] = LEARNING_RATE * lr_scale

        # --- 正常训练步骤 ---
        imgs = imgs.to(device)
        targets_on_device = [t.to(device) for t in targets]
        cls_preds, reg_preds = model_fpn(imgs)
        loss_cls, loss_reg = criterion(precomputed_anchors, cls_preds, reg_preds, targets_on_device)
        total_loss = loss_cls + loss_reg
        optimizer.zero_grad()
        total_loss.backward()

        # --- 新增：梯度裁剪 ---
        torch.nn.utils.clip_grad_norm_(model_fpn.parameters(), max_norm=GRADIENT_CLIP_VAL)
        
        # 微调模型参数
        optimizer.step()

        # --- 更新 EMA ---
        # 更新 EMA 和步数计数器
        ema.update(model_fpn)
        current_step += 1
        
        epoch_loss_cls += loss_cls.item()
        epoch_loss_reg += loss_reg.item()
        current_lr = optimizer.param_groups[0]['lr']
        pbar.set_postfix(cls=f"{loss_cls.item():.4f}", reg=f"{loss_reg.item():.4f}")

    # 每个 epoch 结束后，更新学习率调度器
    if epoch >= WARMUP_EPOCHS - 1: # -1 是因为 step() 应在 optimizer.step() 之后调用
        scheduler.step()

    # 每个 epoch 结束后，进行验证
    avg_val_loss, _, _ = evaluate(ema.ema_model, val_loader, criterion, anchor_generator, precomputed_anchors, device)
    print(f"\n---> Epoch {epoch+1}/{EPOCHS} Validation Summary <---")
    print(f"  Average Validation Loss: {avg_val_loss:.4f}")

    # --- 保存 last.pt 和 best.pt ---
    checkpoint = {
        'epoch': epoch,
        'model': model_fpn.state_dict(),
        'ema_model': ema.ema_model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict(), # <--- 保存 scheduler 状态
        'best_val_loss': BEST_VAL_LOSS,
    }
    
    # 保存 last.pt
    torch.save(checkpoint, last_pt_path)
    print(f"  Saved last checkpoint to {last_pt_path}")
    
    # 如果当前是最佳模型，则保存 best.pt
    if avg_val_loss < BEST_VAL_LOSS:
        BEST_VAL_LOSS = avg_val_loss
        checkpoint['best_val_loss'] = BEST_VAL_LOSS # 更新 checkpoint 中的最佳损失
        best_pt_path = os.path.join(WEIGHTS_DIR, "best.pt")
        torch.save(checkpoint, best_pt_path)
        print(f"  >>> New best model found! Saved to {best_pt_path}\n")
    
    # --- 每 10 个 epoch，可视化一次预测结果 ---
    if (epoch + 1) % 10 == 0:
        print(f"\n--- Visualizing predictions at epoch {epoch+1} ---")
        viz_dir = os.path.join(WEIGHTS_DIR, "viz")
        os.makedirs(viz_dir, exist_ok=True)
        save_path = os.path.join(viz_dir, f"epoch_{epoch+1:03d}.png")
        visualize_predictions(
            model=ema.ema_model,
            image_path=TEST_IMAGE_PATH,
            anchor_generator=anchor_generator,
            device=device,
            precomputed_anchors=precomputed_anchors,
            conf_thresh=0.3,
            nms_thresh=0.45,
            save_path=save_path,   # 保存，不show
            show=False
        )

print("--- Training Finished ---")


--- Starting Training ---
Logging average loss every 125 batches.


Epoch 54/100 [Training]:  73%|███████▎  | 10753/14785 [05:11<01:58, 33.93it/s, cls=0.3789, reg=0.0716]

使用这些技巧后为什么误差不再剧烈抖动：

1.  **Warm-up**：在最初的 `WARMUP_EPOCHS` 个周期里，学习率会从一个接近0的值慢慢爬升到 `1e-4`。这给了模型充足的时间来适应数据，预测头的权重会从随机状态平稳地过渡到一个更有意义的状态，避免了初期的梯度爆炸。
2.  **Gradient Clipping**：即使在预热后，偶尔也可能遇到特别困难的样本导致梯度激增。`clip_grad_norm_` 就像一个保险丝，能确保任何一次的更新都不会过大，从而保证了训练过程的平滑。
3.  **CosineAnnealingLR**：在模型度过初期、趋于稳定后，这个调度器会开始工作，它会像余弦曲线一样，缓慢地将学习率从 `1e-4` 降低到一个非常小的值（`1e-6`）。这使得模型在训练后期能够在损失的“山谷”底部进行精细搜索，从而找到一个更好的局部最优解。

**建议**：
请使用这个新的训练脚本来重新开始训练（可以先删除旧的 `outputs` 目录）。你将会观察到，验证集损失的下降过程会变得**平滑得多**，不会再出现几百上千的剧烈跳动，模型的收敛会更稳定、效果也更有可能达到最佳。