In [4]:
import torch
from torch import nn
import pickle
import numpy as np
import matplotlib.pyplot as plt

from modeling_gemma import KVCache
from processing_paligemma import PaliGemmaProcessor, LabelProcessor
from utils import move_inputs_to_device
from prepare_data import ImageInstructionOutputDataset, update_tokenizer, update_embeddings, prepare_dataset
from utils import *
from datetime import datetime


def compute_training_memory(model, input_size, optimizer_type='adam', device='cuda'):
    """
    计算神经网络训练时的显存峰值使用量

    参数:
        model (nn.Module): 神经网络模型
        input_size (tuple): 输入张量尺寸 (batch_size, ...)
        optimizer_type (str): 优化器类型，支持 'adam' 或 'sgd'
        device (str): 计算设备 ('cuda' 或 'cpu')

    返回:
        int: 峰值显存占用量（字节）
    """
    # 确保使用GPU
    if device != 'cuda':
        raise ValueError("显存计算需要CUDA设备")

    # 将模型移至GPU
    model = model.to(device)

    # 生成虚拟输入数据和标签
    dummy_input = torch.randn(*input_size).to(device)
    dummy_target = torch.randint(0, 10, (input_size[0],)).to(device)

    # 定义损失函数
    criterion = nn.CrossEntropyLoss()

    # 重置显存统计
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

    try:
        # 前向传播
        outputs = model(dummy_input)
        loss = criterion(outputs, dummy_target)

        # 反向传播
        loss.backward()

        # 创建优化器并执行一步更新
        if optimizer_type.lower() == 'adam':
            optimizer = torch.optim.Adam(model.parameters())
        elif optimizer_type.lower() == 'sgd':
            optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
        else:
            raise ValueError("支持的优化器类型: 'adam' 或 'sgd'")

        optimizer.step()

        # 获取峰值显存占用
        peak_memory = torch.cuda.max_memory_allocated(device=device)

    finally:
        # 清理内存
        del dummy_input, dummy_target, outputs, loss
        torch.cuda.empty_cache()

    return peak_memory



In [5]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = 'cpu'
print("Device in use: ", device)
model_path = 'paligemma'

print(f"Loading model")
tokenizer_modified = True
rank = 16  # LoRA rank
model, tokenizer = load_hf_model(model_path, freeze_vision=0)
vocab_size = len(tokenizer)

# 计算显存使用量
input_shape = (224, 224, 3)  # batch_size=32
peak_mem = compute_training_memory(model, input_shape)

# 打印结果
print(f"训练所需峰值显存: {peak_mem / 1024**2:.2f} MB")
print(f"详细组成:")
print(f"- 模型参数: {sum(p.numel() for p in model.parameters())} 个参数")
print(f"- 输入尺寸: {input_shape}")

Device in use:  cuda
Loading model


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



UnboundLocalError: local variable 'outputs' referenced before assignment