In [5]:
import math

In [8]:
# -------------------
#   模型结构以llama 70B为例
s = 4096  # 模型输入序列长度
b = 8    # batch size
a = 64    # 
h = 8192  # 隐藏层维度
l = 80    # transfomer层数
v = 32000 # 词向量长度
model_args = 70e9  # 模型参数
# -------------------
#   GPT3
gpt3_args = 175e9  # 模型参数
gpt3_h = 12288     # 隐藏层维度
gpt3_l = 96        # transfomer层数
gpt3_a = 96        # 
gpt3_s = 2048      # 序列长度
gpt3_tokens_num = 300e9  # 训练数据的总tokens
# -------------------
# lora
r = 4  # 论文中显示4-8为优
lora_tokens = 20e9   # 训练集tokens数量
tokens_num = 500000 * 4096 * 10 # 数据集tokens数量
# -------------------
# GPUA100
GPU_ava_factor = 0.45  # GPU利用率一般在0.3-0.55之间。
GPU_flops = 312e12  # A100 fp16峰值
GPU_memory40 = 40  # A100显卡40G版本
GPU_memory80 = 80  # A100显卡80G版本
# -------------------
# method selective activation recomputation + Sequence Parallelism
t = 1  # the degree of tensor parallelism being used (1 if not)
overhead = 0.05  # selective activation recomputation 在整体上引入的计算量
Backward_compute_rate = 1.5  # 反向传播计算量/正向传播计算量,取值为1-3之间。

In [40]:
"""
全量计算，selective activation recomputation + Sequence Parallelism + tensor parallel
"""

def computer_time(commite,tokens_num, model_args, overhead, Backward_compute_rate, GPU_num, GPU_flops, GPU_ava_factor):
    """
    用于计算给定模型参数、数据集tokens数量、GPU数量时在算力上所需要的时间
    """
    time = (2*(1+1*Backward_compute_rate)* (1+overhead) *tokens_num*model_args) / \
        (GPU_num*GPU_flops*GPU_ava_factor)/(24*60*60)
    print(f"{commite}:{GPU_num}张卡训练{int(tokens_num/1e9)}Btokens需要时间为{int(math.ceil(time))}天")
    return time


def computer_GPU_num_computation(commite, time, tokens_num, model_args, overhead, Backward_compute_rate, GPU_flops, GPU_ava_factor):
    """
    用于计算给定模型参数、数据集tokens数量、训练时间时在算力上所需的显卡数量
    """
    GPU_num = (2*(1+1*Backward_compute_rate) * (1+overhead) * tokens_num*model_args) / \
        ((GPU_flops*GPU_ava_factor) * (time*24*60*60))
    GPU_num = math.ceil(GPU_num)
    print(f"{commite}:{time}天训练{int(tokens_num/1e9)}B tokens算力上需要显卡：{GPU_num}张")
    return GPU_num


def computer_GPU_num_memory(commite, model_args, GPU_memory, s, b,  h, l, t=1):
    """
    用于计算给定模型参数、GPU显存、输入序列长度、batch size、隐藏层、层数、张量并行参数时内存上需要的显卡数量。
    """
    GPU_num = (model_args*20 + s*b*h*l*(10/t + 24/t)) / \
        (GPU_memory * 1024 * 1024 * 1024)
    GPU_num = int(math.ceil(GPU_num))
    print(f"{commite}:模型参数{model_args/1e9}B显存上需要显存为{GPU_memory}G的显卡：{GPU_num}张")
    return GPU_num


def computer_memory(commite, model_args, s, b,  h, l, t=1):
    """
    用于计算给定模型参数、GPU显存、输入序列长度、batch size、隐藏层、层数、张量并行参数时内存上需要的显卡数量。
    """
    memory = (model_args*20 + s*b*h*l*(10/t + 24/t))/1e9
    print(f"{commite}:{int(math.ceil(memory))}")
    return memory

In [41]:
# 计算最小显卡数量
computer_time("llama",tokens_num, model_args, overhead,
              Backward_compute_rate, 85, GPU_flops, GPU_ava_factor)

GPU_num_computer = computer_GPU_num_computation(
    "gpt3",34, gpt3_tokens_num, gpt3_args, overhead, Backward_compute_rate, GPU_flops, GPU_ava_factor)

GPU_num_computer = computer_GPU_num_computation(
    "llama", 60, tokens_num, model_args, overhead, Backward_compute_rate, GPU_flops, GPU_ava_factor)

GPU_num_memory = computer_GPU_num_memory("llama",model_args, GPU_memory80, s, b, h, l)

model_memery = computer_memory("gpt3",gpt3_args, gpt3_s, b,  gpt3_h, gpt3_l, t=1)

llama:85张卡训练20Btokens需要时间为8天
gpt3:34天训练300B tokens算力上需要显卡：669张
llama:60天训练20B tokens算力上需要显卡：11张
llama:模型参数70.0B显存上需要显存为80G的显卡：25张
gpt3:4158


In [36]:
"""
lora + selective activation recomputation + Sequence Parallelism + tensor parallel
"""
def compute_backbone_args(commite, backbone_args,h,l,V):
    """
    计算backbone的参数
    """
    backbone_args =l*(12*h*h+13*h) + V*h
    print(f"{commite}:backbone_args权重为:{backbone_args}")
    return backbone_args

def compute_lora_args(commite,h,l,r,turning_layer_num=None):
    """
    计算lora分支的参数
    """
    if not turning_layer_num:
        turning_layer_num = l
    lora_args = turning_layer_num*(h*r*2)*2
    print(f"{commite}:h={h},l={l},r={r},lora_args权重为:{lora_args}")
    return lora_args


def compute_computation_lora(commite, b, s, h, l, V, r, Backward_compute_rate, overhead, turning_layer_num=None):
    """
    计算使用lora时，backbone和lora分支的总计算量
    lora分支：由于lora分支的计算量和backbone的计算量相比，计算量极小，这里为了计算方便，lora的计算量为lora正向传播的四倍。
    backbone分支：backbone的计算量
    """
    if not turning_layer_num:
        turning_layer_num = l
    lora_computation = 4*(8*r + 2)*turning_layer_num*b*s*h
    base_computation = (l*(24*b*s*h*h + 4*b*s*s*h)+2*b*s*h*V + Backward_compute_rate*turning_layer_num * \
                        (24*b*s*h*h + 4*b*s*s*h)+2*b*s*h*V) * (1 + overhead)
    computation = base_computation + lora_computation
    print(f"{commite}:计算量为:{computation}")
    return computation

def compute_memory_lora(commite, backbone_args,lora_args,b,s,h,l,r,t=1,turning_layer_num=None):
    """
    计算使用lora时，backbone和lora分支的内存占用
    """
    if not turning_layer_num:
        turning_layer_num = l
    lora_memory = lora_args * ((2+4)+(2+4)+(4+4)) +  turning_layer_num *(2*b*s*r + 2*b*s*h)
    backbone_memory = backbone_args * ((2)+(0)+0) + turning_layer_num*s*b*h*(10/t + 24/t)
    memory = int(math.ceil((lora_memory + backbone_memory)/1024**3))
    print(f"{commite}:显存为:{memory}G")
    return memory, backbone_memory, lora_memory


def computer_time_lora(commite, Backward_compute_rate, overhead, s, h, backbone_args, lora_args, tokens_num, GPU_num, GPU_flops, GPU_ava_factor):
    """
    计算使用lora时，需要训练的时间
    """
    time = (8*tokens_num*lora_args + ((1+overhead)*(1+1*Backward_compute_rate)*(2+s/3*h))*tokens_num*backbone_args) / \
        (GPU_num*GPU_flops*GPU_ava_factor)/(24*60*60)
    print(f"{commite}:{GPU_num}张卡训练{int(tokens_num/1e9)}Btokens需要时间为{time}天")
    return time


def computer_time_lora_method2(commite, s, h, backbone_args, tokens_num, Backward_compute_rate, overhead, GPU_num, GPU_flops, GPU_ava_factor):
    """
    计算使用lora时，需要训练的时间方法2。
    方法2：参考gpt3使用lora微调时，在全部的Q和V矩阵设置lora，相比全量微调加速百分之25。
    """
    time = ((1+overhead)*(1+1*Backward_compute_rate)*(2+s/3*h)*tokens_num*backbone_args) * 0.75 / \
        (GPU_num*GPU_flops*GPU_ava_factor)/(24*60*60)
    print(f"{commite}:{GPU_num}张卡训练{int(tokens_num/1e9)}Btokens需要时间为{time}天")
    return time


def computer_GPU_num_lora(commite, Backward_compute_rate, overhead, s, h, time, tokens_num, backbone_args, lora_args, GPU_flops, GPU_ava_factor):
    """
    计算使用lora时，给定模型参数，tokens数量后在算力层面所需要的显卡数量。
    """
    GPU_num = (8*tokens_num*lora_args + ((1+overhead)*(1+1*Backward_compute_rate)*(2+s/(3*h)))*tokens_num*backbone_args) / \
        ((GPU_flops*GPU_ava_factor)*(time*24*60*60))
    GPU_num = math.ceil(GPU_num)
    model_args = int(math.ceil((backbone_args+lora_args)/1e9))
    print(f"{commite}:lora算力需求:{time}天,参数量{model_args}B训练{int(tokens_num/1e9)}B tokens需要显卡：{GPU_num}张")
    return GPU_num

def computer_min_GPU_num_lora(commite, backbone_memory,lora_memory, GPU_memory, l,turning_layer_num = None):
    """
    计算使用lora时，给定模型参数，后在内存层面所需要的显卡数量。
    """
    if not turning_layer_num:
        turning_layer_num = l
    GPU_num = (lora_memory + backbone_memory) / \
        (GPU_memory * 1024 * 1024 * 1024)
    GPU_num = int(math.ceil(GPU_num))
    memory = int(math.ceil((backbone_memory+lora_memory)/1024**3))
    print(f"{commite}:lora显存需求:显存{memory}G,需要显存为{GPU_memory}的显卡：{GPU_num}张")
    return GPU_num

In [37]:
# 计算lora权重
gpt3_lora_args = compute_lora_args("gpt3",gpt3_h,gpt3_l,r,turning_layer_num=None)
lora_args = compute_lora_args("llama70B",h,l,r,turning_layer_num=None)
gpt3_memory, gpt3_backbone_memory, gpt3_lora_memory = compute_memory_lora("gpt3",gpt3_args,gpt3_lora_args,b,gpt3_s*2,gpt3_h,gpt3_l,r,t,turning_layer_num=None)
memory, backbone_memory, lora_memory = compute_memory_lora("llama70B",model_args,lora_args,b,s,h,l,r,t,turning_layer_num=None)
computer_GPU_num_lora("gpt3",  Backward_compute_rate, overhead, gpt3_s, gpt3_h, 60,
                      lora_tokens, gpt3_args, gpt3_lora_args, GPU_flops, GPU_ava_factor)
computer_GPU_num_lora("llama70B", Backward_compute_rate, overhead, s, h, 60,
                      lora_tokens, model_args, lora_args, GPU_flops, GPU_ava_factor)
computer_min_GPU_num_lora("gpt3",gpt3_backbone_memory,gpt3_lora_memory, GPU_memory80, gpt3_l,turning_layer_num = None)
computer_min_GPU_num_lora("llama70B",backbone_memory,lora_memory, GPU_memory80, l,turning_layer_num = None)

gpt3:h=12288,l=96,r=4,lora_args权重为:18874368
llama70B:h=8192,l=80,r=4,lora_args权重为:10485760
gpt3:显存为:1623G
llama70B:显存为:851G
gpt3:lora算力需求:60天,参数量176B训练20B tokens需要显卡：26张
llama70B:lora算力需求:60天,参数量71B训练20B tokens需要显卡：11张
gpt3:lora显存需求:显存1623G,需要显存为80的显卡：21张
llama70B:lora显存需求:显存851G,需要显存为80的显卡：11张


11