# Training Time Estimation

In [2]:
%load_ext autoreload

%autoreload 2
import pandas as pd

## Models training performance on H100

In [3]:
import pandas as pd

# Creando la tabla en un DataFrame
data = {
    "Model": ["GPT3-5B", "GPT3-20B", "GPT3-175B", "GPT3-175B", "LLAMA2-7B", "LLAMA2-13B", "LLAMA2-70B", "Nemotron-8B", "Nemotron-22B", "Nemotron-340B", "LLAMA3-8B", "LLAMA3-70B"],
    "#-GPUs": [64, 64, 128, 512, 8, 16, 64, 64, 64, 128, 8, 64],
    "GBS": [2048, 256, 256, 2048, 128, 128, 128, 256, 256, 32, 128, 128],
    "MBS": [4, 2, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1],
    "Sequence Length": [2048, 2048, 2048, 2048, 4096, 4096, 4096, 4096, 4096, 4096, 8192, 8192],
    "TP": [1, 2, 4, 4, 1, 1, 4, 2, 2, 8, 1, 4],
    "PP": [1, 1, 8, 8, 1, 4, 4, 1, 4, 8, 1, 4],
    "CP": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2],
    "VP": [1, 1, 6, 6, 1, 10, 20, 1, 10, 12, 1, 5],
    "Tokens / sec / GPU": [23406, 5851, 716, 825, 16934, 8715, 1728, 12507, 4312, 326, 12273, 1524],
    "Model TFLOP / sec / GPU": [765, 750, 771, 888, 780, 760, 768, 643, 562, 686, 711, 734],
    "Est. time to train in days (10T tokens, 1K GPUs)": [5, 19, 158, 137, 7, 13, 65, 9, 26, 347, 9, 74]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Model,#-GPUs,GBS,MBS,Sequence Length,TP,PP,CP,VP,Tokens / sec / GPU,Model TFLOP / sec / GPU,"Est. time to train in days (10T tokens, 1K GPUs)"
0,GPT3-5B,64,2048,4,2048,1,1,1,1,23406,765,5
1,GPT3-20B,64,256,2,2048,2,1,1,1,5851,750,19
2,GPT3-175B,128,256,1,2048,4,8,1,6,716,771,158
3,GPT3-175B,512,2048,2,2048,4,8,1,6,825,888,137
4,LLAMA2-7B,8,128,1,4096,1,1,1,1,16934,780,7
5,LLAMA2-13B,16,128,1,4096,1,4,1,10,8715,760,13
6,LLAMA2-70B,64,128,1,4096,4,4,1,20,1728,768,65
7,Nemotron-8B,64,256,4,4096,2,1,1,1,12507,643,9
8,Nemotron-22B,64,256,2,4096,2,4,1,10,4312,562,26
9,Nemotron-340B,128,32,1,4096,8,8,1,12,326,686,347


In [13]:
ZERO_DICT = {
    'T': 12,  # Trillion
    'B': 9,   # Billion
    'M': 6,   # Million
    'K': 3,   # Thousand
    '': 0     # Base case (less than 1,000)
}

def format_numbres(num):
    for suffix, zeros in ZERO_DICT.items():
        if abs(num) >= 10 ** zeros:
            value = num / (10 ** zeros)
            return f"{value:.0f}e{zeros} ({suffix})"
    return f"{num:.0f}e0"

def training_time(
    tokens_per_second: int = 23406,
    model_name: str = "GPT3-5B",
    n_gpus: int = 1000,
    n_tokens: int = 10_000_000_000_000,
    time_format: str = "D",
):
    total_time = n_tokens / (tokens_per_second * n_gpus)
    if time_format == "D":
        return round(total_time / (24 * 60 * 60))

def training_tokens(tokens_per_second: int = 23406,
    model_name: str = "GPT3-5B", n_gpus: int=96, n_days: int=30):
    """ Calculate the amount of tokens that a fixed numbers of H200 gpu's can process"""
    h200_tps = tokens_per_second * 1.32
    t_tokens = (n_days * 24 * 60 * 60)* (h200_tps * n_gpus)
    return t_tokens

In [14]:
def calculate_data_size(tokens, bytes_per_token=2):

    total_bytes = tokens * bytes_per_token
    
    size_in_gb = total_bytes / 1e9  # 1 GB = 1e9 bytes
    size_in_tb = size_in_gb / 1024  # 1 TB = 1024 GB

    return size_in_tb


In [16]:
# Add new columns for training time and tokens
df['Training Days (10T tokens, 1K GPUs)'] = df.apply(lambda row: training_time(row['Tokens / sec / GPU'], row['Model']), axis=1)
df['Training Tokens (30 days, 96 GPUs H200)'] = df.apply(lambda row: training_tokens(row['Tokens / sec / GPU'], row['Model']), axis=1)
df['Datasize processed in 30 days'] = df.apply(lambda row: calculate_data_size(row['Training Tokens (30 days, 96 GPUs H200)']), axis=1)

# Display the updated dataframe
df[['Model','Training Tokens (30 days, 96 GPUs H200)','Datasize processed in 30 days']].to_csv(index=False)

'Model,"Training Tokens (30 days, 96 GPUs H200)",Datasize processed in 30 days\nGPT3-5B,7687893565440.001,15.015417120000002\nGPT3-20B,1921809162240.0002,3.7535335200000004\nGPT3-175B,235176099840.0,0.45932832\nGPT3-175B,270978048000.0,0.529254\nLLAMA2-7B,5562111836160.0,10.86349968\nLLAMA2-13B,2862513561600.0,5.5908468\nLLAMA2-70B,567575838720.0,1.10854656\nNemotron-8B,4108027207680.0,8.02349064\nNemotron-22B,1416311930880.0,2.76623424\nNemotron-340B,107077386240.0,0.20913552\nLLAMA3-8B,4031167979520.0,7.87337496\nLLAMA3-70B,500570357760.0,0.97767648\n'