# Training Time Estimation

In [None]:
%load_ext autoreload

%autoreload 2
import pandas as pd

## Models training performance on H100

In [None]:
import pandas as pd

# Creando la tabla en un DataFrame
data = {
    "Model": ["GPT3-5B", "GPT3-20B", "GPT3-175B", "GPT3-175B", "LLAMA2-7B", "LLAMA2-13B", "LLAMA2-70B", "Nemotron-8B", "Nemotron-22B", "Nemotron-340B", "LLAMA3-8B", "LLAMA3-70B"],
    "#-GPUs": [64, 64, 128, 512, 8, 16, 64, 64, 64, 128, 8, 64],
    "GBS": [2048, 256, 256, 2048, 128, 128, 128, 256, 256, 32, 128, 128],
    "MBS": [4, 2, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1],
    "Sequence Length": [2048, 2048, 2048, 2048, 4096, 4096, 4096, 4096, 4096, 4096, 8192, 8192],
    "TP": [1, 2, 4, 4, 1, 1, 4, 2, 2, 8, 1, 4],
    "PP": [1, 1, 8, 8, 1, 4, 4, 1, 4, 8, 1, 4],
    "CP": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2],
    "VP": [1, 1, 6, 6, 1, 10, 20, 1, 10, 12, 1, 5],
    "Tokens / sec / GPU": [23406, 5851, 716, 825, 16934, 8715, 1728, 12507, 4312, 326, 12273, 1524],
    "Model TFLOP / sec / GPU": [765, 750, 771, 888, 780, 760, 768, 643, 562, 686, 711, 734],
    "Est. time to train in days (10T tokens, 1K GPUs)": [5, 19, 158, 137, 7, 13, 65, 9, 26, 347, 9, 74]
}

df = pd.DataFrame(data)
df

In [None]:
ZERO_DICT = {
    'T': 12,  # Trillion
    'B': 9,   # Billion
    'M': 6,   # Million
    'K': 3,   # Thousand
    '': 0     # Base case (less than 1,000)
}

def format_numbres(num):
    for suffix, zeros in ZERO_DICT.items():
        if abs(num) >= 10 ** zeros:
            value = num / (10 ** zeros)
            return f"{value:.0f}e{zeros} ({suffix})"
    return f"{num:.0f}e0"

def training_time(
    tokens_per_second: int = 23406,
    model_name: str = "GPT3-5B",
    n_gpus: int = 1000,
    n_tokens: int = 10_000_000_000_000,
    time_format: str = "D",
):
    total_time = n_tokens / (tokens_per_second * n_gpus)
    if time_format == "D":
        return round(total_time / (24 * 60 * 60))

def training_tokens(tokens_per_second: int = 23406,
    model_name: str = "GPT3-5B", n_gpus: int=96, n_days: int=30):
    """ Calculate the amount of tokens that a fixed numbers of H200 gpu's can process"""
    h200_tps = tokens_per_second * 1.4
    t_tokens = (n_days * 24 * 60 * 60)* (h200_tps * n_gpus)
    return t_tokens

In [None]:
# Add new columns for training time and tokens
df['Training Days (10T tokens, 1K GPUs)'] = df.apply(lambda row: training_time(row['Tokens / sec / GPU'], row['Model']), axis=1)
df['Training Tokens (30 days, 96 GPUs)'] = df.apply(lambda row: training_tokens(row['Tokens / sec / GPU'], row['Model']), axis=1)

# Display the updated dataframe
df