In [None]:
%pip install torch>=2.0.0 transformers>=4.30.0 accelerate>=0.20.3 bitsandbytes

In [None]:
import os
import time
import argparse
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

cpu


**Gemma 2B**

In [None]:
import os
import time
import argparse
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')


# Mapping of model identifiers - Gemma-2B
MODELS = {
    'gemma-2b': 'google/gemma-2b'
}

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

class HFModelLoader:
    @staticmethod
    def from_pretrained(repo_id, **kwargs):
        # pass the auth token to access gated models
        if HF_TOKEN:
            kwargs['use_auth_token'] = HF_TOKEN
        return AutoModelForCausalLM.from_pretrained(repo_id, **kwargs)

class HFTokenizerLoader:
    @staticmethod
    def from_pretrained(repo_id, **kwargs):
        # pass the auth token to access gated models
        if HF_TOKEN:
            kwargs['use_auth_token'] = HF_TOKEN
        return AutoTokenizer.from_pretrained(repo_id, **kwargs)


def benchmark_model(name, repo_id, runs, max_new_tokens, quantize=False):
    print(f"Benchmarking {name} on {DEVICE}{' (8-bit)' if quantize else ''}")
    # Load tokenizer
    tokenizer = HFTokenizerLoader.from_pretrained(repo_id)

    # Quantization config if needed
    quant_config = BitsAndBytesConfig(load_in_8bit=True) if quantize else None

    # Load model
    model = HFModelLoader.from_pretrained(
        repo_id,
        device_map='auto' if DEVICE=='cuda' else None,
        quantization_config=quant_config,
        torch_dtype=torch.float16 if DEVICE=='cuda' else torch.float32
    )
    model.to(DEVICE)

    # Prepare input
    prompt = "Hello, how are you today?"
    inputs = tokenizer(prompt, return_tensors='pt').to(DEVICE)

    # Warm-up
    _ = model.generate(**inputs, max_new_tokens=10)

    # Benchmark
    start_time = time.time()
    total_tokens = 0
    for _ in range(runs):
        output = model.generate(**inputs, max_new_tokens=max_new_tokens)
        total_tokens += output.shape[-1]
    elapsed = time.time() - start_time

    tps = total_tokens / elapsed
    tpm = tps * 60
    print(f"Tokens generated: {total_tokens}")
    print(f"Elapsed time (s): {elapsed:.2f}")
    print(f"Tokens per second: {tps:.2f}")
    print(f"Tokens per minute: {tpm:.2f}")
    return tps, tpm

if __name__ == '__main__':
    sys.argv = ['colab_kernel_launcher.py', '--all']

    parser = argparse.ArgumentParser(description='Local LLM Benchmarking')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--model', choices=list(MODELS.keys()), help='Model to benchmark')
    group.add_argument('--all', action='store_true', help='Benchmark all models')
    parser.add_argument('--runs', type=int, default=20, help='Number of runs')
    parser.add_argument('--max_new_tokens', type=int, default=50, help='Maximum new tokens per run')
    parser.add_argument('--quantize', action='store_true', help='Use 8-bit quantization')
    args = parser.parse_args()

    results = {}
    targets = MODELS.keys() if args.all else [args.model]
    for m in targets:
        tps, tpm = benchmark_model(m, MODELS[m], args.runs, args.max_new_tokens, quantize=args.quantize)
        results[m] = {'TPS': tps, 'TPM': tpm}

    print("\n=== Summary ===")
    for m, res in results.items():
        print(f"{m}: {res['TPS']:.2f} TPS, {res['TPM']:.2f} TPM")

Benchmarking gemma-2b on cuda




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Tokens generated: 1160
Elapsed time (s): 29.42
Tokens per second: 39.43
Tokens per minute: 2365.56

=== Summary ===
gemma-2b: 39.43 TPS, 2365.56 TPM


**Llama 3.1 8B**

In [None]:
import os
import time
import argparse
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import sys

from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')


# Mapping of model identifiers - Llama-3.1-8B
MODELS = {
    'llama-3.1-8b': 'meta-llama/Llama-3.1-8B'
}

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

class HFModelLoader:
    @staticmethod
    def from_pretrained(repo_id, **kwargs):
        # pass the auth token to access gated models
        if HF_TOKEN:
            kwargs['use_auth_token'] = HF_TOKEN
        return AutoModelForCausalLM.from_pretrained(repo_id, **kwargs)

class HFTokenizerLoader:
    @staticmethod
    def from_pretrained(repo_id, **kwargs):
        # pass the auth token to access gated models
        if HF_TOKEN:
            kwargs['use_auth_token'] = HF_TOKEN
        return AutoTokenizer.from_pretrained(repo_id, **kwargs)


def benchmark_model(name, repo_id, runs, max_new_tokens, quantize=False):
    print(f"Benchmarking {name} on {DEVICE}{' (8-bit)' if quantize else ''}")
    # Load tokenizer
    tokenizer = HFTokenizerLoader.from_pretrained(repo_id)

    # Quantization config if needed
    quant_config = BitsAndBytesConfig(load_in_8bit=True) if quantize else None

    # Load model
    model = HFModelLoader.from_pretrained(
        repo_id,
        device_map='auto' if DEVICE=='cuda' else None,
        quantization_config=quant_config,
        torch_dtype=torch.float16 if DEVICE=='cuda' else torch.float32
    )

    # Prepare input
    prompt = "Hello, how are you today?"
    inputs = tokenizer(prompt, return_tensors='pt').to(DEVICE)

    # Warm-up
    _ = model.generate(**inputs, max_new_tokens=10)

    # Benchmark
    start_time = time.time()
    total_tokens = 0
    for _ in range(runs):
        output = model.generate(**inputs, max_new_tokens=max_new_tokens)
        total_tokens += output.shape[-1]
    elapsed = time.time() - start_time

    tps = total_tokens / elapsed
    tpm = tps * 60
    print(f"Tokens generated: {total_tokens}")
    print(f"Elapsed time (s): {elapsed:.2f}")
    print(f"Tokens per second: {tps:.2f}")
    print(f"Tokens per minute: {tpm:.2f}")
    return tps, tpm

if __name__ == '__main__':
    sys.argv = ['colab_kernel_launcher.py', '--all']

    parser = argparse.ArgumentParser(description='Local LLM Benchmarking')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--model', choices=list(MODELS.keys()), help='Model to benchmark')
    group.add_argument('--all', action='store_true', help='Benchmark all models')
    parser.add_argument('--runs', type=int, default=20, help='Number of runs')
    parser.add_argument('--max_new_tokens', type=int, default=50, help='Maximum new tokens per run')
    parser.add_argument('--quantize', action='store_true', help='Use 8-bit quantization')
    args = parser.parse_args()

    results = {}
    targets = MODELS.keys() if args.all else [args.model]
    for m in targets:
        tps, tpm = benchmark_model(m, MODELS[m], args.runs, args.max_new_tokens, quantize=args.quantize)
        results[m] = {'TPS': tps, 'TPM': tpm}

    print("\n=== Summary ===")
    for m, res in results.items():
        print(f"{m}: {res['TPS']:.2f} TPS, {res['TPM']:.2f} TPM")

Benchmarking llama-3.1-8b on cuda




model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Tokens generated: 1160
Elapsed time (s): 822.78
Tokens per second: 1.41
Tokens per minute: 84.59

=== Summary ===
llama-3.1-8b: 1.41 TPS, 84.59 TPM


**Qwen 2.5B**

In [None]:
import os
import time
import argparse
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import sys

HF_TOKEN = os.getenv('HUGGINGFACE_TOKEN')

# Mapping of model identifiers - Qwen1.5-1.8B
MODELS = {
    'qwen-1.8b': 'Qwen/Qwen1.5-1.8B'
}

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

class HFModelLoader:
    @staticmethod
    def from_pretrained(repo_id, **kwargs):
        if HF_TOKEN:
            kwargs['use_auth_token'] = HF_TOKEN
        return AutoModelForCausalLM.from_pretrained(repo_id, **kwargs)

class HFTokenizerLoader:
    @staticmethod
    def from_pretrained(repo_id, **kwargs):
        if HF_TOKEN:
            kwargs['use_auth_token'] = HF_TOKEN
        return AutoTokenizer.from_pretrained(repo_id, **kwargs)

def benchmark_model(name, repo_id, runs, max_new_tokens, quantize=False):
    print(f"\nBenchmarking {name} on {DEVICE}{' (8-bit)' if quantize else ''}")
    tokenizer = HFTokenizerLoader.from_pretrained(repo_id)
    quant_config = BitsAndBytesConfig(load_in_8bit=True) if quantize else None
    model = HFModelLoader.from_pretrained(
        repo_id,
        device_map='auto' if DEVICE == 'cuda' else None,
        quantization_config=quant_config,
        torch_dtype=torch.float16 if DEVICE == 'cuda' else torch.float32
    )

    prompt = "Hello, how are you today?"
    inputs = tokenizer(prompt, return_tensors='pt').to(DEVICE)

    _ = model.generate(**inputs, max_new_tokens=10)
    start_time = time.time()
    total_tokens = 0
    for _ in range(runs):
        output = model.generate(**inputs, max_new_tokens=max_new_tokens)
        total_tokens += output.shape[-1]
    elapsed = time.time() - start_time
    tps = total_tokens / elapsed
    tpm = tps * 60
    print(f"Tokens generated: {total_tokens}")
    print(f"Elapsed time (s): {elapsed:.2f}")
    print(f"Tokens per second: {tps:.2f}")
    print(f"Tokens per minute: {tpm:.2f}")
    return tps, tpm

if __name__ == '__main__':
    sys.argv = ['colab_kernel_launcher.py', '--all']
    parser = argparse.ArgumentParser(description='Local LLM Benchmarking')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--model', choices=list(MODELS.keys()), help='Model to benchmark')
    group.add_argument('--all', action='store_true', help='Benchmark all models')
    parser.add_argument('--runs', type=int, default=20, help='Number of runs')
    parser.add_argument('--max_new_tokens', type=int, default=50, help='Maximum new tokens per run')
    parser.add_argument('--quantize', action='store_true', help='Use 8-bit quantization')
    args = parser.parse_args()
    results = {}
    targets = MODELS.keys() if args.all else [args.model]
    for m in targets:
        tps, tpm = benchmark_model(m, MODELS[m], args.runs, args.max_new_tokens, quantize=args.quantize)
        results[m] = {'TPS': tps, 'TPM': tpm}
    print("\n=== Summary ===")
    for m, res in results.items():
        print(f"{m}: {res['TPS']:.2f} TPS, {res['TPM']:.2f} TPM")


Benchmarking qwen-1.8b on cpu


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for

Tokens generated: 1140
Elapsed time (s): 623.44
Tokens per second: 1.83
Tokens per minute: 109.71

=== Summary ===
qwen-1.8b: 1.83 TPS, 109.71 TPM
