## 🧠 A quick useful util method - Measures and prints the token generation speed (in tokens per second) for evaluating your LLM

In [None]:
import time

def measure_token_speed(model, tokenizer, prompts):
    """
    Parameters:
    model (transformers.PreTrainedModel): The language model to be used for token generation.
    tokenizer (transformers.PreTrainedTokenizer): The tokenizer corresponding to the model,
                                                  used for tokenizing the prompts.
    prompts (list of str): A list of string prompts for which the token generation speed is measured.

    This function iterates over each prompt, generates tokens using the model, and measures the
    time taken for each prompt. It then calculates and prints the tokens per second for each prompt
    and the average speed across all prompts.

    Note: The function assumes that the model and tokenizer are compatible and that the model is
    already loaded onto the appropriate device (e.g., CUDA for GPU acceleration).
    """
    duration = 0.0
    total_length = 0

    for p in prompts:
        try:
            inputs = tokenizer([p], return_tensors="pt").to('cuda')
            start_time = time.time()
            output = model.generate(**inputs, max_new_tokens=1000)
            prompt_duration = time.time() - start_time
            tok_sec_prompt = round(len(output[0]) / prompt_duration, 3)
            print(f"Prompt: {p} --- {tok_sec_prompt} tokens/seconds ---")
            duration += prompt_duration
            total_length += len(output[0])

            # Optional: Clear up GPU memory here if needed
            # torch.cuda.empty_cache()
        except Exception as e:
            print(f"Error processing prompt '{p}': {e}")

    tok_sec = round(total_length / duration, 3)
    print(f"Average --- {tok_sec} tokens/seconds ---")

# Usage Example
# model = [Your Model]
# tokenizer = [Your Tokenizer]
# prompts = ["AI will rule ", "I am in Love with ",
# "Awesome weather and ", "My favourite movie is "]
# measure_token_speed(model, tokenizer, prompts)