# Measure model performance 
import time
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the model and tokenizer
model_name = "distilgpt2"  # Replace with your small language model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define input text
input_text = "Once upon a time, in a small village,"

# Tokenize input
inputs = tokenizer(input_text, return_tensors="pt")

# Measure latency
num_trials = 10
latencies = []
for _ in range(num_trials):
    start_time = time.time()
    outputs = model.generate(**inputs, max_length=50)  # Generate text
    latencies.append(time.time() - start_time)

average_latency = sum(latencies) / num_trials
print(f"Average Latency: {average_latency:.4f} seconds")

# Measure throughput (tokens per second)
total_tokens = 0
start_time = time.time()
for _ in range(num_trials):
    outputs = model.generate(**inputs, max_length=50)
    total_tokens += outputs.size(-1)  # Count tokens in output

end_time = time.time()
throughput = total_tokens / (end_time - start_time)
print(f"Throughput: {throughput:.2f} tokens/second")

# Output generated text for verification
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Generated Text: {generated_text}")
