# 

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import torch
from threading import Thread
import time
import sys 
# ----------------------------------------------------------------------------:
model_name = "meta-llama/Llama-3.2-1B-Instruct"
#model_name = "meta-llama/Llama-3.2-3B-Instruct"
#model_name = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# ----------------------------------------------------------------------------:

In [None]:

if torch.cuda.is_available(): device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): device = "mps"
else: device = "cpu"
model = model.to(device)
print(f"DEBUG: Device set to: {device}")

In [None]:
# https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is the capital of France?"}
]
chat_text = tokenizer.apply_chat_template(
    messages, 
    tokenize=True,
    add_generation_prompt=True
)
print(f"Chat prompt text ids:\n{chat_text}")
print(f"Chat prompt decoded:\n{tokenizer.decode(chat_text)}")
print("-"*40)
tokenized_chat = tokenizer.apply_chat_template(messages, 
                                               tokenize=True, 
                                               add_generation_prompt=True, 
                                               return_tensors="pt")
print(tokenizer.decode(tokenized_chat[0]))

In [None]:
len(tokenized_chat[0]) 

In [None]:
# (9125) --> system
# (882) --> user
# (128006) --> <|start_header_id|>
# (128007) --> <|end_header_id|> 
# (128009) --> <|eot_id|> 
# 
# so you will get a [<start_header_id> | <ID> | <end_header_id> ... <eot_id> ]
# This consites a block

for i, token in enumerate(tokenized_chat[0].tolist(), start=1) : 
    decoded = tokenizer.decode(token)
    if decoded == "\\n": decoded = "-newline-"
    print(f" ({token}) --> {decoded}")
print(f"Number of tokens in into prompt: {i}")

In [None]:
chat_text = tokenizer.apply_chat_template(
    messages, 
    tokenize=False,
    add_generation_prompt=True
)
# Tokenize the formatted chat
inputs = tokenizer(chat_text, return_tensors="pt").to(device)
num_input_tokens = inputs.input_ids.shape[1]

print(inputs) 
print(inputs.input_ids)

In [None]:
# Create streamer for token-by-token generation
streamer = TextIteratorStreamer(tokenizer, 
                                skip_prompt=True, 
                                skip_special_tokens=True)



# Start generation in a separate thread
generation_kwargs = {
    "input_ids": inputs.input_ids,
    "attention_mask": inputs.attention_mask,
    "max_new_tokens": 100,
    "temperature": 0.7,
    "streamer": streamer,
    "eos_token_id": None,
    "do_sample": True
}

thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()


prefill_start = time.time()
first_token = next(streamer)
prefill_end = time.time()

prefill_time = prefill_end - prefill_start



# Track generation time
generated_text = first_token
tokens_generated = 1

generation_start = time.time()
# Continue streaming tokens
for token in streamer:
    generated_text += token
    tokens_generated += 1

generation_end = time.time()
generation_time = generation_end - generation_start

# Print metrics
print(f"Input: {chat_text}")
print(f"Generated: {generated_text}")
print(f"Input tokens: {num_input_tokens}")
print(f"Output tokens: {tokens_generated}")
print(f"Prefill time: {prefill_time:.4f}s")
print(f"Generation time: {generation_time:.4f}s")
print(f"Generation tokens/second: {tokens_generated/generation_time:.2f}")

---
## LLM + KG 

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import torch
from threading import Thread
import time
import sys 
from mlhq.utils import load_jsonl, proceed, write_json 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"
#model_name = "meta-llama/Llama-3.2-3B-Instruct"
#model_name = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [3]:
if torch.cuda.is_available(): device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): device = "mps"
else: device = "cpu"
model = model.to(device)
print(f"DEBUG: Device set to: {device}")

DEBUG: Device set to: mps


In [4]:
qa_paths = {
    "biomedical" : "/Users/msbabo/code/Graph-CoT/data/processed_data/biomedical/data.jsonl", 
    "amazon" : "/Users/msbabo/code/Graph-CoT/data/processed_data/amazon/data.jsonl", 
    "dblp" : "/Users/msbabo/code/Graph-CoT/data/processed_data/dblp/data.jsonl", 
    "goodreads": "/Users/msbabo/code/Graph-CoT/data/processed_data/goodreads/data.jsonl", 
    "legal": "/Users/msbabo/code/Graph-CoT/data/processed_data/legal/data.jsonl", 
    "maple-biology": "/Users/msbabo/code/Graph-CoT/data/processed_data/maple/Biology/data.jsonl",
    "maple-medicine": "/Users/msbabo/code/Graph-CoT/data/processed_data/maple/Medicine/data.jsonl",
    "maple-physics": "/Users/msbabo/code/Graph-CoT/data/processed_data/maple/Physics/data.jsonl",
    "maple-materials-science": "/Users/msbabo/code/Graph-CoT/data/processed_data/maple/Materials_Science/data.jsonl",
    "maple-chemistry": "/Users/msbabo/code/Graph-CoT/data/processed_data/maple/Chemistry/data.jsonl",
}
domains = [ 
    "amazon", 
    "biomedical", 
    "dblp", 
    "goodreads", 
    "legal", 
    "maple-biology", 
    "maple-physics", 
    "maple-chemistry", 
    "maple-medicine", 
    "maple-materials-science", 
]

In [5]:
system_prompt = """
You are an expert prompt classifier designed to categorize user prompts accurately into one of two distinct categories:

Internal Response: Prompts for which an LLM can confidently provide accurate, complete, and helpful responses solely using its internal knowledge, without needing external information, databases, APIs, or tooling.

External Response: Prompts for which an LLM requires external knowledge, updated information, APIs, databases, or external tooling to confidently deliver accurate, complete, and helpful responses.

Guidelines for Classification:

Classify prompts as "Internal Response" if the information needed to answer fully and confidently is within general world knowledge as of the training cutoff, and no specific, updated, or external information is required.

Classify prompts as "External Response" if answering correctly and confidently necessitates:

Real-time data, current events, or recent updates.

Information specific to locations, current weather, dates, times, recent publications, or market values.

External computation, databases, or web searches.

Provide only the classification label ("Internal Response" or "External Response") as your output.
"""
system_prompt_len = len(tokenizer(system_prompt)['input_ids'])
#print(f"Length of System-prompt: {system_prompt_len}")

#messages = [{"role": "system","content": f"{system_prompt}"}]
#sys_txt = tokenizer.apply_chat_template(
#    messages, 
#    tokenize=True,
#    add_generation_prompt=True,
#    return_tensors="pt" # w/ Tensor, w/o list
#).to(device)
#print(type(sys_txt), sys_txt)

#for i, token in enumerate(sys_txt[0].tolist(), start=1) : 
#    decoded = tokenizer.decode(token)
#    if decoded == "\\n": decoded = "-newline-"
#    print(f" ({token:6}) --> {decoded}")
#print(f"Number of tokens in into prompt: {i}")

In [6]:
prefix = "Incoming Prompt:\n"
prefix_len = len(tokenizer(prefix)['input_ids'])

#prx_txt = tokenizer.apply_chat_template(
#    messages, 
#    tokenize=True,
#    add_generation_prompt=True,
#    return_tensors="pt" # w/ Tensor, w/o list
#).to(device)

---

In [7]:
results = {}
results['model'] = model_name
results['model-config'] = model.config.__dict__
#results['task'] = task 
#results['forward-params'] = dict(pipe.__dict__["_forward_params"])
#results["generation-config"] = {} 
results["system-prompt"] = system_prompt
results["system-prompt-len"] = system_prompt_len
results["prefix"] = prefix
results["prefix-len"] = prefix_len
#print(f"System prompt length = {system_prompt_len}")
#print(f"Prefix length = {prefix_len}")

In [14]:
for domain in domains: 
    qa_data = load_jsonl(qa_paths[domain])

    results[domain] = {}
    results[domain]['high-level-metrics'] = {
        "total-questions" : 0,
        "total-internals" : 0, 
        "total-externals" : 0, 
        "total-latency-s" : 0.0, 
        "average-latency-s" : 0.0, 
    }
    results[domain]['per-question-results'] = [] 
    count = 10
    internals = 0 
    externals = 0 
    average_latency = 0.0
    total_latency = 0.0 
    for i, qdata in enumerate(qa_data, start=1): 
        if i > count: 
            i = i - 1 
            break 
        qid = qdata["qid"]
        question = qdata["question"]
        messages = [
            {
                "role": "system",
                "content": f"{system_prompt}"
            },
            {
                "role": "user", 
                "content": f"{prefix}{question}"
            }
        ]
        #inputs = tokenizer.apply_chat_template(
        #    messages, 
        #    tokenize=False,
        #    add_generation_prompt=False, # 
        #    #return_tensors="pt" # w/ Tensor, w/o list
        #)#.to(device)
        #inputs = tokenizer(chat_text, return_tensors="pt").to(device)
        print(f"{i}.) [{domain}] Question   --> {question}")
        #print(f"{i}.) [{domain}] Classified --> {classified}")
        chat_text = tokenizer.apply_chat_template(
            messages, 
            tokenize=False,
            add_generation_prompt=True
        )
        # Tokenize the formatted chat
        inputs = tokenizer(chat_text, return_tensors="pt").to(device)
        num_input_tokens = inputs.input_ids.shape[1]
        #print(inputs) 
        #print(inputs.input_ids)



        # Create streamer for token-by-token generation
        streamer = TextIteratorStreamer(tokenizer, 
                                skip_prompt=True, 
                                skip_special_tokens=True)

        generation_kwargs = {
            "input_ids": inputs.input_ids,
            "attention_mask": inputs.attention_mask,
            "max_new_tokens": 100,
            "temperature": 0.7,
            "streamer": streamer,
            #"eos_token_id": None,
            "do_sample": True
        }

        thread = Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()

        prefill_start = time.time()
        first_token = next(streamer)
        prefill_end = time.time()
        prefill_time = prefill_end - prefill_start

        # Track generation time
        generated_text = first_token 
        tokens_generated = 1

        generation_start = time.time()
        # Continue streaming tokens
        for token in streamer:
            generated_text += token
            tokens_generated += 1
        generation_end = time.time()
        generation_time = generation_end - generation_start

         # Print metrics
        #print(f"Input: {chat_text}")
        #print(f"Generated: {generated_text}")
        print(f"Input tokens: {num_input_tokens}")
        print(f"Output tokens: {tokens_generated}")
        print(f"Prefill time: {prefill_time:.4f}s")
        print(f"Generation time: {generation_time:.4f}s")
        print(f"Generation tokens/second: {tokens_generated/generation_time:.2f}")
        
        #average_latency = 0.0
        total_latency += (prefill_time + generation_time) 
        
        if "Internal Response" in generated_text: 
            classified = "Internal Response"
            internals += 1 
        elif "External Response" in generated_text: 
            classified = "External Response"
            externals += 1
        else: 
            raise RuntimeError("bad llm response")
        # SimpleAgent("sys + prefix + prompt").SimpleAgent("sys2 + prefix2")
        results[domain]['per-question-results'].append(
            {'qid':qid, 
             'question':question, 
             "classified":classified,
             "num-input-tokens":num_input_tokens, 
             "num-output-tokens": tokens_generated, 
             "ttft": prefill_time, 
             "total-latency": generation_time, 
             "tps": tokens_generated/generation_time, 
            }
        )
        #proceed() 
    results[domain]['high-level-metrics']["total-questions"] = i 
    results[domain]['high-level-metrics']["total-internals"] = internals
    results[domain]['high-level-metrics']["total-externals"] = externals
    results[domain]['high-level-metrics']["total-latency-s"] = total_latency
    results[domain]['high-level-metrics']["average-latency-s"] = total_latency / i 
    



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


1.) [amazon] Question   --> Could you specify the brand of Dolica DC-BP511 1400mAh Canon Battery?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 271
Output tokens: 7
Prefill time: 0.1485s
Generation time: 0.1004s
Generation tokens/second: 69.75
2.) [amazon] Question   --> Could you specify the brand of [Aftermarket Product] Black PU Leather Wireless Bluetooth Keyboard Flip Case Cover For Samsung Galaxy S3 i9300 Sprint L710 att i747 Verizon i535 T-Mobile T999 New?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 295
Output tokens: 12
Prefill time: 0.0771s
Generation time: 0.2224s
Generation tokens/second: 53.95
3.) [amazon] Question   --> Could you specify the brand of Blackberry Playbook 7-Inch Tablet (64GB)?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 271
Output tokens: 41
Prefill time: 0.0695s
Generation time: 0.8588s
Generation tokens/second: 47.74
4.) [amazon] Question   --> What brand does the item Intermatic ML900TW 900-Watt Power Pack with Timer and Ground Shield belong to?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 277
Output tokens: 32
Prefill time: 0.0732s
Generation time: 0.6230s
Generation tokens/second: 51.37
5.) [amazon] Question   --> What is the brand of item Easy Provider Covert Acoustic Tube Earpiece 2 PIN for Motorola Radio?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 274
Output tokens: 28
Prefill time: 0.0713s
Generation time: 0.5467s
Generation tokens/second: 51.22
6.) [amazon] Question   --> What is the brand of item 3-D Letter Kit Chrom (1/2 inch)?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 270
Output tokens: 32
Prefill time: 0.0720s
Generation time: 0.6278s
Generation tokens/second: 50.97
7.) [amazon] Question   --> What brand does the item Sassy Developmental Bath Toy, Catch and Count Net belong to?
Input tokens: 271
Output tokens: 4
Prefill time: 0.0690s
Generation time: 0.0418s
Generation tokens/second: 95.80
8.) [amazon] Question   --> Can you tell me the brand of the item NECA Gears of War 3 Series 2 Action Figure Damon Baird Lancer, Wrench Screwdrivers?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 285
Output tokens: 40
Prefill time: 0.0719s
Generation time: 0.7997s
Generation tokens/second: 50.02
9.) [amazon] Question   --> Can you tell me the brand of the item Speedball Mona Lisa 18-Inch-by-24-Inch Jumbo Graphite Paper, Black?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 282
Output tokens: 64
Prefill time: 0.0712s
Generation time: 1.3258s
Generation tokens/second: 48.27
10.) [amazon] Question   --> What brand does the item Denim Blue Baker's Twine, 240 Yard Spool belong to?
Input tokens: 273
Output tokens: 4
Prefill time: 0.0723s
Generation time: 0.0398s
Generation tokens/second: 100.47
1.) [biomedical] Question   --> Are there any side effects of using compound Malathion?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 264
Output tokens: 101
Prefill time: 0.0720s
Generation time: 2.1872s
Generation tokens/second: 46.18
2.) [biomedical] Question   --> Are there any side effects of using compound Zinc?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 262
Output tokens: 101
Prefill time: 0.0732s
Generation time: 2.1686s
Generation tokens/second: 46.57
3.) [biomedical] Question   --> Are there any side effects of using compound Benzyl Benzoate?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 266
Output tokens: 78
Prefill time: 0.0721s
Generation time: 1.6288s
Generation tokens/second: 47.89
4.) [biomedical] Question   --> Can you list the side effects of Mepyramine?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 264
Output tokens: 68
Prefill time: 0.0696s
Generation time: 1.4373s
Generation tokens/second: 47.31
5.) [biomedical] Question   --> what are the side effects of compound Acetic acid?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 263
Output tokens: 101
Prefill time: 0.0722s
Generation time: 2.2078s
Generation tokens/second: 45.75
6.) [biomedical] Question   --> what are the side effects of compound Pyridoxine?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 264
Output tokens: 101
Prefill time: 0.0793s
Generation time: 2.1955s
Generation tokens/second: 46.00
7.) [biomedical] Question   --> Can you list the side effects of Hexachlorophene?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 265
Output tokens: 101
Prefill time: 0.0731s
Generation time: 2.1680s
Generation tokens/second: 46.59
8.) [biomedical] Question   --> Could you tell me the side effects of the compound Triclosan?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 267
Output tokens: 7
Prefill time: 0.1021s
Generation time: 0.1047s
Generation tokens/second: 66.83
9.) [biomedical] Question   --> Could you tell me the side effects of the compound Vitamin A?
Input tokens: 265
Output tokens: 7
Prefill time: 0.0693s
Generation time: 0.1083s
Generation tokens/second: 64.65
10.) [biomedical] Question   --> Can you list the side effects of Crotamiton?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 264
Output tokens: 8
Prefill time: 0.0686s
Generation time: 0.1276s
Generation tokens/second: 62.69
1.) [dblp] Question   --> Can you inform me who are the authors of "Patient-Specific Physiological Monitoring And Prediction Using Structured Gaussian Processes?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 276
Output tokens: 33
Prefill time: 0.0734s
Generation time: 0.6901s
Generation tokens/second: 47.82
2.) [dblp] Question   --> Can you inform me who are the authors of "Long Range Optical Truck Tracking.?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 269
Output tokens: 38
Prefill time: 0.0717s
Generation time: 0.8008s
Generation tokens/second: 47.45
3.) [dblp] Question   --> Can you inform me who are the authors of "0.33 mm 2 13.3 dB gain sub-6 GHz to 28 GHz transformer-coupled low-voltage upconversion mixer for 5G applications?" 
Input tokens: 298
Output tokens: 7
Prefill time: 0.0768s
Generation time: 0.1060s
Generation tokens/second: 66.02
4.) [dblp] Question   --> Could you tell me the authors of the paper "Physical parameter extraction over urban areas using L-band POLSAR data and interferometric baseline diversity?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 281
Output tokens: 101
Prefill time: 0.0724s
Generation time: 2.1945s
Generation tokens/second: 46.02
5.) [dblp] Question   --> Who are the authors of paper "A New Node Deployment and Location Dispatch Algorithm for Underwater Sensor Networks.?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 274
Output tokens: 34
Prefill time: 0.0733s
Generation time: 0.6115s
Generation tokens/second: 55.60
6.) [dblp] Question   --> Who are the authors of paper "Distributed global development parametric cost modeling?" 
Input tokens: 268
Output tokens: 4
Prefill time: 0.0722s
Generation time: 0.0474s
Generation tokens/second: 84.40
7.) [dblp] Question   --> Could you tell me the authors of the paper "BlockHammer: Improving Flash Reliability by Exploiting Process Variation Aware Proactive Failure Prediction?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 283
Output tokens: 41
Prefill time: 0.1051s
Generation time: 0.7449s
Generation tokens/second: 55.04
8.) [dblp] Question   --> Can you tell me who wrote the paper "Biologically Inspired Neural Controller For Robot Learning And Mapping?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 272
Output tokens: 36
Prefill time: 0.0717s
Generation time: 0.6495s
Generation tokens/second: 55.42
9.) [dblp] Question   --> Can you tell me who wrote the paper "Sub-Bottom Sediment Classification Using Reliable Instantaneous Frequency Calculation and Relaxation Time Estimation?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 280
Output tokens: 8
Prefill time: 0.1019s
Generation time: 0.1192s
Generation tokens/second: 67.10
10.) [dblp] Question   --> Could you tell me the authors of the paper "Robust assembly line balancing with heterogeneous workers?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 271
Output tokens: 25
Prefill time: 0.0718s
Generation time: 0.4761s
Generation tokens/second: 52.51
1.) [goodreads] Question   --> Who penned the book Truth, Love, Non-Violence: The Story of Gurcharan Singh Bhatia?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 276
Output tokens: 37
Prefill time: 0.0715s
Generation time: 0.6958s
Generation tokens/second: 53.17
2.) [goodreads] Question   --> Who penned the book Robert Graves the Assault Heroic 19 (v. 1)?
Input tokens: 270
Output tokens: 6
Prefill time: 0.0718s
Generation time: 0.0798s
Generation tokens/second: 75.21
3.) [goodreads] Question   --> Who penned the book Cape Cod Surprise: Oliver Matches Wits with Hurricane Carol?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 268
Output tokens: 4
Prefill time: 0.0705s
Generation time: 0.0392s
Generation tokens/second: 101.93
4.) [goodreads] Question   --> Do you know who penned the book Sea Monsters and Other Delicacies (An Awfully Beastly Business, #2)?
Input tokens: 277
Output tokens: 6
Prefill time: 0.0733s
Generation time: 0.0838s
Generation tokens/second: 71.59
5.) [goodreads] Question   --> Who are the authors of the book Cat Out of the Bag (By the Tail, #1)?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 272
Output tokens: 42
Prefill time: 0.0709s
Generation time: 0.8059s
Generation tokens/second: 52.11
6.) [goodreads] Question   --> Who are the authors of the book Life in Stone: Fossils of the Colorado Plateau?
Input tokens: 272
Output tokens: 4
Prefill time: 0.0701s
Generation time: 0.0397s
Generation tokens/second: 100.74
7.) [goodreads] Question   --> Do you know who penned the book An Act of Evil (Augustus Maltravers Mystery Book 1)?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 274
Output tokens: 4
Prefill time: 0.0714s
Generation time: 0.0411s
Generation tokens/second: 97.30
8.) [goodreads] Question   --> Can you tell me the authors of the book أوراقي.. حياتي.. ج1?
Input tokens: 273
Output tokens: 7
Prefill time: 0.0729s
Generation time: 0.0997s
Generation tokens/second: 70.19
9.) [goodreads] Question   --> Can you tell me the authors of the book Don't Panic: The Official Hitchhiker's Guide to the Galaxy Companion?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 277
Output tokens: 31
Prefill time: 0.0700s
Generation time: 0.5824s
Generation tokens/second: 53.23
10.) [goodreads] Question   --> Do you know who penned the book The Battle For WondLa (The Search for WondLa, #3)?
Input tokens: 276
Output tokens: 6
Prefill time: 0.0717s
Generation time: 0.0789s
Generation tokens/second: 76.01
1.) [legal] Question   --> what is the start date of court U.S. Circuit Court for the District of Western Kentucky?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 271
Output tokens: 31
Prefill time: 0.0708s
Generation time: 0.5727s
Generation tokens/second: 54.13
2.) [legal] Question   --> what is the start date of court Circuit Court of the 12th Judicial Circuit of Florida, Manatee County?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 275
Output tokens: 49
Prefill time: 0.0733s
Generation time: 0.9410s
Generation tokens/second: 52.07
3.) [legal] Question   --> what is the start date of court Oyer and Terminer, Rensselaer County?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 272
Output tokens: 35
Prefill time: 0.0700s
Generation time: 0.6439s
Generation tokens/second: 54.35
4.) [legal] Question   --> When is the North Dakota Court of Appeals court scheduled to start?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 265
Output tokens: 101
Prefill time: 0.0723s
Generation time: 2.0065s
Generation tokens/second: 50.34
5.) [legal] Question   --> When is the Circuit Court of the 3rd Judicial Circuit of Florida, Columbia County court scheduled to start?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 274
Output tokens: 47
Prefill time: 0.0703s
Generation time: 0.8604s
Generation tokens/second: 54.63
6.) [legal] Question   --> When does the court Oregon Supreme Court commence?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 261
Output tokens: 21
Prefill time: 0.1074s
Generation time: 0.3764s
Generation tokens/second: 55.80
7.) [legal] Question   --> When does the court New York Court of Sessions, Orange County commence?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 266
Output tokens: 24
Prefill time: 0.0721s
Generation time: 0.4536s
Generation tokens/second: 52.91
8.) [legal] Question   --> When is the Court Of Oyer And Terminer New York court scheduled to start?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 269
Output tokens: 64
Prefill time: 0.0719s
Generation time: 1.2866s
Generation tokens/second: 49.74
9.) [legal] Question   --> When is the U.S. Circuit Court for the Southern District of Georgia court scheduled to start?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 271
Output tokens: 32
Prefill time: 0.0693s
Generation time: 0.5793s
Generation tokens/second: 55.24
10.) [legal] Question   --> Can you tell me the commencement date of District Court, Canal Zone court?
Input tokens: 267
Output tokens: 7
Prefill time: 0.0719s
Generation time: 0.0993s
Generation tokens/second: 70.49
1.) [maple-biology] Question   --> Who wrote the paper titled "snrk1 zmrfwd3 opaque2 a nexus of seed nutrient accumulation and diurnal cycles?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 278
Output tokens: 4
Prefill time: 0.0731s
Generation time: 0.0400s
Generation tokens/second: 100.00
2.) [maple-biology] Question   --> Who wrote the paper titled "mutations in fbxl4 cause mitochondrial encephalopathy and a disorder of mitochondrial dna maintenance?"
Input tokens: 278
Output tokens: 4
Prefill time: 0.0692s
Generation time: 0.0407s
Generation tokens/second: 98.21
3.) [maple-biology] Question   --> Who wrote the paper titled "molecular basis for expression of common and rare fragile sites?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 270
Output tokens: 4
Prefill time: 0.0715s
Generation time: 0.0410s
Generation tokens/second: 97.60
4.) [maple-biology] Question   --> Do you know the authors for the paper "taxis of pseudomonas putida f1 toward phenylacetic acid is mediated by the energy taxis receptor aer2?"
Input tokens: 287
Output tokens: 4
Prefill time: 0.0744s
Generation time: 0.0472s
Generation tokens/second: 84.69
5.) [maple-biology] Question   --> Who are the authors of paper "beyond the signal sequence protein routing in health and disease?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 271
Output tokens: 27
Prefill time: 0.0709s
Generation time: 0.5133s
Generation tokens/second: 52.60
6.) [maple-biology] Question   --> Who are the authors of paper "molecular analysis of expansion differentiation and growth factor treatment of human chondrocytes identifies differentiation markers and growth related genes?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 284
Output tokens: 41
Prefill time: 0.1036s
Generation time: 0.7951s
Generation tokens/second: 51.56
7.) [maple-biology] Question   --> Do you know the authors for the paper "analysis of rare deletional thalassemia using custom cgh array dna chip?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 279
Output tokens: 36
Prefill time: 0.0735s
Generation time: 0.7174s
Generation tokens/second: 50.18
8.) [maple-biology] Question   --> Can you tell me the authors of the paper "increasing temperature seasonality may overwhelm shifts in soil moisture to favor shrub over grass dominance in colorado plateau drylands?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 287
Output tokens: 86
Prefill time: 0.0708s
Generation time: 1.7762s
Generation tokens/second: 48.42
9.) [maple-biology] Question   --> Can you tell me the authors of the paper "tissue distribution subcellular localization and enzymatic activity analysis of human sirt5 isoforms?"
Input tokens: 282
Output tokens: 7
Prefill time: 0.0738s
Generation time: 0.0951s
Generation tokens/second: 73.63
10.) [maple-biology] Question   --> Do you know the authors for the paper "sphingosine 1 phosphate phosphohydrolase regulates endoplasmic reticulum to golgi trafficking of ceramide?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 290
Output tokens: 4
Prefill time: 0.0766s
Generation time: 0.0390s
Generation tokens/second: 102.58
1.) [maple-physics] Question   --> Can you name the authors of "effects of encounters with field stars on the evolution of low mass semidetached binaries?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 277
Output tokens: 32
Prefill time: 0.0719s
Generation time: 0.5943s
Generation tokens/second: 53.85
2.) [maple-physics] Question   --> Can you name the authors of "the planetary luminosity problem missing planets and the observational consequences of episodic accretion?"
Input tokens: 277
Output tokens: 7
Prefill time: 0.0701s
Generation time: 0.0973s
Generation tokens/second: 71.92
3.) [maple-physics] Question   --> Can you name the authors of "factorization of seiberg witten curves with matter from random matrix models?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 275
Output tokens: 39
Prefill time: 0.0719s
Generation time: 0.7755s
Generation tokens/second: 50.29
4.) [maple-physics] Question   --> Who wrote the paper "origin of superconductivity in the weyl semimetal wt e 2 under pressure?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 276
Output tokens: 57
Prefill time: 0.0730s
Generation time: 1.1733s
Generation tokens/second: 48.58
5.) [maple-physics] Question   --> Who are the authors of paper "effect of self phase modulation in chirped pulse amplification like schemes?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 273
Output tokens: 31
Prefill time: 0.0724s
Generation time: 0.5917s
Generation tokens/second: 52.39
6.) [maple-physics] Question   --> Who are the authors of paper "study of chiral property of nuclear matter through measurements of φ meson decays?" 
Input tokens: 276
Output tokens: 4
Prefill time: 0.0700s
Generation time: 0.0421s
Generation tokens/second: 95.05
7.) [maple-physics] Question   --> Who wrote the paper "confronting gw190814 with hyperonization in dense matter and hypernuclear compact stars?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 277
Output tokens: 4
Prefill time: 0.0720s
Generation time: 0.0397s
Generation tokens/second: 100.71
8.) [maple-physics] Question   --> Can you tell me the authors of the paper "three dimensional flow studies on a slotted transonic wind tunnel wall?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 276
Output tokens: 34
Prefill time: 0.0692s
Generation time: 0.6775s
Generation tokens/second: 50.18
9.) [maple-physics] Question   --> Can you tell me the authors of the paper "influence of cassette type on the dqe of cr systems?"
Input tokens: 275
Output tokens: 7
Prefill time: 0.0701s
Generation time: 0.1013s
Generation tokens/second: 69.09
10.) [maple-physics] Question   --> Who wrote the paper "atomic gas in blue ultra diffuse galaxies around hickson compact groups?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 271
Output tokens: 47
Prefill time: 0.0722s
Generation time: 0.9629s
Generation tokens/second: 48.81
1.) [maple-chemistry] Question   --> Who penned the paper "sirt2 inhibition activates hypoxia inducible factor 1α signaling and mediates neuronal survival?"
Input tokens: 279
Output tokens: 4
Prefill time: 0.0715s
Generation time: 0.0404s
Generation tokens/second: 99.01
2.) [maple-chemistry] Question   --> Who penned the paper "instantaneous and permanent photoionization?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 265
Output tokens: 4
Prefill time: 0.0707s
Generation time: 0.0397s
Generation tokens/second: 100.74
3.) [maple-chemistry] Question   --> Who penned the paper "complex rare earth aluminum hydrides mechanochemical preparation crystal structure and potential for hydrogen storage?"
Input tokens: 276
Output tokens: 4
Prefill time: 0.0698s
Generation time: 0.0397s
Generation tokens/second: 100.80
4.) [maple-chemistry] Question   --> Do you know who the authors of "asymmetric and independent contribution of the second transmembrane segment 12 residues to diliganded gating of acetylcholine receptor channels a single channel study with choline as the agonist" are?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 300
Output tokens: 24
Prefill time: 0.0771s
Generation time: 0.4716s
Generation tokens/second: 50.89
5.) [maple-chemistry] Question   --> Who are the authors of paper "on the stability of the extracellular hemoglobin of glossoscolex paulistus in two iron oxidation states in the presence of urea?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 288
Output tokens: 45
Prefill time: 0.0732s
Generation time: 0.9317s
Generation tokens/second: 48.30
6.) [maple-chemistry] Question   --> Who are the authors of paper "thorium iv molecular clusters with a hexanuclear th core?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 273
Output tokens: 33
Prefill time: 0.0722s
Generation time: 0.6562s
Generation tokens/second: 50.29
7.) [maple-chemistry] Question   --> Do you know who the authors of "chemistry eluting the tag" are?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 268
Output tokens: 40
Prefill time: 0.0723s
Generation time: 0.8266s
Generation tokens/second: 48.39
8.) [maple-chemistry] Question   --> Can you tell me the authors of the paper titled "reactions of chlorine atoms with a series of aromatic hydrocarbons?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 277
Output tokens: 87
Prefill time: 0.0718s
Generation time: 1.7903s
Generation tokens/second: 48.60
9.) [maple-chemistry] Question   --> Can you tell me the authors of the paper titled "m fluorotyrosine substitution in β galactosidase evidence for the existence of a catalytically active tyrosine?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 290
Output tokens: 53
Prefill time: 0.0757s
Generation time: 1.0436s
Generation tokens/second: 50.79
10.) [maple-chemistry] Question   --> Do you know who the authors of "efficient assembly of iminodicarboxamides by a truly four component reaction" are?
Input tokens: 279
Output tokens: 8
Prefill time: 0.0721s
Generation time: 0.1176s
Generation tokens/second: 68.03
1.) [maple-medicine] Question   --> Do you know who the authors of "preinfarction blood pressure and smoking are determinants for a fatal outcome of myocardial infarction a prospective analysis from the finnmark study" are?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 293
Output tokens: 57
Prefill time: 0.0772s
Generation time: 1.1708s
Generation tokens/second: 48.68
2.) [maple-medicine] Question   --> Do you know who the authors of "subtle and underrecognized side effects of neuroleptic treatment in children with tourette s disorder" are?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 282
Output tokens: 41
Prefill time: 0.0729s
Generation time: 0.8511s
Generation tokens/second: 48.17
3.) [maple-medicine] Question   --> Do you know who the authors of "isotype profile and clinical relevance of anticardiolipin antibodies in sjogren s syndrome" are?
Input tokens: 282
Output tokens: 7
Prefill time: 0.0708s
Generation time: 0.1018s
Generation tokens/second: 68.74
4.) [maple-medicine] Question   --> Who wrote the "bcr breakpoint and prognosis of chronic phase chronic myeloid leukemia letter comment" paper?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 274
Output tokens: 4
Prefill time: 0.0737s
Generation time: 0.0443s
Generation tokens/second: 90.32
5.) [maple-medicine] Question   --> Who are the authors of paper "patterns and trends of newly diagnosed hiv infections among adults and adolescents in correctional and noncorrectional facilities united states 2008 2011?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 290
Output tokens: 101
Prefill time: 0.0748s
Generation time: 2.2205s
Generation tokens/second: 45.48
6.) [maple-medicine] Question   --> Who are the authors of paper "assessing the risk of breast cancer?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 268
Output tokens: 25
Prefill time: 0.0711s
Generation time: 0.4383s
Generation tokens/second: 57.04
7.) [maple-medicine] Question   --> Who wrote the "levels of phosphorylated p70 s6 kinase and phosphorylated erk1 2 predict overall survival and time to first treatment in chronic lymphocytic leukaemia" paper?
Input tokens: 291
Output tokens: 4
Prefill time: 0.1067s
Generation time: 0.0405s
Generation tokens/second: 98.85
8.) [maple-medicine] Question   --> Can you tell me the authors of the paper "altitude pulmonary edema below 8 000 feet what are we missing?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 277
Output tokens: 31
Prefill time: 0.0720s
Generation time: 0.6249s
Generation tokens/second: 49.61
9.) [maple-medicine] Question   --> Can you tell me the authors of the paper "comparison of visual acuity measured with allen figures and snellen letters using the b vat ii monitor?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 282
Output tokens: 41
Prefill time: 0.0701s
Generation time: 0.8666s
Generation tokens/second: 47.31
10.) [maple-medicine] Question   --> Who wrote the "an open label prospective pilot clinical study of denosumab for severe hyperparathyroidism in patients with low bone mass undergoing dialysis" paper?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 287
Output tokens: 27
Prefill time: 0.0737s
Generation time: 0.5337s
Generation tokens/second: 50.59
1.) [maple-materials-science] Question   --> Could you tell me who are the authors of "sts observations of landau levels at graphite surfaces?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 272
Output tokens: 56
Prefill time: 0.0726s
Generation time: 1.1807s
Generation tokens/second: 47.43
2.) [maple-materials-science] Question   --> Could you tell me who are the authors of "formation of single quantum dot in single walled carbon nanotube channel using focused ion beam technique?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 282
Output tokens: 69
Prefill time: 0.0723s
Generation time: 1.5867s
Generation tokens/second: 43.49
3.) [maple-materials-science] Question   --> Could you tell me who are the authors of "high sensitivity nanometer scale infrared spectroscopy using a contact mode microcantilever with an internal resonator paddle?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 285
Output tokens: 42
Prefill time: 0.0742s
Generation time: 0.8898s
Generation tokens/second: 47.20
4.) [maple-materials-science] Question   --> Who wrote the paper "unraveling doping capability of conjugated polymers for strategic manipulation of electric dipole layer toward efficient charge collection in perovskite solar cells?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 287
Output tokens: 42
Prefill time: 0.0712s
Generation time: 0.8543s
Generation tokens/second: 49.16
5.) [maple-materials-science] Question   --> Who are the authors of paper "isomerization of azobenzene and the enhancement of dynamic heterogeneities in molecular glass formers?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 281
Output tokens: 37
Prefill time: 0.0738s
Generation time: 0.7961s
Generation tokens/second: 46.48
6.) [maple-materials-science] Question   --> Who are the authors of paper "ni2p grown in situ on milled black phosphorus flakes and its high energy storage performance?" 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 279
Output tokens: 36
Prefill time: 0.0734s
Generation time: 0.7743s
Generation tokens/second: 46.50
7.) [maple-materials-science] Question   --> Who wrote the paper "promotion of sers and catalytic activities with bimetallic and ternary concave nanolayers?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 279
Output tokens: 43
Prefill time: 0.0698s
Generation time: 0.9067s
Generation tokens/second: 47.43
8.) [maple-materials-science] Question   --> Can you tell me the authors of the paper titled "tensile properties of high strength polyacrylonitrile pan based and high modulus pitch based hybrid carbon fibers reinforced epoxy matrix composite?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 291
Output tokens: 40
Prefill time: 0.0754s
Generation time: 0.7977s
Generation tokens/second: 50.14
9.) [maple-materials-science] Question   --> Can you tell me the authors of the paper titled "hydration of mgo 100 surface promoted at 011 steps?"


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input tokens: 276
Output tokens: 50
Prefill time: 0.0737s
Generation time: 1.0447s
Generation tokens/second: 47.86
10.) [maple-materials-science] Question   --> Who wrote the paper "microstructure and growth model for rice hull derived sic whiskers?"
Input tokens: 270
Output tokens: 6
Prefill time: 0.0724s
Generation time: 0.0774s
Generation tokens/second: 77.56


In [15]:
from mlhq.utils import get_datetime_str
revised_model_name = model_name.replace('/','--')
write_json(results, f"./baseline-{revised_model_name}-{get_datetime_str()}.json")

In [11]:
num_input_tokens = []
num_output_tokens = []
ttfts =[] 
total_latencies = [] 
tpss = [] 

for k,v in results.items(): 
    #print(k)
    if k in domains:
        for q in v: 
            num_input_tokens.append(q['num-input-tokens'])
            num_output_tokens.append(q['num-output-tokens'])
            ttfts.append(q['ttft'])
            total_latencies.append(q['total-latency'])
            tpss.append(q['tps'])
        

TypeError: string indices must be integers

In [None]:
tpss

In [None]:
# Plots tpss (y) as a function of output tokens lenght(x) and input tokens
import matplotlib.pyplot as plt

# Assuming num_output_tokens and tpss are already populated from your loop above

plt.figure(figsize=(10,6))

# Scatter plot
plt.scatter(num_output_tokens, tpss, marker='o')

# Adding titles and labels
plt.title(f'TPS vs. Number of Output Tokens [{model_name}]', fontsize=14)
plt.xlabel('Number of Output Tokens', fontsize=12)
plt.ylabel('TPS', fontsize=12)

# Optional: adding grid for better readability
plt.grid(True, linestyle='--', alpha=0.6)

# Display the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import make_interp_spline

# Convert to numpy arrays
x = np.array(num_output_tokens)
y = np.array(tpss)

# Average TPS values for each unique output token count
unique_x = np.unique(x)
average_y = np.array([y[x == val].mean() for val in unique_x])

# Spline interpolation
x_smooth = np.linspace(unique_x.min(), unique_x.max(), 300)
spl = make_interp_spline(unique_x, average_y, k=3)
y_smooth = spl(x_smooth)

# Plotting
plt.figure(figsize=(10, 6))
plt.scatter(x, y, marker='o', label='Original Data')
plt.plot(x_smooth, y_smooth, color='red', linewidth=2, label='Spline Curve (Averaged)')

# Labels and Grid
plt.title(f'TPS vs. Number of Output Tokens [{model_name}]', fontsize=14)
plt.xlabel('Number of Output Tokens', fontsize=12)
plt.ylabel('Tokens/Second', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()

plt.show()
