In [1]:
import os
import gzip, json, time
import torch 
import numpy as np
from pathlib import Path
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig
from bert_score import score
import onnxruntime as ort
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import onnx
import onnxruntime as ort
from tqdm import tqdm
import re
from bert_score import score as bert_score


In [10]:
torch.cuda.empty_cache()
import gc
gc.collect()


354

In [6]:
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")

config.json:   0%|          | 0.00/646 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [7]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32016, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_e

In [8]:
def get_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    dtype_size = torch.finfo(model.dtype).bits // 8
    
    model_size_gb = total_params * dtype_size / (1024 ** 3)
    print(f"Model size: {model_size_gb:2f} GB")

In [9]:
get_model_size(model)

Model size: 25.103043 GB


### Floating Point 15 params

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    "codellama/CodeLlama-7b-Instruct-hf",
    torch_dtype=torch.float16,
    # device_map={"": 0},
    # low_cpu_mem_usage=True
    device_map="cuda:0"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
get_model_size(model)

Model size: 12.551521 GB


In [13]:
!nvidia-smi

Sun May 11 11:42:31 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off |   00000000:27:00.0 Off |                    0 |
| N/A   57C    P0             80W /  300W |   14139MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [14]:
data_path = "/mnt/object_group/data/"

In [15]:
train = "/mnt/object_group/data/processed/train.jsonl.gz"
with gzip.open(train, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        print("    " + line.strip())
        if i >= 10:
            break


    {"comment_id": 74291026, "comment_user_login": "vikerman", "comment_body": "Is there a better way to avoid having this in different places?\n", "comment_created_at": "2016-08-10T17:30:42+00:00", "comment_html_url": "https://github.com/angular/angular/pull/10620#discussion_r74291026", "comment_path": "modules/@angular/common/src/forms-deprecated/directives/abstract_control_directive.ts", "comment_position": 10, "comment_original_position": 10, "comment_commit_id": "ee8e802b7bc25eafbe54109b3924e9dbc36dce11", "comment_original_commit_id": "ee8e802b7bc25eafbe54109b3924e9dbc36dce11", "diff_line_content": "}\n", "diff_line_type": "added", "diff_line_source_no": null, "diff_line_target_no": 15, "diff_hunk_header": "", "diff": "@@ -6,10 +6,13 @@\n  * found in the LICENSE file at https://angular.io/license\n  */\n \n-import {unimplemented} from '../../facade/exceptions';\n+import {BaseException} from '@angular/core';\n import {isPresent} from '../../facade/lang';\n import {AbstractControl} 

In [16]:
DATA_ROOT = Path("/mnt/object_group/data/processed")  # Adjust as needed

def load_samples(limit_per_split=10):
    diff_samples = []
    for repo_dir in DATA_ROOT.iterdir():
        if not repo_dir.is_dir():
            continue
        for pr_dir in (repo_dir / "diff").iterdir():
            pr_id = pr_dir.name
            diff_file = pr_dir
            comments_file = DATA_ROOT / repo_dir.name / "comments" / f"{repo_dir.name}_{pr_id}_comments.jsonl"

            if not diff_file.exists() or not comments_file.exists():
                continue

            with open(diff_file, "r", encoding="utf-8") as df:
                diff_content = df.read()

            with open(comments_file, "r", encoding="utf-8") as cf:
                for line in cf:
                    try:
                        comment = json.loads(line)
                        offset = comment.get("original_position")
                        side = comment.get("side", "RIGHT")
                        body = comment.get("body", "").strip()
                        if offset is not None and body:
                            sample = {
                                "input": f"<DIFF>\n{diff_content}\n</DIFF>\n<COMMENT side=\"{side}\" offset=\"{offset}\">",
                                "output": body
                            }
                            diff_samples.append(sample)
                    except json.JSONDecodeError:
                        continue

    random.shuffle(diff_samples)
    return {
        "train": diff_samples[:limit_per_split],
        "test": diff_samples[limit_per_split:2*limit_per_split],
        "eval": diff_samples[2*limit_per_split:3*limit_per_split],
    }


In [17]:
def load_jsonl_gz(file_path, limit):
    samples = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= limit:
                break
            entry = json.loads(line)
            prompt = f"<DIFF>\n{entry['diff']}\n</DIFF>\n"
            comment = f"<COMMENT side=\"{entry['side']}\" offset=\"{entry['line_offset']}\">{entry['comment_body']}"
            samples.append({'input': prompt, 'label': comment})
    return samples

train_data = load_jsonl_gz("/mnt/object_group/data/processed/train.jsonl.gz", 10)
test_data  = load_jsonl_gz("/mnt/object_group/data/processed/test.jsonl.gz", 8)
eval_data  = load_jsonl_gz("/mnt/object_group/data/processed/val.jsonl.gz", 4)

In [18]:
print(train_data[0].keys())

for i in range(3):
    print(train_data[i]["input"])
    print(train_data[i]["label"])
# print(train_data[0]["input"])

dict_keys(['input', 'label'])
<DIFF>
@@ -6,10 +6,13 @@
  * found in the LICENSE file at https://angular.io/license
  */
 
-import {unimplemented} from '../../facade/exceptions';
+import {BaseException} from '@angular/core';
 import {isPresent} from '../../facade/lang';
 import {AbstractControl} from '../model';
 
+function unimplemented(): any {
+  throw new BaseException('unimplemented');
</DIFF>

<COMMENT side="RIGHT" offset="9">Is there a better way to avoid having this in different places?

<DIFF>
@@ -6,10 +6,13 @@
  * found in the LICENSE file at https://angular.io/license
  */
 
-import {unimplemented} from '../../facade/exceptions';
+import {BaseException} from '@angular/core';
 import {isPresent} from '../../facade/lang';
 import {AbstractControl} from '../model';
 
+function unimplemented(): any {
+  throw new BaseException('unimplemented');
</DIFF>

<COMMENT side="RIGHT" offset="9">we could export it in the public API but that's not nice.
Or we could inline the method and del

In [19]:
# Prompt template
instruction = (
    "You are a helpful code reviewer. "
    "Given a code diff, generate all relevant review comments. "
    "Each comment must be in the format: "
    "<COMMENT side=\"RIGHT\" offset=\"X\">Your comment here.\n"
    "Only output comments, nothing else.\n\n"
    "### Code Diff:\n"
)

In [20]:
def extract_comments(text):
    pattern = re.compile(r'<COMMENT side="(?P<side>[^"]+)" offset="(?P<offset>\d+)">(?P<comment>.+)')
    results = []
    for line in text.strip().splitlines():
        match = pattern.match(line.strip())
        if match:
            results.append({
                "side": match.group("side"),
                "offset": int(match.group("offset")),
                "comment": match.group("comment").strip()
            })
    return results

In [21]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32016, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_e

In [22]:
def offset_match(pred_comments, true_comments, window=3):
    correct = 0
    total = len(pred_comments)

    gt_pairs = {(c["side"], c["offset"]) for c in true_comments}
    matched = set()

    for pred in pred_comments:
        pred_side, pred_offset = pred["side"], pred["offset"]
        for gt in gt_pairs:
            gt_side, gt_offset = gt
            if pred_side == gt_side and abs(pred_offset - gt_offset) <= window:
                matched.add(gt)
                correct += 1
                break  # count each pred only once

    return correct, total

In [25]:
references = []
candidates = []
offset_match_total = 0
offset_match_correct = 0

for i, sample in enumerate(tqdm(test_data[:100])):  # Full test set if needed
    prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": instruction + sample["input"]}],
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=4096
    ).to("cuda:0")

    true_comments = extract_comments(sample["label"])

    # Measure inference latency
    with torch.no_grad():
        start = time.time()
        output_ids = model.generate(**inputs, max_new_tokens=256, do_sample=False)
        latencies.append(time.time() - start)

    # Decode and evaluate
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
    generated_text = generated_text.split("Setting `pad_token_id")[0].strip()
    print(f"\n=== Sample {i} ===\n{generated_text}")

    pred_comments = extract_comments(generated_text)

    for gt in true_comments:
        references.append(gt["comment"])
        best_pred = max(pred_comments, key=lambda p: p["comment"], default={"comment": ""})
        candidates.append(best_pred["comment"] if best_pred else "")

    correct, total = offset_match(pred_comments, true_comments, window=3)
    offset_match_correct += correct
    offset_match_total += total

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Sample 0 ===
[INST] You are a helpful code reviewer. Given a code diff, generate all relevant review comments. Each comment must be in the format: <COMMENT side="RIGHT" offset="X">Your comment here.
Only output comments, nothing else.

### Code Diff:
<DIFF>
@@ -838,45 +839,58 @@ function performConcurrentWorkOnRoot(root, didTimeout) {
       throw fatalError;
     }
 
-    // Check if this render may have yielded to a concurrent event, and if so,
-    // confirm that any newly rendered stores are consistent.
-    // TODO: It's possible that even a concurrent render may never have yielded
-    // to the main thread, if it was fast enough, or if it expired. We could
-    // skip the consistency check in that case, too.
-    const renderWasConcurrent = !includesBlockingLane(root, lanes);
-    const finishedWork: Fiber = (root.current.alternate: any);
-    if (
-      renderWasConcurrent &&
-      !isRenderConsistentWithExternalStores(finishedWork)
-    ) {
-      // A store was mutat

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Sample 1 ===
[INST] You are a helpful code reviewer. Given a code diff, generate all relevant review comments. Each comment must be in the format: <COMMENT side="RIGHT" offset="X">Your comment here.
Only output comments, nothing else.

### Code Diff:
<DIFF>
@@ -838,45 +839,58 @@ function performConcurrentWorkOnRoot(root, didTimeout) {
       throw fatalError;
     }
 
-    // Check if this render may have yielded to a concurrent event, and if so,
-    // confirm that any newly rendered stores are consistent.
-    // TODO: It's possible that even a concurrent render may never have yielded
-    // to the main thread, if it was fast enough, or if it expired. We could
-    // skip the consistency check in that case, too.
-    const renderWasConcurrent = !includesBlockingLane(root, lanes);
-    const finishedWork: Fiber = (root.current.alternate: any);
-    if (
-      renderWasConcurrent &&
-      !isRenderConsistentWithExternalStores(finishedWork)
-    ) {
-      // A store was mutat

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Sample 2 ===
[INST] You are a helpful code reviewer. Given a code diff, generate all relevant review comments. Each comment must be in the format: <COMMENT side="RIGHT" offset="X">Your comment here.
Only output comments, nothing else.

### Code Diff:
<DIFF>
@@ -1149,6 +1163,10 @@ function performSyncWorkOnRoot(root) {
     throw fatalError;
   }
 
+  if (exitStatus === RootDidNotComplete) {
</DIFF> [/INST]  <COMMENT side="RIGHT" offset="1150">This line is not needed. The `throw` statement will already cause the function to exit with an error. It's not necessary to check the `exitStatus` variable and throw a new error if it's not `RootCompleted`.</COMMENT>

<COMMENT side="RIGHT" offset="1164">This line is not needed. The `if` statement is already checking if the `exitStatus` variable is equal to `RootDidNotComplete`. It's not necessary to check the `exitStatus` variable again in the `if` statement's body.</COMMENT>


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Sample 3 ===
[INST] You are a helpful code reviewer. Given a code diff, generate all relevant review comments. Each comment must be in the format: <COMMENT side="RIGHT" offset="X">Your comment here.
Only output comments, nothing else.

### Code Diff:
<DIFF>
@@ -739,14 +739,20 @@ describe('ReactCache', () => {
     await act(async () => {
       refresh();
     });
-    expect(Scheduler).toHaveYielded(['Cache miss! [A]', 'Loading...']);
+    expect(Scheduler).toHaveYielded([
+      'Cache miss! [A]',
+      'Loading...',
+      // TODO: This happens too early, because we don't retain the refreshed
+      // cache until it commits. Will fix in next step.
+      'Cache cleanup: A [v1]',
</DIFF> [/INST]  <COMMENT side="RIGHT" offset="10">This comment is suggesting that the cache cleanup is happening too early, before the refreshed cache has committed. This could be fixed by waiting for the cache to commit before cleaning up.</COMMENT>

<COMMENT side="RIGHT" offset="15">This comment is

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Sample 4 ===
[INST] You are a helpful code reviewer. Given a code diff, generate all relevant review comments. Each comment must be in the format: <COMMENT side="RIGHT" offset="X">Your comment here.
Only output comments, nothing else.

### Code Diff:
<DIFF>
@@ -739,14 +739,20 @@ describe('ReactCache', () => {
     await act(async () => {
       refresh();
     });
-    expect(Scheduler).toHaveYielded(['Cache miss! [A]', 'Loading...']);
+    expect(Scheduler).toHaveYielded([
+      'Cache miss! [A]',
+      'Loading...',
+      // TODO: This happens too early, because we don't retain the refreshed
+      // cache until it commits. Will fix in next step.
+      'Cache cleanup: A [v1]',
</DIFF> [/INST]  <COMMENT side="RIGHT" offset="10">This comment is suggesting that the cache cleanup is happening too early, before the refreshed cache has committed. This could be fixed by waiting for the cache to commit before cleaning up.</COMMENT>

<COMMENT side="RIGHT" offset="15">This comment is

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Sample 5 ===
[INST] You are a helpful code reviewer. Given a code diff, generate all relevant review comments. Each comment must be in the format: <COMMENT side="RIGHT" offset="X">Your comment here.
Only output comments, nothing else.

### Code Diff:
<DIFF>
@@ -739,14 +739,20 @@ describe('ReactCache', () => {
     await act(async () => {
       refresh();
     });
-    expect(Scheduler).toHaveYielded(['Cache miss! [A]', 'Loading...']);
+    expect(Scheduler).toHaveYielded([
+      'Cache miss! [A]',
+      'Loading...',
+      // TODO: This happens too early, because we don't retain the refreshed
+      // cache until it commits. Will fix in next step.
+      'Cache cleanup: A [v1]',
</DIFF> [/INST]  <COMMENT side="RIGHT" offset="10">This comment is suggesting that the cache cleanup is happening too early, before the refreshed cache has committed. This could be fixed by waiting for the cache to commit before cleaning up.</COMMENT>

<COMMENT side="RIGHT" offset="15">This comment is

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Sample 6 ===
[INST] You are a helpful code reviewer. Given a code diff, generate all relevant review comments. Each comment must be in the format: <COMMENT side="RIGHT" offset="X">Your comment here.
Only output comments, nothing else.

### Code Diff:
<DIFF>
@@ -0,0 +1,197 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * @emails react-core
+ */
+
+let JSDOM;
+let React;
+let ReactDOM;
+let Scheduler;
+let clientAct;
+let ReactDOMFizzServer;
+let Stream;
+let document;
+let writable;
+let container;
+let buffer = '';
+let hasErrored = false;
+let fatalError = undefined;
+let textCache;
+
+describe('useId', () => {
+  beforeEach(() => {
+    jest.resetModules();
+    JSDOM = require('jsdom').JSDOM;
+    React = require('react');
+    ReactDOM = require('react-dom');
+    Scheduler = require('scheduler');
+    clientAct = require('jest

100%|██████████| 8/8 [00:58<00:00,  7.37s/it]


=== Sample 7 ===
[INST] You are a helpful code reviewer. Given a code diff, generate all relevant review comments. Each comment must be in the format: <COMMENT side="RIGHT" offset="X">Your comment here.
Only output comments, nothing else.

### Code Diff:
<DIFF>
@@ -389,17 +389,6 @@ describe('ReactSuspense', () => {
     expect(root).toMatchRenderedOutput('Hi');
   });
 
-  it('throws if tree suspends and none of the Suspense ancestors have a boundary', () => {
</DIFF> [/INST]  <COMMENT side="RIGHT" offset="389">This test seems to be testing the behavior of the `ReactSuspense` component when it is used in a situation where it is not expected to be used. The test is checking that an error is thrown when the component suspends and none of the Suspense ancestors have a boundary. However, this test is not actually testing the behavior of the `ReactSuspense` component in this situation. It would be more useful to test the behavior of the component in a real-world scenario where it is actual




In [26]:
# Latency stats
print("\n=== Inference Latency Stats ===")
print(f"Median latency:        {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"95th percentile:       {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"99th percentile:       {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Throughput (1 sample): {len(latencies) / np.sum(latencies):.2f} samples/sec")


=== Inference Latency Stats ===
Median latency:        7757.82 ms
95th percentile:       7990.19 ms
99th percentile:       8111.77 ms
Throughput (1 sample): 0.13 samples/sec


In [28]:
# BERTScore
P, R, F1 = bert_score(candidates, references, lang="en", verbose=True)
print("\nEvaluation Results")
print(f"Avg BERTScore F1: {F1.mean().item():.4f}")
print(f"Offset Precision: {offset_match_correct}/{offset_match_total} = {offset_match_correct/offset_match_total:.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.10 seconds, 83.01 sentences/sec

Evaluation Results
Avg BERTScore F1: 0.7252
Offset Precision: 0/23 = 0.0000




Batch throughput test

In [None]:
batch_size = 4
num_batches = 10
batch_inputs = [test_data[i]["input"] for i in range(batch_size)]
batch_prompts = [
    tokenizer.apply_chat_template(
        [{"role": "user", "content": instruction + inp}],
        tokenize=False,
        add_generation_prompt=True
    )
    for inp in batch_inputs
]

encoded = tokenizer(
    batch_prompts,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=4096
).to("cuda:0")

# Warm-up
with torch.no_grad():
    _ = model.generate(**encoded, max_new_tokens=256, do_sample=False)

batch_times = []
with torch.no_grad():
    for _ in range(num_batches):
        start = time.time()
        _ = model.generate(**encoded, max_new_tokens=256, do_sample=False)
        batch_times.append(time.time() - start)

batch_fps = (batch_size * num_batches) / np.sum(batch_times)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
print("\n=== Final Evaluation Summary ===")
print(f"Model Size on Disk:               {model_size_bytes / 1e6:.2f} MB")
print(f"BERTScore F1 (avg):               {F1.mean().item():.4f}")
print(f"Location Precision:               {offset_match_correct}/{offset_match_total} = {offset_match_correct / offset_match_total:.4f}")
print(f"Inference Latency (median):       {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (95th pct):     {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (99th pct):     {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (1 sample):  {len(latencies) / np.sum(latencies):.2f} samples/sec")
print(f"Batch Throughput ({batch_size}):  {batch_fps:.2f} samples/sec")
