In [None]:
import os
import gzip, json, time
import torch 
import numpy as np
from pathlib import Path
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig
from bert_score import score
import onnxruntime as ort
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import onnx
import onnxruntime as ort
from tqdm import tqdm
import re
from bert_score import score as bert_score
import neural_compressor

In [2]:
model_name = "codellama/CodeLlama-7b-Instruct-hf"

In [3]:
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def get_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    dtype_size = torch.finfo(model.dtype).bits // 8
    
    model_size_gb = total_params * dtype_size / (1024 ** 3)
    # print(f"Model size: {model_size_gb:2f} GB")
    return model_size_gb

In [5]:
get_model_size(model)

25.103042602539062

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    "codellama/CodeLlama-7b-Instruct-hf",
    torch_dtype=torch.float16,
    # device_map={"": 0},
    # low_cpu_mem_usage=True
    device_map="cuda:0"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
!nvidia-smi

Mon May 12 02:31:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03             Driver Version: 550.144.03     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  |   00000000:0F:00.0 Off |                    0 |
| N/A   29C    P0            114W /  700W |   13384MiB /  81559MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [8]:
# data_path = "/mnt/object_group/data/"

In [9]:
train = "train.jsonl.gz"
with gzip.open(train, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        print("    " + line.strip())
        if i >= 5:
            break


    {"comment_id": 74291026, "comment_user_login": "vikerman", "comment_body": "Is there a better way to avoid having this in different places?\n", "comment_created_at": "2016-08-10T17:30:42+00:00", "comment_html_url": "https://github.com/angular/angular/pull/10620#discussion_r74291026", "comment_path": "modules/@angular/common/src/forms-deprecated/directives/abstract_control_directive.ts", "comment_position": 10, "comment_original_position": 10, "comment_commit_id": "ee8e802b7bc25eafbe54109b3924e9dbc36dce11", "comment_original_commit_id": "ee8e802b7bc25eafbe54109b3924e9dbc36dce11", "diff_line_content": "}\n", "diff_line_type": "added", "diff_line_source_no": null, "diff_line_target_no": 15, "diff_hunk_header": "", "diff": "@@ -6,10 +6,13 @@\n  * found in the LICENSE file at https://angular.io/license\n  */\n \n-import {unimplemented} from '../../facade/exceptions';\n+import {BaseException} from '@angular/core';\n import {isPresent} from '../../facade/lang';\n import {AbstractControl} 

In [10]:
DATA_ROOT = Path("./")  # Adjust as needed

def load_samples(limit_per_split=10):
    diff_samples = []
    for repo_dir in DATA_ROOT.iterdir():
        if not repo_dir.is_dir():
            continue
        for pr_dir in (repo_dir / "diff").iterdir():
            pr_id = pr_dir.name
            diff_file = pr_dir
            comments_file = DATA_ROOT / repo_dir.name / "comments" / f"{repo_dir.name}_{pr_id}_comments.jsonl"

            if not diff_file.exists() or not comments_file.exists():
                continue

            with open(diff_file, "r", encoding="utf-8") as df:
                diff_content = df.read()

            with open(comments_file, "r", encoding="utf-8") as cf:
                for line in cf:
                    try:
                        comment = json.loads(line)
                        offset = comment.get("original_position")
                        side = comment.get("side", "RIGHT")
                        body = comment.get("body", "").strip()
                        if offset is not None and body:
                            sample = {
                                "input": f"<DIFF>\n{diff_content}\n</DIFF>\n<COMMENT side=\"{side}\" offset=\"{offset}\">",
                                "output": body
                            }
                            diff_samples.append(sample)
                    except json.JSONDecodeError:
                        continue

    random.shuffle(diff_samples)
    return {
        "train": diff_samples[:limit_per_split],
        "test": diff_samples[limit_per_split:2*limit_per_split],
        "eval": diff_samples[2*limit_per_split:3*limit_per_split],
    }


In [11]:
def load_jsonl_gz(file_path, limit):
    samples = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= limit:
                break
            entry = json.loads(line)
            prompt = f"<DIFF>\n{entry['diff']}\n</DIFF>\n"
            comment = f"<COMMENT side=\"{entry['side']}\" offset=\"{entry['line_offset']}\">{entry['comment_body']}"
            samples.append({'input': prompt, 'label': comment})
    return samples

train_data = load_jsonl_gz("train.jsonl.gz", 1000000000)
test_data  = load_jsonl_gz("test.jsonl.gz", 1000000000)
eval_data  = load_jsonl_gz("val.jsonl.gz", 1000000000)

In [12]:
print(len(test_data))
print(len(eval_data))

8952
8308


In [13]:
print(train_data[0].keys())

for i in range(3):
    print(train_data[i]["input"])
    print(train_data[i]["label"])
# print(train_data[0]["input"])

dict_keys(['input', 'label'])
<DIFF>
@@ -6,10 +6,13 @@
  * found in the LICENSE file at https://angular.io/license
  */
 
-import {unimplemented} from '../../facade/exceptions';
+import {BaseException} from '@angular/core';
 import {isPresent} from '../../facade/lang';
 import {AbstractControl} from '../model';
 
+function unimplemented(): any {
+  throw new BaseException('unimplemented');
</DIFF>

<COMMENT side="RIGHT" offset="9">Is there a better way to avoid having this in different places?

<DIFF>
@@ -6,10 +6,13 @@
  * found in the LICENSE file at https://angular.io/license
  */
 
-import {unimplemented} from '../../facade/exceptions';
+import {BaseException} from '@angular/core';
 import {isPresent} from '../../facade/lang';
 import {AbstractControl} from '../model';
 
+function unimplemented(): any {
+  throw new BaseException('unimplemented');
</DIFF>

<COMMENT side="RIGHT" offset="9">we could export it in the public API but that's not nice.
Or we could inline the method and del

In [14]:
# Prompt template
instruction = (
    "You are a helpful code reviewer. "
    "Given a code diff, generate all relevant review comments. "
    "Each comment must be in the format: "
    "<COMMENT side=\"RIGHT\" offset=\"X\">Your comment here.\n"
    "Only output comments, nothing else.\n\n"
    "### Code Diff:\n"
)

In [15]:
def extract_comments(text):
    pattern = re.compile(r'<COMMENT side="(?P<side>[^"]+)" offset="(?P<offset>\d+)">(?P<comment>.+)')
    results = []
    for line in text.strip().splitlines():
        match = pattern.match(line.strip())
        if match:
            results.append({
                "side": match.group("side"),
                "offset": int(match.group("offset")),
                "comment": match.group("comment").strip()
            })
    return results

In [16]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32016, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (no

In [17]:
def offset_match(pred_comments, true_comments, window=3):
    correct = 0
    total = len(pred_comments)

    gt_pairs = {(c["side"], c["offset"]) for c in true_comments}
    matched = set()

    for pred in pred_comments:
        pred_side, pred_offset = pred["side"], pred["offset"]
        for gt in gt_pairs:
            gt_side, gt_offset = gt
            if pred_side == gt_side and abs(pred_offset - gt_offset) <= window:
                matched.add(gt)
                correct += 1
                break  # count each pred only once

    return correct, total

In [19]:
references = []
candidates = []
offset_match_total = 0
offset_match_correct = 0
latencies = []

tokenizer = AutoTokenizer.from_pretrained("./models/llama_onnx")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

for i, sample in enumerate(tqdm(test_data[:30])):  # Full test set if needed
    prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": instruction + sample["input"]}],
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=4096
    ).to("cuda:0")

    true_comments = extract_comments(sample["label"])

    # Measure inference latency
    with torch.no_grad():
        start = time.time()
        output_ids = model.generate(**inputs, max_new_tokens=256, do_sample=False)
        latencies.append(time.time() - start)

    # Decode and evaluate
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
    generated_text = generated_text.split("Setting `pad_token_id")[0].strip()
    # print(f"\n=== Sample {i} ===\n{generated_text}")

    pred_comments = extract_comments(generated_text)

    for gt in true_comments:
        references.append(gt["comment"])
        best_pred = max(pred_comments, key=lambda p: p["comment"], default={"comment": ""})
        # print(best_pred)
        candidates.append(best_pred["comment"] if best_pred else "")

    correct, total = offset_match(pred_comments, true_comments, window=3)
    offset_match_correct += correct
    offset_match_total += total

  0%|          | 0/30 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 1/30 [00:05<02:39,  5.51s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  7%|▋         | 2/30 [00:11<02:34,  5.51s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 10%|█         | 3/30 [00:14<01:59,  4.41s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 13%|█▎        | 4/30 [00:19<02:06,  4.85s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 17%|█▋        | 5/30 [00:25<02:07,  5.09s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 20%|██        | 6/30 [00:30<02:06,  5.25s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 23%|██▎       | 7/30 [00:36<02:02,  5.34s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 27%|██▋       | 8/30 [00:41<01:58,  5.39s/it]Setting `pad_token_id` to 

In [20]:
# Latency stats
print("\n=== Inference Latency Stats ===")
print(f"Median latency:        {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"95th percentile:       {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"99th percentile:       {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Throughput (1 sample): {len(latencies) / np.sum(latencies):.2f} samples/sec")


=== Inference Latency Stats ===
Median latency:        5502.59 ms
95th percentile:       5514.66 ms
99th percentile:       5544.22 ms
Throughput (1 sample): 0.19 samples/sec


In [21]:
# BERTScore
P, R, F1 = bert_score(candidates, references, lang="en", verbose=True)
print("\nEvaluation Results")
print(f"Avg BERTScore F1: {F1.mean().item():.4f}")
print(f"Offset Precision: {offset_match_correct}/{offset_match_total} = {offset_match_correct/offset_match_total:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.21 seconds, 146.01 sentences/sec

Evaluation Results
Avg BERTScore F1: 0.8023
Offset Precision: 2/98 = 0.0204




# Batch throughput

In [22]:
batch_size = 4
num_batches = 10
batch_inputs = [test_data[i]["input"] for i in range(batch_size)]
batch_prompts = [
    tokenizer.apply_chat_template(
        [{"role": "user", "content": instruction + inp}],
        tokenize=False,
        add_generation_prompt=True
    )
    for inp in batch_inputs
]

encoded = tokenizer(
    batch_prompts,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=4096
).to("cuda:0")

# Warm-up
with torch.no_grad():
    _ = model.generate(**encoded, max_new_tokens=256, do_sample=False)

batch_times = []
with torch.no_grad():
    for _ in range(num_batches):
        start = time.time()
        _ = model.generate(**encoded, max_new_tokens=256, do_sample=False)
        batch_times.append(time.time() - start)

batch_fps = (batch_size * num_batches) / np.sum(batch_times)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [23]:
print("\n=== Final Evaluation Summary ===")
print(f"Model Size on Disk:               {get_model_size(model):.2f}")
print(f"BERTScore F1 (avg):               {F1.mean().item():.4f}")
print(f"Location Precision:               {offset_match_correct}/{offset_match_total} = {offset_match_correct / offset_match_total:.4f}")
print(f"Inference Latency (median):       {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (95th pct):     {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (99th pct):     {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (1 sample):  {len(latencies) / np.sum(latencies):.2f} samples/sec")
print(f"Batch Throughput ({batch_size}):  {batch_fps:.2f} samples/sec")



=== Final Evaluation Summary ===
Model Size on Disk:               12.55
BERTScore F1 (avg):               0.8023
Location Precision:               2/98 = 0.0204
Inference Latency (median):       5502.59 ms
Inference Latency (95th pct):     5514.66 ms
Inference Latency (99th pct):     5544.22 ms
Inference Throughput (1 sample):  0.19 samples/sec
Batch Throughput (4):  0.44 samples/sec


In [24]:
!nvidia-smi

Mon May 12 02:39:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03             Driver Version: 550.144.03     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  |   00000000:0F:00.0 Off |                    0 |
| N/A   42C    P0            382W /  700W |   35146MiB /  81559MiB |     98%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Converting to ONNX

In [25]:
tokenizer.save_pretrained("./models/llama_onnx")
# tokenizer.save_pretrained("mnt/object/llama_onnx")

('./models/llama_onnx/tokenizer_config.json',
 './models/llama_onnx/special_tokens_map.json',
 './models/llama_onnx/tokenizer.json')

In [26]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32016, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (no

In [27]:
def make_review_prompt(diff_text: str) -> str:
    instruction = (
        "You are a helpful code reviewer. "
        "Given a code diff, generate all relevant review comments. "
        "Each comment must be in the format: "
        "<COMMENT side=\"RIGHT\" offset=\"X\">Your comment here.\n"
        "Only output comments, nothing else.\n\n"
        "### Code Diff:\n"
    )
    
    # Use the same tokenizer chat template as in inference
    prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": instruction + diff_text}],
        tokenize=False,
        add_generation_prompt=True
    )
    return prompt


In [28]:
# Load directly onto the first CUDA device
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,      # halve VRAM use
    device_map={"": 0},             # force everything to cuda:0
    use_cache=False                 # strip DynamicCache to keep export simple
).eval()

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Dummy input on GPU
inputs = tokenizer(test_data[0]["input"], return_tensors="pt").to("cuda:0")
input_ids, attention_mask = inputs["input_ids"], inputs["attention_mask"]

# Export
torch.onnx.export(
    model,
    (input_ids, attention_mask),
    "models/llama_onnx/codellama_7b_gpu.onnx",
    input_names  = ["input_ids", "attention_mask"],
    output_names = ["logits"],
    dynamic_axes = {
        "input_ids":      {0: "batch", 1: "seq"},
        "attention_mask": {0: "batch", 1: "seq"},
        "logits":         {0: "batch", 1: "seq"},
    },
    opset_version       = 17,
    export_params       = True,
    do_constant_folding = True,
    external_data_format= True
)
print("Export complete (GPU path)")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  if sequence_length != 1:


Export complete (GPU path)


In [29]:
def get_total_folder_size(path):
    total_bytes = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_bytes += os.path.getsize(fp)
    return total_bytes

size_bytes = get_total_folder_size("models/llama_onnx")
print(f"Total size: {size_bytes / 1e6:.2f} MB ({size_bytes / 1e9:.2f} GB)")


Total size: 13482.13 MB (13.48 GB)


### Graph Optimizations

In [41]:
device = "cuda" if torch.cuda.is_available() else "cpu"


In [46]:
import onnxruntime as ort

optimized_path = "models/codellama_7b_graph_opt.onnx"
session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
session_options.optimized_model_filepath = optimized_path

_ = ort.InferenceSession(model_path, sess_options=session_options, providers=["CUDAExecutionProvider"])
print(f"Graph-optimized model saved to {optimized_path}")

Graph-optimized model saved to models/codellama_7b_graph_opt.onnx


In [None]:
def benchmark_model_latency(model_path, tokenizer_path, prompts, max_length=128):
    # Load ONNX model
    session = ort.InferenceSession(model_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"])

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    latencies = []

    for prompt in prompts:
        input_data = tokenizer(prompt, return_tensors="np", padding="max_length", truncation=True, max_length=max_length)
        input_ids = input_data["input_ids"]
        attention_mask = input_data["attention_mask"]

        start = time.time()
        _ = session.run(["logits"], {"input_ids": input_ids, "attention_mask": attention_mask})
        latency = (time.time() - start) * 1000  # in milliseconds
        latencies.append(latency)

    print("=== Benchmark Results ===")
    for i, prompt in enumerate(prompts):
        print(f"Prompt {i+1}: {latencies[i]:.2f} ms")

    print(f"Average Latency: {np.mean(latencies):.2f} ms")

    return latencies, np.mean(latencies)


benchmark_model_latency("/mnt/object/llama_onnx/codellama_7b_gpu.onnx", "/mnt/object/llama_onnx", test_prompts)

=== Benchmark Results ===
Average Latency: 4.98 ms


In [48]:
onnx_model_path = "models/llama_onnx/codellama_7b_gpu.onnx"
optimized_model_path = "models/llama_onnx/codellama_7b_gpu_graph_optimized.onnx"

In [None]:
!nvidia-smi

### Dynamic Quantization

In [None]:
fp32_model = neural_compressor.model.onnx_model.ONNXModel(model_path)
config_ptq = neural_compressor.PostTrainingQuantConfig(
    approach="dynamic"
)

q_model = quantization.fit(
    model=fp32_model, 
    conf=config_ptq
)

q_model.save_model_to_file("models/codellama/codellama_7b_gpu_dynamic.onnx")
benchmark_session("models/llama_onnx, codellama_7b_gpu_dynamic.onnx", "models/llama_onnx", test_prompts)

=== Benchmark Results ===
Average Latency: 3.28 ms


### Static Quantization

In [None]:
fp32_model = neural_compressor.model.onnx_model.ONNXModel(model_path)
config_ptq = neural_compressor.PostTrainingQuantConfig(
    approach="static"
)

q_model = quantization.fit(
    model=fp32_model, 
    conf=config_ptq
)

q_model.save_model_to_file("models/codellama/codellama_7b_gpu_static.onnx")
benchmark_session("models/llama_onnx, codellama_7b_gpu_static.onnx", "models/llama_onnx", test_prompts)

=== Benchmark Results ===
Average Latency: 4.87 ms
