# Evaluate a model offline
- Loading our onnx model and performing offline Evaluation

In [1]:
!pip install mlflow
!pip install onnxruntime-gpu
!pip install transformers
!pip install rouge-score evaluate

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.52.0-py3-none-any.whl.metadata (39 kB)
Collecting fastapi<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 k

In [18]:
import onnxruntime as ort
print("Available Providers:", ort.get_available_providers())

Available Providers: ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']


In [12]:
!nvidia-smi

Sun May 11 21:09:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   32C    P0             43W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import mlflow
import onnxruntime as ort
from transformers import AutoTokenizer
from pathlib import Path
import os

os.environ["MLFLOW_ARTIFACT_DOWNLOAD_PARALLEL"] = "false"
# Set MLflow tracking URI (if needed)
mlflow.set_tracking_uri("http://129.114.25.240:8000")  # Optional if already set

# Define your model URI
model_uri = "runs:/d83c3a778ab94075962dd67f724af964/onnx_model"  # or use "models:/ModelName/1" if registered

# Download ONNX model artifact folder
local_dir = Path(mlflow.artifacts.download_artifacts(artifact_uri=model_uri))
onnx_path = local_dir / "model.onnx"  # Adjust if the name is different

# Load ONNX Runtime Inference Session
ort_session = ort.InferenceSession(onnx_path.as_posix(), providers=["CUDAExecutionProvider"])

Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]



KeyboardInterrupt: 

In [19]:
#If using Your Local
from pathlib import Path
import onnxruntime as ort

# ① Point to the model that’s already in your working directory
onnx_path = Path("/content/llama2-legal-onnx/model.onnx")      # ← adjust if it lives elsewhere

if not onnx_path.exists():
    raise FileNotFoundError(f"{onnx_path} not found!")

# ② See which execution providers this runtime offers
print("Available providers ➜", ort.get_available_providers())
# e.g. ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']

# ③ Pick a provider list (fastest first, CPU as fallback)
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]   # swap order if no GPU

# ④ Create the inference session
ort_session = ort.InferenceSession(onnx_path.as_posix(), providers=providers)

print("ONNX session ready →", ort_session.get_providers())


Available providers ➜ ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
ONNX session ready → ['CUDAExecutionProvider', 'CPUExecutionProvider']


In [20]:
ort_session = ort.InferenceSession(onnx_path.as_posix(), providers=providers)

print("ONNX session ready →", ort_session.get_providers())

ONNX session ready → ['CUDAExecutionProvider', 'CPUExecutionProvider']


In [21]:
import json

with open("/mnt/LlamaData/test.jsonl", "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f]

system_prompt = "Summarize the following legal text."
inputs = [
    f"""### Instruction: {system_prompt}\n\n### Input:\n{item['judgement'].strip()[:10000]}\n\n### Response:\n"""
    for item in test_data
]
references = [item["summary"].strip() for item in test_data]

# Get Predictions for each test data

In [23]:
import time
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer

# ─────── Config ───────
MAX_NEW_TOKENS  = 100
TEMPERATURE     = 0.8
TOP_K           = 40
REPETITION_PEN  = 1.15
END_TOKENS      = {0, 2, 50256}
# ──────────────────────

# Load model + tokenizer
tokenizer = AutoTokenizer.from_pretrained(local_dir)
sess = ort_session

# Top-k sampling function
def sample_top_k(logits, top_k, temperature=1.0):
    logits = logits.astype(np.float32) / temperature
    if top_k and top_k < logits.size:
        top_ids = logits.argsort()[-top_k:]
        mask = np.ones_like(logits, dtype=bool)
        mask[top_ids] = False
        logits[mask] = -np.inf
    probs = np.exp(logits - np.max(logits))
    probs /= probs.sum()
    return np.random.choice(len(logits), p=probs)

# Run for each input in list
generated_summaries = []
for i,prompt in enumerate(inputs):  # <- your list of prompts
    enc = tokenizer(prompt, return_tensors="np")
    input_ids = enc["input_ids"]
    attention_mask = enc["attention_mask"]

    generated = input_ids.copy()
    print("Prompt"+str(i))
    for _ in range(MAX_NEW_TOKENS):
        position_ids = np.arange(generated.shape[1], dtype=np.int64)[None, :]

        logits = sess.run(None, {
            "input_ids": generated.astype(np.int64),
            "attention_mask": attention_mask.astype(np.int64),
            "position_ids": position_ids,
        })[0]

        logits[0, -1, np.unique(generated)] /= REPETITION_PEN
        next_id = sample_top_k(logits[0, -1], top_k=TOP_K, temperature=TEMPERATURE)
        if next_id in END_TOKENS:
            break

        next_token = np.array([[next_id]], dtype=np.int64)
        generated = np.concatenate([generated, next_token], axis=1)
        attention_mask = np.concatenate([attention_mask, np.ones_like(next_token)], axis=1)

    text = tokenizer.decode(generated[0], skip_special_tokens=True)
    generated_summaries.append(text)

print("✔️ Generated", len(generated_summaries), "summaries.")


Prompt0
Prompt1
Prompt2
Prompt3
Prompt4
Prompt5
Prompt6
Prompt7
Prompt8
Prompt9
Prompt10
Prompt11
Prompt12
Prompt13
Prompt14
Prompt15
Prompt16
Prompt17
Prompt18
Prompt19
Prompt20
Prompt21
Prompt22
Prompt23
Prompt24
Prompt25
Prompt26
Prompt27
Prompt28
Prompt29
Prompt30
Prompt31
Prompt32
Prompt33
Prompt34
Prompt35
Prompt36
Prompt37
Prompt38
Prompt39
Prompt40
Prompt41
Prompt42
Prompt43
Prompt44
Prompt45
Prompt46
Prompt47
Prompt48
Prompt49
Prompt50
Prompt51
Prompt52
Prompt53
Prompt54
Prompt55
Prompt56
Prompt57
Prompt58
Prompt59
Prompt60
Prompt61
Prompt62
Prompt63
Prompt64
Prompt65
Prompt66
Prompt67
Prompt68
Prompt69
Prompt70
Prompt71
Prompt72
Prompt73
Prompt74
Prompt75
Prompt76
Prompt77
Prompt78
Prompt79
Prompt80
Prompt81
Prompt82
Prompt83
Prompt84
Prompt85
Prompt86
Prompt87
Prompt88
Prompt89
Prompt90
Prompt91
Prompt92
Prompt93
Prompt94
Prompt95
Prompt96
Prompt97
Prompt98
Prompt99
Prompt100
Prompt101
Prompt102
Prompt103
Prompt104
Prompt105
Prompt106
Prompt107
Prompt108
Prompt109
Prompt110


KeyboardInterrupt: 

# Evaluate a Text Generation Model

In [25]:
from evaluate import load

rouge = load("rouge")
results = rouge.compute(predictions=generated_summaries, references=references[:570])
print("ROUGE Scores:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

ROUGE Scores:
rouge1: 0.4370
rouge2: 0.2787
rougeL: 0.2631
rougeLsum: 0.2638


In [29]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [31]:
# BLEU
bleu = load("bleu")
bleu_score = bleu.compute(predictions=generated_summaries, references=[[ref] for ref in references[:570]])
print("BLEU score:", bleu_score["bleu"])


BLEU score: 0.16900432425985754


In [32]:
print(f"\n{'Metric':<10} | {'Score':>8}")
print("-" * 22)
print(f"{'ROUGE-1':<10} | {results['rouge1']:.4f}")
print(f"{'ROUGE-2':<10} | {results['rouge2']:.4f}")
print(f"{'ROUGE-L':<10} | {results['rougeL']:.4f}")
print(f"{'BLEU':<10}   | {bleu_score['bleu']:.4f}")


Metric     |    Score
----------------------
ROUGE-1    | 0.4370
ROUGE-2    | 0.2787
ROUGE-L    | 0.2631
BLEU         | 0.1690


#Sanity Test

In [33]:
for i in range(5):
    print(f"--- Example {i+1} ---")
    print("Judgement:\n", test_data[i]['judgement'][:500], "...\n")
    print("Reference Summary:\n", references[i], "\n")
    print("Generated Summary:\n", generated_summaries[i], "\n")
    print("-" * 80)

--- Example 1 ---
Judgement:
 risdiction: criminal appeal n4. 133 of 1971. appeal by special leave from the judgment and order dated the 15 2 1971 of the borrrbay high court in criminal appeal no. 1354 (lf 1 969. section k. gambhir and 5. m. sikka for the appellant. m. c. bhandare and m. n. shroff for respondent. the judgment of the court was delivered by beg, j. the appellant before us by special leave was charged as follows by the presidency magistrate of bombay: "i.b. p. saptarshi, presidency magistrate 6th court, mazaaga ...

Reference Summary:
 on 4 3 1968, the date of the accident. karnal singh, the accused was driving the truck no. mrs 7372. purchased out of the loan advanced by the ex serviceman co operative society to one sutar who entrusted the vehicle to balwant singh, the brother of the appellant, a co accused, under a contract for hire against a monthly payment of rs. 2000 2200, after incurring all expenses over the truck. the payment was regular up to december, 1967, and,

In [34]:
import os, mlflow
mlflow.set_tracking_uri("http://129.114.25.240:8000")

In [36]:
run_id   = "d83c3a778ab94075962dd67f724af964"
metrics = {
    "rouge1" : results["rouge1"],          # ROUGE‑1  (e.g. 0.5274)
    "rouge2" : results["rouge2"],          # ROUGE‑2
    "rougeL" : results["rougeL"],          # ROUGE‑L
    "bleu"   : bleu_score["bleu"]        # corpus BLEU
}

with mlflow.start_run(run_id=run_id):
    mlflow.log_metrics(metrics)
    # optional: tag that these are evaluation results
    mlflow.set_tag("phase", "evaluation")
print("metrics logged to run", run_id)


🏃 View run llama2-legal-onnx-fp16 at: http://129.114.25.240:8000/#/experiments/1/runs/d83c3a778ab94075962dd67f724af964
🧪 View experiment at: http://129.114.25.240:8000/#/experiments/1
metrics logged to run d83c3a778ab94075962dd67f724af964


# For Running the Pytorch Scripts run the following



```
# on the node

docker build -t llama-eval .

docker build -t llama-eval:latest .
docker run --gpus all \
  -e MLFLOW_TRACKING_URI=http://129.114.25.240:8000 \
  -e MLFLOW_RUN_ID=d83c3a778ab94075962dd67f724af964 \
  -e LLM_MODEL_URI=runs:/d83c3a778ab94075962dd67f724af964/onnx_model \
  llama-eval:latest /bin/bash
```

-
# Inside the shell:

pytest --verbose llm_tests/tests

pytest --lf   