# Merge the LoRA adapter into the base model

In [1]:
!pip install "transformers==4.37.2" "peft==0.7.1"

Collecting transformers==4.37.2
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
Collecting peft==0.7.1
  Downloading peft-0.7.1-py3-none-any.whl.metadata (25 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers==4.37.2)
  Downloading huggingface_hub-0.31.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers==4.37.2)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.37.2)
  Downloading tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers==4.37.2)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting accelerate>=0.21.0 (from peft==0.7.1)
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting hf-xet<2.0.0,>=1.1.0 (from huggingface-hub<1.0,>=0.19.

In [2]:
from huggingface_hub import login
login("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")

In [4]:
import json
from pathlib import Path

def clean_adapter_config(config_path):
    UNNEEDED_KEYS = [
        "corda_config",
        "eva_config",
        "megatron_config",
        "megatron_core",
        "loftq_config",
        "layers_pattern",
        "layer_replication",
        "auto_mapping",
        "revision",
        "modules_to_save",
        "trainable_token_indices",
        "use_dora",
        "use_rslora",
        "rank_pattern",
        "fan_in_fan_out",
        "init_lora_weights",
        "exclude_modules",
        "lora_bias",
        "layers_to_transform"
    ]

    path = Path(config_path)
    if not path.exists():
        raise FileNotFoundError(f"Config file not found: {path}")

    with open(path, "r") as f:
        config = json.load(f)

    for key in UNNEEDED_KEYS:
        if key in config:
            print(f"🧹 Removing: {key}")
            config.pop(key)

    with open(path, "w") as f:
        json.dump(config, f, indent=2)

    print(f"Cleaned config saved to: {path}")

# Clean this config before merging LoRA
clean_adapter_config("../fine_tuned_lora_model/adapter_config.json")


Cleaned config saved to: ../fine_tuned_lora_model/adapter_config.json


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from peft import PeftModel
import torch
import pathlib

BASE = "meta-llama/Llama-2-7b-hf"
ADAPTER = "../fine_tuned_lora_model"
MERGED = pathlib.Path("../llama2-legal-merged")

# Load base model
model = AutoModelForCausalLM.from_pretrained(BASE, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(BASE)

# Load LoRA adapter
model = PeftModel.from_pretrained(model, ADAPTER)

# ⚠️ MANUAL LoRA MERGE
model.base_model.merge_and_unload()

config = AutoConfig.from_pretrained(BASE)
config.save_pretrained(MERGED)

# Save the merged model
model.save_pretrained(MERGED, safe_serialization=False)
tokenizer.save_pretrained(MERGED)



config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

('../llama2-legal-merged/tokenizer_config.json',
 '../llama2-legal-merged/special_tokens_map.json',
 '../llama2-legal-merged/tokenizer.json')

In [6]:
!mv ../llama2-legal-merged/adapter_model.bin ../llama2-legal-merged/pytorch_model.bin

# Export to ONNX

In [3]:
!pip install optimum[exporters] onnx onnxruntime-gpu

Collecting optimum[exporters]
  Downloading optimum-1.24.0-py3-none-any.whl.metadata (21 kB)
Collecting onnxruntime (from optimum[exporters])
  Downloading onnxruntime-1.22.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting timm (from optimum[exporters])
  Downloading timm-1.0.15-py3-none-any.whl.metadata (52 kB)
Downloading onnxruntime-1.22.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optimum-1.24.0-py3-none-any.whl (433 kB)
Downloading timm-1.0.15-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m139.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: onnxruntime, timm, optimum
Successfully installed onnxruntime-1.22.0 optimum-1.24.0 timm-1.0.15


In [5]:
!optimum-cli export onnx \
  --model ../llama2-legal-merged \
  --task text-generation \
  --dtype fp16 \
  --device cuda \
  --library transformers \
  ../llama2-legal-onnx

config.json: 100%|█████████████████████████████| 609/609 [00:00<00:00, 5.38MB/s]
model.safetensors.index.json: 100%|████████| 26.8k/26.8k [00:00<00:00, 39.5MB/s]
Downloading shards:   0%|                                 | 0/2 [00:00<?, ?it/s]
model-00001-of-00002.safetensors:   0%|             | 0.00/9.98G [00:00<?, ?B/s][A
model-00001-of-00002.safetensors:   0%|    | 10.5M/9.98G [00:00<04:33, 36.4MB/s][A
model-00001-of-00002.safetensors:   0%|    | 21.0M/9.98G [00:00<04:20, 38.2MB/s][A
model-00001-of-00002.safetensors:   0%|    | 31.5M/9.98G [00:00<04:13, 39.2MB/s][A
model-00001-of-00002.safetensors:   0%|    | 41.9M/9.98G [00:01<03:51, 42.9MB/s][A
model-00001-of-00002.safetensors:   1%|    | 52.4M/9.98G [00:01<03:39, 45.1MB/s][A
model-00001-of-00002.safetensors:   1%|    | 62.9M/9.98G [00:01<03:29, 47.4MB/s][A
model-00001-of-00002.safetensors:   1%|    | 73.4M/9.98G [00:01<04:07, 40.1MB/s][A
model-00001-of-00002.safetensors:   1%|    | 83.9M/9.98G [00:01<03:48, 43.4MB/s][A
m

# Quick test in ONNX Runtime 

In [4]:
import onnxruntime as ort
print(ort.get_available_providers())

['AzureExecutionProvider', 'CPUExecutionProvider']


In [10]:
# Inference for text-generation with CPU

In [11]:
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer
import time

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("../llama2-legal-merged")

# Load ONNX model using available GPU execution provider
available_providers = ort.get_available_providers()
ort_session = ort.InferenceSession(
    "../llama2-legal-onnx/model.onnx",  # Use optimized .onnx
     providers=["CPUExecutionProvider"]
)

print("Using provider:", ort_session.get_providers()[0])

# Prompt setup
prompt_text = "One-sentence summary of clause 7.2:"
inputs = tokenizer(prompt_text, return_tensors="np")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Generate tokens
max_new_tokens = 20
times = []

for _ in range(max_new_tokens):
    position_ids = np.arange(input_ids.shape[1], dtype=np.int64)[None, :]

    start = time.time()
    outputs = ort_session.run(None, {
        "input_ids": input_ids.astype(np.int64),
        "attention_mask": attention_mask.astype(np.int64),
        "position_ids": position_ids
    })
    end = time.time()
    times.append(end - start)

    logits = outputs[0]
    next_token = np.argmax(logits[:, -1, :], axis=-1)
    input_ids = np.concatenate([input_ids, next_token[:, None]], axis=1)
    attention_mask = np.concatenate([attention_mask, np.ones_like(next_token)[:, None]], axis=1)

    if next_token[0] == tokenizer.eos_token_id:
        break

# Decode
generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
print("\nGenerated Text:")
print(generated_text)

# Benchmarking
print(f"\n--- Inference Benchmark (Total Tokens: {len(times)}) ---")
print(f"Total time: {sum(times):.2f}s")
print(f"Avg per token: {np.mean(times)*1000:.2f} ms")
print(f"Median: {np.percentile(times, 50)*1000:.2f} ms | 95th: {np.percentile(times, 95)*1000:.2f} ms | 99th: {np.percentile(times, 99)*1000:.2f} ms")
print(f"Throughput: {len(times)/sum(times):.2f} tokens/sec")

Using provider: CPUExecutionProvider

Generated Text:
One-sentence summary of clause 7.2:
The contractor shall not be liable for any loss or damage whatsoever caused by any

--- Inference Benchmark (Total Tokens: 20) ---
Total time: 23.50s
Avg per token: 1175.11 ms
Median: 1157.95 ms | 95th: 1223.06 ms | 99th: 1517.06 ms
Throughput: 0.85 tokens/sec


In [12]:
# Inference for CUDA EP with Inference Latency Value

In [13]:
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer
import time

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("../llama2-legal-merged")

# Load ONNX model using available GPU execution provider
available_providers = ort.get_available_providers()
ort_session = ort.InferenceSession(
    "../llama2-legal-onnx/model.onnx",  # Use optimized .onnx
     providers=["CUDAExecutionProvider"]
)

print("Using provider:", ort_session.get_providers()[0])

# Prompt setup
prompt_text = "Give summary of clause 7.2:"
inputs = tokenizer(prompt_text, return_tensors="np")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Generate tokens
max_new_tokens = 1000
times = []

for _ in range(max_new_tokens):
    position_ids = np.arange(input_ids.shape[1], dtype=np.int64)[None, :]

    start = time.time()
    outputs = ort_session.run(None, {
        "input_ids": input_ids.astype(np.int64),
        "attention_mask": attention_mask.astype(np.int64),
        "position_ids": position_ids
    })
    end = time.time()
    times.append(end - start)

    logits = outputs[0]
    next_token = np.argmax(logits[:, -1, :], axis=-1)
    input_ids = np.concatenate([input_ids, next_token[:, None]], axis=1)
    attention_mask = np.concatenate([attention_mask, np.ones_like(next_token)[:, None]], axis=1)

    if next_token[0] == tokenizer.eos_token_id:
        break

# Decode
generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
print("\nGenerated Text:")
print(generated_text)

# Benchmarking
print(f"\n--- Inference Benchmark (Total Tokens: {len(times)}) ---")
print(f"Total time: {sum(times):.2f}s")
print(f"Avg per token: {np.mean(times)*1000:.2f} ms")
print(f"Median: {np.percentile(times, 50)*1000:.2f} ms | 95th: {np.percentile(times, 95)*1000:.2f} ms | 99th: {np.percentile(times, 99)*1000:.2f} ms")
print(f"Throughput: {len(times)/sum(times):.2f} tokens/sec")

[0;93m2025-05-10 08:01:45.697210692 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 32 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-05-10 08:01:45.712292555 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 08:01:45.712304879 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


Using provider: CUDAExecutionProvider

Generated Text:
Give summary of clause 7.2:
The clause 7.2 of the contract is related to the payment of the contractor. The contractor is required to pay the subcontractor within 14 days of the receipt of the invoice. The contractor is also required to pay the subcontractor within 14 days of the receipt of the invoice.
Give summary of clause 7.3:
The clause 7.3 of the contract is related to the payment of the contractor. The contractor is required to pay the subcontractor within 14 days of the receipt of the invoice. The contractor is also required to pay the subcontractor within 14 days of the receipt of the invoice. The contractor is required to pay the subcontractor within 14 days of the receipt of the invoice.
Give summary of clause 7.4: The clause 7.4 of the contract is related to the payment of the contractor. The contractor is required to pay the subcontractor within 14 days of the receipt of the invoice. The contractor is also required to 

In [14]:
# Inference for CUDA EP with Inference Latency Value with Sampling for Better Inference

In [15]:
#!/usr/bin/env python3
"""
Fast-ish ONNX‑runtime decoding with basic top‑k sampling
and a repetition penalty to avoid infinite loops.
"""

import time
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer

# ─────────── Config ────────────
MODEL_DIR      = "../llama2-legal-onnx/model.onnx"
TOKENIZER_DIR  = "../llama2-legal-merged"
PROMPT_TEXT    = "Give a concise summary of clause 7.2:"
MAX_NEW_TOKENS = 256                # hard cap
TEMPERATURE    = 0.8
TOP_K          = 40
REPETITION_PEN = 1.15               # >1.0 penalises already‑seen tokens
END_TOKENS     = {0, 2, 50256}      # eos, or add your own
# ───────────────────────────────

def sample_top_k(logits, top_k, temperature=1.0):
    """Return one sampled token id (numpy int64) from top‑k"""
    logits = logits.astype(np.float32) / temperature
    # keep top‑k
    if top_k and top_k < logits.size:
        top_ids = logits.argsort()[-top_k:]
        mask = np.ones_like(logits, dtype=bool)
        mask[top_ids] = False
        logits[mask] = -np.inf
    probs = np.exp(logits - np.max(logits))
    probs /= probs.sum()
    return np.random.choice(len(logits), p=probs)

# ─────────── Load model ─────────
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
sess = ort.InferenceSession(
    MODEL_DIR,
    providers=["CUDAExecutionProvider"],  # assumes GPU
)
print("ONNX provider →", sess.get_providers()[0])

# ─────────── Prepare prompt ─────
inputs          = tokenizer(PROMPT_TEXT, return_tensors="np")
input_ids       = inputs["input_ids"]
attention_mask  = inputs["attention_mask"]

generated = input_ids.copy()
times = []

# ─────────── Decode loop ────────
for _ in range(MAX_NEW_TOKENS):
    position_ids = np.arange(generated.shape[1], dtype=np.int64)[None, :]

    start = time.time()
    logits = sess.run(
        None,
        {
            "input_ids": generated.astype(np.int64),
            "attention_mask": attention_mask.astype(np.int64),
            "position_ids": position_ids,
        },
    )[0]
    times.append(time.time() - start)

    # repetition penalty
    logits[0, -1, np.unique(generated)] /= REPETITION_PEN

    next_id = sample_top_k(logits[0, -1], top_k=TOP_K, temperature=TEMPERATURE)
    if next_id in END_TOKENS:
        break

    next_token = np.array([[next_id]], dtype=np.int64)
    generated  = np.concatenate([generated, next_token], axis=1)
    attention_mask = np.concatenate([attention_mask, np.ones_like(next_token)], axis=1)

# ─────────── Output + stats ─────
text = tokenizer.decode(generated[0], skip_special_tokens=True)
print("\nGenerated text:\n" + "-"*60 + f"\n{text}\n" + "-"*60)

n = len(times)
print(f"\n--- Benchmark ({n} new tokens) ---")
print(f"total {sum(times):.2f}s | mean {np.mean(times)*1000:.1f} ms "
      f"| 95th {np.percentile(times,95)*1000:.1f} ms "
      f"| throughput {n/sum(times):.1f} tok/s")


[0;93m2025-05-10 08:02:56.410673675 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 32 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-05-10 08:02:56.422904391 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 08:02:56.422912326 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


ONNX provider → CUDAExecutionProvider

Generated text:
------------------------------------------------------------
Give a concise summary of clause 7.2:
(a) What is the purpose?
(b) Who has authority over the budget process? (Who controls what expenses can be incurred?)
Analyze why each of these clauses are important and necessary for an organization to have as part of its bylaws, based upon your experience with non-profits: Clause 5.1; 6.3 (a), (d); 8.4; 9.2 (c).
Do you believe that the use of volunteers is appropriate for all charitable or membership organizations? Why or why not? Which types of organizations should not utilize volunteer staff?
What type of leadership style do you think would work best in your current organizational structure? Do you think it might change at some point within the next five years? Explain both points, with specific examples from your own organization.
------------------------------------------------------------

--- Benchmark (183 new tokens) ---
tot

# MLFlow Setup to save ONNX

In [6]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting markdown<4,>=3.3 (from mlflow)
  Downloading markdown-3.8-py3-none-any.whl.metadata (5.1 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading cachetools-5.5.2-py3-none-any.whl.metadata (5.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.52.0-py3-none-any.whl.metadata (39 kB)
Collecting fastap

In [7]:
import mlflow
import os
import onnx
import numpy as np
import mlflow
from mlflow.models import infer_signature          # ← this was missing
from transformers import AutoTokenizer

# ---- edit these three lines for YOUR setup ---------------------
TRACKING_URI           = "http://129.114.25.240:8000"         # mlflow server
MLFLOW_S3_ENDPOINT_URL = "http://129.114.25.240:9000"  # MinIO
ARTIFACT_ROOT          = "s3://mlflow-artifacts"              # same bucket path
AWS_ACCESS_KEY_ID      = "your-access-key"
AWS_SECRET_ACCESS_KEY  = "your-secret-key"
MLFLOW_HTTP_REQUEST_TIMEOUT = "3600" 
# ----------------------------------------------------------------

os.environ["MLFLOW_S3_ENDPOINT_URL"]   = MLFLOW_S3_ENDPOINT_URL
os.environ["AWS_ACCESS_KEY_ID"]        = AWS_ACCESS_KEY_ID
os.environ["AWS_SECRET_ACCESS_KEY"]    = AWS_SECRET_ACCESS_KEY
os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"]    = MLFLOW_HTTP_REQUEST_TIMEOUT

mlflow.set_tracking_uri(TRACKING_URI)

# Optional: create (or get) an experiment
experiment_name = "Legal‑Summarizers"
mlflow.set_experiment(experiment_name)
print("Tracking URI:", mlflow.get_tracking_uri())

Tracking URI: http://129.114.25.240:8000


In [9]:
ONNX_PATH      = "../llama2-legal-onnx/model.onnx"
TOKENIZER_DIR  = "../llama2-legal-merged"
example_prompt = "Summarize clause 7.5"

# --- tokenise one prompt ------------------------------------------------------
tok = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
enc = tok(example_prompt, return_tensors="np")

seq_len = enc["input_ids"].shape[1]
enc["position_ids"] = np.arange(seq_len, dtype=np.int64)[None, :]   # add the 3rd input

# turn the tokenizer output into the format MLflow expects
input_example = {
    "input_ids":     enc["input_ids"].astype(np.int64),
    "attention_mask":enc["attention_mask"].astype(np.int64),
    "position_ids":  enc["position_ids"]
}

# optional but recommended – lets MLflow display the schema in the UI
signature = infer_signature(input_example)

# --- log the model ------------------------------------------------------------
model_proto = onnx.load(ONNX_PATH)
with mlflow.start_run(run_name="llama2-legal-onnx-fp16"):
    mlflow.onnx.log_model(
        onnx_model=model_proto,
        artifact_path="onnx_model",
        input_example=input_example,   # <- **now all three inputs**
        signature=signature,
        metadata={"quantization": "fp16"},
        registered_model_name="LegalClauseSummarizer" 
    )

Successfully registered model 'LegalClauseSummarizer'.
2025/05/10 23:10:03 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LegalClauseSummarizer, version 1
Created version '1' of model 'LegalClauseSummarizer'.


🏃 View run llama2-legal-onnx-fp16 at: http://129.114.25.240:8000/#/experiments/1/runs/d83c3a778ab94075962dd67f724af964
🧪 View experiment at: http://129.114.25.240:8000/#/experiments/1
