# Merge the LoRA adapter into the base model

In [9]:
!pip install "transformers==4.37.2" "peft==0.7.1"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [10]:
from huggingface_hub import login
login("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")

In [11]:
import json
from pathlib import Path

def clean_adapter_config(config_path):
    UNNEEDED_KEYS = [
        "corda_config",
        "eva_config",
        "megatron_config",
        "megatron_core",
        "loftq_config",
        "layers_pattern",
        "layer_replication",
        "auto_mapping",
        "revision",
        "modules_to_save",
        "trainable_token_indices",
        "use_dora",
        "use_rslora",
        "rank_pattern",
        "fan_in_fan_out",
        "init_lora_weights",
        "exclude_modules",
        "lora_bias",
        "layers_to_transform"
    ]

    path = Path(config_path)
    if not path.exists():
        raise FileNotFoundError(f"Config file not found: {path}")

    with open(path, "r") as f:
        config = json.load(f)

    for key in UNNEEDED_KEYS:
        if key in config:
            print(f"🧹 Removing: {key}")
            config.pop(key)

    with open(path, "w") as f:
        json.dump(config, f, indent=2)

    print(f"Cleaned config saved to: {path}")

# Clean this config before merging LoRA
clean_adapter_config("../fine_tuned_lora_model/adapter_config.json")


Cleaned config saved to: ../fine_tuned_lora_model/adapter_config.json


In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from peft import PeftModel
import torch
import pathlib

BASE = "meta-llama/Llama-2-7b-hf"
ADAPTER = "../fine_tuned_lora_model"
MERGED = pathlib.Path("../llama2-legal-merged")

# Load base model
model = AutoModelForCausalLM.from_pretrained(BASE, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(BASE)

# Load LoRA adapter
model = PeftModel.from_pretrained(model, ADAPTER)

# ⚠️ MANUAL LoRA MERGE
model.base_model.merge_and_unload()

config = AutoConfig.from_pretrained(BASE)
config.save_pretrained(MERGED)

# Save the merged model
model.save_pretrained(MERGED, safe_serialization=False)
tokenizer.save_pretrained(MERGED)



config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

('../llama2-legal-merged/tokenizer_config.json',
 '../llama2-legal-merged/special_tokens_map.json',
 '../llama2-legal-merged/tokenizer.json')

In [13]:
!mv ../llama2-legal-merged/adapter_model.bin ../llama2-legal-merged/pytorch_model.bin

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Export to ONNX

In [14]:
!pip install optimum[exporters] onnx onnxruntime-gpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [15]:
!optimum-cli export onnx \
  --model ../llama2-legal-merged \
  --task text-generation \
  --dtype fp16 \
  --device cuda \
  --library transformers \
  ../llama2-legal-onnx

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards: 100%|██████████████████| 2/2 [00:02<00:00,  1.11s/it]
  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
  if past_key_values_length > 0:
  if seq_len > self.max_seq_len_cached:
  if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
		-[x] values not close enough, max diff: 0.8278255462646484 (atol: 1e-05)
- logits: max diff = 0.8278255462646484.
 The exported model was saved at: ../llama2-legal-onnx


# Quick test in ONNX Runtime 

In [16]:
import onnxruntime as ort
print(ort.get_available_providers())

['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']


In [17]:
# Inference for text-generation with CPU

In [18]:
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer
import time

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("../llama2-legal-merged")

# Load ONNX model using available GPU execution provider
available_providers = ort.get_available_providers()
ort_session = ort.InferenceSession(
    "../llama2-legal-onnx/model.onnx",  # Use optimized .onnx
     providers=["CPUExecutionProvider"]
)

print("Using provider:", ort_session.get_providers()[0])

# Prompt setup
prompt_text = "One-sentence summary of clause 7.2:"
inputs = tokenizer(prompt_text, return_tensors="np")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Generate tokens
max_new_tokens = 20
times = []

for _ in range(max_new_tokens):
    position_ids = np.arange(input_ids.shape[1], dtype=np.int64)[None, :]

    start = time.time()
    outputs = ort_session.run(None, {
        "input_ids": input_ids.astype(np.int64),
        "attention_mask": attention_mask.astype(np.int64),
        "position_ids": position_ids
    })
    end = time.time()
    times.append(end - start)

    logits = outputs[0]
    next_token = np.argmax(logits[:, -1, :], axis=-1)
    input_ids = np.concatenate([input_ids, next_token[:, None]], axis=1)
    attention_mask = np.concatenate([attention_mask, np.ones_like(next_token)[:, None]], axis=1)

    if next_token[0] == tokenizer.eos_token_id:
        break

# Decode
generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
print("\nGenerated Text:")
print(generated_text)

# Benchmarking
print(f"\n--- Inference Benchmark (Total Tokens: {len(times)}) ---")
print(f"Total time: {sum(times):.2f}s")
print(f"Avg per token: {np.mean(times)*1000:.2f} ms")
print(f"Median: {np.percentile(times, 50)*1000:.2f} ms | 95th: {np.percentile(times, 95)*1000:.2f} ms | 99th: {np.percentile(times, 99)*1000:.2f} ms")
print(f"Throughput: {len(times)/sum(times):.2f} tokens/sec")

Using provider: CPUExecutionProvider

Generated Text:
One-sentence summary of clause 7.2:
The contractor shall not be liable for any loss or damage whatsoever caused by any

--- Inference Benchmark (Total Tokens: 20) ---
Total time: 24.37s
Avg per token: 1218.70 ms
Median: 1167.67 ms | 95th: 1564.73 ms | 99th: 1585.26 ms
Throughput: 0.82 tokens/sec


In [None]:
- Inference for text-generation with past

In [20]:
from fastapi import FastAPI
from pydantic import BaseModel
import onnxruntime as ort, numpy as np
from transformers import AutoTokenizer

# 1) set the TensorRT flags *before* session creation
import os
os.environ["ORT_TENSORRT_FP16_ENABLE"] = "1"
os.environ["ORT_TENSORRT_ENGINE_CACHE_ENABLE"] = "1"
os.environ["ORT_TENSORRT_ENGINE_CACHE_PATH"] = "/cache"

tokenizer = AutoTokenizer.from_pretrained("./llama2-legal-merged")
sess = ort.InferenceSession(
        "./llama-model-onnx/model.onnx",
        providers=["TensorrtExecutionProvider","CUDAExecutionProvider"])

app = FastAPI()

class Req(BaseModel):
    prompt: str
    max_new_tokens: int = 100

@app.post("/generate")
def generate(req: Req):
    ids = tokenizer(req.prompt, return_tensors="np")
    input_ids, attn = ids["input_ids"], ids["attention_mask"]

    for _ in range(req.max_new_tokens):
        pos = np.arange(input_ids.shape[1], dtype=np.int64)[None, :]
        out  = sess.run(None, {
                "input_ids": input_ids.astype(np.int64),
                "attention_mask": attn.astype(np.int64),
                "position_ids": pos})
        next_id = np.argmax(out[0][:, -1, :], axis=-1)
        input_ids = np.concatenate([input_ids, next_id[:, None]], axis=1)
        attn      = np.concatenate([attn, np.ones_like(next_id)[:, None]], axis=1)
        if next_id[0] == tokenizer.eos_token_id: break

    return {"text": tokenizer.decode(input_ids[0], skip_special_tokens=True)}


ModuleNotFoundError: No module named 'fastapi'

#Wrap with FastAPI or Triton(Dont Run)

In [None]:
docker run -d --gpus all -p 8000:8000 \
  -v /home/cc/triton_repo:/models \
  nvcr.io/nvidia/tritonserver:24.05-py3 \
  tritonserver --model-repository=/models


In [21]:
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer
import time

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("../llama2-legal-merged")

# Load ONNX model using available GPU execution provider
available_providers = ort.get_available_providers()
ort_session = ort.InferenceSession(
    "../llama2-legal-onnx/model.onnx",  # Use optimized .onnx
     providers=["CUDAExecutionProvider"]
)

print("Using provider:", ort_session.get_providers()[0])

# Prompt setup
prompt_text = "Give summary of clause 7.2:"
inputs = tokenizer(prompt_text, return_tensors="np")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Generate tokens
max_new_tokens = 1000
times = []

for _ in range(max_new_tokens):
    position_ids = np.arange(input_ids.shape[1], dtype=np.int64)[None, :]

    start = time.time()
    outputs = ort_session.run(None, {
        "input_ids": input_ids.astype(np.int64),
        "attention_mask": attention_mask.astype(np.int64),
        "position_ids": position_ids
    })
    end = time.time()
    times.append(end - start)

    logits = outputs[0]
    next_token = np.argmax(logits[:, -1, :], axis=-1)
    input_ids = np.concatenate([input_ids, next_token[:, None]], axis=1)
    attention_mask = np.concatenate([attention_mask, np.ones_like(next_token)[:, None]], axis=1)

    if next_token[0] == tokenizer.eos_token_id:
        break

# Decode
generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
print("\nGenerated Text:")
print(generated_text)

# Benchmarking
print(f"\n--- Inference Benchmark (Total Tokens: {len(times)}) ---")
print(f"Total time: {sum(times):.2f}s")
print(f"Avg per token: {np.mean(times)*1000:.2f} ms")
print(f"Median: {np.percentile(times, 50)*1000:.2f} ms | 95th: {np.percentile(times, 95)*1000:.2f} ms | 99th: {np.percentile(times, 99)*1000:.2f} ms")
print(f"Throughput: {len(times)/sum(times):.2f} tokens/sec")

[0;93m2025-05-10 03:06:40.724437322 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 32 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-05-10 03:06:40.736667639 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 03:06:40.736674633 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


Using provider: CUDAExecutionProvider

Generated Text:
Give summary of clause 7.2:
The clause 7.2 of the contract is related to the payment of the contractor. The contractor is required to pay the subcontractor within 14 days of the receipt of the invoice. The contractor is also required to pay the subcontractor within 14 days of the receipt of the invoice.
Give summary of clause 7.3:
The clause 7.3 of the contract is related to the payment of the contractor. The contractor is required to pay the subcontractor within 14 days of the receipt of the invoice. The contractor is also required to pay the subcontractor within 14 days of the receipt of the invoice. The contractor is required to pay the subcontractor within 14 days of the receipt of the invoice.
Give summary of clause 7.4: The clause 7.4 of the contract is related to the payment of the contractor. The contractor is required to pay the subcontractor within 14 days of the receipt of the invoice. The contractor is also required to 

In [22]:
#!/usr/bin/env python3
"""
Fast-ish ONNX‑runtime decoding with basic top‑k sampling
and a repetition penalty to avoid infinite loops.
"""

import time
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer

# ─────────── Config ────────────
MODEL_DIR      = "../llama2-legal-onnx/model.onnx"
TOKENIZER_DIR  = "../llama2-legal-merged"
PROMPT_TEXT    = "Give a concise summary of clause 7.2:"
MAX_NEW_TOKENS = 256                # hard cap
TEMPERATURE    = 0.8
TOP_K          = 40
REPETITION_PEN = 1.15               # >1.0 penalises already‑seen tokens
END_TOKENS     = {0, 2, 50256}      # eos, or add your own
# ───────────────────────────────

def sample_top_k(logits, top_k, temperature=1.0):
    """Return one sampled token id (numpy int64) from top‑k"""
    logits = logits.astype(np.float32) / temperature
    # keep top‑k
    if top_k and top_k < logits.size:
        top_ids = logits.argsort()[-top_k:]
        mask = np.ones_like(logits, dtype=bool)
        mask[top_ids] = False
        logits[mask] = -np.inf
    probs = np.exp(logits - np.max(logits))
    probs /= probs.sum()
    return np.random.choice(len(logits), p=probs)

# ─────────── Load model ─────────
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
sess = ort.InferenceSession(
    MODEL_DIR,
    providers=["CUDAExecutionProvider"],  # assumes GPU
)
print("ONNX provider →", sess.get_providers()[0])

# ─────────── Prepare prompt ─────
inputs          = tokenizer(PROMPT_TEXT, return_tensors="np")
input_ids       = inputs["input_ids"]
attention_mask  = inputs["attention_mask"]

generated = input_ids.copy()
times = []

# ─────────── Decode loop ────────
for _ in range(MAX_NEW_TOKENS):
    position_ids = np.arange(generated.shape[1], dtype=np.int64)[None, :]

    start = time.time()
    logits = sess.run(
        None,
        {
            "input_ids": generated.astype(np.int64),
            "attention_mask": attention_mask.astype(np.int64),
            "position_ids": position_ids,
        },
    )[0]
    times.append(time.time() - start)

    # repetition penalty
    logits[0, -1, np.unique(generated)] /= REPETITION_PEN

    next_id = sample_top_k(logits[0, -1], top_k=TOP_K, temperature=TEMPERATURE)
    if next_id in END_TOKENS:
        break

    next_token = np.array([[next_id]], dtype=np.int64)
    generated  = np.concatenate([generated, next_token], axis=1)
    attention_mask = np.concatenate([attention_mask, np.ones_like(next_token)], axis=1)

# ─────────── Output + stats ─────
text = tokenizer.decode(generated[0], skip_special_tokens=True)
print("\nGenerated text:\n" + "-"*60 + f"\n{text}\n" + "-"*60)

n = len(times)
print(f"\n--- Benchmark ({n} new tokens) ---")
print(f"total {sum(times):.2f}s | mean {np.mean(times)*1000:.1f} ms "
      f"| 95th {np.percentile(times,95)*1000:.1f} ms "
      f"| throughput {n/sum(times):.1f} tok/s")


[0;93m2025-05-10 03:07:51.316876084 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 32 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-05-10 03:07:51.329300097 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 03:07:51.329307461 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


ONNX provider → CUDAExecutionProvider

Generated text:
------------------------------------------------------------
Give a concise summary of clause 7.2:
The contractor can request from the client that they be released from their obligations under the contract, provided certain criteria are fulfilled. The parties must also agree to the new terms and conditions when this happens.
What is ‘the subject matter’ of the contract? (Clause 1)
This clause sets out what the subject matter of the contract is in relation to the works – that is, for example, an extension or refurbishment project. In the case of services, it will be whatever work the service provider does on behalf of the other party. This could include accountancy, marketing, IT support etc.
Does the contract require specific performance by either party?
No, although some clauses do refer to ‘specific performance not being an appropriate remedy.’ For instance, if one of the parties to a construction contract was unable to complete 

#Build a FastAPI ONNX micro-service (pattern from the hand-out)
````
docker compose -f docker-compose-fastapi.yaml up -d --build
````

````
curl -X POST http://<IP>:8000/generate \
     -H "Content-Type: application/json" \
     -d '{"prompt":"Summarise clause 7.2 in two lines"}'
````


In [24]:
#!/usr/bin/env python3
"""
Fast-ish ONNX‑runtime decoding with basic top‑k sampling
and a repetition penalty to avoid infinite loops.
"""

import time
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer

# ─────────── Config ────────────
MODEL_DIR      = "../llama2-legal-onnx/model.onnx"
TOKENIZER_DIR  = "../llama2-legal-merged"
PROMPT_TEXT    = "Summarise clause 7.2 in two lines"#"Give a concise summary of clause 7.2:"
MAX_NEW_TOKENS = 256                # hard cap
TEMPERATURE    = 0.8
TOP_K          = 40
REPETITION_PEN = 1.15               # >1.0 penalises already‑seen tokens
END_TOKENS     = {0, 2, 50256}      # eos, or add your own
# ───────────────────────────────

def sample_top_k(logits, top_k, temperature=1.0):
    """Return one sampled token id (numpy int64) from top‑k"""
    logits = logits.astype(np.float32) / temperature
    # keep top‑k
    if top_k and top_k < logits.size:
        top_ids = logits.argsort()[-top_k:]
        mask = np.ones_like(logits, dtype=bool)
        mask[top_ids] = False
        logits[mask] = -np.inf
    probs = np.exp(logits - np.max(logits))
    probs /= probs.sum()
    return np.random.choice(len(logits), p=probs)

# ─────────── Load model ─────────
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
sess = ort.InferenceSession(
    MODEL_DIR,
    providers=["CUDAExecutionProvider"],  # assumes GPU
)
print("ONNX provider →", sess.get_providers()[0])

# ─────────── Prepare prompt ─────
inputs          = tokenizer(PROMPT_TEXT, return_tensors="np")
input_ids       = inputs["input_ids"]
attention_mask  = inputs["attention_mask"]

generated = input_ids.copy()
times = []

# ─────────── Decode loop ────────
for _ in range(MAX_NEW_TOKENS):
    position_ids = np.arange(generated.shape[1], dtype=np.int64)[None, :]

    start = time.time()
    logits = sess.run(
        None,
        {
            "input_ids": generated.astype(np.int64),
            "attention_mask": attention_mask.astype(np.int64),
            "position_ids": position_ids,
        },
    )[0]
    times.append(time.time() - start)

    # repetition penalty
    logits[0, -1, np.unique(generated)] /= REPETITION_PEN

    next_id = sample_top_k(logits[0, -1], top_k=TOP_K, temperature=TEMPERATURE)
    if next_id in END_TOKENS:
        break

    next_token = np.array([[next_id]], dtype=np.int64)
    generated  = np.concatenate([generated, next_token], axis=1)
    attention_mask = np.concatenate([attention_mask, np.ones_like(next_token)], axis=1)

# ─────────── Output + stats ─────
text = tokenizer.decode(generated[0], skip_special_tokens=True)
print("\nGenerated text:\n" + "-"*60 + f"\n{text}\n" + "-"*60)

n = len(times)
print(f"\n--- Benchmark ({n} new tokens) ---")
print(f"total {sum(times):.2f}s | mean {np.mean(times)*1000:.1f} ms "
      f"| 95th {np.percentile(times,95)*1000:.1f} ms "
      f"| throughput {n/sum(times):.1f} tok/s")


[0;93m2025-05-10 03:09:22.978709284 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 32 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-05-10 03:09:22.991289654 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 03:09:22.991297429 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


ONNX provider → CUDAExecutionProvider

Generated text:
------------------------------------------------------------
Summarise clause 7.2 in two lines
A summary is a paraphrase of the text, that is, an outline of the main themes with no specific wording from the original passage; its purpose is to allow you to see the important points very quickly and concisely. In an essay examination you will not be allowed more than 150 words for your summaries (although there are some exceptions). Here we set out an example of how such a short piece can look when written as if it was part of an essay.
Idea: Describe clause 7.2 on the effectiveness of advertising by showing what this means in practice and explaining why advertisements have their intended effects. You should conclude that advertisers use images which appeal directly to our emotions rather than appeal to reason or logic.
We know immediately from reading 'effective' what clause 7.2 is about. The second sentence tells us what sort of adv

In [25]:
# MLFlow

In [27]:
!pip install mlflow

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting markdown<4,>=3.3 (from mlflow)
  Downloading markdown-3.8-py3-none-any.whl.metadata (5.1 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading cachetools-5.5.2-py3-none-any.whl.metadata (5.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.52.0-py3-none-any.whl.metadata (39 kB)
Collecting fastap

In [28]:
import mlflow
mlflow.set_tracking_uri("http://129.114.25.240:8000")  

In [None]:
import onnx
import mlflow
import mlflow.onnx

run_name         = "legalsummarizer"
artifact_path    = "onnx_model"
local_onnx_path  = "../llama2-legal-onnx/model.onnx"   # your file

# Load ONNX into memory (so we can log without copying the file twice)
model_proto = onnx.load(local_onnx_path)

with mlflow.start_run(run_name=run_name) as run:
    # (optional) meta‑data
    mlflow.log_params({
        "base_model": "Llama‑2‑7B",
        "task": "legal_clause_summarization",
        "quantized": "fp16"
    })

    # Log the ONNX model — this creates the MLflow *model* directory + MLmodel file
    mlflow.onnx.log_model(
        onnx_model=model_proto,
        artifact_path=artifact_path,
        signature=None,           # add an example in/out signature if you like
        input_example=None
    )

    run_id = run.info.run_id
    print("Logged to run:", run_id)



In [None]:
from mlflow import register_model

model_uri = f"runs:/{run_id}/{artifact_path}"
registered_model = register_model(model_uri, "LegalClauseSummarizer")

print("Registered name:", registered_model.name)
print("Version:", registered_model.version)