# Merge the LoRA adapter into the base model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, merge_and_unload
import pathlib, torch

BASE    = "meta-llama/Llama-2-7b-hf"
ADAPTER = "LLM_LegalDocSummarization/fine_tuned_lora_model/"
MERGED  = pathlib.Path("LLM_LegalDocSummarization/llama2-legal-merged")

model = AutoModelForCausalLM.from_pretrained(BASE, torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER)
model = merge_and_unload(model)          # <-- fuse LoRA matrices
model.save_pretrained(MERGED)
AutoTokenizer.from_pretrained(BASE).save_pretrained(MERGED)

#Export to ONNX

In [None]:
pip install optimum[exporters] onnx onnxruntime-gpu

optimum-cli export onnx \
    --model LLM_LegalDocSummarization/llama2-legal-merged \
    --task text-generation \
    --fp16 \
    LLM_LegalDocSummarization/llama2-legal-onnx

#Graph-optimise & kernel-fuse

In [None]:
python -m onnxruntime_tools.optimizer_cli \
       --input LLM_LegalDocSummarization/llama2-legal-onnx/model_decoder.onnx \
       --output LLM_LegalDocSummarization/llama2-legal-onnx/model_decoder_opt.onnx \
       --float16


#(Optional) INT-4 / INT-8 quantisation (testing)

In [None]:
pip install neural-compressor

inc_quantizer \
  --model LLM_LegalDocSummarization/llama2-legal-onnx/model_decoder_opt.onnx \
  --output LLM_LegalDocSummarization/llama2-legal-onnx/model_decoder_opt.onnxmodel_decoder_int4.onnx \
  --approach static  --performance-only


# Quick test in ONNX Runtime (Dont Run)

In [None]:
import onnxruntime as ort, numpy as np, torch
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained("/home/cc/models/llama2-legal-merged")
sess = ort.InferenceSession(
          "/home/cc/models/llama2-legal-onnx/model_decoder_int4.onnx",
          providers=["TensorrtExecutionProvider","CUDAExecutionProvider"])

prompt = tok("One‑sentence summary of clause 7.2:", return_tensors="np")
outputs = sess.run(None, {"input_ids":prompt["input_ids"],
                          "attention_mask":prompt["attention_mask"]})
print(outputs[0].shape)     # sanity: (1, seq_len, vocab)

#Wrap with FastAPI or Triton(Dont Run)

In [None]:
docker run -d --gpus all -p 8000:8000 \
  -v /home/cc/triton_repo:/models \
  nvcr.io/nvidia/tritonserver:24.05-py3 \
  tritonserver --model-repository=/models


#Build a FastAPI ONNX micro-service (pattern from the hand-out)
````
docker compose -f docker-compose-fastapi.yaml up -d --build
````

````
curl -X POST http://<IP>:8000/generate \
     -H "Content-Type: application/json" \
     -d '{"prompt":"Summarise clause 7.2 in two lines"}'
````
