# Merge the LoRA adapter into the base model

In [2]:
!pip install "transformers==4.37.2" "peft==0.7.1"

Collecting transformers==4.37.2
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
Collecting peft==0.7.1
  Downloading peft-0.7.1-py3-none-any.whl.metadata (25 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers==4.37.2)
  Downloading huggingface_hub-0.31.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers==4.37.2)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.37.2)
  Downloading tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers==4.37.2)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting accelerate>=0.21.0 (from peft==0.7.1)
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting hf-xet<2.0.0,>=1.1.0 (from huggingface-hub<1.0,>=0.19.

In [3]:
from huggingface_hub import login
login("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")

In [4]:
import json
from pathlib import Path

def clean_adapter_config(config_path):
    UNNEEDED_KEYS = [
        "corda_config",
        "eva_config",
        "megatron_config",
        "megatron_core",
        "loftq_config",
        "layers_pattern",
        "layer_replication",
        "auto_mapping",
        "revision",
        "modules_to_save",
        "trainable_token_indices",
        "use_dora",
        "use_rslora",
        "rank_pattern",
        "fan_in_fan_out",
        "init_lora_weights",
        "exclude_modules",
        "lora_bias",
        "layers_to_transform"
    ]

    path = Path(config_path)
    if not path.exists():
        raise FileNotFoundError(f"Config file not found: {path}")

    with open(path, "r") as f:
        config = json.load(f)

    for key in UNNEEDED_KEYS:
        if key in config:
            print(f"🧹 Removing: {key}")
            config.pop(key)

    with open(path, "w") as f:
        json.dump(config, f, indent=2)

    print(f"Cleaned config saved to: {path}")

# Clean this config before merging LoRA
clean_adapter_config("../fine_tuned_lora_model/adapter_config.json")


Cleaned config saved to: ../fine_tuned_lora_model/adapter_config.json


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from peft import PeftModel
import torch
import pathlib

BASE = "meta-llama/Llama-2-7b-hf"
ADAPTER = "../fine_tuned_lora_model"
MERGED = pathlib.Path("../llama2-legal-merged")

# Load base model
model = AutoModelForCausalLM.from_pretrained(BASE, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(BASE)

# Load LoRA adapter
model = PeftModel.from_pretrained(model, ADAPTER)

# ⚠️ MANUAL LoRA MERGE
model.base_model.merge_and_unload()

config = AutoConfig.from_pretrained(BASE)
config.save_pretrained(MERGED)

# Save the merged model
model.save_pretrained(MERGED, safe_serialization=False)
tokenizer.save_pretrained(MERGED)



config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

('../llama2-legal-merged/tokenizer_config.json',
 '../llama2-legal-merged/special_tokens_map.json',
 '../llama2-legal-merged/tokenizer.json')

In [6]:
!mv ../llama2-legal-merged/adapter_model.bin ../llama2-legal-merged/pytorch_model.bin

# Export to ONNX

In [7]:
!pip install optimum[exporters] onnx onnxruntime-gpu

Collecting optimum[exporters]
  Downloading optimum-1.24.0-py3-none-any.whl.metadata (21 kB)
Collecting onnxruntime (from optimum[exporters])
  Downloading onnxruntime-1.22.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting timm (from optimum[exporters])
  Downloading timm-1.0.15-py3-none-any.whl.metadata (52 kB)
Downloading onnxruntime-1.22.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading optimum-1.24.0-py3-none-any.whl (433 kB)
Downloading timm-1.0.15-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: onnxruntime, timm, optimum
Successfully installed onnxruntime-1.22.0 optimum-1.24.0 timm-1.0.15


In [None]:
!pip install --extra-index-url https://pypi.nvidia.com \
            tensorrt-llm==0.19.0  --no-cache-dir

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting tensorrt-llm==0.19.0
  Downloading https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.19.0-cp312-cp312-linux_x86_64.whl (2049.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 GB[0m [31m111.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting build (from tensorrt-llm==0.19.0)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting colored (from tensorrt-llm==0.19.0)
  Downloading colored-2.3.0-py3-none-any.whl.metadata (3.6 kB)
Collecting cuda-python (from tensorrt-llm==0.19.0)
  Downloading cuda_python-12.9.0-py3-none-any.whl.metadata (4.6 kB)
Collecting diffusers>=0.27.0 (from tensorrt-llm==0.19.0)
  Downloading diffusers-0.33.1-py3-none-any.whl.metadata (19 kB)
Collecting lark (from tensorrt-llm==0.19.0)
  Downloading lark-1.2.2-py3-none-any.whl.metadata (1.8 kB)
Collecting mpi4py (from tensorrt-llm==0.19.0)
  Downloading mpi4py-4.0.3.tar.gz (4

In [None]:
import mpi4py, tensorrt_llm as trt, torch
print("mpi4py:", mpi4py.__version__)
print("TRT‑LLM:", trt.__version__)
print("CUDA:", torch.version.cuda)

In [None]:
!optimum-cli export trtllm \
  --model ../llama2-legal-merged \
  --task causal-lm-with-past \
  --dtype fp16 \
  --batch-size 1 \
  --sequence-length 4096 \
  ../llama2-legal-trtllm 

In [10]:
!optimum-cli export onnx \
  --model ../llama2-legal-merged \
  --task causal-lm-with-past \
  --dtype fp16 \
  --device cuda \
  --library transformers \
  ../llama2-legal-onnx

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:02<00:00,  1.03s/it]
  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
  if past_key_values_length > 0:
  if seq_len > self.max_seq_len_cached:
  if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
In-place op on output of tensor.shape. See https://pytorch.org/docs/main/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/main/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/main/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/main/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/main/onnx.html#avoid-inplace-operations-when-

# Graph-optimise & kernel-fuse (Didnot Run for just checking)

In [None]:
python -m onnxruntime_tools.optimizer_cli \
       --input LLM_LegalDocSummarization/llama2-legal-onnx/model_decoder.onnx \
       --output LLM_LegalDocSummarization/llama2-legal-onnx/model_decoder_opt.onnx \
       --float16


# (Optional) INT-4 / INT-8 quantisation (Didnot Run for just checking)

In [None]:
pip install neural-compressor

inc_quantizer \
  --model LLM_LegalDocSummarization/llama2-legal-onnx/model_decoder_opt.onnx \
  --output LLM_LegalDocSummarization/llama2-legal-onnx/model_decoder_opt.onnxmodel_decoder_int4.onnx \
  --approach static  --performance-only


# Quick test in ONNX Runtime (Dont Run)

In [None]:
import onnxruntime as ort, numpy as np, torch
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained("/home/cc/models/llama2-legal-merged")
sess = ort.InferenceSession(
          "/home/cc/models/llama2-legal-onnx/model_decoder_int4.onnx",
          providers=["TensorrtExecutionProvider","CUDAExecutionProvider"])

prompt = tok("One‑sentence summary of clause 7.2:", return_tensors="np")
outputs = sess.run(None, {"input_ids":prompt["input_ids"],
                          "attention_mask":prompt["attention_mask"]})
print(outputs[0].shape)     # sanity: (1, seq_len, vocab)

In [46]:
!pip uninstall onnxruntime -y
!pip install onnxruntime-gpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
!apt update
!apt install -y nvidia-cuda-toolkit

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[sudo] password for jovyan: 


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[sudo] password for jovyan: 

In [1]:
import onnxruntime as ort
print(ort.get_available_providers())

['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']


In [39]:
import numpy as np
import torch
import onnxruntime as ort
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("../llama2-legal-merged")

# Set up ONNX Runtime session with fallback
providers = ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"]
available_providers = ort.get_available_providers()
sess = ort.InferenceSession(
    "../llama2-legal-onnx/model.onnx",
    providers=[p for p in providers if p in available_providers]
)

print("Using provider:", sess.get_providers()[0])

# Prompt
prompt_text = "One-sentence summary of clause 7.2:"
inputs = tokenizer(prompt_text, return_tensors="np")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Start generation loop
max_new_tokens = 50

for _ in range(max_new_tokens):
    position_ids = np.arange(input_ids.shape[1], dtype=np.int64)[None, :]

    outputs = sess.run(None, {
        "input_ids": input_ids.astype(np.int64),
        "attention_mask": attention_mask.astype(np.int64),
        "position_ids": position_ids
    })

    logits = outputs[0]  # (1, seq_len, vocab_size)
    next_token_id = np.argmax(logits[:, -1, :], axis=-1)

    # Append next token
    input_ids = np.concatenate([input_ids, next_token_id[:, None]], axis=1)
    attention_mask = np.concatenate([attention_mask, np.ones_like(next_token_id)[:, None]], axis=1)

    if next_token_id[0] == tokenizer.eos_token_id:
        break

# Decode and print
output_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
print(output_text)

Using provider: CPUExecutionProvider


KeyboardInterrupt: 

#Wrap with FastAPI or Triton(Dont Run)

In [None]:
docker run -d --gpus all -p 8000:8000 \
  -v /home/cc/triton_repo:/models \
  nvcr.io/nvidia/tritonserver:24.05-py3 \
  tritonserver --model-repository=/models


#Build a FastAPI ONNX micro-service (pattern from the hand-out)
````
docker compose -f docker-compose-fastapi.yaml up -d --build
````

````
curl -X POST http://<IP>:8000/generate \
     -H "Content-Type: application/json" \
     -d '{"prompt":"Summarise clause 7.2 in two lines"}'
````
