# Load BeamSeach onnx model 
This assumes you've run `onnxruntime.transformers.convert_generation` from `convert_to_onnx.ipynb`.

In [2]:
import onnx
task = 'titles'
# Load full beam search onnx model
base_path = f"../data/models/onnx/v2.0.0/beam_search_model"
'''
This code loads a model from an onnx file and sets the task name to "".
'''
beam_model_path = f"{base_path}/quantized.onnx"
beam_model = onnx.load(beam_model_path)
task_name = ''
quantized = True
# get beam search node from gr
beam_search = beam_model.graph.node[0] 
assert beam_search.op_type == 'BeamSearch'

# Optimize model using ORT

In [15]:
from onnx import numpy_helper, TensorProto
import numpy as np
import h5py
import os

# TODO: automate 
# The lora weights are named like: "onnx::Matmul_1520_" + some postfix for the quantization
# We find these weights by finding the names of the lora layers in the model state_dict, searching the BeamSearch model for these layer names, and then 
# saving the intializers for the layer
lora_initializers = [1520,1531,1537,1548,1554,1565,1571,1582,1588,1599,1605,1616,949,960,979,990,1003,1014,
                     1027,1038,1051,1062,1075,1086,966,994,1018,1042,1066,1090,1670, 1798, 1674, 1802, 1813, 
                     1685, 1691, 1787, 1702, 1706, 1717, 1723, 1734, 1738, 1749, 1625, 1755, 1636, 1766, 1640, 
                     1770, 1651, 1781, 1659]

# Create a directory to store the initiali
task_initializers_path = f"../initializers/{task_name}"
if not os.path.exists(task_initializers_path):
    os.makedirs(task_initializers_path)
else:
    # Delete all files in the direct
    for file in os.listdir(task_initializers_path):
        os.remove(os.path.join(task_initializers_path, file))

i = 0
# Create a new HDF5 file
with h5py.File(f'../initializers/{task_name}/initializers.h5', 'w') as f:

    # encoder and decoder subghs each have lora layers 
    count_total_init = 0
    count_init = 0
    for model in ["encoder", "decoder"]:
        # find the subgraph
        subgraph = None
        for attr in beam_search.attribute:
            if attr.name == model:
                subgraph = attr.g # 'g' the attribute type for subgraphs
                break
        assert(subgraph is not None)

        for initializer in subgraph.initializer:
            count_total_init += 1

            # Save all non-loraitializers
            has_lora = False
            for lora_init in lora_initializers:
                if str(lora_init) in initializer.name:
                    has_lora = True
                    break

            if not (initializer.name.startswith("onnx::MatMul") and has_lora) and not initializer.name.endswith("output_0"):
                count_init += 1
                initializer_data = np.array(numpy_helper.to_array(initializer))

                # recursivelyate directories
                file_name =f'../initializers/{task_name}/{initializer.name}.h5' 
                os.makedirs(os.path.dirname(file_name), exist_ok=True)

                # Create a newset in the file. Always going to use `data` as the dataset name
                dset = f.create_dataset(str(i), data=initializer_data)
                # Create a new attribute in the dataset. Always going to use `type` as the attribute name
                dset.attrs['type'] = initializer.data_type
                dset.attrs['name'] = initializer.name

                i += 1
                
                # Find the TensorPattribhat contains the initializer data and remove it
                if initializer.raw_data:
                    initializer.ClearField('raw_data')
                elif initializer.double_data:
                    initializer.ClearField('double_data')
                elif initializer.float_data:
                    initializer.ClearField('float_data')
                elif initializer.int32_data:
                    initializer.ClearField('int32_data')
                elif initializer.int64_data:
                    initializer.ClearField('int64_data')
                
                # Set the data location to an external dummy file
                initializer.data_location = TensorProto.EXTERNAL
                location = initializer.external_data.add()
                location.key = "location"
                location.value = "./dummy_data.txt"
f.close()
print(f"Total initializers: {count_total_init}")
print(f"Initializers saved: {count_init}")

# Save the modifmodel
modified_beam_model_path = f"{base_path}/lora_inits_only.onnx" 
onnx.save(beam_model, modified_beam_model_path)

Total initializers: 894
Initializers saved: 342


# Test original BeamSearch onnx model (no LoRA swapping)

In [3]:
import onnxruntime as ort
import datetime
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained('Salesforce/codet5-small')
# Create onnx inferesession
sess = ort.InferenceSession(beam_model_path)

# Create inputs
input_string = '''console.log('Hello World!');'''

input_ids = np.asarray(tokenizer(input_string, return_tensors='pt')['input_ids'], dtype=np.int32)
print("input tokens:", input_ids)

inputs = {
    'input_ids': input_ids, 
    'max_length': np.array([128], dtype=np.int32),
    'min_length': np.array([0], dtype=np.int32),
    'num_beams': np.array([4], dtype=np.int32),
    'num_return_sequences': np.array([1], dtype=np.int32),
    'length_penalty': np.array([80.0], dtype=np.float32),
    'repetition_penalty': np.array([50.0], dtype=np.float32),
}

# Run inference
start = datetime.datetime.now()
outputs = sess.run(None, inputs)
end = datetime.datetime.now()
print("Baseline time (ms):", (end - start).total_seconds() * 1000)

baseline_output = outputs[0][0][0]
print("output tokens:", baseline_output)
print("Baseline output:", tokenizer.decode(baseline_output, skip_special_tokens=True))


2023-06-08 11:38:49.461185 [W:onnxruntime:, graph.cc:3487 CleanUnusedInitializersAndNodeArgs] Removing initializer '/t5_decoder_init/decoder/block.4/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.
2023-06-08 11:38:49.461205 [W:onnxruntime:, graph.cc:3487 CleanUnusedInitializersAndNodeArgs] Removing initializer '/t5_decoder_init/decoder/block.4/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.
2023-06-08 11:38:49.461210 [W:onnxruntime:, graph.cc:3487 CleanUnusedInitializersAndNodeArgs] Removing initializer '/t5_decoder_init/decoder/block.4/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.
2023-06-08 11:38:49.461215 [W:onnxruntime:, graph.cc:3487 CleanUnusedInitializersAndNodeArgs] Removing initializer '/t5_decoder_init/decoder/block.3/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be remo

input tokens: [[    1  8698    18  1330  2668 18601 21820  5124  1769     2]]
Baseline time (ms): 55.716
output tokens: [    0     1  4528  2207 23669    18  8683 15990    18  3341    19   275
    17  3378    19  8532    19  4079    19 10215    19  1343     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
Baseline output: https://developer.mozilla.org/en-US/docs/Web/Console/Log


# Test BeamSeach with swapped LoRA weights from HDF5 files 

In [4]:
from onnx import numpy_helper, TensorProto
import numpy as np
import h5py
# Initialize ort session options
options = ort.SessionOptions()

# add initializers to session options
ortvalue_initializers = []
count_init = 342
task_initializers_path = '../data/models/onnx/v2.0.0/initializers/initializers.h5'
modified_beam_model_path = f'../data/models/onnx/v2.0.0/beam_search_model/lora_inits_only.onnx'
# open h5 file
with h5py.File(task_initializers_path, 'r') as f:
    for i in range(count_init):
        # get data from h5 file
        arr_h5 = f[str(i)][()]
        
        # detect if scalar. Must be a np array for ortvalue_from_numpy
        if arr_h5.shape == ():
            arr_h5 = np.array([arr_h5])        
        
        # create OrtValue from numpy array
        ortvalue_initializers.append(ort.OrtValue.ortvalue_from_numpy(arr_h5))

        # get initializer name from 'name' attribute
        initializer_name = f[str(i)].attrs['name']

        # add initializer to session options. We append to outer list and then add to prevent garbage collection
        # https://stackoverflow.com/questions/74139718/onnx-runtime-adding-multiple-initializers-in-python
        options.add_initializer(initializer_name, ortvalue_initializers[-1])
f.close()

# create new session with initializers
new_session = ort.InferenceSession(
     modified_beam_model_path,
    sess_options=options,
)

# Run inference
start = datetime.datetime.now()
outputs = new_session.run(None, inputs)

end = datetime.datetime.now()

print("LORA time (ms):", (end - start).total_seconds() * 1000)
lora_output = outputs[0][0][0]

# Check that outputs match
assert(np.array_equal(baseline_output, lora_output))

# print("Lora output:", lora_output)
print(f"input: {input_string}")
print("out of app output:", tokenizer.decode(lora_output, skip_special_tokens=True))
print("in app output: Python Code.")

2023-06-08 11:45:02.832476 [W:onnxruntime:, graph.cc:3487 CleanUnusedInitializersAndNodeArgs] Removing initializer '/t5_decoder_init/decoder/block.4/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.
2023-06-08 11:45:02.832497 [W:onnxruntime:, graph.cc:3487 CleanUnusedInitializersAndNodeArgs] Removing initializer '/t5_decoder_init/decoder/block.4/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.
2023-06-08 11:45:02.832504 [W:onnxruntime:, graph.cc:3487 CleanUnusedInitializersAndNodeArgs] Removing initializer '/t5_decoder_init/decoder/block.4/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.
2023-06-08 11:45:02.832509 [W:onnxruntime:, graph.cc:3487 CleanUnusedInitializersAndNodeArgs] Removing initializer '/t5_decoder_init/decoder/block.3/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be remo

RuntimeException: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Exception during initialization: /Users/runner/work/1/s/onnxruntime/core/optimizer/initializer.cc:43 onnxruntime::Initializer::Initializer(const onnx::TensorProto &, const onnxruntime::Path &) [ONNXRuntimeError] : 1 : FAIL : tensorprotoutils.cc:625 GetExtDataFromTensorProto External initializer: decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight_scale offset: 0 size to read: 4 given file_length: 1 are out of bounds or can not be read in full.


In [25]:
# create new session with initializers
new_session = ort.InferenceSession(
    f"../data/models/latest/titles/lora_inits_only.onnx",
    sess_options=options,
)

# Run inference
start = datetime.datetime.now()
outputs = new_session.run(None, inputs)
end = datetime.datetime.now()

print("LORA time (ms):", (end - start).total_seconds() * 1000)
lora_output = outputs[0][0][0]

print("Lora output:", lora_output)
print("tokenized output:", tokenizer.decode(lora_output, skip_special_tokens=True))

NoSuchFile: [ONNXRuntimeError] : 3 : NO_SUCHFILE : Load model from ../data/models/latest/titles/lora_inits_only.onnx failed:Load model ../data/models/latest/titles/lora_inits_only.onnx failed. File doesn't exist