In [None]:
%pip install optimum[exporters] onnxruntime-openvino

In [1]:
model_name='Qwen/Qwen2.5-Coder-0.5B'

In [None]:
!optimum-cli export onnx --model {model_name} data/onnx

In [None]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("data/onnx")
model = ORTModelForCausalLM.from_pretrained("data/onnx")


In [None]:
session=model.model
print(session.get_providers())

In [None]:
from onnxruntime.transformers import optimizer
optimized_model = optimizer.optimize_model('data/onnx/model.onnx')
optimized_model.convert_float_to_float16()
# optimized_model.save_model_to_file("bert_fp16.onnx")

In [None]:
import time
import torch

fim_prefix_id = tokenizer.convert_tokens_to_ids("<|fim_prefix|>")
fim_suffix_id = tokenizer.convert_tokens_to_ids("<|fim_suffix|>")
fim_middle_id = tokenizer.convert_tokens_to_ids("<|fim_middle|>")
fim_pad_id = tokenizer.convert_tokens_to_ids("<|fim_pad|>")

model.generation_config.pad_token_id = tokenizer.pad_token_id

def generate_response(prefix, suffix):
    start = time.time()
    # Tokenize all prefixes and suffixes together
    prefix_ids = tokenizer(prefix, add_special_tokens=False)["input_ids"]
    suffix_ids = tokenizer(suffix, add_special_tokens=False)["input_ids"]

    # Combine the IDs
    inputs = {'input_ids': torch.tensor([[fim_prefix_id] + prefix_ids + [fim_suffix_id] + suffix_ids +[fim_middle_id]], dtype=torch.int64)}
    inputs['attention_mask']=torch.tensor([[1]*inputs['input_ids'].shape[1]], dtype=torch.int64)


    input_token_count=inputs['input_ids'].shape[1]
    
    outputs = model.generate(**inputs,max_new_tokens=20,eos_token_id=[tokenizer.eos_token_id,fim_pad_id,fim_suffix_id ] )
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    output_token_count=outputs.shape[-1]
    print("Total input tokens: {} Output token count: {} Time: {}ms Token/s {}".format(input_token_count, output_token_count, 1000*(time.time()-start), ( output_token_count+input_token_count)/(time.time()-start)))
    return tokenizer.decode(outputs[0])
# Example usage
response = generate_response('print("Total input tokens: {} Output token count:'*10+'\n// Log Hello World\nconsole.','')
print(f"Response: {response}")