In [26]:
!pip install openvino



In [27]:
!pip install openvino-dev[onnx]



In [28]:
!pip install flash-attn



In [29]:
!pip install timm



In [30]:
import numpy as np
import requests
from openvino.runtime import Core
from transformers import AutoProcessor
from PIL import Image
from io import BytesIO
import torch
import onnx
import os
from transformers import AutoProcessor, AutoModelForCausalLM
import warnings
warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [31]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base-ft", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True)

Disable flash attention

In [32]:
def disable_flash_attention(model):
    for name, module in model.named_modules():
        if hasattr(module, 'use_flash_attention'):
            print(f"Disabling flash attention in {name}")
            module.use_flash_attention = False

Handle conditional code

In [33]:
def handle_conditional_code(model):
    for name, module in model.named_modules():
        if "non_traceable" in name:
            print(f"Handling non-traceable section in {name}")
            pass

In [34]:
disable_flash_attention(model)
handle_conditional_code(model)

creating input for text and image

In [35]:
dummy_input_ids = torch.ones(1, 12, dtype=torch.long).to(device)
dummy_pixel_values = torch.randn(1, 3, 224, 224).to(device)
dummy_decoder_input_ids = torch.ones(1, 12, dtype=torch.long).to(device)
dummy_attention_mask = torch.ones_like(dummy_input_ids).to(device)
dummy_decoder_attention_mask = torch.ones_like(dummy_decoder_input_ids).to(device)

exporting the model to onnx

In [36]:
onnx_model_path = "florence_cpu_model.onnx"
torch.onnx.export(
    model,
    (dummy_input_ids, dummy_pixel_values, dummy_decoder_input_ids, dummy_attention_mask, dummy_decoder_attention_mask),
    onnx_model_path,
    export_params=True,
    opset_version=14,
    do_constant_folding=True,
    input_names=['input_ids', 'pixel_values', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask'],
    output_names=['output'],
    dynamic_axes={
        'input_ids': {0: 'batch_size'},
        'pixel_values': {0: 'batch_size'},
        'decoder_input_ids': {0: 'batch_size'},
        'attention_mask': {0: 'batch_size'},
        'decoder_attention_mask': {0: 'batch_size'},
        'output': {0: 'batch_size'}
    }
)

converting onnx to openvino model

In [37]:
print(f"model exported to onxx format at {onnx_model_path}")
os.system(f"mo --input_model {onnx_model_path} --output_dir ./openvino_model")
print("model successfully converted to OpenVino format")

model exported to onxx format at florence_cpu_model.onnx
model successfully converted to OpenVino format


# Inference Script

In [38]:
#decoding method for better output
def softmax(x, temperature=1.0):
    x = x / temperature
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / e_x.sum(axis=-1, keepdims=True)

def top_k_sampling(probs, k=5, penalty_factor=0.9, penalize_tokens=['<loc_0>']):
    batch_size, seq_len, vocab_size = probs.shape
    indices = np.argpartition(probs, -k, axis=-1)[:, :, -k:]
    selected_indices = np.zeros((batch_size, seq_len), dtype=np.int64)
    for i in range(seq_len):
        choices = indices[0, i, :]
        weighted_probs = probs[0, i, choices] * penalty_factor
        for j, choice in enumerate(choices):
            token_str = processor.tokenizer.convert_ids_to_tokens([choice])[0]
            if token_str in penalize_tokens:
                weighted_probs[j] *= 0.1
        selected_indices[:, i] = np.random.choice(choices, p=weighted_probs / weighted_probs.sum())
    return selected_indices

In [39]:
#loading the model
core = Core()
model_path = "./openvino_model/florence_cpu_model.xml"
compiled_model = core.compile_model(model_path, "CPU")
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True)

In [40]:
print("model input names and potential shapes")
for input in compiled_model.inputs:
    shape = input.get_partial_shape()
    print(f"{input.get_any_name()}: {shape}")

model input names and potential shapes
input_ids: [?,12]
pixel_values: [?,3,224,224]
attention_mask: [?,12]
decoder_attention_mask: [?,12]


In [41]:
image_url = 'https://images.unsplash.com/photo-1501594907352-04cda38ebc29'
response = requests.get(image_url)
if 'image' in response.headers.get('Content-Type', ''):
    real_image = Image.open(BytesIO(response.content))
    print("Image successfully opened")
else:
    raise ValueError(f"failed to download image or invalid content type: {response.headers['Content-Type']}")
real_image = real_image.resize((224, 224))
image_array = np.array(real_image).astype(np.float32)
image_array = image_array.transpose(2, 0, 1)
image_array = np.expand_dims(image_array, axis=0)

Image successfully opened


In [42]:
text_input = "What does this image show?"
inputs = processor(text=[text_input], images=real_image, return_tensors="np", padding=False, truncation=False)

In [43]:
#preprocessing according to model
input_ids = inputs['input_ids']
if input_ids.shape[1] > 12:
    input_ids = input_ids[:, :12]
elif input_ids.shape[1] < 12:
    padding_length = 12 - input_ids.shape[1]
    input_ids = np.pad(input_ids, ((0, 0), (0, padding_length)), constant_values=processor.tokenizer.pad_token_id)

attention_mask = inputs['attention_mask']
if attention_mask.shape[1] > 12:
    attention_mask = attention_mask[:, :12]
elif attention_mask.shape[1] < 12:
    padding_length = 12 - attention_mask.shape[1]
    attention_mask = np.pad(attention_mask, ((0, 0), (0, padding_length)), constant_values=0)

In [44]:
decoder_attention_mask = attention_mask.copy()
inputs_dict = {
    "input_ids": input_ids.astype(np.int64),
    "pixel_values": image_array.astype(np.float32),
    "attention_mask": attention_mask.astype(np.int64),
    "decoder_attention_mask": decoder_attention_mask.astype(np.int64)
}

In [45]:
#running inference and applying deconding method
output = compiled_model(inputs_dict)
output_data = output['/language_model/Add_output_0']
output_probs = softmax(output_data, temperature=0.7)
predicted_token_ids = top_k_sampling(output_probs, k=5, penalty_factor=0.9, penalize_tokens=['<loc_0>'])
tokens = processor.tokenizer.convert_ids_to_tokens(predicted_token_ids[0].tolist())

In [46]:
#post-processing
cleaned_tokens = []
for token in tokens:
    if token not in ['<s>', '</s>', '<pad>'] and not (token in cleaned_tokens and token in [',', '.', "'"]):
        cleaned_tokens.append(token)
decoded_output = " ".join(cleaned_tokens)
decoded_output = decoded_output.replace(" Ġ", " ").replace(" '", "'").strip()
decoded_output = decoded_output.replace(" ,", ",").replace(" .", ".").replace(" 's", "'s")
print("Decoded output:", decoded_output)

Decoded output: water  on


In [51]:
import torch
import transformers
import onnx
import openvino
import PIL
import numpy as np
import requests
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"ONNX version: {onnx.__version__}")
print(f"OpenVINO version: {openvino.__version__}")
print(f"Pillow (PIL) version: {PIL.__version__}")
print(f"Numpy version: {np.__version__}")
print(f"Requests version: {requests.__version__}")


PyTorch version: 2.4.1+cu121
Transformers version: 4.44.2
ONNX version: 1.16.0
OpenVINO version: 2024.4.0-16579-c3152d32c9c-releases/2024/4
Pillow (PIL) version: 10.4.0
Numpy version: 1.26.4
Requests version: 2.32.3
