In [1]:
from transformers import AutoProcessor, AutoModelForImageTextToText
from qwen_vl_utils import process_vision_info
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
model = AutoModelForImageTextToText.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.31s/it]


In [3]:
import torch
if torch.cuda.is_available():
    model = model.to("cuda")

In [4]:
file_name = "../attachments/gui2.jpg"

In [5]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": file_name,
                "resized_height": 696,
                "resized_width": 943,
            },
            {
                "type": "text",
                "text": "Retrieve invoice_number, date_of_issue, seller_info, client_info, invoice_items_table, currency, invoice_summary_table. Response must be in JSON format"
            }
        ]
    }
]

In [6]:
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)

# Move inputs to the same device as the model
device = "cuda" if torch.cuda.is_available() else "cpu"
inputs = {key: value.to(device) for key, value in inputs.items()}

generated_ids = model.generate(**inputs, max_new_tokens=1024)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True)

output_text

['```json\n{\n  "invoice_number": "24/12/2025",\n  "date_of_issue": "04. October 2024",\n  "seller_info": {\n    "name": "PSI Services SA",\n    "address": "17, Rue de Flaxweiler - L-6776 Grevenmacher"\n  },\n  "client_info": {\n    "name": "Sir or Madam"\n  },\n  "invoice_items_table": [\n    {\n      "position": 1,\n      "description": "Administration costs September 2024",\n      "vat_percent": 0.00,\n      "net_amount": 12567.09,\n      "vat_amount": 0.01,\n      "gross_amount": 23546.11\n    }\n  ],\n  "currency": "CHF",\n  "invoice_summary_table": [\n    {\n      "total": 12567.09,\n      "vat_percent": 0.00,\n      "net_amount": 12567.09,\n      "vat_amount": 0.01,\n      "gross_amount": 23546.11\n    }\n  ]\n}\n```']

In [22]:
json_string = output_text[0]
json_string = json_string.strip("[]'")
json_string = json_string.replace("```json\n", "").replace("\n```", "")
json_string = json_string.replace("'", "")
print(json_string)
try:
    formatted_json = json.loads(json_string)
    with open('../Data/InvoiceData/'+formatted_json['client']+'.json', 'w') as f:
        json.dump(formatted_json, f)

    print(json.dumps(formatted_json, indent=3))
except json.JSONDecodeError as e:
    print("Not valid JSON format:", e)

{
  "client": "PSI Concepts SA",
  "date": "04. October 2024",
  "vat": "0.00%",
  "brutto": "12.567.09",
  "net": "0.01",
  "currency": "CHF"
}
{
   "client": "PSI Concepts SA",
   "date": "04. October 2024",
   "vat": "0.00%",
   "brutto": "12.567.09",
   "net": "0.01",
   "currency": "CHF"
}
