# Introduction

Using the Qwen2-VL 7B model to generate ground truth OCR data for SROIEv2 dataset.

In [1]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from transformers import BitsAndBytesConfig
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader

import torch
import glob
import os

In [2]:
model_id = "Qwen/Qwen2-VL-7B-Instruct"

# flash_attention_2 for better acceleration and memory saving. Great for batched inference.
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto"
)

# Load processor
processor = AutoProcessor.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
image_paths_list = [
    '../../input/sroie_v2/SROIE2019/train/img/*.jpg', # Train.
    '../../input/sroie_v2/SROIE2019/test/img/*.jpg' # Test
] 

In [4]:
out_dir_list = [
    '../../input/qwen2_vl_7b_annots/train_annots', #Train
    '../../input/qwen2_vl_7b_annots/test_annots' # Test
]

## Batch Inference

Batch processing example
```python
messages1 = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "file:///path/to/image1.jpg"},
            {"type": "image", "image": "file:///path/to/image2.jpg"},
            {"type": "text", "text": "What are the common elements in these pictures?"},
        ],
    }
]
messages2 = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Who are you?"},
]
# Combine messages for batch processing
messages = [messages1, messages1]
```

In [5]:
def batch_infer(messages):
    # Preparation for inference
    texts = [
            processor.apply_chat_template(
            msg, tokenize=False, add_generation_prompt=True
        )
        for msg in messages
    ]
    
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=texts,
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    )
    inputs = inputs.to("cuda")
    
    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    # print(output_text)
    return output_text

In [6]:
batch_size = 2

In [7]:
class BatchedDataset(Dataset):
    def __init__(self, all_images):
        self.all_images = all_images

    def __len__(self):
        return len(self.all_images)

    def __getitem__(self, idx):
        return self.all_images[idx]

In [8]:
for image_path, out_dir in zip(image_paths_list, out_dir_list):
    all_images = glob.glob(image_path)
    os.makedirs(out_dir, exist_ok=True)
    
    custom_dataset = BatchedDataset(all_images)
    
    batch_dl = DataLoader(custom_dataset, batch_size=batch_size, shuffle=False)
    
    print('####### Sample paths #######')
    for i in batch_dl:
        print(i)
        break
    print('####### Sample paths #######')
    
    for batch in tqdm(batch_dl, total=len(batch_dl)):
        messages = []
        
        for image_path in batch:
            message = [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "image": image_path,
                            "resized_height": 768,
                            "resized_width": 512,
                        },
                        {"type": "text", "text": "Give the OCR text from this image and nothing else."},
                    ],
                }
            ]
            messages.append(message)
    
        texts = batch_infer(messages)
    
        for text, image_path in zip(texts, batch):
            # print(text)
            with open(os.path.join(out_dir, image_path.split(os.path.sep)[-1].split('.jpg')[0]+'.txt'), 'w') as f:
                f.write(text)

####### Sample paths #######
['../../input/sroie_v2/SROIE2019/train/img/X51006392122.jpg', '../../input/sroie_v2/SROIE2019/train/img/X00016469612.jpg']
####### Sample paths #######


  0%|          | 0/313 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.02 GiB. GPU 0 has a total capacity of 9.75 GiB of which 834.25 MiB is free. Including non-PyTorch memory, this process has 8.43 GiB memory in use. Of the allocated memory 7.14 GiB is allocated by PyTorch, and 1.03 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)