In [2]:
import os
import io
import base64
import pandas as pd
from PIL import Image
import hashlib


def md5(s):
    hash = hashlib.new('md5')
    if os.path.exists(s):
        with open(s, 'rb') as f:
            for chunk in iter(lambda: f.read(2**20), b''):
                hash.update(chunk)
    else:
        hash.update(s.encode('utf-8'))
    return str(hash.hexdigest())

def encode_image(img, target_size=-1, fmt='JPEG'):
    # if target_size == -1, will not do resizing
    # else, will set the max_size ot (target_size, target_size)
    if img.mode in ('RGBA', 'P'):
        img = img.convert('RGB')
    if target_size > 0:
        img.thumbnail((target_size, target_size))
    img_buffer = io.BytesIO()
    img.save(img_buffer, format=fmt)
    image_data = img_buffer.getvalue()
    ret = base64.b64encode(image_data).decode('utf-8')
    return ret


def encode_image_to_base64(img, target_size=-1):
    return encode_image(img, target_size=target_size)


def decode_base64(base64_string, target_size=-1):
    image_data = base64.b64decode(base64_string)
    image = Image.open(io.BytesIO(image_data))
    if image.mode in ('RGBA', 'P'):
        image = image.convert('RGB')
    if target_size > 0:
        image.thumbnail((target_size, target_size))
    return image


def decode_base64_to_image(base64_string, target_size=-1):
    image = decode_base64(base64_string, target_size=target_size)
    return image

def split_image(img, target_section=None, grid_size=(3, 3), encode=False):
    """
    Split an image into patches based on a given grid size.

    Args:
        img: PIL.Image object.
        grid_size: Tuple specifying the number of rows and columns for the grid (rows, columns).

    Returns:
        A dictionary of cropped patches.
    """
    width, height = img.size
    num_rows, num_cols = grid_size

    if num_rows < 1 or num_cols < 1:
        raise ValueError("grid_size must contain positive integers for rows and columns.")

    # Calculate patch dimensions
    patch_width = width / num_cols
    patch_height = height / num_rows

    grid_name = f'grid_{grid_size[0]}x{grid_size[1]}'
    # Generate patches
    patches = {}
    for row in range(num_rows):
        for col in range(num_cols):
            left = int(col * patch_width)
            upper = int(row * patch_height)
            right = int((col + 1) * patch_width)
            lower = int((row + 1) * patch_height)
            section_name = f"row{row+1}_col{col+1}"
            patches[f'{grid_name}_{section_name}'] = encode_image_to_base64(img.crop((left, upper, right, lower))) if encode else img.crop((left, upper, right, lower))

    return patches

In [22]:
## RealWorldQA pre-processing
file_path = '/home/srikapan/LMUData/RealWorldQA.tsv'
df = pd.read_csv(file_path, sep='\t')

all_list_of_imgs = {}
all_list_of_imgs['image'] = df.image.tolist()

for img in df['image']:
    all_crops = {}
    img = decode_base64_to_image(img)
    img_crops = split_image(img, grid_size=(2, 1), encode=True)
    all_crops.update(img_crops)
    img_crops = split_image(img, grid_size=(1, 2), encode=True)
    all_crops.update(img_crops)
    img_crops = split_image(img, grid_size=(2, 2), encode=True)
    all_crops.update(img_crops)
    img_crops = split_image(img, grid_size=(3, 3), encode=True)
    all_crops.update(img_crops)

    for label, crop in all_crops.items():
        if label not in all_list_of_imgs:
            all_list_of_imgs[label] = [crop]
        else:
            all_list_of_imgs[label].append(crop)

print(f'original file_path: {file_path} written successfully, md5: {md5(file_path)}')
for label, img_list in all_list_of_imgs.items():
    if label != 'image':
        temp_df = df.copy()
        temp_df['image'] = img_list
        op_file_path = f'{os.path.dirname(file_path)}/{os.path.basename(file_path).split(".")[0]}_{label}.tsv'
        temp_df.to_csv(op_file_path, sep='\t')
        print(f'op_file_path: {op_file_path} written successfully, md5: {md5(op_file_path)}')

original file_path: /home/srikapan/LMUData/RealWorldQA.tsv written successfully, md5: 4de008f55dc4fd008ca9e15321dc44b7
op_file_path: /home/srikapan/LMUData/RealWorldQA_grid_2x1_row1_col1.tsv written successfully, md5: 23dd6bb085d7fc34b4501450285c2b65
op_file_path: /home/srikapan/LMUData/RealWorldQA_grid_2x1_row2_col1.tsv written successfully, md5: 1bb1973c44a95f26281afcbe5c1b0344
op_file_path: /home/srikapan/LMUData/RealWorldQA_grid_1x2_row1_col1.tsv written successfully, md5: 05f27eaaabbfda0b64510b5a40fd7904
op_file_path: /home/srikapan/LMUData/RealWorldQA_grid_1x2_row1_col2.tsv written successfully, md5: e64007802642df4f3bc8bf82f3470e09
op_file_path: /home/srikapan/LMUData/RealWorldQA_grid_2x2_row1_col1.tsv written successfully, md5: b9d02c2e82afbae3d8574fbebed3e1e6
op_file_path: /home/srikapan/LMUData/RealWorldQA_grid_2x2_row1_col2.tsv written successfully, md5: 7cdc6b709903e7be066dbc7862bb9a00
op_file_path: /home/srikapan/LMUData/RealWorldQA_grid_2x2_row2_col1.tsv written successfu

In [25]:
## COCO sample pre-processing
file_path = '/home/srikapan/LMUData/COCO_VAL_sample.tsv'
df = pd.read_csv(file_path, sep='\t')

all_list_of_imgs = {}
all_list_of_imgs['image'] = df.image.tolist()

for img in df['image']:
    all_crops = {}
    img = decode_base64_to_image(img)
    img_crops = split_image(img, grid_size=(2, 1), encode=True)
    all_crops.update(img_crops)
    img_crops = split_image(img, grid_size=(1, 2), encode=True)
    all_crops.update(img_crops)
    img_crops = split_image(img, grid_size=(2, 2), encode=True)
    all_crops.update(img_crops)
    img_crops = split_image(img, grid_size=(3, 3), encode=True)
    all_crops.update(img_crops)

    for label, crop in all_crops.items():
        if label not in all_list_of_imgs:
            all_list_of_imgs[label] = [crop]
        else:
            all_list_of_imgs[label].append(crop)

print(f'original file_path: {file_path} written successfully, md5: {md5(file_path)}')
for label, img_list in all_list_of_imgs.items():
    if label != 'image':
        temp_df = df.copy()
        temp_df['image'] = img_list
        op_file_path = f'{os.path.dirname(file_path)}/{os.path.basename(file_path).split(".")[0]}_{label}.tsv'
        temp_df.to_csv(op_file_path, sep='\t')
        print(f'op_file_path: {op_file_path} written successfully, md5: {md5(op_file_path)}')

original file_path: /home/srikapan/LMUData/COCO_VAL_sample.tsv written successfully, md5: 5b6ed6e5f35024d003804372a13533c4
op_file_path: /home/srikapan/LMUData/COCO_VAL_sample_grid_2x1_row1_col1.tsv written successfully, md5: 9999f71c872ada7193ac6f96e9c4030e
op_file_path: /home/srikapan/LMUData/COCO_VAL_sample_grid_2x1_row2_col1.tsv written successfully, md5: 18ec1a575214bab28a2e50003e957299
op_file_path: /home/srikapan/LMUData/COCO_VAL_sample_grid_1x2_row1_col1.tsv written successfully, md5: 6b02299cf429ca0ad398d01603688448
op_file_path: /home/srikapan/LMUData/COCO_VAL_sample_grid_1x2_row1_col2.tsv written successfully, md5: 9ebee03bedc40d91c2fd08ff3dee1102
op_file_path: /home/srikapan/LMUData/COCO_VAL_sample_grid_2x2_row1_col1.tsv written successfully, md5: f5e674127cc699ebfb77f66ba5581868
op_file_path: /home/srikapan/LMUData/COCO_VAL_sample_grid_2x2_row1_col2.tsv written successfully, md5: 4c9f1408805b62e02695efe5cd546a1b
op_file_path: /home/srikapan/LMUData/COCO_VAL_sample_grid_2x2

In [3]:
## COCO pre-processing
file_path = '/home/srikapan/LMUData/COCO_VAL.tsv'
df = pd.read_csv(file_path, sep='\t')

all_list_of_imgs = {}
all_list_of_imgs['image'] = df.image.tolist()

for img in df['image']:
    all_crops = {}
    img = decode_base64_to_image(img)
    img_crops = split_image(img, grid_size=(2, 1), encode=True)
    all_crops.update(img_crops)
    img_crops = split_image(img, grid_size=(1, 2), encode=True)
    all_crops.update(img_crops)
    img_crops = split_image(img, grid_size=(2, 2), encode=True)
    all_crops.update(img_crops)
    img_crops = split_image(img, grid_size=(3, 3), encode=True)
    all_crops.update(img_crops)

    for label, crop in all_crops.items():
        if label not in all_list_of_imgs:
            all_list_of_imgs[label] = [crop]
        else:
            all_list_of_imgs[label].append(crop)

print(f'original file_path: {file_path} written successfully, md5: {md5(file_path)}')
for label, img_list in all_list_of_imgs.items():
    if label != 'image':
        temp_df = df.copy()
        temp_df['image'] = img_list
        op_file_path = f'{os.path.dirname(file_path)}/{os.path.basename(file_path).split(".")[0]}_{label}.tsv'
        temp_df.to_csv(op_file_path, sep='\t')
        print(f'op_file_path: {op_file_path} written successfully, md5: {md5(op_file_path)}')

original file_path: /home/srikapan/LMUData/COCO_VAL.tsv written successfully, md5: 72a5079dead060269ac222c5aa5128af
op_file_path: /home/srikapan/LMUData/COCO_VAL_grid_2x1_row1_col1.tsv written successfully, md5: a661f336357cc033c0ac362ce43c8934
op_file_path: /home/srikapan/LMUData/COCO_VAL_grid_2x1_row2_col1.tsv written successfully, md5: a81faa8e1616fc8893a1c9208358a8fb
op_file_path: /home/srikapan/LMUData/COCO_VAL_grid_1x2_row1_col1.tsv written successfully, md5: 220648e71ea09065a5e6c984c98d257d
op_file_path: /home/srikapan/LMUData/COCO_VAL_grid_1x2_row1_col2.tsv written successfully, md5: c0ccc20c7af4b1382a6c128ad449dd57
op_file_path: /home/srikapan/LMUData/COCO_VAL_grid_2x2_row1_col1.tsv written successfully, md5: a37fc00dab224f9ae041172b25ac8735
op_file_path: /home/srikapan/LMUData/COCO_VAL_grid_2x2_row1_col2.tsv written successfully, md5: f43fb8f80df717a53d1f441047cfd4c8
op_file_path: /home/srikapan/LMUData/COCO_VAL_grid_2x2_row2_col1.tsv written successfully, md5: 407cc5d794fb98

In [1]:
dic = {'COCO_VAL_grid_2x1_row1_col1' : 'a661f336357cc033c0ac362ce43c8934',
'COCO_VAL_grid_2x1_row2_col1' : 'a81faa8e1616fc8893a1c9208358a8fb',
'COCO_VAL_grid_1x2_row1_col1' : '220648e71ea09065a5e6c984c98d257d',
'COCO_VAL_grid_1x2_row1_col2' : 'c0ccc20c7af4b1382a6c128ad449dd57',
'COCO_VAL_grid_2x2_row1_col1' : 'a37fc00dab224f9ae041172b25ac8735',
'COCO_VAL_grid_2x2_row1_col2' : 'f43fb8f80df717a53d1f441047cfd4c8',
'COCO_VAL_grid_2x2_row2_col1' : '407cc5d794fb986fac4bebfd2eef09ca',
'COCO_VAL_grid_2x2_row2_col2' : '0924bda06630c614fd74e22d4dd43690',
'COCO_VAL_grid_3x3_row1_col1' : '54aab03cd5a0efe08032f89a88131e67',
'COCO_VAL_grid_3x3_row1_col2' : '80d808b6c9246ecd013503c3ed553a9a',
'COCO_VAL_grid_3x3_row1_col3' : '039933f1adb271f7af61481933ba9e79',
'COCO_VAL_grid_3x3_row2_col1' : '27aadc2b62fd451690b61c6217d36a41',
'COCO_VAL_grid_3x3_row2_col2' : 'f8e7abca5ed106f5303e1a8c6a0c720e',
'COCO_VAL_grid_3x3_row2_col3' : '92283d89cd17aab740d0578adca10fad',
'COCO_VAL_grid_3x3_row3_col1' : 'd50f7854d5d0b27ded7c9ab46c838791',
'COCO_VAL_grid_3x3_row3_col2' : 'd43351ea6958510794c13db54f926b7c',
'COCO_VAL_grid_3x3_row3_col3' : 'cd8f8c222ca98a8375454f23d89cfb08',}

for i in dic.keys():
    print(i, ':', f'https://huggingface.co/datasets/Srikant86/VLMEval/resolve/main/COCO_VAL/{i}.tsv')

COCO_VAL_grid_2x1_row1_col1 : https://huggingface.co/datasets/Srikant86/VLMEval/resolve/main/COCO_VAL/COCO_VAL_grid_2x1_row1_col1.tsv
COCO_VAL_grid_2x1_row2_col1 : https://huggingface.co/datasets/Srikant86/VLMEval/resolve/main/COCO_VAL/COCO_VAL_grid_2x1_row2_col1.tsv
COCO_VAL_grid_1x2_row1_col1 : https://huggingface.co/datasets/Srikant86/VLMEval/resolve/main/COCO_VAL/COCO_VAL_grid_1x2_row1_col1.tsv
COCO_VAL_grid_1x2_row1_col2 : https://huggingface.co/datasets/Srikant86/VLMEval/resolve/main/COCO_VAL/COCO_VAL_grid_1x2_row1_col2.tsv
COCO_VAL_grid_2x2_row1_col1 : https://huggingface.co/datasets/Srikant86/VLMEval/resolve/main/COCO_VAL/COCO_VAL_grid_2x2_row1_col1.tsv
COCO_VAL_grid_2x2_row1_col2 : https://huggingface.co/datasets/Srikant86/VLMEval/resolve/main/COCO_VAL/COCO_VAL_grid_2x2_row1_col2.tsv
COCO_VAL_grid_2x2_row2_col1 : https://huggingface.co/datasets/Srikant86/VLMEval/resolve/main/COCO_VAL/COCO_VAL_grid_2x2_row2_col1.tsv
COCO_VAL_grid_2x2_row2_col2 : https://huggingface.co/datasets/