In [2]:
import os
import json
import pandas as pd
import pyarrow as pa
from PIL import Image
import io
import numpy as np
import pickle

In [3]:
with open('../VQA_RAD Dataset Public.json', 'r') as f:
    data = json.load(f)

image_folder_path = '../VQA_RAD Image Folder' 

data_X = {
    'instanceID': [],
    'images': [],
    # 'segmentations': [],  
    # 'report_text': [],
    'caption': [],
    #'label': [],
    'metadata': []
}

def image_to_numpy(image_path):
    with Image.open(image_path) as img:
        # convert to rgb
        img = img.convert('RGB')
        # to serialized numpy array
        img = img.resize((224, 224))
        img_array = np.array(img)
        img_bytes = pickle.dumps(img_array)
        return img_bytes


for entry in data:
    instanceID = entry['qid_linked_id']
    image_name = entry['image_name']
    
    # Define Caption = Question + Answer
    caption = f"Organ: {entry['image_organ']} Question: {entry['question']} Answer: {entry['answer']}"
    
    image_path = os.path.join(image_folder_path, image_name)
    
    image_data = image_to_numpy(image_path)
    
    metadata = {
        'dataset': "VQA_RAD",
        'image_organs': entry['image_organ'],
        'phrase_type': entry['phrase_type'],
        'evaluation': entry['evaluation'],
        'question_type': entry['question_type'],
        'answer_type': entry['answer_type']
    }
    
    
    data_X['instanceID'].append(instanceID)
    data_X['images'].append(image_data)  
    data_X['caption'].append(caption)
    # data_X['label'].append(entry['image_organ']) # To-do
    data_X['metadata'].append(metadata)
    
    # data_X['segmentations'].append(None)  
    # data_X['report_text'].append(None)  

In [4]:
df = pd.DataFrame(data_X)
print(df.head(10))

                             instanceID   
0  03f451ca-de62-4617-9679-e836026a7642  \
1  06e26b2c-04b9-42bc-8e98-1de30a0f7682   
2  0d0e8b6b-7753-4788-9b6d-dc7f25250c3f   
3  0e90b6bc-265f-490b-a039-509b9907a3cb   
4  1179f612-12e0-4dda-aee0-f14a5200be7b   
5  2415ba3a-3044-4d68-a99f-21249b892970   
6  27169439-b749-4b2f-8abe-bfb57a46ce53   
7  31b7e566-757d-4758-a2a8-c3025e06a6b3   
8  38319c99-665d-4ad8-a49b-4d539a745647   
9  399e22da-72d5-4078-9703-8156b5091875   

                                              images   
0  b'\x80\x04\x95\x85\x00\x00\x00\x00\x00\x00\x00...  \
1  b'\x80\x04\x95\x85\x00\x00\x00\x00\x00\x00\x00...   
2  b'\x80\x04\x95\x85\x00\x00\x00\x00\x00\x00\x00...   
3  b'\x80\x04\x95\x85\x00\x00\x00\x00\x00\x00\x00...   
4  b'\x80\x04\x95\x85\x00\x00\x00\x00\x00\x00\x00...   
5  b'\x80\x04\x95\x85\x00\x00\x00\x00\x00\x00\x00...   
6  b'\x80\x04\x95\x85\x00\x00\x00\x00\x00\x00\x00...   
7  b'\x80\x04\x95\x85\x00\x00\x00\x00\x00\x00\x00...   
8  b'\x80\x04\x95\x85\

In [5]:
# convert to parquet
df.to_parquet('VQA_RAD.parquet', index=False)