In [4]:
from datasets import Dataset, Features, Value
import json
from PIL import Image
import os
import numpy as np
from tqdm import tqdm
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# 读取图像并转换为字节流
def load_image_as_bytes(image_path):
    image = Image.open(image_path)
    # 调整图像大小（如果需要）
    image = image.resize((512, 512))  # 确保图像大小一致
    image_array = np.array(image)
    # 将 NumPy 数组转换为字节流
    return image_array.tobytes()

# 读取JSON文件并构建数据集
def load_custom_dataset(json_file_path, image_folder_path):
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    dataset_data = {
        "image_name": [],
        "caption": [],
        "image": [],
    }
    
    # 使用 tqdm 显示进度条
    for idx in tqdm(range(len(data["annotations"])), desc="Processing dataset"):
        annotation = data["annotations"][idx]
        image_path = os.path.join(image_folder_path, annotation["filename"])
        
        dataset_data["image_name"].append(annotation["filename"])
        dataset_data["caption"].append(annotation["caption"])
        dataset_data["image"].append(load_image_as_bytes(image_path))
    
    # 定义数据集的特征
    features = Features({
        "image_name": Value("string"),
        "caption": Value("string"),
        "image": Value("binary")  # 图像数据存储为字节流
    })
    
    # 创建数据集
    dataset = Dataset.from_dict(dataset_data, features=features)
    return dataset

# 将 Dataset 转换为 Parquet 文件
def dataset_to_parquet(dataset, output_path):
    # 将 Dataset 转换为 Pandas DataFrame
    df = dataset.to_pandas()
    
    # 创建 PyArrow Table
    table = pa.Table.from_pandas(df)
    
    # 保存为 Parquet 文件
    pq.write_table(table, output_path)

# 调用
json_file_path = "./data/RSICap/captions.json"  # JSON文件路径
image_folder_path = "./data/RSICap/images/"  # 图片文件夹路径
dataset = load_custom_dataset(json_file_path, image_folder_path)

# 保存为 Parquet 文件
output_parquet_path = "./dataset.parquet"
dataset_to_parquet(dataset, output_parquet_path)
print(f"Dataset saved to {output_parquet_path}")

Processing dataset: 100%|██████████| 2585/2585 [00:16<00:00, 157.73it/s]


Dataset saved to ./dataset.parquet


In [5]:
# 加载保存的 Parquet 文件
loaded_df = pd.read_parquet("dataset.parquet")

# 查看前几条记录
print(loaded_df.head())

       image_name                                            caption  \
0  P0378_0001.png  This is an aerial image showing a parking lot ...   
1  P1077_0002.png  This is a remote sensing image shows an outdoo...   
2  P1140_0085.png  There are four planes parked in an open field,...   
3  P0363_0005.png  This is a aerial image of a neighborhood with ...   
4  P2686_0095.png  This is a remote sensing image with high resol...   

                                               image  
0  b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00...  
1  b'npZvxb~\x80j{}glnX`bL_aK_cL^cL]bK^cL`eNdiRgl...  
2  b'@L^?Ka?Lg@Ok?Rm<Uq;Wt:Zv@b}Op\x89c\x81\x9bx\...  
3  b'X<9O:2R<5\\C=eJDgKFZ<8hJEgID`C>\\A;T:4U=6lVN...  
4  b'\x84\x9e{\x85\x9d{\x8a\x9f~\x8b\x9c|\x83\x93...  


In [2]:
from datasets import load_dataset, Features, Value, Image
import numpy as np
from PIL import Image as PILImage

# 读取 Parquet 文件
def load_parquet_dataset(file_path):
    dataset = load_dataset("parquet", data_files=file_path)
    return dataset

# 恢复图像数据
def bytes_to_image(byte_data, shape=(512, 512, 3), dtype=np.uint8):
    try:
        # 将字节流转换为 NumPy 数组
        image_array = np.frombuffer(byte_data, dtype=dtype)
        # 重塑数组为图像形状
        image_array = image_array.reshape(shape)
        # 创建图像对象
        image = PILImage.fromarray(image_array)
        return image
    except Exception as e:
        print(f"Error converting byte data to image: {e}")
        return None

parquet_file_path = "./dataset.parquet"
dataset = load_parquet_dataset(parquet_file_path)

# 使用 map 方法将字节流转换为图像对象
try:
    dataset = dataset.map(
        lambda x: {"image": bytes_to_image(x["image"])},
        batched=False,
        num_proc=6  # 使用 4 个进程并行处理
    )
except Exception as e:
    print(f"Error during map operation: {e}")

# 重新定义数据集的特征
features = Features({
    "image_name": Value("string"),
    "caption": Value("string"),
    "image": Image()  
})

# 将数据集转换为新的特征
dataset = dataset.cast(features)

# 验证转换后的图像数据
for sample in dataset["train"]:
    if isinstance(sample["image"], PILImage.Image):
        print("Image conversion successful.")
        print(sample["image"])  # 打印图像对象
        break
    else:
        print("Image conversion failed.")
        break

dataset['train'][0]

  from .autonotebook import tqdm as notebook_tqdm
Map (num_proc=6): 100%|██████████| 2585/2585 [01:54<00:00, 22.56 examples/s]  

Image conversion successful.
<PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x512 at 0x7C10ECAEF1D0>





{'image_name': 'P0378_0001.png',
 'caption': "This is an aerial image showing a parking lot with a high resolution. In the parking lot, there are many vehicles, including cars, trucks, and buses. Specifically, thirty-seven large-vehicles and fourteen small-vehicles can be observed in the image. Some of them are parked in rows, and others are parked in random positions. In the image's bottom-right, the trucks and buses are parked in a line, with some facing the same direction and some facing the opposite direction. Some of the trucks and buses are white, while others are yellow. Overall, the parking lot is filled with a variety of vehicles. There is a structure in the image's top left and two roadside green belts at the bottom of the image. The image shows a sunny day since we can see the shadow of the vehicles and trees.",
 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x512>}

In [6]:
from datasets import DatasetDict

# 按照 80%, 10%, 10% 的比例划分数据集
train_test_split = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]

validation_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)
validation_dataset = validation_test_split["train"]
test_dataset = validation_test_split["test"]

# 创建一个新的 DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset
})

dataset_dict["train"][0]

{'image_name': 'P2691_0130.png',
 'caption': 'This is an aerial image of a parking lot. In the bottom right corner of the image, there is a parking lot with many parked cars. There is a white building near the parking lot. In the top left corner of the image, there is a piece of wasteland. A road runs through the wasteland.',
 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=512x512>}