# 处理BBBC021数据集

In [1]:
# 解压缩zip文件
import os
import zipfile
from tqdm import tqdm
import glob

def main():
    # 设置路径
    base_dir = "/data/pr/cellpainting/BBBC021/raw_data"
    target_dir = "/data/pr/cellpainting/BBBC021/raw_data/images"
    
    # 确保目标目录存在
    os.makedirs(target_dir, exist_ok=True)
    
    # 获取所有zip文件
    zip_files = glob.glob(os.path.join(base_dir, "*.zip"))
    total_files = len(zip_files)
    
    if total_files == 0:
        print("No ZIP files found!")
        return
    
    print(f"Found {total_files} ZIP files")
    print(f"Extracting to: {target_dir}")
    
    # 记录成功和失败的文件
    success_files = []
    failed_files = []
    
    # 使用tqdm创建进度条
    for zip_path in tqdm(zip_files, desc="Extracting files"):
        try:
            # 解压文件
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(target_dir)
            success_files.append(zip_path)
        except Exception as e:
            failed_files.append(f"{zip_path}: {str(e)}")
    
    # 打印结果
    print("\nExtraction completed!")
    print(f"Successfully extracted: {len(success_files)} files")
    if failed_files:
        print("\nFailed to extract the following files:")
        for fail in failed_files:
            print(fail)

try:
    main()
except KeyboardInterrupt:
    print("\nProcess interrupted by user")
except Exception as e:
    print(f"\nAn error occurred: {str(e)}")

Found 55 ZIP files
Extracting to: /data/pr/cellpainting/BBBC021/data/images


Extracting files: 100%|██████████| 55/55 [33:21<00:00, 36.39s/it]


Extraction completed!
Successfully extracted: 55 files





# 将BBBC021原始三通道数据给merge成RGB图像，进行随机增强和训练数据集划分
运行python /data/pr/DiT_AIVCdiff/pr_tutorial/utils/merge_BBBC021.py

生成数据路径：
/data/pr/cellpainting/BBBC021/raw_data/merged_rgb_images_train

/data/pr/cellpainting/BBBC021/raw_data/merged_rgb_images_test

/data/pr/cellpainting/BBBC021/raw_data/metadata/augmented_image_metadata.csv
