In [1]:
import os
from PIL import Image
import numpy as np

from sklearn.model_selection import train_test_split

In [2]:
def load_and_convert_image(image_path, target_size=(256, 256)):
    """Load image and convert to RGB if needed"""
    img = Image.open(image_path)
    
    # Convert to RGB if image is RGBA or grayscale
    if img.mode in ('RGBA', 'LA'):
        img = img.convert('RGB')
    elif img.mode != 'RGB':
        img = img.convert('RGB')
    
    img = img.resize(target_size)
    return np.array(img)

In [3]:
def organize_dataset_with_tif(dataset_path, output_path, test_size=0.2, val_size=0.1):
    authentic_dir = os.path.join(dataset_path, 'Au')
    tampered_dir = os.path.join(dataset_path, 'Tp')
    
    # Get all image paths (JPG for authentic, TIF for tampered)
    authentic_images = [os.path.join(authentic_dir, img) 
                       for img in os.listdir(authentic_dir) 
                       if img.lower().endswith('.jpg')]
    
    tampered_images = [os.path.join(tampered_dir, img) 
                      for img in os.listdir(tampered_dir) 
                      if img.lower().endswith('.tif')]
    
    # Split authentic images
    auth_train, auth_test = train_test_split(authentic_images, test_size=test_size, random_state=42)
    auth_train, auth_val = train_test_split(auth_train, test_size=val_size/(1-test_size), random_state=42)
    
    # Split tampered images
    tamp_train, tamp_test = train_test_split(tampered_images, test_size=test_size, random_state=42)
    tamp_train, tamp_val = train_test_split(tamp_train, test_size=val_size/(1-test_size), random_state=42)
    
    # Create directories
    for split in ['train', 'val', 'test']:
        os.makedirs(os.path.join(output_path, split, 'authentic'), exist_ok=True)
        os.makedirs(os.path.join(output_path, split, 'tampered'), exist_ok=True)
    
    # Save images with consistent format (convert TIF to JPG)
    def save_images(files, dest_folder, label):
        for file in files:
            img = load_and_convert_image(file)
            filename = os.path.basename(file).split('.')[0] + '.jpg'  # Convert all to JPG
            save_path = os.path.join(output_path, dest_folder, label, filename)
            Image.fromarray(img).save(save_path, 'JPEG', quality=95)
    
    save_images(auth_train, 'train', 'authentic')
    save_images(auth_val, 'val', 'authentic')
    save_images(auth_test, 'test', 'authentic')
    save_images(tamp_train, 'train', 'tampered')
    save_images(tamp_val, 'val', 'tampered')
    save_images(tamp_test, 'test', 'tampered')

In [4]:
dataset_path = 'CASIA2'
output_path = 'dataset'
organize_dataset_with_tif(dataset_path, output_path)
print("Dataset organized successfully.")

Dataset organized successfully.
