In [None]:
import os
import random
import shutil
from tqdm import tqdm
random.seed(1337)

In [6]:
def sample_dataset(dataset_path, output_path, sample_percent, min_images_per_class=5):
    """
    Samples a percentage of images per class from a dataset organized in class-based directories.
    
    Args:
        dataset_path (str): Path to the dataset directory (organized as one subdirectory per class).
        output_path (str): Path to save the sampled dataset.
        sample_percent (float): Percentage of images to sample per class (0-100).
        min_images_per_class (int): Minimum number of images to sample per class.
    """
    os.makedirs(output_path, exist_ok=True)

    # Iterate through each class directory
    for class_name in tqdm(os.listdir(dataset_path)):
        class_dir = os.path.join(dataset_path, class_name)
        
        if not os.path.isdir(class_dir):
            continue
        
        # Get all images in the class directory
        images = os.listdir(class_dir)
        num_images = len(images)
        
        num_to_sample = max(round(num_images * (sample_percent / 100)), min_images_per_class)
        num_to_sample = min(num_to_sample, num_images)
        
        # Randomly sample images
        sampled_images = random.sample(images, num_to_sample)
        
        # Create class directory in output path
        class_output_dir = os.path.join(output_path, class_name)
        os.makedirs(class_output_dir, exist_ok=True)
        
        # Copy sampled images to output directory
        for img in sampled_images:
            src = os.path.join(class_dir, img)
            dst = os.path.join(class_output_dir, img)
            shutil.copy(src, dst)

    print(f"Sampling complete. Sampled dataset saved at: {output_path}")

In [7]:
dataset_path = 'data/VGG-Face2/data/vggface2_train/train'
output_path = 'data/VGG-Face2_sampled/train'
sample_percent = 10
min_images_per_class = 5

In [8]:
sample_dataset(dataset_path, output_path, sample_percent, min_images_per_class)

  0%|          | 0/8631 [00:00<?, ?it/s]

100%|██████████| 8631/8631 [1:02:56<00:00,  2.29it/s]

Sampling complete. Sampled dataset saved at: data/VGG-Face2_sampled/train



