In [None]:
import os
import pandas as pd
from shutil import copyfile
from PIL import Image
from tqdm import tqdm
import imagehash


rafdb_root_dir = "../../datasets/RAF-DB/DATASET"
fer2013plus_train_dir = "../../datasets/fer2013plus_copy/train"
fer2013plus_test_dir = "../../datasets/fer2013plus_copy/test"
rafdb_train_csv = f"../../datasets/RAF-DB/train_labels.csv"
rafdb_test_csv = f"../../datasets/RAF-DB/test_labels.csv"

# Mapping RAF-DB labels to FER2013+ labels
rafdb_to_fer2013_labels = {
    "fear": "fear",
    "disgust": "disgust",
    "anger": "anger"
}

# Resize dimensions (FER2013+ uses 48x48)
resize_to = (48, 48)

# Collect existing FER2013+ fingerprints (hashes)
def get_image_hashes(directory):
    image_hashes = set()
    for root, _, files in os.walk(directory):
        for file in files:
            filepath = os.path.join(root, file)
            try:
                img = Image.open(filepath).convert("L")  # Convert to grayscale
                img_hash = str(imagehash.phash(img))  # Generate perceptual hash
                image_hashes.add(img_hash)
            except Exception as e:
                print(f"Error hashing file {filepath}: {e}")
    return image_hashes

print("Collecting existing FER2013+ image fingerprints...")
fer2013plus_hashes = get_image_hashes(fer2013plus_train_dir)
fer2013plus_hashes.update(get_image_hashes(fer2013plus_test_dir))
print(f"Collected {len(fer2013plus_hashes)} unique image hashes from FER2013+")

# Function to process and copy images, skipping duplicates
def process_images(csv_path, rafdb_dir, target_dir, label_mapping, existing_hashes):
    data = pd.read_csv(csv_path)
    # for _, row in tqdm(data.iterrows(), total=len(data)):
    for _, row in data.iterrows():
        label = row["label"]  # Adjust column name to match CSV
        filepath = os.path.join(rafdb_dir, row["image"])  # Adjust to RAF-DB structure

        # Map RAF-DB label to FER2013+ label
        if label in label_mapping:
            fer_label = label_mapping[label]
            target_folder = os.path.join(target_dir, fer_label)
            os.makedirs(target_folder, exist_ok=True)

            # Preprocess image
            # try:
            img = Image.open(filepath).convert("L")  # Ensure it's in RGB
            img = img.resize(resize_to)
            img_hash = str(imagehash.phash(img))  # Generate hash for duplicate checking

            # Skip if the hash exists in FER2013+
            if img_hash in existing_hashes:
                print(f"Duplicate found, skipping: {filepath}")
                continue

            # Save the new image
            save_path = os.path.join(target_folder, os.path.basename(filepath))
            img.save(save_path)
            print('saving new image to ', save_path)

            # Add hash to the set to avoid future duplicates
            existing_hashes.add(img_hash)

            # except Exception as e:
            #     print(f"Error processing file {filepath}: {e}")

# Process training images
print("Processing training images...")
process_images(rafdb_train_csv, rafdb_root_dir, fer2013plus_train_dir, rafdb_to_fer2013_labels, fer2013plus_hashes)

# Process test images
print("Processing test images...")
process_images(rafdb_test_csv, rafdb_root_dir, fer2013plus_test_dir, rafdb_to_fer2013_labels, fer2013plus_hashes)

print("RAF-DB to FER2013+ conversion completed!")


Collecting existing FER2013+ image fingerprints...
Collected 33389 unique image hashes from FER2013+
Processing training images...


100%|██████████| 12271/12271 [00:01<00:00, 7597.85it/s]


Processing test images...


100%|██████████| 3068/3068 [00:00<00:00, 5416.29it/s]

RAF-DB to FER2013+ conversion completed!



