In [1]:
#import libraries
import os
import random
import numpy as np
import cv2
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
seed_val = 420
torch.manual_seed(seed_val)

<torch._C.Generator at 0x75110838a590>

In [3]:
def preprocess_image(img, mode, size=(50, 50), cutoff=40, debug=False):
    """
    Apply the specified preprocessing mode to the image.
    """
    if mode == 'grayscale':
        return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    elif mode == 'resize':
        return cv2.resize(img, size)
    elif mode in ['low_freq', 'high_freq']:
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        dft = np.fft.fft2(gray)
        dft_shift = np.fft.fftshift(dft)

        rows, cols = gray.shape
        crow, ccol = rows // 2, cols // 2

        # Gaussian filter
        x, y = np.ogrid[:rows, :cols]
        if mode == 'low_freq':
            mask = np.exp(-((x - crow)**2 + (y - ccol)**2) / (2.0 * cutoff**2))
        elif mode == 'high_freq':
            mask = 1 - np.exp(-((x - crow)**2 + (y - ccol)**2) / (2.0 * cutoff**2))

        # Apply mask to frequency domain
        filtered = dft_shift * mask

        # Transform back to the spatial domain
        img_back = np.fft.ifft2(np.fft.ifftshift(filtered))
        img_back = np.abs(img_back)

        # Normalize to 8-bit range
        img_back = cv2.normalize(img_back, None, 0, 255, cv2.NORM_MINMAX)
        img_back = img_back.astype(np.uint8)

        # Debugging: Visualize frequency spectrum
        if debug:
            magnitude_spectrum = 20 * np.log(np.abs(dft_shift) + 1)
            cv2.imshow(f'{mode} Magnitude Spectrum', magnitude_spectrum)
            cv2.imshow(f'{mode} Filtered Image', img_back)
            cv2.waitKey(0)
            cv2.destroyAllWindows()

        return img_back
    else:
        raise ValueError("Invalid mode specified!")

def get_all_files_recursive(directory, extensions=('.jpg', '.png', '.jpeg')):
    """
    Recursively collects all files with the specified extensions from a directory and its subdirectories.
    """
    all_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(extensions):
                all_files.append(os.path.join(root, file))
    return all_files

def preprocess_directory(input_dir, output_dir, mode, size=(50, 50), cutoff=20, subset_fraction=0.1):
    """
    Preprocess a subset of images in a directory (recursively if needed) based on the mode.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Get list of all image files recursively
    all_files = get_all_files_recursive(input_dir)
    if not all_files:
        print(f"No valid files found in '{input_dir}'")
        return 0

    # Randomly sample a subset
    subset_size = min(len(all_files), max(1, int(len(all_files) * subset_fraction)))  # Ensure valid subset size
    subset_files = random.sample(all_files, subset_size)

    processed_count = 0
    for img_path in subset_files:
        img = cv2.imread(img_path)

        # Preprocess the image
        processed_img = preprocess_image(img, mode, size=size, cutoff=cutoff)

        # Save the processed image to the output directory (keep relative path structure)
        relative_path = os.path.relpath(img_path, input_dir)
        output_path = os.path.join(output_dir, relative_path)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        cv2.imwrite(output_path, processed_img)
        processed_count += 1

    print(f"Processed {processed_count}/{len(all_files)} images in '{input_dir}'")
    return processed_count


def prepare_dataset(base_dir, output_base_dir, mode, size=(50, 50), cutoff=20, subset_fraction=0.1):
    """
    Preprocess 10% of the dataset in a specific mode.
    """
    total_processed = 0

    # Process "generated" (fake) images
    print(f"Processing 'generated' images for mode: {mode}")
    total_processed += preprocess_directory(
        os.path.join(base_dir, 'generated'),
        os.path.join(output_base_dir, f'{mode}_generated'),
        mode, size=size, cutoff=cutoff, subset_fraction=subset_fraction
    )

    # Process "real" images
    print(f"Processing 'real' images for mode: {mode}")
    total_processed += preprocess_directory(
        os.path.join(base_dir, 'real'),
        os.path.join(output_base_dir, f'{mode}_real'),
        mode, size=size, cutoff=cutoff, subset_fraction=subset_fraction
    )

    print(f"Total images processed for mode '{mode}': {total_processed}")
    return total_processed


In [None]:
# Directories
base_data_dir = '../../data/artifact'  # Replace with your dataset directory
output_data_dir = '../../data/processed'  # Directory to save processed data
mode = 'high_freq'  # Change to 'grayscale','resize', 'low_freq', or 'high_freq' as needed

# Process 10% of the dataset in the specified mode
total_images_processed = prepare_dataset(base_data_dir, output_data_dir, mode, subset_fraction=0.1)
print(f"Finished processing. Total images processed: {total_images_processed}")

Processing 'generated' images for mode: high_freq
No valid files found in '../data/deepfake/original/generated'
Processing 'real' images for mode: high_freq
No valid files found in '../data/deepfake/original/real'
Total images processed for mode 'high_freq': 0
Finished processing. Total images processed: 0


In [4]:
data = [] 
base_data_dir = '../../data/artifact'
fake_dir = os.path.join(base_data_dir, 'generated')  # Fake artworks directory
real_dir = os.path.join(base_data_dir, 'real')  # Real artworks directory

# Iterate over the fake artworks and add their paths and labels to the list
for dirpath, dirnames, filenames in os.walk(fake_dir):
    for filename in filenames:
        if filename.endswith(".jpg"): # only consider jpg files
            filepath = os.path.join(dirpath, filename)
            data.append((filepath, "0"))


# Iterate over the real artworks and add their paths and labels to the list
for dirpath, dirnames, filenames in os.walk(real_dir):
    for filename in filenames:
        if filename.endswith(".jpg"):
            filepath = os.path.join(dirpath, filename)
            data.append((filepath, "1"))  # Label 1 for real artworks

# Convert the list "data" to a pandas dataframe
df = pd.DataFrame(data, columns=["path", "label"])

dataset = df
dataset['label'] = dataset['label'].astype(int)
dataset

# Save the dataframe to a CSV file
csv_output_path = os.path.join(base_data_dir, "image_labels.csv")
df.to_csv(csv_output_path, index=False)
print(f"CSV file saved at {csv_output_path}")


train_val_data, test = train_test_split(dataset.values, test_size=0.1, random_state=seed_val)
test_links = pd.DataFrame(test, columns = dataset.columns)

CSV file saved at ../../data/artifact/image_labels.csv


In [5]:
def preprocess_test_set(test_links, output_dir, mode, size=(50, 50), cutoff=20):
    """
    Preprocess all images listed in test_links and save to output directory.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    processed_count = 0

    for idx, row in test_links.iterrows():
        img_path = row["path"]
        label = row["label"]
        
        img = cv2.imread(img_path)
        if img is None:
            print(f"Warning: Unable to read image {img_path}. Skipping.")
            continue

        # Preprocess the image
        processed_img = preprocess_image(img, mode, size=size, cutoff=cutoff)

        # Create a subfolder based on the label for organized storage
        label_dir = os.path.join(output_dir, str(label))
        os.makedirs(label_dir, exist_ok=True)

        # Save the processed image
        output_path = os.path.join(label_dir, os.path.basename(img_path))
        cv2.imwrite(output_path, img)
        processed_count += 1

    print(f"Processed {processed_count}/{len(test_links)} test images.")
    return processed_count


# Example usage:
mode = 'resize'  # Example preprocessing mode
output_dir = '../../data/artifact/test_set'
preprocess_test_set(test_links, output_dir, mode=mode, size=(25, 25), cutoff=40)

Processed 8816/8816 test images.


8816

In [10]:
# Directories
base_data_dir = '../../data/deepfakeart/original'  # Replace with your dataset directory
output_data_dir = '../../data/processed/deepfakeart/low_freq'  # Directory to save processed data
mode = 'low_freq'  # Change to 'grayscale','resize', 'low_freq', or 'high_freq' as needed

# Process 10% of the dataset in the specified mode
total_images_processed = preprocess_directory(base_data_dir, output_data_dir, mode)
print(f"Finished processing. Total images processed: {total_images_processed}")



Processed 1104/11047 images in '../../data/deepfakeart/original'
Finished processing. Total images processed: 1104
