In [None]:
import sys
sys.path.append("../..")

from datasets import load_dataset
from PIL import Image
import matplotlib.pyplot as plt
import cv2
import numpy as np
import pandas as pd
import os
from pathlib import Path
import random
from tqdm import tqdm
import logging

# --- DETAILED EXPLANATION ---
# This script generates blurred versions of each image in the dataset using three types of blur:
# 
# Why Blurring Images?
# Blurring is a common type of image distortion that can occur due to various factors:
# - Camera motion during capture (Motion Blur)
# - Out-of-focus regions due to depth of field (Gaussian Blur)
# - Uniform blurring due to poor image processing or resizing (Box Blur)
# These blurs simulate real-world distortions, making our analysis more realistic.
#
# --- Types of Blur Used ---
# 1. Gaussian Blur:
# - This blur applies a Gaussian kernel to each pixel, which results in a smooth, out-of-focus effect.
# - The blur is strongest at the center and gradually fades out (bell curve distribution).
# - We chose sigma range (0.5 to 3.0) because it provides visible but controlled blur.
# - Sigma defines the standard deviation of the Gaussian distribution (higher = stronger blur).
#
# 2. Motion Blur:
# - Simulates the effect of movement, either from a moving object or a moving camera.
# - The blur has a clear direction (angle) and intensity (length).
# - We chose length (5 to 30) to allow visible motion but not too extreme.
# - Angle (0 to 180) provides all possible directions of movement.
#
# 3. Box Blur:
# - Applies a simple averaging of pixel values in a square region around each pixel.
# - This blur is fast and commonly used for quick blur effects.
# - We chose kernel size (3 to 15) because it provides a visible blur without being too extreme.
# - The kernel size defines the square region used for averaging (higher = stronger blur).
#
# --- Why Save Parameters in a Parquet File? ---
# - Parquet is optimized for speed and storage efficiency.
# - It is suitable for large datasets, ensuring fast read and write operations.
# - Parquet allows us to store the blur parameters alongside the image identifiers, making it easy to track and analyze.

# Load the dataset from HuggingFace (streaming mode)
# This allows us to access images without downloading everything at once
ds = load_dataset("slymachenko/image-deblurring-performance-analysis", split="train", streaming=True)
ds_head = ds.take(10)  # We take a small sample (10 images) for quick testing

# Save first 10 images locally to a temporary directory for processing
os.makedirs("temp_images", exist_ok=True)  # Create directory if it doesn't exist
for i, img in enumerate(ds_head):
    img["image"].save(f"temp_images/immagine_{i}.png")  # Save each image for local processing

# Display a sample image to ensure loading works correctly
image = Image.open("temp_images/immagine_3.png")
plt.imshow(image)
plt.show()

# Initialize paths and directories for saving blurred images
BLURRED_DIR = Path("data/image-deblurring-performance-analysis/blurred")
BLUR_TYPES = ['gaussian', 'motion', 'box']  # Three types of blur we will apply
for blur_type in BLUR_TYPES:
    (BLURRED_DIR / blur_type).mkdir(parents=True, exist_ok=True)  # Create separate folders for each blur type

# Define blur parameter ranges
PARAM_RANGES = {
    'gaussian': {'sigma': (0.5, 3.0)},  # Sigma controls the amount of blur (higher = stronger)
    'motion': {'length': (5, 30), 'angle': (0, 180)},  # Length and angle control motion blur intensity and direction
    'box': {'kernel_size': (3, 15)}  # Kernel size defines the strength of the Box blur
}

# Initialize a list to store metadata (parameters) of blurred images
blurred_images = []

# Apply blur to each saved image
for i in range(10):  # Loop over all saved images
    image_path = f"temp_images/immagine_{i}.png"
    image = cv2.imread(image_path)  # Read the image using OpenCV

    if image is None:
        continue  # Skip if image cannot be loaded

    # Loop through each blur type (Gaussian, Motion, Box)
    for blur_type in BLUR_TYPES:
        if blur_type == 'gaussian':
            # Apply Gaussian Blur (soft, smooth blur)
            sigma = random.uniform(*PARAM_RANGES['gaussian']['sigma'])
            blurred = cv2.GaussianBlur(image, (int(6 * sigma) | 1, int(6 * sigma) | 1), sigma)
            blur_params = {'gaussian_sigma': sigma}  # Save the parameter

        elif blur_type == 'motion':
            # Apply Motion Blur (directional blur)
            length = random.randint(*PARAM_RANGES['motion']['length'])
            angle = random.uniform(*PARAM_RANGES['motion']['angle'])
            kernel = np.zeros((length, length))
            cv2.line(kernel, (0, length // 2), (length, length // 2), 1, thickness=1)  # Create a line to simulate motion
            blurred = cv2.filter2D(image, -1, kernel / np.sum(kernel))  # Apply the kernel
            blur_params = {'motion_length': length, 'motion_angle': angle}

        else:  # Box Blur
            # Apply Box Blur (average of pixels)
            kernel_size = random.randint(*PARAM_RANGES['box']['kernel_size'])
            blurred = cv2.blur(image, (kernel_size, kernel_size))
            blur_params = {'box_kernel_size': kernel_size}  # Save the parameter

        # Store the blur type and its parameters
        blur_params.update({'image': f"immagine_{i}.png", 'blur_type': blur_type})
        blurred_images.append(blur_params)  # Save metadata

        # Save the blurred image to the appropriate directory
        cv2.imwrite(str(BLURRED_DIR / blur_type / f"immagine_{i}.png"), blurred)

# Save metadata of blurred images to a Parquet file (efficient storage)
df_blur = pd.DataFrame(blurred_images)
df_blur.to_parquet("data/image_deblurring_performance_analysis_blur_metadata.parquet", index=False)

print("Blur generation completed.")