The "HQ-50K Dataset" is required, which contains the `test` and `train` folders. The important folder is `test`, which includes `.txt` files containing URLs for 1250 images.

### Load HQ-50k test

In [1]:
# create df with "url" and "category"

import os
import pandas as pd

dataset_path = "../../data/HQ-50k/test"
image_data = []

# Iterate over .txt files in the folder
for txt_file in os.listdir(dataset_path):
    if txt_file.endswith(".txt"):
        category = os.path.splitext(txt_file)[0]  # Use the file name (without extension) as the category
        txt_path = os.path.join(dataset_path, txt_file)
        with open(txt_path, "r") as file:
            urls = file.readlines()

            for url in urls:
                url = url.strip()
                if not url:
                    continue

                image_data.append({
                    "url": url,
                    "category": category
                })

# Verify the total number of images, must be 1250
print(f"Total processed images: {len(image_data)}")

df_image_data = pd.DataFrame(image_data)
print(df_image_data)

Total processed images: 1250
                                                    url  category
0     https://prosperwell.com/wp-content/uploads/201...    animal
1     https://jennifertarheelreader.files.wordpress....    animal
2     http://lanting.com/wp-content/uploads/Lanting_...    animal
3     https://drscdn.500px.org/photo/41458556/m%3D20...    animal
4     https://images7.alphacoders.com/755/thumb-1920...    animal
...                                                 ...       ...
1245  https://images.squarespace-cdn.com/content/v1/...  withchar
1246  https://s3-us-west-2.amazonaws.com/static1.vil...  withchar
1247  https://greeblehaus.com/wp-content/uploads/201...  withchar
1248  http://media3.onsugar.com/files/2013/12/06/901...  withchar
1249  http://www.fontpad.co.uk/wp-content/uploads/20...  withchar

[1250 rows x 2 columns]


### Validation

In [3]:
# validate URLs
from concurrent.futures import ThreadPoolExecutor       # for parallel processing
import requests                                         # for HTTP requests
from tqdm import tqdm                                   # for progress bar

# Function to verify a single URL
def is_valid_url(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.head(url, timeout=15, headers=headers, allow_redirects=True)
        return response.status_code == 200
    except requests.RequestException:
        return False

# Function to process a row of the DataFrame
def process_row(row):
    if is_valid_url(row['url']):
        # Return a dictionary containing the unique ID, URL, and category for valid image URLs
        return {
            'url': row['url'],  # URL of the image
            'category': row['category']  # Category of the image
        }
    return None

# Verify URLs in parallel
valid_image_data = []
with ThreadPoolExecutor(max_workers=8) as executor:  # Use 8 threads to parallelize
    futures = list(tqdm(executor.map(process_row, [row for _, row in df_image_data.iterrows()]), total=len(df_image_data), desc="Verify URLs"))
    valid_image_data = [result for result in futures if result is not None]

# Create the new DataFrame
df_valid_image_data = pd.DataFrame(valid_image_data)
df_valid_image_data = df_valid_image_data[df_valid_image_data['category'] != 'withchar'] # Exclude 'withchar' category because it has few images

# Display the result
print(df_valid_image_data)

Verify URLs: 100%|██████████| 1250/1250 [01:21<00:00, 15.34it/s]

                                                   url category
0    https://prosperwell.com/wp-content/uploads/201...   animal
1    https://jennifertarheelreader.files.wordpress....   animal
2    http://lanting.com/wp-content/uploads/Lanting_...   animal
3    https://drscdn.500px.org/photo/41458556/m%3D20...   animal
4    https://images7.alphacoders.com/755/thumb-1920...   animal
..                                                 ...      ...
927  https://www.williamhortonphotography.com/wp-co...  vehicle
928  http://media4.onsugar.com/files/2014/01/12/123...  vehicle
929  https://amsrus.ru/wp-content/uploads/2017/01/b...  vehicle
930            https://i.postimg.cc/JzD5v86M/10024.jpg  vehicle
931            https://i.postimg.cc/LsjB4TFW/10031.jpg  vehicle

[932 rows x 2 columns]





### 'image_deblurring_dataset.csv' creation

In [4]:
from io import BytesIO
import numpy as np
import cv2
from PIL import Image

# Add an 'id' column to the DataFrame
df_valid_image_data['id'] = range(1, len(df_valid_image_data) + 1)

# Function to process a single image and extract metadata
def extract_image_metadata(row):
    """
    Fetches an image from the URL and extracts metadata like width, height,
    format, aspect ratio, and size.
    """
    try:
        url = row['url']
        # Perform the full GET request to fetch the image
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, timeout=20, headers=headers, allow_redirects=True)
        response.raise_for_status()  # Raise an exception for HTTP errors
        size = len(response.content)  # Image size in bytes

        # Open the image with OpenCV
        image_array = np.asarray(bytearray(response.content), dtype=np.uint8)
        image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
        if image is None:
            raise ValueError("Failed to decode image with OpenCV")

        # Extract metadata using OpenCV
        height, width, _ = image.shape
        aspect_ratio = round(width / height, 2)

        # Use Pillow to detect the image format
        image_format = Image.open(BytesIO(response.content)).format

        # Return all metadata
        return {
            'id': row['id'],  # Include the ID
            'category': row['category'],
            'width': width,
            'height': height,
            'format': image_format,
            'aspect_ratio': aspect_ratio,
            'size': size,
            'url': url  # Move URL to the last column
        }
    except Exception as e:
        print(f"Error processing URL {row['url']}: {e}")
        # Return NaN for all metadata fields in case of an error
        return {
            'id': row['id'],  # Include the ID even in case of failure
            'category': row['category'],
            'width': np.nan,
            'height': np.nan,
            'format': np.nan,
            'aspect_ratio': np.nan,
            'size': np.nan,
            'url': row['url']  # Ensure the URL is preserved
        }

# Parallelize metadata extraction
def process_metadata_in_parallel(df, max_workers=8):
    """
    Processes images in parallel using ThreadPoolExecutor for efficiency.
    """
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Map dataframe rows to the extract_image_metadata function
        futures = list(tqdm(executor.map(extract_image_metadata, [row for _, row in df.iterrows()]), total=len(df)))
        results.extend(futures)
    return results

# Extract metadata for all valid images in parallel
image_metadata = process_metadata_in_parallel(df_valid_image_data)

# Create a DataFrame with the metadata
df_metadata = pd.DataFrame(image_metadata)

# Save the DataFrame to a CSV file (ensure proper quoting for all fields)
output_path = "../../data/image_deblurring_dataset.csv"
df_metadata.to_csv(output_path, index=False, quoting=1)  # quoting=1 ensures all fields are quoted

print(f"Metadata saved to {output_path}")

 13%|█▎        | 118/932 [00:12<01:28,  9.23it/s]

Error processing URL https://www.gilingo.de/wp-content/uploads/2017/07/iStock-648821756-Edit-Final-darker.jpg: Failed to decode image with OpenCV


 31%|███       | 287/932 [00:25<00:40, 15.80it/s]

Error processing URL https://www.dreampirates.in/wallpaper/textures/img/25-12-2019-1674-wood-surface-texture-boards.jpg: Failed to decode image with OpenCV
Error processing URL https://www.all4women.co.za/wp-content/uploads/2021/04/15/Nandos2484.jpg: Failed to decode image with OpenCV
Error processing URL https://media2.fdncms.com/eastbayexpress/imager/u/original/12065784/img_0623.jpg: Failed to decode image with OpenCV


 42%|████▏     | 394/932 [00:38<00:41, 13.10it/s]

Error processing URL https://www.adelaidehillswinetrail.com/wp-content/uploads/2015/01/vintage-peacock-wall-art.jpg: Failed to decode image with OpenCV


 64%|██████▍   | 599/932 [01:01<00:32, 10.12it/s]

Error processing URL https://www.cartographersguild.com/attachment.php?attachmentid=42083: Failed to decode image with OpenCV


 66%|██████▌   | 617/932 [01:03<00:31, 10.16it/s]

Error processing URL https://img.kansasmemory.org/00278033.jpg: Failed to decode image with OpenCV


 67%|██████▋   | 621/932 [01:03<00:29, 10.50it/s]

Error processing URL http://cascoly.com/maps/map-1965-world.jpg: Failed to decode image with OpenCV


 70%|███████   | 656/932 [01:12<01:34,  2.93it/s]

Error processing URL https://kerrymorgan.com/wp-content/uploads/2017/08/2017-08-22_0029.jpg: Failed to decode image with OpenCV


100%|██████████| 932/932 [01:38<00:00,  9.51it/s]

Metadata saved to ../../data/image_deblurring_dataset.csv





### Check standard deviation of images per category

In [8]:
# Calculate the standard deviation of the number of images per category

# Filter the DataFrame to exclude rows with NaN values
df_filtered = df_metadata.dropna()

# Group the filtered DataFrame by category and count the number of images per category
category_counts = df_filtered.groupby('category')['id'].count()
print(category_counts)
sum_counts = category_counts.sum()
print(f"\nSum of counts: {sum_counts}\n")

# Calculate the standard deviation of the counts
std_dev = np.std(category_counts)

# Print the result
print(f"Standard deviation of the number of images per category: {std_dev}")

category
animal          94
architecture    74
comic           90
complex         75
food            76
furniture       70
indoor_scene    69
map             81
people          74
poster          83
scenery         65
vehicle         72
Name: id, dtype: int64

Sum of counts: 923

Standard deviation of the number of images per category: 8.240735538908053
