The "HQ-50K Dataset" is required, which contains the `test` and `train` folders. The important folder is `test`, which includes `.txt` files containing URLs for 1250 images.

### Load HQ-50k test

In [10]:
# create df with "url" and "category"

import os
import pandas as pd

dataset_path = "../../data/HQ-50k/test"
image_data = []

# Iterate over .txt files in the folder
for txt_file in os.listdir(dataset_path):
    if txt_file.endswith(".txt"):
        category = os.path.splitext(txt_file)[0]  # Use the file name (without extension) as the category
        txt_path = os.path.join(dataset_path, txt_file)
        with open(txt_path, "r") as file:
            urls = file.readlines()

            for url in urls:
                url = url.strip()
                if not url:
                    continue

                image_data.append({
                    "url": url,
                    "category": category
                })

# Verify the total number of images, must be 1250
print(f"Total processed images: {len(image_data)}")

df_image_data = pd.DataFrame(image_data)
print(df_image_data)

Total processed images: 1250
                                                    url  category
0     https://prosperwell.com/wp-content/uploads/201...    animal
1     https://jennifertarheelreader.files.wordpress....    animal
2     http://lanting.com/wp-content/uploads/Lanting_...    animal
3     https://drscdn.500px.org/photo/41458556/m%3D20...    animal
4     https://images7.alphacoders.com/755/thumb-1920...    animal
...                                                 ...       ...
1245  https://images.squarespace-cdn.com/content/v1/...  withchar
1246  https://s3-us-west-2.amazonaws.com/static1.vil...  withchar
1247  https://greeblehaus.com/wp-content/uploads/201...  withchar
1248  http://media3.onsugar.com/files/2013/12/06/901...  withchar
1249  http://www.fontpad.co.uk/wp-content/uploads/20...  withchar

[1250 rows x 2 columns]


### Validation

In [20]:
# validate URLs
from concurrent.futures import ThreadPoolExecutor       # for parallel processing
import requests                                         # for HTTP requests
from tqdm import tqdm                                   # for progress bar

# Function to verify a single URL and ensure it points to a valid image
def is_valid_image_url(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.head(url, timeout=20, headers=headers, allow_redirects=True)
        
        # Check if the status code is 200 and the Content-Type is an image
        if response.status_code == 200 and 'image' in response.headers.get('Content-Type', ''):
            return True
        return False
    except requests.RequestException:
        return False

# Function to process a row of the DataFrame
def process_row(row):
    if is_valid_image_url(row['url']):
        # Return a dictionary containing the unique ID, URL, and category for valid image URLs
        return {
            'url': row['url'],  # URL of the image
            'category': row['category']  # Category of the image
        }
    return None

# Verify URLs in parallel
valid_image_data = []
with ThreadPoolExecutor(max_workers=8) as executor:  # Use 8 threads to parallelize
    futures = list(tqdm(executor.map(process_row, [row for _, row in df_image_data.iterrows()]), total=len(df_image_data), desc="Verify URLs"))
    valid_image_data = [result for result in futures if result is not None]

# Create the new DataFrame
df_valid_image_data = pd.DataFrame(valid_image_data)
df_valid_image_data = df_valid_image_data[df_valid_image_data['category'] != 'withchar'] # Exclude 'withchar' category because it has few images

# Display the result
print(df_valid_image_data)

Verify URLs: 100%|██████████| 1250/1250 [01:10<00:00, 17.66it/s]

                                                   url category
0    https://prosperwell.com/wp-content/uploads/201...   animal
1    https://jennifertarheelreader.files.wordpress....   animal
2    http://lanting.com/wp-content/uploads/Lanting_...   animal
3    https://drscdn.500px.org/photo/41458556/m%3D20...   animal
4    https://images7.alphacoders.com/755/thumb-1920...   animal
..                                                 ...      ...
905  https://www.williamhortonphotography.com/wp-co...  vehicle
906  http://media4.onsugar.com/files/2014/01/12/123...  vehicle
907  https://amsrus.ru/wp-content/uploads/2017/01/b...  vehicle
908            https://i.postimg.cc/JzD5v86M/10024.jpg  vehicle
909            https://i.postimg.cc/LsjB4TFW/10031.jpg  vehicle

[910 rows x 2 columns]





### 'image_deblurring_dataset.csv' creation

In [None]:
# set ids and metadata
# save in `image_deblurring_dataset.csv`

from PIL import Image
from io import BytesIO
import numpy as np

# Function to extract image metadata
def extract_image_metadata(row, unique_id):
    try:
        response = requests.get(row['url'], timeout=20)
        if response.status_code == 200:
            img = Image.open(BytesIO(response.content))
            width, height = img.size
            aspect_ratio = width / height
            size = len(response.content)  # Size in bytes
            return {
                'id': unique_id,  # Unique ID starting from 1
                'width': width,
                'height': height,
                'aspect_ratio': aspect_ratio,
                'format': img.format,
                'size': size,  # Weight of the image in bytes
                'category': row['category'],
                'url': row['url']
            }
    except Exception as e:
        print(f"Error processing URL {row['url']}: {e}")
        # Return NaN for all metadata fields in case of an error
        return {
            'id': unique_id,
            'width': np.nan,
            'height': np.nan,
            'aspect_ratio': np.nan,
            'format': np.nan,
            'size': np.nan,
            'category': row['category'],
            'url': row['url']
        }

# Parallelize metadata extraction
def process_metadata_in_parallel(df, max_workers=8):
    image_metadata = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = list(tqdm(
            executor.map(lambda args: extract_image_metadata(args[1], args[0]),  # Pass row as a dictionary
                         [(idx, row.to_dict()) for idx, row in df.iterrows()]),  # Convert row to dictionary
            total=len(df), desc="Extracting metadata"
        ))
    for result in futures:
        if result is not None:
            image_metadata.append(result)
    return image_metadata

# Extract metadata for all valid images in parallel
image_metadata = process_metadata_in_parallel(df_valid_image_data)

# Create a DataFrame with the metadata
df_metadata = pd.DataFrame(image_metadata)

# Save the DataFrame to a CSV file
output_path = "../../data/image_deblurring_dataset.csv"
df_metadata.to_csv(output_path, index=False)

print(f"Metadata saved to {output_path}")

Extracting metadata:  36%|███▌      | 325/910 [00:21<00:31, 18.70it/s]

Error processing URL https://danieletorella.com/wp-content/uploads/2017/06/wedding-castello-meleto-toscana-0044.jpg: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error processing URL https://www.demetriopaparoni.com/image.php?rand=1597401839&t=p&id=18&o=1: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Extracting metadata:  47%|████▋     | 431/910 [00:37<00:41, 11.41it/s]

Error processing URL https://stillmed.olympic.org/media/Images/OlympicOrg/News/2019/12/03/2019-12-03-EB-fight-against-doping-thumbnail.jpg: HTTPSConnectionPool(host='stillmed.olympic.org', port=443): Read timed out. (read timeout=20)


Extracting metadata: 100%|██████████| 910/910 [00:49<00:00, 18.53it/s]

Metadata saved to ../../data/image_deblurring_dataset.csv



