The "HQ-50K Dataset" is required, which contains the `test` and `train` folders. The important folder is `test`, which includes `.txt` files containing URLs for 1250 images.


### Load HQ-50k/test/ subset


In [None]:
import os
import pandas as pd

image_data = []

for txt_file in os.listdir(const.INIT_IMAGES_DATASET_PATH):
    if txt_file.endswith(".txt"):
        category = os.path.splitext(txt_file)[0] # Use the file name (without extension) as the category
        txt_path = os.path.join(const.INIT_IMAGES_DATASET_PATH, txt_file)
        with open(txt_path, "r") as file:
            urls = file.readlines()

            for url in urls:
                url = url.strip()
                if not url:
                    continue

                image_data.append({
                    "url": url,
                    "category": category
                })

print(f"Total processed images: {len(image_data)}")

df_image_data = pd.DataFrame(image_data)
display(df_image_data)

df_image_data.to_parquet(const.MAIN_DATASET_PATH, index=False)
print(f"\nDataframe saved to '{const.MAIN_DATASET_PATH}'")

Total processed images: 1250


Unnamed: 0,url,category
0,http://m.sothebysrealty.com/236i0/6ytt92qefx9y...,architecture
1,http://RealEstateAdminImages.gabriels.net/170/...,architecture
2,http://m.sothebysrealty.com/307i0/x8jcwqt4cwd5...,architecture
3,https://media.wired.com/photos/5eb0bb43baa7754...,architecture
4,https://d1sacvjbhsczdb.cloudfront.net/media/ho...,architecture
...,...,...
1245,https://www.historicaerials.com/topos/T2014/75...,map
1246,https://www.british-coast-maps.com/map-torquay...,map
1247,https://media.skigebiete-test.de/images/ecu/en...,map
1248,https://fa.dhstatics.com/fa/0825/15035158-4438...,map



Dataframe saved to '../../data/image-deblurring-performance-analysis/image_deblurring_dataset.parquet'


### Download original images


In [None]:
from img2dataset import download
from pathlib import Path

def download_images(url_list, output_dir, generate_json=False):
    if not isinstance(url_list, Path):
        if isinstance(url_list, str):
            url_list = Path(url_list)
        else:
            raise TypeError("url_list must be a string or a pathlib.Path object")

    if not isinstance(output_dir, Path):
        if isinstance(output_dir, str):
            output_dir = Path(output_dir)
        else:
            raise TypeError("output_dir must be a string or a pathlib.Path object")

    output_dir.mkdir(parents=True, exist_ok=True)

    download(
        url_list=str(url_list),
        input_format="parquet",
        output_folder=str(output_dir),
        resize_mode="no",
        encode_format="png",
        encode_quality=9,
        thread_count=64,
    )

    if not generate_json:
        for json_file in (output_dir / "00000").glob("*.json"):
            json_file.unlink()

download_images(
    url_list=const.MAIN_DATASET_PATH,
    output_dir=const.ORIGINAL_DATASET_PATH
)

Starting the downloading of this file
Sharding file number 1 of 1 called /home/sense/SUPSI/supsi-labs/1.2/data-project-and-hackaton/image-deblurring-performance-analysis/notebooks/dataset_creation/../../data/image_deblurring_dataset.parquet


0it [00:00, ?it/s]

File sharded in 1 shards
Downloading starting now, check your bandwidth speed (with bwm-ng)your cpu (with htop), and your disk usage (with iotop)!


1it [18:29, 1109.85s/it]


worker  - success: 0.760 - failed to download: 0.229 - failed to resize: 0.011 - images per sec: 1 - count: 1250
total   - success: 0.760 - failed to download: 0.229 - failed to resize: 0.011 - images per sec: 1 - count: 1250


### Metadata extraction


In [23]:
import pyarrow.parquet as pq

import utils.constants as const

parquet_file_path = const.ORIGINAL_DATASET_PATH / "00000.parquet"
parquet_data = pq.read_table(parquet_file_path).to_pandas()
parquet_data = parquet_data.drop(columns=["error_message"]).dropna()
parquet_data

Unnamed: 0,url,key,status,width,height,original_width,original_height,exif,sha256
5,https://media.wired.com/photos/5eb0bb43baa7754...,000000003,success,1600.0,1067.0,1600.0,1067.0,{},a60bc92a3eea64049891455993d337c015d6e9ffe12ed3...
10,http://mredllc.media-cs.connectmls.com/699F686...,000000039,success,1620.0,1080.0,1620.0,1080.0,{},54fd1f55509ec28fc10206d812fc0e402d53bc2d035e49...
11,https://d1sacvjbhsczdb.cloudfront.net/media/ho...,000000004,success,1072.0,1072.0,1072.0,1072.0,{},962c9203ccee681dfee3779b0ecc7f58d537484bcb1d5c...
15,https://www.officelovin.com/wp-content/uploads...,000000013,success,1700.0,1133.0,1700.0,1133.0,"{""Image XResolution"": ""240"", ""Image YResolutio...",89e1a574b269eea8f931494cb8dd049e6fa2694dcd6bd0...
16,https://c8.alamy.com/comp/F3X5NK/car-light-tra...,000000012,success,1300.0,1327.0,1300.0,1327.0,{},3cc10c093e7f86b449197698385ad8fa2c181664aff2d6...
...,...,...,...,...,...,...,...,...,...
1245,https://fa.dhstatics.com/fa/0825/15035158-4438...,000001248,success,1226.0,1500.0,1226.0,1500.0,{},ebd3aa2aaf3dd106a2a626309981045a254bc424c65ec0...
1246,https://media.skigebiete-test.de/images/ecu/en...,000001247,success,2200.0,1317.0,2200.0,1317.0,{},6793dd8b6c68b92f68fc07f8ec61605a0a3102d16fb691...
1247,https://geospatialmedia.s3.amazonaws.com/wp-co...,000001249,success,1920.0,1750.0,1920.0,1750.0,{},f9a21c8216bb738d08a91d02a22e2fe2d56b80ef960192...
1248,https://www.historicaerials.com/topos/T2014/75...,000001245,success,1638.0,2088.0,1638.0,2088.0,{},fb99e3fce009d987f698f511dbec06c4d31fa50a7168ed...


In [None]:
# TODO use already present 'df_image_data' to store metadata
# TODO from parquet_data extract all relevant data (width, height, exif)
# TODO extract additional data that was not present in .json (aspect_ratio, size)
# TODO save df_image_data to const.MAIN_DATASET_PATH (to_parquet)

### Resulting dataset distribution


In [None]:
import numpy as np

df_image_data = df_image_data.dropna()

category_counts = df_image_data.groupby('category')['url'].count()
print(category_counts)
sum_counts = category_counts.sum()
print(f"\nSum of counts: {sum_counts}\n")

mean = np.mean(category_counts)
std_dev = np.std(category_counts)

print(f"Mean: {mean}")
print(f"Standard deviation: {std_dev}")