The "HQ-50K Dataset" is required, which contains the `test` and `train` folders. The important folder is `test`, which includes `.txt` files containing URLs for 1250 images.


In [1]:
import sys
sys.path.append("../..")

### Load HQ-50k/test/ subset


In [2]:
import os
import pandas as pd
import utils.constants as const

image_data = []

for txt_file in os.listdir(const.INIT_IMAGES_DATASET_PATH):
    if txt_file.endswith(".txt"):
        category = os.path.splitext(txt_file)[0] # Use the file name (without extension) as the category
        txt_path = os.path.join(const.INIT_IMAGES_DATASET_PATH, txt_file)
        with open(txt_path, "r") as file:
            urls = file.readlines()

            for url in urls:
                url = url.strip()
                if not url:
                    continue

                image_data.append({
                    "url": url,
                    "category": category
                })

print(f"Total processed images: {len(image_data)}")

df_image_data = pd.DataFrame(image_data)
display(df_image_data)

df_image_data.to_parquet(const.MAIN_DATASET_PATH, index=False)
print(f"\nDataframe saved")

Total processed images: 1250


Unnamed: 0,url,category
0,http://m.sothebysrealty.com/236i0/6ytt92qefx9y...,architecture
1,http://RealEstateAdminImages.gabriels.net/170/...,architecture
2,http://m.sothebysrealty.com/307i0/x8jcwqt4cwd5...,architecture
3,https://media.wired.com/photos/5eb0bb43baa7754...,architecture
4,https://d1sacvjbhsczdb.cloudfront.net/media/ho...,architecture
...,...,...
1245,https://www.historicaerials.com/topos/T2014/75...,map
1246,https://www.british-coast-maps.com/map-torquay...,map
1247,https://media.skigebiete-test.de/images/ecu/en...,map
1248,https://fa.dhstatics.com/fa/0825/15035158-4438...,map



Dataframe saved


### Download original images


In [None]:
from img2dataset import download
from pathlib import Path

def download_images(url_list, output_dir, generate_json=False):
    if not isinstance(url_list, Path):
        if isinstance(url_list, str):
            url_list = Path(url_list)
        else:
            raise TypeError("url_list must be a string or a pathlib.Path object")

    if not isinstance(output_dir, Path):
        if isinstance(output_dir, str):
            output_dir = Path(output_dir)
        else:
            raise TypeError("output_dir must be a string or a pathlib.Path object")

    output_dir.mkdir(parents=True, exist_ok=True)

    download(
        url_list=str(url_list),
        input_format="parquet",
        output_folder=str(output_dir),
        resize_mode="no",
        encode_format="png",
        encode_quality=9,
        thread_count=64,
    )

    if not generate_json:
        for json_file in (output_dir / "00000").glob("*.json"):
            json_file.unlink()

download_images(
    url_list=const.MAIN_DATASET_PATH,
    output_dir=const.ORIGINAL_DATASET_PATH
)

### Metadata extraction


In [3]:
import pyarrow.parquet as pq

import utils.constants as const

parquet_file_path = const.ORIGINAL_DATASET_PATH / "00000.parquet"
parquet_data = pq.read_table(parquet_file_path).to_pandas()
parquet_data

Unnamed: 0,url,key,status,error_message,width,height,original_width,original_height,exif,sha256
0,http://melissako.net/wp-content/uploads/meliss...,000000008,failed_to_download,<urlopen error [Errno -2] Name or service not ...,,,,,,
1,http://alliswall.com/file/1007/1920x1200/16:9/...,000000047,failed_to_download,<urlopen error [Errno -2] Name or service not ...,,,,,,
2,https://www.abergavennymotorhomehire.co.uk/wp-...,000000033,failed_to_download,<urlopen error [Errno -2] Name or service not ...,,,,,,
3,https://wallpapersdsc.net/wp-content/uploads/2...,000000020,failed_to_download,<urlopen error [Errno -5] No address associate...,,,,,,
4,http://www.northwestcontemporaryhome.com/wp-co...,000000054,failed_to_download,<urlopen error [Errno -2] Name or service not ...,,,,,,
...,...,...,...,...,...,...,...,...,...,...
1245,https://fa.dhstatics.com/fa/0825/15035158-4438...,000001248,success,,1226.0,1500.0,1226.0,1500.0,{},ebd3aa2aaf3dd106a2a626309981045a254bc424c65ec0...
1246,https://media.skigebiete-test.de/images/ecu/en...,000001247,success,,2200.0,1317.0,2200.0,1317.0,{},6793dd8b6c68b92f68fc07f8ec61605a0a3102d16fb691...
1247,https://geospatialmedia.s3.amazonaws.com/wp-co...,000001249,success,,1920.0,1750.0,1920.0,1750.0,{},f9a21c8216bb738d08a91d02a22e2fe2d56b80ef960192...
1248,https://www.historicaerials.com/topos/T2014/75...,000001245,success,,1638.0,2088.0,1638.0,2088.0,{},fb99e3fce009d987f698f511dbec06c4d31fa50a7168ed...


In [4]:
def get_file_size_in_bytes(file_path):
    try:
        return os.path.getsize(file_path)
    except FileNotFoundError:
        return None

parquet_data['aspect_ratio'] = parquet_data['width'] / parquet_data['height']
parquet_data['size'] = parquet_data['key'].apply(
    lambda key: get_file_size_in_bytes(const.ORIGINAL_DATASET_PATH / "00000" / f"{key}.png")
)

df_image_data = df_image_data.merge(
    parquet_data[['url', 'key', 'width', 'height', 'exif', 'aspect_ratio', 'size']],
    on='url',
    how='outer'
)

display(df_image_data)

df_image_data.to_parquet(const.MAIN_DATASET_PATH, index=False)
print(f"Updated dataframe saved")


Unnamed: 0,url,category,key,width,height,exif,aspect_ratio,size
0,http://100500foto.com/wp-content/uploads/2016/...,people,000000291,,,,,
1,http://2gfsl7am0og1m91u0pwpiehl.wpengine.netdn...,indoor_scene,000000987,,,,,
2,http://411posters.com/wp-content/uploads/2011/...,poster,000000382,1300.0,1728.0,{},0.752315,3169036.0
3,http://RealEstateAdminImages.gabriels.net/170/...,architecture,000000058,,,,,
4,http://RealEstateAdminImages.gabriels.net/170/...,architecture,000000001,,,,,
...,...,...,...,...,...,...,...,...
1245,https://www.yamaha.com/en/musical_instrument_g...,complex,000000564,1800.0,1042.0,{},1.727447,3199285.0
1246,https://www.yellowmaps.com/usgs/topomaps/drg24...,map,000001225,1509.0,2026.0,"{""Image Tag 0x5100"": ""0""}",0.744817,5742001.0
1247,https://www.zappos.com/images/z/2/5/1/8/8/7/25...,furniture,000000481,1920.0,1440.0,{},1.333333,4276346.0
1248,https://ycdn.space/h/2015/02/Capitol-Hill-Loft...,indoor_scene,000000954,1050.0,1575.0,{},0.666667,2000861.0


Updated dataframe saved


### Resulting dataset distribution


In [5]:
import numpy as np

df_image_data = df_image_data.dropna()

category_counts = df_image_data.groupby('category')['url'].count()
print(category_counts)
sum_counts = category_counts.sum()
print(f"\nSum of counts: {sum_counts}\n")

mean = np.mean(category_counts)
std_dev = np.std(category_counts)

print(f"Mean: {mean}")
print(f"Standard deviation: {std_dev}")

category
animal          94
architecture    76
comic           89
complex         76
food            74
furniture       71
indoor_scene    69
map             80
people          72
poster          83
scenery         65
vehicle         68
withchar        33
Name: url, dtype: int64

Sum of counts: 950

Mean: 73.07692307692308
Standard deviation: 14.073770138706962
