In [1]:
import polars as pl
from pathlib import Path
import logging
from utils import set_up_logging, delete_corrupt_images

set_up_logging(Path("../logs"))

RETRY_COUNT = 10
WORKER_COUNT = 16
HTTP_TIMEOUT = 120
TARGET_PATH = Path("/mnt/wsl/PHYSICALDRIVE0p1/downloaded-unsplash")

TARGET_PATH = TARGET_PATH.resolve()
assert TARGET_PATH.exists()

In [2]:
delete_corrupt_images(list(TARGET_PATH.glob("*")))

In [3]:
photos = (
    pl.scan_csv(
        "../data/unsplash-full/photos.tsv000",
        separator="\t",
        infer_schema_length=100000,
    )
    .filter(pl.col("photo_featured") == "t")
    .sort("photo_id")
    .select("photo_id", "photo_image_url")
    .collect()
)

photos.limit(10)

photo_id,photo_image_url
str,str
"""---jvVJZ34o""","""https://images.unsplash.com/ph…"
"""--0-I4GpLZU""","""https://images.unsplash.com/ph…"
"""--2IBUMom1I""","""https://images.unsplash.com/ph…"
"""--2sDoKRgCg""","""https://images.unsplash.com/ph…"
"""--5QEAiAfgE""","""https://images.unsplash.com/ph…"
"""--6JlGcHl-w""","""https://images.unsplash.com/ph…"
"""--6sqOMUDs8""","""https://images.unsplash.com/ph…"
"""--97ozlPF1A""","""https://images.unsplash.com/ph…"
"""--D4Gg8RhIk""","""https://images.unsplash.com/ph…"
"""--EUYLhCTdc""","""https://images.unsplash.com/ph…"


In [4]:
keys = {path.name.split(".")[0] for path in TARGET_PATH.glob("*")}
photos = photos.filter(~pl.col("photo_id").is_in(keys))
logging.info(f"Found {len(photos)} missing photos")

2024-07-08 19:43:32,887 - INFO - Found 273293 missing photos


In [5]:
import concurrent.futures
import requests
from tqdm import tqdm
from typing import List
from time import sleep

progress = tqdm(total=len(photos))


def download_image(row):
    for retry_count in range(RETRY_COUNT):
        try:
            url = row["photo_image_url"]
            photo_id = row["photo_id"]
            logging.debug(f"Downloading {photo_id} from {url}")
            response = requests.get(url, timeout=HTTP_TIMEOUT)
            response.raise_for_status()
            extension = response.headers["Content-Type"].split("/")[-1]
            filename = TARGET_PATH / f"{photo_id}.{extension}"
            with open(filename, "wb") as f:
                f.write(response.content)
            logging.debug(f"Downloaded {photo_id} to {filename}")
            with progress.get_lock():
                progress.update(1)
            return
        except Exception as e:
            logging.error(
                f"Error downloading {photo_id} from {url} (retry {retry_count}): {e}",
                exc_info=True,
                stack_info=True,
            )
            sleep(retry_count * 0.5)


with concurrent.futures.ThreadPoolExecutor(max_workers=WORKER_COUNT) as executor:
    futures: List[concurrent.futures.Future] = []
    for row in photos.to_dicts():
        future = executor.submit(download_image, row)
        futures.append(future)

    progress.display()
    concurrent.futures.wait(futures)
progress.close()

  0%|          | 0/273293 [00:00<?, ?it/s]

  0%|          | 70/273293 [00:18<30:01:41,  2.53it/s]

In [None]:
delete_corrupt_images(list(TARGET_PATH.glob("*")))