In [1]:
import string
import requests
import json
from requests import HTTPError
from ipywidgets import IntProgress
from IPython.display import display

In [5]:
# here we combine all possible two character sequences to query Docker Hub

queries = []
for f in string.ascii_lowercase:
    for s in string.ascii_lowercase:
        queries.append(f + s)
queries[:10]

['aa', 'ab', 'ac', 'ad', 'ae', 'af', 'ag', 'ah', 'ai', 'aj']

In [6]:
def get_page(page: int, query: str):
    """
    requests Docker Hub images with the query
    """
    url = "https://hub.docker.com/api/content/v1/products/search"
    params = {
        "page_size": 100,
        "q": query,
        "source": "community",
        "type": "image",
        "page": page
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()
    return data["summaries"]

In [None]:
# this dict checkpoints the progress
query_finished = {}
for q in queries:
    query_finished[q] = False

for a in string.ascii_lowercase[:16]:
    for b in string.ascii_lowercase:
        query_finished[a + b] = True

In [None]:
# for each two-letter string retrieve 100 pages with 100 images each and store to file
index_start = 0
index_end = len(queries)
page_start = 1
page_end = 100

bar = IntProgress(min=0, max=index_end)
display(bar)


for index in range(index_start, index_end):
    query = queries[index]
    print(f"getting {query}")
    if query_finished[query] != True:
        images = {}
        for i in range(page_start, page_end):
            images[i] = []
        
        for page_index in range(page_start, page_end):
            try:
                images[page_index] = get_page(page_index, query)
            except HTTPError:
                print(f"error with query {query} page {page_index}")

        with open(f"../data/01_list_images/letters_paginated/{query}.json", "w") as output:
            json.dump(images, output, ensure_ascii=False, indent=4)
        
    print(f"got {query}")
    query_finished[query] = True
    bar.value += 1

In [None]:
# check all finished
all(finished == True for finished in query_finished.values())

In [None]:
# for each two-letters query string combine 100 pages together

bar = IntProgress(min=0, max=len(queries))
display(bar)

for query in queries:
    images = []
    with open(f"../data/01_list_images/letters_paginated/{query}.json", "r") as f:
        images_pages = json.load(f)
    for page in images_pages.values():
        images.extend(page)
    with open(f"../data/01_list_images/letters/{query}.json", "w") as output:
        json.dump(images, output, ensure_ascii=False, indent=4)
    bar.value += 1