In [17]:
import string
import pandas as pd
import json
from ipywidgets import IntProgress
from IPython.display import display

In [18]:
# here we combine all possible two character sequences to query Docker Hub

queries = []
for f in string.ascii_lowercase:
    for s in string.ascii_lowercase:
        queries.append(f + s)

In [19]:
# load all images to the Dataframe

bar = IntProgress(min=0, max=len(queries))
display(bar)

dfs = []
for query in queries:
    data = pd.read_json(f"../data/01_list_images/letters/{query}.json")
    dfs.append(data)
    bar.value += 1

images = pd.concat(dfs, ignore_index=True)

IntProgress(value=0, max=676)

In [20]:
# delete duplicating rows and sort according to image popularity

images.drop_duplicates(subset=['name'], inplace=True)
images.sort_values(by=["popularity"], ascending=False, inplace=True)

In [21]:
images

Unnamed: 0,id,name,slug,type,publisher,created_at,updated_at,short_description,source,extension_reviewed,popularity,categories,operating_systems,architectures,logo_url,certification_status,star_count,filter_type
3324362,,stakater/reloader,,,{},0001-01-01T00:00:00Z,0001-01-01T00:00:00Z,,community,False,15707567662,,,,,,0,
128898,,fluent/fluent-bit,,,{},0001-01-01T00:00:00Z,0001-01-01T00:00:00Z,"Fluent Bit, lightweight logs and metrics colle...",community,False,13504855326,,,,,,0,
173908,,istio/pilot,,,{},0001-01-01T00:00:00Z,0001-01-01T00:00:00Z,Istiod (formerly known as Pilot),community,False,10892841325,,,,,,0,
1777945,,istio/proxyv2,,,{},0001-01-01T00:00:00Z,0001-01-01T00:00:00Z,Istio proxy,community,False,10856258988,,,,,,0,
59603,,datadog/agent,,,{},0001-01-01T00:00:00Z,0001-01-01T00:00:00Z,Docker container for the new Datadog Agent,community,False,10506175135,,,,,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948837,,stimopicma/unduh-uc-browser-blackberry-9320-ce...,,,{},0001-01-01T00:00:00Z,0001-01-01T00:00:00Z,Unduh Uc Browser Blackberry 9320 Cell Phone Game,community,False,0,,,,,,0,
3948838,,nejudpapo/unduh-film-di-uc-browser-error-di-ip...,,,{},0001-01-01T00:00:00Z,0001-01-01T00:00:00Z,Unduh Film Di Uc Browser Error Di Iphone,community,False,0,,,,,,0,
3948839,,riabiosenword/aplikasi-pengunduh-video-youtube...,,,{},0001-01-01T00:00:00Z,0001-01-01T00:00:00Z,Aplikasi Pengunduh Video Youtube Uc Browser Fi...,community,False,0,,,,,,0,
3948840,,textjamfconfoo1989/read-download-pdf-epub-virt...,,,{},0001-01-01T00:00:00Z,0001-01-01T00:00:00Z,Ebook Virtual Reality Access Enabled by Kieron...,community,False,0,,,,,,0,


In [22]:
# overall, ~3.7 images were found, which is the approximate volume of Docker Hub (community part)

images.shape

(3694651, 18)

In [23]:
# clean some unnecessary columns

images_clean = images.drop(columns=["id", "slug", "type", "publisher", "created_at", "updated_at", "extension_reviewed", "categories", "operating_systems", "architectures", "logo_url", "certification_status", "filter_type", "star_count", "source"])

In [24]:
images_clean

Unnamed: 0,name,short_description,popularity
3324362,stakater/reloader,,15707567662
128898,fluent/fluent-bit,"Fluent Bit, lightweight logs and metrics colle...",13504855326
173908,istio/pilot,Istiod (formerly known as Pilot),10892841325
1777945,istio/proxyv2,Istio proxy,10856258988
59603,datadog/agent,Docker container for the new Datadog Agent,10506175135
...,...,...,...
3948837,stimopicma/unduh-uc-browser-blackberry-9320-ce...,Unduh Uc Browser Blackberry 9320 Cell Phone Game,0
3948838,nejudpapo/unduh-film-di-uc-browser-error-di-ip...,Unduh Film Di Uc Browser Error Di Iphone,0
3948839,riabiosenword/aplikasi-pengunduh-video-youtube...,Aplikasi Pengunduh Video Youtube Uc Browser Fi...,0
3948840,textjamfconfoo1989/read-download-pdf-epub-virt...,Ebook Virtual Reality Access Enabled by Kieron...,0


In [25]:
# as can be seen, the 75% of images have less than 146 pulls

images.popularity.describe().apply(lambda x: format(x, 'f'))

count        3694651.000000
mean          116215.950822
std         18573991.339427
min                0.000000
25%                8.000000
50%               31.000000
75%              146.000000
max      15707567662.000000
Name: popularity, dtype: object

In [26]:
# here we select the upper part of the set to form a reasonably subset of images for further research

quant_1000 = images_clean[images_clean['popularity'] > images_clean.popularity.quantile(0.99968)]
quant_1000

Unnamed: 0,name,short_description,popularity
3324362,stakater/reloader,,15707567662
128898,fluent/fluent-bit,"Fluent Bit, lightweight logs and metrics colle...",13504855326
173908,istio/pilot,Istiod (formerly known as Pilot),10892841325
1777945,istio/proxyv2,Istio proxy,10856258988
59603,datadog/agent,Docker container for the new Datadog Agent,10506175135
...,...,...,...
909846,edwardcarmack/gearforce-web,,29534504
774904,rancher/dns,,29531302
1301155,maxmindinc/geoipupdate,Use ghcr.io/maxmind/geoipupdate for new releases,29463326
726731,directus/directus,"Directus is an open-source ""headless"" CMS & AP...",29400874


In [27]:
# save to file 
quant_1000.to_csv("../data/02_subset_images/quant_1000.csv")

In [None]:
# a bigger subset
quant = images_clean[images_clean["popularity"] > images_clean.popularity.quantile(0.999)]
quant.to_csv("../data/02_subset_images/quant.csv")