In [1]:
import json
import numpy as np
from metaclip.substr_matching import substr_matching
from metaclip.balancing import balance_sampling

with open("metadata.json") as f:
  metadata = json.load(f)
# entry counts for our 1.6B(pool) -> 400M(curated); please check balance_sampling:main and substr match and count on your own data.
with open("metaclip/entry_counts_400m.json") as f:
  entry_count_json = json.load(f)
entry_count = np.array([entry_count_json[entry] for entry in metadata], dtype=np.uint64)  # uint64 to be safe for scaling.

In [2]:
t = 20000
entry_count[entry_count < t] = t
entry_prob = t / entry_count

In [3]:
# try multiple times for chance of sampling an example.
texts = [
    ["alt", "jacksons chameleon", None],
    ["alt", "battery plate", None],
    ["alt", "trombone model", None],
    ["alt", "Adult T-shirt", None],
]

for text in texts:
  text[2] = substr_matching(text[1], metadata)
  curation_prob = min(entry_prob[text[2]].sum(), 1.0)
  curated = balance_sampling(text[2], entry_prob)
  print(f"[curation_prob={curation_prob:.3f}, curated={curated}] {text[1]}")

'jacksons chameleon' curated
'trombone model' curated
'Adult T-shirt' curated


In [4]:
from IPython.display import display, HTML
min_font_size = 4
max_font_size = 10

for text in texts:
    raw_text = text[1]
    for entry_id in text[2]:
        entry = metadata[entry_id]
        font_size = min_font_size + int((max_font_size - min_font_size) * entry_prob[entry_id])
        raw_text = raw_text.replace(entry, '<font size="{}", color="#008080">{}</font>'.format(font_size, entry))
    display(HTML(raw_text))