In [None]:
from tqdm import tqdm
tqdm.pandas()
from glob import glob

import json
import csv
import numpy as np
import pandas as pd
from collections import Counter

import rdflib
from rdflib import Graph
from data.data import CollectionAccessor, ImageHandler, EmbeddingSpaceAccessor

from search import Search, Randomiser, GraphSearcher, EmbeddingSearcher

def init_DMG():
    image_folder = "./data/images/DMG"
    image_handler = ImageHandler(image_folder=image_folder, keep_prefix=True)
    print("image paths loaded...")

    time_stamp, pub_file, priv_file = CollectionAccessor.get_latest_dump("./data/dumps")
    # print(CollectionAccessor.get_latest_dump("./data/dumps"))

    
    dmg_meta = dict(name="Design Museum Gent (public & private)", id_="DMG_"+time_stamp,
                creation_timestamp=time_stamp)
    df = CollectionAccessor.get_DMG(pub_path=pub_file, #get_latest("./data/dumps", contains="public"),
                                     priv_path=priv_file, #get_latest("./data/dumps", contains="private"),
                                     rights_path="./data/rights.csv",
                                     image_handler=image_handler,
                                     **dmg_meta)
    return image_handler, df

im, dmg = init_DMG()

In [None]:
kg_searcher = GraphSearcher(dmg)
sem_embs = EmbeddingSpaceAccessor.load("data/generated_data/distiluse-base-multilingual-cased-v2",
                                       loadXD=13)
sem_searcher = EmbeddingSearcher(sem_embs, name="SemanticSearcher")
    
viz_embs = EmbeddingSpaceAccessor.load("data/generated_data/vitmae", loadXD=13)
viz_searcher = EmbeddingSearcher(viz_embs, name="VisualSearcher")

s = Search([kg_searcher, sem_searcher, viz_searcher])

In [None]:
searcher_ids = [searcher.id for searcher in s.searchers]
cur_searchers = [s for s in s.searchers for s_id in searcher_ids if s.id == s_id]

cur_searchers

In [None]:
recs = dmg.sample(4)
cur_searchers = s.searchers

searcher_scores = [s(recs) for s in []]# self.searchers]
searcher_scores = pd.DataFrame({s.name: s for s in searcher_scores})
searcher_scores.loc[recs.index] = 0.


In [None]:
def unit_norm(scores):
    return scores/scores.sum()

normed_scores = (searcher_scores/searcher_scores.sum(0))

# 1/len(searcher_scores)


normed_scores.mean(axis=1).hist(histtype="step")
unit_norm(searcher_scores.mean(axis=1)).hist(histtype="step")

In [None]:
pd.Series([v for _ in tqdm(range(50)) for v in kg(dmg.sample(6))]).value_counts().sort_index()

---

In [None]:
import networkx as nx
class GraphSearcher():#Searcher):
    @staticmethod
    def iter_values(r):
        for v in r:
            if isinstance(v, list): yield from v
            elif v: yield v
            else: pass
    
    def _build(self, collection):
        # pbar = tqdm(collection[self.cats].iterrows(), 
        #             total=len(collection), desc='[GraphSearcher]: building graph...')
        pbar = collection[self.cats].iterrows()
        cat_obj_links = [(r.name, v) for i, r in pbar for v in GraphSearcher.iter_values(r)]
        
        # pbar = tqdm(collection[self.cats].iterrows(), 
        #             total=len(collection), desc='[GraphSearcher]: building graph...')
        pbar = collection[self.cats].iterrows()
        cat_cat_links = [tuple(sorted((v1, v2)))for i, r in pbar 
                         for v1 in GraphSearcher.iter_values(r) for v2 in GraphSearcher.iter_values(r) 
                         if (v1 and v2) and (not v1 == v2)]
        return nx.from_edgelist(cat_obj_links+list(set(cat_cat_links)))
    
    def __init__(self, coll, cats=None, name="KGSearcher"):
        # super().__init__(name)    
        
        self.obj_nodes = set(coll.index)
        self.cats = cats if (cats is not None) else coll.coll.categorical_cols.keys()
        self.G = self._build(coll)
        self.id = "KG"

    
    def __call__(self, records):
        assert all((obj_num in self.obj_nodes) for obj_num in records.index)
        
        dists = [nx.shortest_path_length(self.G, source=objnum, target=None) for objnum in records.index]

        raw_scores = pd.Series([np.mean([(d[obj_num] if obj_num in d else 100) for d in dists]) 
                                for obj_num in self.obj_nodes], 
                       index=self.obj_nodes, name=self.id)
        return raw_scores #self.dist2sim(raw_scores)

    @staticmethod
    def unit_norm(s):
        unit_normed = (s - s.min())/(s.max()- s.min())
        return unit_normed #/unit_normed.sum()

    @staticmethod
    def dist2sim(d):
        return GraphSearcher.unit_norm(d.max() - d)


In [None]:
cs = ['objectname_URI', 'material_URI', 'part_label', 'part_material_URI', 'maker_URI', 'coiner_URI']
kg = GraphSearcher(dmg, cats=cs)
kg(dmg.sample(6)).value_counts()

In [None]:
from itertools import combinations
len(list(combinations(range(10), 7)))

In [None]:
from itertools import combinations
combs = combinations(dmg.coll.categorical_cols.keys(), 7)
for cur_cs in tqdm(combs):
    print(cur_cs)
    kg = GraphSearcher(dmg, cats=list(cur_cs))
    

    vals = kg(dmg.sample(6)).value_counts()
    if len(vals) > 4 and vals.iloc[0] < 12000:
        print(cur_cs)
        print(vals, "\n\n\n")

In [None]:
pbar = tqdm(dmg[dmg.coll.categorical_cols.keys()].iterrows(), 
                    total=len(dmg), desc='[GraphSearcher]: building graph...')
cat_obj_links = [(r.name, v) for i, r in pbar for v in GraphSearcher.iter_values(r) if v]

In [None]:
# len(cat_obj_links), cat_obj_links[:10]

len(kg.G.nodes)

In [None]:
for c in dmg.coll.categorical_cols.keys():
    print(c)
    print(pd.Series(
        [v for row in dmg[c] for v in (row if isinstance(row, list) else [row]) if v]
    ).value_counts().iloc[:3])
    print("\n\n")

---
# EMBEDDINGS

In [None]:
# from data import EmbeddingSpaceAccessor
# import umap.umap_ as umap

# sem_emb_dir = "./distiluse-base-multilingual-cased-v2_embeddings/"
# sem_emb = EmbeddingSpaceAccessor.load(sem_emb_dir, loadXD=None, index_col="record_id", )

# n_c = 32
# reducer = umap.UMAP(metric="cosine", n_neighbors=10, min_dist=(sem_emb.values.var()**0.5/2), n_components=n_c)
# red_embs = pd.DataFrame(reducer.fit_transform(sem_emb.to_numpy()), index=sem_emb.index)
# red_embs.to_csv(f"{sem_emb_dir}/embs_umap_{n_c}D.csv")

In [None]:
vis_embs = EmbeddingSpaceAccessor.load("./data/generated_data/vitmae")

In [None]:
vis_embs

In [None]:
red_embs = vis_embs.emb_space.umap(save_to="./data/generated_data/vitmae/embeddings_32.csv")

In [None]:
from search import EmbeddingSearcher


vs = EmbeddingSearcher(red_embs, name="VisualSearcher")

In [None]:
vs(dmg.sample(4)).hist()

In [None]:
# import umap.umap_ as umap

# k = -1
# data = vis_embs.iloc[:k].to_numpy()
# default_params = dict(metric="cosine", n_neighbors=10, 
#                     min_dist=(data.var()**0.5/5), spread=40, n_components=2)

# print(default_params["min_dist"])
# # default_params.update(umap_params)
# reducer = umap.UMAP(**default_params)
# red_embs = pd.DataFrame(reducer.fit_transform(data), index=vis_embs.index[:k])
# # if save_to is not None:
# #     red_embs.to_csv(save_to, index=True, sep=("\t" if to_tsv else ","))


In [None]:
import matplotlib.pyplot as plt

plt.plot(red_embs.values.T[0], red_embs.values.T[1], ".")

### semantic embeddings

In [None]:
sem_embs = EmbeddingSpaceAccessor.load("./data/generated_data/distiluse-base-multilingual-cased-v2/")

In [None]:
red_embs = sem_embs.emb_space.umap(save_to="./data/generated_data/distiluse-base-multilingual-cased-v2/embeddings_32.csv")

---
# concept search

In [None]:
from search import TextEmbeddingSearcher

In [None]:
sem_embs = EmbeddingSpaceAccessor.load("data/generated_data/distiluse-base-multilingual-cased-v2/",
                                       loadXD=None)

In [None]:
ts = TextEmbeddingSearcher(sem_embs)

In [None]:
dmg.loc[ts("stoel met rugleuning").sort_values()[::-1].index[:20]].coll.get_texts()

In [None]:
import torch
vec = ts.embedder.encode("serendipity")
vec = torch.as_tensor(vec).double()
sims = pd.Series(ts.rank_vector(vec), index=ts.space.index, name=ts.id)
sims.sort_values()[::-1][:100]

In [None]:
dmg.loc[sims.sort_values()[::-1].index[:100]].coll.get_texts()

In [None]:
import matplotlib.pyplot as plt
plt.plot(
    list(sims.sort_values()[::-1]),
    dmg.coll.get_texts().apply(len).loc[sims.sort_values()[::-1].index],
    "."
)

---

# MKG KG search

In [1]:
from data.data import ImageHandler, CollectionAccessor
from search import GraphSearcher

def init_MKG():
    MKG_DIR = "./data/MKG/"
    image_folder = "./data/MKG/images"
    image_handler = ImageHandler(image_folder=image_folder, keep_prefix=False)

    # time_stamp, pub_file, priv_file = CollectionAccessor.get_latest_dump("./data/dumps")

    time_stamp = "2025-06-05"
    mkg_meta = dict(name="Museum Kunst & Gewerbe", id_="MKG_"+time_stamp,
                    creation_timestamp=time_stamp, language="de")
    mkg = CollectionAccessor.get_MKG(metadata_path=MKG_DIR+"/dumps/extraction_v0_1.csv",
                                    image_handler=image_handler,
                                    **mkg_meta)

    return mkg


mkg = init_MKG()

  from .autonotebook import tqdm as notebook_tqdm




100%|███████████████████████████████████████████████████████████████████████| 26299/26299 [00:25<00:00, 1017.98it/s]


In [2]:
kg_searcher = GraphSearcher(mkg)

[GraphSearcher]: building graph...: 100%|███████████████████████████████████| 26299/26299 [00:03<00:00, 8735.08it/s]
[GraphSearcher]: building graph...: 100%|███████████████████████████████████| 26299/26299 [00:04<00:00, 5267.73it/s]


In [3]:
kg_searcher(mkg.sample(4)).value_counts()

KGSearcher0
0.428571    11046
0.714286     7697
0.571429     5905
0.285714     1300
0.142857      341
0.857143        5
0.000000        4
1.000000        1
Name: count, dtype: int64

In [5]:
mkg.sample(100).coll.get_presentation_records()

[{'inventory_number': 'P1991.5012',
  'title': 'Ludwig Mies van der Rohe',
  'description': '',
  'designer': 'Hugo Erfurth',
  'producer': 'Hugo Erfurth',
  'design_date': '1934',
  'production_date': '1934',
  'design_place': 'Dresden',
  'production_place': 'Dresden',
  'rights_attribution': 'In Copyright',
  'image_path': ''},
 {'inventory_number': 'P1991.5088',
  'title': '"Syndikus H. A. Roeloffs" aus der Mappe "Hamburgische Männer und Frauen am Anfang des XX. Jahrhunder …',
  'description': 'Teil der Mappe "Hamburgische Männer und Frauen am Anfang des XX. Jahrhunderts - Kamera Bildnisse - Aufgenommen, in Kupfer geätzt und gedruckt von Rudolph Dührkoop Hamburg 1905".',
  'designer': 'Rudolph Dührkoop',
  'producer': 'Rudolph Dührkoop',
  'design_date': '1905',
  'production_date': '1905',
  'design_place': 'Hamburg',
  'production_place': 'Hamburg',
  'rights_attribution': 'In Copyright',
  'image_path': ''},
 {'inventory_number': 'P2012.275',
  'title': '',
  'description': '',
