In [1]:
from tqdm import tqdm
tqdm.pandas()
from glob import glob

import json
import csv
import numpy as np
import pandas as pd
from collections import Counter

import rdflib
from rdflib import Graph
from data.data import CollectionAccessor, ImageHandler, EmbeddingSpaceAccessor

from search import Search, Randomiser#, GraphSearcher

def init_DMG():
    image_folder = "./data/images/DMG"
    image_handler = ImageHandler(image_folder=image_folder, keep_prefix=True)
    print("image paths loaded...")

    time_stamp, pub_file, priv_file = CollectionAccessor.get_latest_dump("./data/dumps")
    # print(CollectionAccessor.get_latest_dump("./data/dumps"))

    
    dmg_meta = dict(name="Design Museum Gent (public & private)", id_="DMG_"+time_stamp,
                creation_timestamp=time_stamp)
    df = CollectionAccessor.get_DMG(pub_path=pub_file, #get_latest("./data/dumps", contains="public"),
                                     priv_path=priv_file, #get_latest("./data/dumps", contains="private"),
                                     rights_path="./data/rights.csv",
                                     image_handler=image_handler,
                                     **dmg_meta)
    return image_handler, df

im, dmg = init_DMG()

  from .autonotebook import tqdm as notebook_tqdm


image paths loaded...


100%|████████████████████████████████████████████████████████████████████████| 24824/24824 [00:27<00:00, 899.16it/s]


In [None]:
kg = GraphSearcher(dmg)

In [None]:
pd.Series([v for _ in tqdm(range(50)) for v in kg(dmg.sample(6))]).value_counts().sort_index()

---

In [None]:
import networkx as nx
class GraphSearcher():#Searcher):
    @staticmethod
    def iter_values(r):
        for v in r:
            if isinstance(v, list): yield from v
            elif v: yield v
            else: pass
    
    def _build(self, collection):
        # pbar = tqdm(collection[self.cats].iterrows(), 
        #             total=len(collection), desc='[GraphSearcher]: building graph...')
        pbar = collection[self.cats].iterrows()
        cat_obj_links = [(r.name, v) for i, r in pbar for v in GraphSearcher.iter_values(r)]
        
        # pbar = tqdm(collection[self.cats].iterrows(), 
        #             total=len(collection), desc='[GraphSearcher]: building graph...')
        pbar = collection[self.cats].iterrows()
        cat_cat_links = [tuple(sorted((v1, v2)))for i, r in pbar 
                         for v1 in GraphSearcher.iter_values(r) for v2 in GraphSearcher.iter_values(r) 
                         if (v1 and v2) and (not v1 == v2)]
        return nx.from_edgelist(cat_obj_links+list(set(cat_cat_links)))
    
    def __init__(self, coll, cats=None, name="KGSearcher"):
        # super().__init__(name)    
        
        self.obj_nodes = set(coll.index)
        self.cats = cats if (cats is not None) else coll.coll.categorical_cols.keys()
        self.G = self._build(coll)
        self.id = "KG"

    
    def __call__(self, records):
        assert all((obj_num in self.obj_nodes) for obj_num in records.index)
        
        dists = [nx.shortest_path_length(self.G, source=objnum, target=None) for objnum in records.index]

        raw_scores = pd.Series([np.mean([(d[obj_num] if obj_num in d else 100) for d in dists]) 
                                for obj_num in self.obj_nodes], 
                       index=self.obj_nodes, name=self.id)
        return raw_scores #self.dist2sim(raw_scores)

    @staticmethod
    def unit_norm(s):
        unit_normed = (s - s.min())/(s.max()- s.min())
        return unit_normed #/unit_normed.sum()

    @staticmethod
    def dist2sim(d):
        return GraphSearcher.unit_norm(d.max() - d)


In [None]:
cs = ['objectname_URI', 'material_URI', 'part_label', 'part_material_URI', 'maker_URI', 'coiner_URI']
kg = GraphSearcher(dmg, cats=cs)
kg(dmg.sample(6)).value_counts()

In [None]:
from itertools import combinations
len(list(combinations(range(10), 7)))

In [None]:
from itertools import combinations
combs = combinations(dmg.coll.categorical_cols.keys(), 7)
for cur_cs in tqdm(combs):
    print(cur_cs)
    kg = GraphSearcher(dmg, cats=list(cur_cs))
    

    vals = kg(dmg.sample(6)).value_counts()
    if len(vals) > 4 and vals.iloc[0] < 12000:
        print(cur_cs)
        print(vals, "\n\n\n")

In [None]:
pbar = tqdm(dmg[dmg.coll.categorical_cols.keys()].iterrows(), 
                    total=len(dmg), desc='[GraphSearcher]: building graph...')
cat_obj_links = [(r.name, v) for i, r in pbar for v in GraphSearcher.iter_values(r) if v]

In [None]:
# len(cat_obj_links), cat_obj_links[:10]

len(kg.G.nodes)

In [None]:
for c in dmg.coll.categorical_cols.keys():
    print(c)
    print(pd.Series(
        [v for row in dmg[c] for v in (row if isinstance(row, list) else [row]) if v]
    ).value_counts().iloc[:3])
    print("\n\n")

---
# EMBEDDINGS

In [None]:
# from data import EmbeddingSpaceAccessor
# import umap.umap_ as umap

# sem_emb_dir = "./distiluse-base-multilingual-cased-v2_embeddings/"
# sem_emb = EmbeddingSpaceAccessor.load(sem_emb_dir, loadXD=None, index_col="record_id", )

# n_c = 32
# reducer = umap.UMAP(metric="cosine", n_neighbors=10, min_dist=(sem_emb.values.var()**0.5/2), n_components=n_c)
# red_embs = pd.DataFrame(reducer.fit_transform(sem_emb.to_numpy()), index=sem_emb.index)
# red_embs.to_csv(f"{sem_emb_dir}/embs_umap_{n_c}D.csv")

In [2]:
vis_embs = EmbeddingSpaceAccessor.load("./data/generated_data/vitmae")

In [3]:
vis_embs

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
object_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_1-2,-6.747492,1.377744,8.752237,0.390078,-12.489491,-47.672676,-0.103505,6.874737,3.854417,2.496699,...,5.171770,-13.857553,-5.093154,-8.767626,-1.434795,-4.758001,-6.276115,-5.456454,-21.845024,8.245191
0001_2-2,-8.361392,1.202482,8.579128,0.738576,-10.780611,-47.848980,0.191315,6.532754,4.073065,1.949772,...,4.633105,-13.773504,-4.753304,-8.800258,-1.358659,-4.665448,-6.971670,-5.328973,-21.921762,8.041379
0003_0-3,-6.586731,1.254864,8.245485,1.658292,-10.260692,-48.248943,0.522000,6.444243,4.522527,1.459047,...,4.521293,-13.874323,-4.620827,-9.676676,-1.095634,-4.261670,-6.832951,-4.496130,-21.695803,8.069549
0003_1-3,-9.854054,0.724908,8.491260,0.559022,-11.103300,-46.235660,-0.528708,6.676129,3.623898,2.027026,...,4.946862,-13.693720,-4.881076,-8.982921,-1.188051,-4.820155,-6.424192,-5.367999,-22.013561,7.816948
0003_2-3,-6.259486,1.243787,7.897137,0.461582,-12.441860,-46.636032,0.209751,6.941004,4.162588,2.042936,...,4.826023,-14.274298,-4.442492,-8.009071,-0.611647,-4.668001,-7.146232,-5.617901,-21.720968,8.278456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TUP-0135,-8.434200,1.587039,8.227858,1.227267,-8.777843,-46.471160,0.007980,6.341303,4.363594,2.004922,...,4.443928,-14.217077,-4.804143,-9.130194,-1.303661,-4.689074,-6.828470,-5.093659,-22.146564,8.437577
TUP-0137,-5.743187,1.243439,8.378471,0.887250,-11.972797,-46.671830,0.072308,6.348228,3.885282,2.145570,...,4.752369,-14.095217,-4.970286,-9.150188,-1.391118,-4.735775,-6.735520,-5.337151,-21.865107,7.696784
TUP-0139,-7.219774,1.492641,7.669778,0.585324,-11.852599,-47.478390,0.041774,6.860432,3.931410,2.246048,...,5.010588,-14.673128,-4.918185,-8.443419,-1.009804,-4.991119,-6.814557,-5.532884,-21.773533,8.391875
TUP-0140,-8.060513,1.025537,7.852996,1.552933,-10.549572,-48.359460,0.775677,6.596469,4.341614,1.371672,...,4.668210,-13.828725,-4.761271,-8.714023,-0.818096,-4.338132,-7.065471,-4.785571,-21.894499,8.049953


In [4]:
red_embs = vis_embs.emb_space.umap()

TypeError: '<' not supported between instances of 'dict' and 'int'

In [None]:
from search import EmbeddingSearcher


vs = EmbeddingSearcher(vis_embs, name="VisualSearcher")