In [101]:
import json
from pathlib import Path
from typing import Any

import networkx as nx
from node2vec import Node2Vec

from config import VCSLAM
from util.parse import generate_dictionaries, generate_id_dict, encode_triples

In [102]:
vcslam = VCSLAM()
dataset = vcslam
with open(vcslam.PARSED_MODELS_PATH, 'r') as file:
    vcslam_models = json.load(file)

# models = armstrader_models if dataset.identifier == "armstrader" else vcslam_models
models = vcslam_models
template = vcslam_models
triples = []
for model in models[:]:
    for triple in model:
        triples.append(tuple(triple))

classes, predicates = generate_dictionaries(triples)
classes_mapping = generate_id_dict(classes)
predicates_mapping = generate_id_dict(predicates)

encoded_triples = encode_triples(triples, classes_mapping, predicates_mapping)

In [103]:
len(triples)

2560

In [104]:
# combine relations as described by Paulus et al.
sums = {}
weights = {}
outgoing = {}

for (s, a, t) in triples:
    if not (s, a, t) in weights:
        weights[(s, a, t)] = 0
    weights[(s, a, t)] += 1
    if not (s, t) in sums:
        sums[(s, t)] = 0
    if not s in outgoing:
        outgoing[s] = 0
    outgoing[s] += 1
    sums[(s, t)] += 1

#weights = dict(sorted(weights.items(), key=lambda item: item[1], reverse=True))
sorted_sums = sorted(sums.items(), reverse=True, key=lambda x: x[1])
# print(sorted_sums)

total = sum([count for count in sums.values()])

graph = nx.Graph()
for (s, t), count in sums.items():
    weight = sums[(s, t)] / outgoing[s]
    graph.add_edge(str(s), str(t), weight=weight)

model_based_node2vec = Node2Vec(graph,
                                p=1,
                                q=1,
                                dimensions=100,
                                walk_length=30,
                                #num_walks=1000,
                                num_walks=10,
                                workers=4)
# print("walks size", len(model_based_node2vec.walks), model_based_node2vec.num_walks)

n2v = model_based_node2vec.fit(window=10, sg=1, negative=5,
                               ns_exponent=1.0)

Computing transition probabilities:   0%|          | 0/468 [00:00<?, ?it/s]

2022-10-28 17:48:33,106 [INFO] - collecting all words and their counts
2022-10-28 17:48:33,107 [INFO] - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-10-28 17:48:33,126 [INFO] - collected 468 word types from a corpus of 140400 raw words and 4680 sentences
2022-10-28 17:48:33,127 [INFO] - Creating a fresh vocabulary
2022-10-28 17:48:33,129 [INFO] - Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 468 unique words (100.0%% of original 468, drops 0)', 'datetime': '2022-10-28T17:48:33.129650', 'gensim': '4.1.2', 'python': '3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'prepare_vocab'}
2022-10-28 17:48:33,129 [INFO] - Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 140400 word corpus (100.0%% of original 140400, drops 0)', 'datetime': '2022-10-28T17:48:33.129650', 'gensim': '4.1.2', 'python': '3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]', 'pl

In [105]:
base_path = Path(vcslam.LABEL_MAPPINGS_PATH)
mapping_files = list(base_path.glob('*'))

In [106]:
def get_mapped_attributes(mappings_path) -> set[str]:
    with open(mappings_path, encoding='utf-8') as mappings_file:
        mappings = json.load(mappings_file)

    label_mappings= [x['conceptResource'] for x in mappings]
    label_mappings = set([x.replace('http://www.plasma.uni-wuppertal.de/schema#', 'http://tmdtkg#') for x in label_mappings])
    return label_mappings

In [107]:
from networkx import minimum_spanning_tree
from networkx.algorithms.approximation import steiner_tree

def get_anchored_target_nodes(model: [], mapped_nodes: [], filter_target_nodes=True) -> dict[Any, list[Any]]:

    graph = nx.MultiDiGraph()
    for s, p, o in model:
        graph.add_edge(s, o, relation=str(p))

    minimal_tree = compute_steiner_tree(model, mapped_nodes)

    all_nodes = set(graph.nodes)
    tree_nodes = set(minimal_tree.nodes)
    target_nodes = all_nodes.difference(tree_nodes)

    anchored_nodes = {}
    # find the anchor for each target node
    for anchor in graph.nodes:
        if filter_target_nodes and anchor in target_nodes:
            continue
        found_nodes = []
        for target in target_nodes:
            if graph.has_edge(anchor,target):
                relation = graph.edges[anchor,target,0]
                found_nodes.append((anchor,relation["relation"],target))

        if found_nodes:
            anchored_nodes[anchor] = found_nodes

    return anchored_nodes

def compute_steiner_tree(model, mapped_nodes) -> nx.Graph:
    graph = nx.Graph()
    for s, p, o in model:
        if s == o:
            continue
        graph.add_node(s)
        graph.add_node(o)
        graph.add_edge(s, o)

    mst = minimum_spanning_tree(graph, algorithm='prim')
    # visualize(mst)

    minimal_tree = steiner_tree(mst, list(mapped_nodes))

    return minimal_tree

In [108]:
import numpy as np
from modelextension.statistics_recommender import StatisticsRecommender as SR

sr = SR(triples=encoded_triples)

setups = []

for id, model in enumerate(models[:]):
    mapped_attributes = get_mapped_attributes(mapping_files[id])
    recommended_nodes_set = set()
    anchored_target_nodes = get_anchored_target_nodes(model=model, mapped_nodes=mapped_attributes)
    # s_tree = compute_steiner_tree(model, mapped_nodes=mapped_attributes)
    if not anchored_target_nodes:
        print(f"Skipped model {id} as 0 target nodes are found")
        continue
    # print(f"Target ({id}) {len(anchored_target_nodes)} {anchored_target_nodes}")

    for anchor, target_nodes in anchored_target_nodes.items():
        similar = n2v.wv.most_similar(positive=anchor, topn=5)

        neighbors = [(concept, 1) for concept, id in classes_mapping.items() if np.amax(sr.predict_link(classes_mapping[anchor], classes_mapping[concept])) > 0]
        # all = [(concept, 1) for concept, id in classes_mapping.items() ]
        oracle = [(target_nodes[0][2], 1) ]
        setups.append({
            "anchor": anchor,
            "target": target_nodes,
            "oracle": oracle,
            "neighbors": neighbors,
            "similar": similar,
            # "all": all
        })


Skipped model 0 as 0 target nodes are found
Skipped model 45 as 0 target nodes are found
Skipped model 47 as 0 target nodes are found


In [109]:
#setups

In [110]:
with open("modelextensions.json","w") as file:
    json.dump(setups,fp=file, indent=3)

In [111]:
# remove duplicates
anchors = []
setups2 = []
for setup in setups:
    if setup['anchor'] not in anchors:
        anchors.append(setup['anchor'])
        setups2.append(setup)

setups = setups2
len(setups)

128

In [112]:
with open("modelextensions_single.json","w") as file:
    json.dump(setups,fp=file, indent=3)

In [113]:
from numpy import array
import numpy as np

def identify_similar_match(setup):
    for (ts,tp,to) in setup["target"]:
        for (so,_) in setup["similar"]:
            if so == to:
                return 1
    return 0

def identify_neighbor_match(setup):
    for (ts,tp,to) in setup["target"]:
        for (so,_) in setup["neighbors"]:
            if so == to:
                return 1
    return 0

In [114]:
result = [identify_similar_match(setup) for setup in setups]
result = array(result)
result.reshape((1,-1))

array([[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
        1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
        1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
        0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0]])

In [115]:
np.sum(result)/ len(setups)

0.6796875

In [116]:
result = [identify_neighbor_match(setup) for setup in setups]
result = array(result)
result.reshape((1,-1))

np.sum(result)/ len(setups)

1.0

In [117]:
setups = [setup for setup in setups if identify_similar_match(setup) == 1]
len(setups)

87

In [118]:
with open("modelextensions_filtered.json","w") as file:
    json.dump(setups,fp=file, indent=3)