In [1]:
import rdflib
import pandas as pd

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold

import sys
sys.path.append('..')
from tree_builder import KGPTree
from datastructures import *
import time

import pickle

In [2]:
rdf_file = 'data/AIFB/aifb.n3'
_format = 'n3'
train_file = 'data/AIFB/AIFB_test.tsv'
test_file = 'data/AIFB/AIFB_train.tsv'
entity_col = 'person'
label_col = 'label_affiliation'
label_predicates = [
    rdflib.URIRef('http://swrc.ontoware.org/ontology#affiliation'),
    rdflib.URIRef('http://swrc.ontoware.org/ontology#employs'),
    rdflib.URIRef('http://swrc.ontoware.org/ontology#carriedOutBy')
]
output = 'output/aifb_depth10.p'

In [3]:
print(end='Loading data... ', flush=True)
g = rdflib.Graph()
g.parse(rdf_file, format=_format)
print('OK')

test_data = pd.read_csv(train_file, sep='\t')
train_data = pd.read_csv(test_file, sep='\t')

train_entities = [rdflib.URIRef(x) for x in train_data[entity_col]]
train_labels = train_data[label_col]

test_entities = [rdflib.URIRef(x) for x in test_data[entity_col]]
test_labels = test_data[label_col]

kg = KnowledgeGraph.rdflib_to_kg(g, label_predicates=label_predicates)

clf = KGPTree(kg, path_max_depth=6, neighborhood_depth=8, min_samples_leaf=1, max_tree_depth=5)

Loading data... OK


In [4]:
clf.fit(train_entities, train_labels)

  0%|          | 0/140 [00:00<?, ?it/s]

Extracting neighborhoods...


100%|██████████| 140/140 [00:36<00:00,  4.34it/s]
100%|██████████| 7763/7763 [00:06<00:00, 1177.37it/s]
100%|██████████| 7763/7763 [00:07<00:00, 1043.13it/s]
100%|██████████| 7763/7763 [00:08<00:00, 911.92it/s] 
100%|██████████| 7312/7312 [00:01<00:00, 3869.32it/s]
100%|██████████| 7312/7312 [00:03<00:00, 1905.26it/s]
100%|██████████| 7312/7312 [00:02<00:00, 2517.75it/s]
100%|██████████| 6232/6232 [00:00<00:00, 21595.28it/s]
100%|██████████| 6232/6232 [00:00<00:00, 12774.74it/s]
100%|██████████| 6232/6232 [00:00<00:00, 8519.34it/s]
100%|██████████| 5966/5966 [00:00<00:00, 32654.68it/s]
100%|██████████| 5966/5966 [00:00<00:00, 17528.05it/s]
100%|██████████| 5966/5966 [00:01<00:00, 3478.01it/s] 
100%|██████████| 3728/3728 [00:01<00:00, 2183.05it/s]
100%|██████████| 3728/3728 [00:02<00:00, 1736.15it/s]
100%|██████████| 3728/3728 [00:02<00:00, 1653.86it/s]
100%|██████████| 3354/3354 [00:00<00:00, 9864.46it/s]
100%|██████████| 3354/3354 [00:00<00:00, 7844.93it/s]
100%|██████████| 3354/3354 

In [5]:
preds = clf.predict(test_entities)
print(accuracy_score(test_labels, preds))

0.8611111111111112


In [15]:
for s, p, o in g.triples((None, None, rdflib.URIRef('http://www.aifb.uni-karlsruhe.de/Projekte/viewProjektOWL/id68instance'))):
    print(s, s in train_entities)

http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance False
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id41instance False
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2085instance False
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id79instance False
http://www.aifb.uni-karlsruhe.de/Forschungsgebiete/viewForschungsgebietOWL/id79instance False
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id57instance False
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2084instance True
http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id20instance True


In [16]:
for s, p, o in g.triples((None, None, rdflib.URIRef('http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id20instance'))):
    print(s, p, o)

http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id1244instance http://swrc.ontoware.org/ontology#author http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id20instance
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id217instance http://swrc.ontoware.org/ontology#author http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id20instance
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id1151instance http://swrc.ontoware.org/ontology#author http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id20instance
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id436instance http://swrc.ontoware.org/ontology#author http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id20instance
http://www.aifb.uni-karlsruhe.de/Publikationen/viewPublikationOWL/id1112instance http://swrc.ontoware.org/ontology#author http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id20instance
http://www.aifb.uni-karlsruhe.de/Publikationen/

In [14]:
train_entities

[rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1909instance'),
 rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2040instance'),
 rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id46instance'),
 rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id3instance'),
 rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1842instance'),
 rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1915instance'),
 rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1992instance'),
 rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1966instance'),
 rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2039instance'),
 rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id2065instance'),
 rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/P

In [12]:
from collections import Counter

walk = Walk()
walk.append(Hop('root', root=True))
walk.append(Hop('*', wildcard=True))
walk.append(Hop(Vertex('http://www.aifb.uni-karlsruhe.de/Projekte/viewProjektOWL/id68instance')))

found = []
not_found = []
for inst, label in zip(train_entities, train_labels):
    neighborhood = kg.extract_instance(inst)
    if neighborhood.find_walk(walk, kg):
        found.append(label)
    else:
        not_found.append(label)
        
print(Counter(found), Counter(not_found))

Counter({'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance': 2}) Counter({'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance': 58, 'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance': 46, 'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance': 22, 'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance': 12})


In [13]:
walk = Walk()
walk.append(Hop('root', root=True))
walk.append(Hop('*', wildcard=True))
walk.append(Hop('*', wildcard=True))
walk.append(Hop('*', wildcard=True))
walk.append(Hop('*', wildcard=True))
walk.append(Hop('*', wildcard=True))
walk.append(Hop(Vertex('http://www.aifb.uni-karlsruhe.de/Projekte/viewProjektOWL/id68instance')))

found = []
not_found = []
for inst, label in zip(train_entities, train_labels):
    neighborhood = kg.extract_instance(inst)
    if neighborhood.find_walk(walk, kg):
        found.append(label)
    else:
        not_found.append(label)
        
print(Counter(found), Counter(not_found))

Counter({'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance': 45, 'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance': 3, 'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance': 2}) Counter({'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance': 56, 'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance': 19, 'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance': 12, 'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance': 3})
