In [1]:
import rdflib
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold

import sys
sys.path.append('..')
from tree_builder import MINDWALCTree, MINDWALCForest, MINDWALCTransform
from datastructures import Graph
import time

import pickle

import warnings; warnings.filterwarnings('ignore')

In [2]:
rdf_file = 'data/AIFB/aifb.n3'
_format = 'n3'
train_file = 'data/AIFB/AIFB_test.tsv'
test_file = 'data/AIFB/AIFB_train.tsv'
entity_col = 'person'
label_col = 'label_affiliation'
label_predicates = [
    rdflib.URIRef('http://swrc.ontoware.org/ontology#affiliation'),
    rdflib.URIRef('http://swrc.ontoware.org/ontology#employs'),
    rdflib.URIRef('http://swrc.ontoware.org/ontology#carriedOutBy')
]
output = 'output/aifb_depth10.p'

In [3]:
print(end='Loading data... ', flush=True)
g = rdflib.Graph()
g.parse(rdf_file, format=_format)
print('OK')

test_data = pd.read_csv(train_file, sep='\t')
train_data = pd.read_csv(test_file, sep='\t')

train_entities = [rdflib.URIRef(x) for x in train_data[entity_col]]
train_labels = train_data[label_col]

test_entities = [rdflib.URIRef(x) for x in test_data[entity_col]]
test_labels = test_data[label_col]

kg = Graph.rdflib_to_graph(g, label_predicates=label_predicates)

Loading data... OK


# Path Feature Transformation

In [4]:
transf = MINDWALCTransform(path_max_depth=8, n_features=1000, progress=tqdm_notebook, n_jobs=1)
transf.fit(kg, train_entities, train_labels)

2019-12-17 15:11:00,150	INFO resource_spec.py:205 -- Starting Ray with 0.98 GiB memory available for workers and up to 0.5 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


HBox(children=(IntProgress(value=0, description='Extracting neighborhoods', max=140, style=ProgressStyle(descr…




In [5]:
train_features = transf.transform(kg, train_entities)
test_features = transf.transform(kg, test_entities)

useful_features = np.sum(train_features, axis=0) > 1

train_features = train_features[:, useful_features]
test_features = test_features[:, useful_features]

print(train_features.shape)

HBox(children=(IntProgress(value=0, description='Extracting neighborhoods', max=140, style=ProgressStyle(descr…




HBox(children=(IntProgress(value=0, description='Extracting neighborhoods', max=36, style=ProgressStyle(descri…


(140, 1000)


In [6]:
from sklearn.ensemble import RandomForestClassifier

clf = GridSearchCV(RandomForestClassifier(max_features=None), 
                   {'n_estimators': [10, 100, 250], 'max_depth': [5, 10, None]})
clf.fit(train_features, train_labels)
print(clf.best_params_)
preds = clf.predict(test_features)
print(accuracy_score(test_labels, preds))

{'max_depth': 10, 'n_estimators': 100}
0.8611111111111112


In [7]:
from sklearn.linear_model import LogisticRegression

clf = GridSearchCV(LogisticRegression(penalty='l1'), {'C': [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]})
clf.fit(train_features, train_labels)
print(clf.best_params_)
preds = clf.predict(test_features)
print(accuracy_score(test_labels, preds))

{'C': 1.0}
0.8611111111111112


In [8]:
from sklearn.svm import SVC

clf = GridSearchCV(SVC(), {'kernel': ['rbf', 'linear'], 'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]})
clf.fit(train_features, train_labels)
print(clf.best_params_)
preds = clf.predict(test_features)
print(accuracy_score(test_labels, preds))

{'C': 0.1, 'kernel': 'linear'}
0.8611111111111112


# Single KG Path Tree

In [9]:
clf = MINDWALCTree(path_max_depth=8, min_samples_leaf=1, max_tree_depth=None, progress=tqdm_notebook, n_jobs=1)

2019-12-17 15:12:03,560	INFO resource_spec.py:205 -- Starting Ray with 0.83 GiB memory available for workers and up to 0.43 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


In [10]:
%%time
clf.fit(kg, train_entities, train_labels)

HBox(children=(IntProgress(value=0, description='Neighborhood extraction', max=140, style=ProgressStyle(descri…


CPU times: user 14.8 s, sys: 426 ms, total: 15.2 s
Wall time: 31.4 s


In [11]:
preds = clf.predict(kg, test_entities)
print(accuracy_score(test_labels, preds))

0.9166666666666666


# Forest of KG Path Trees

In [12]:
clf = MINDWALCForest(path_max_depth=8, 
                     min_samples_leaf=1, 
                     max_tree_depth=None,
                     n_estimators=25,
                     vertex_sample=0.5,
                     progress=tqdm_notebook,
                     n_jobs=1)

2019-12-17 15:12:38,307	INFO resource_spec.py:205 -- Starting Ray with 0.88 GiB memory available for workers and up to 0.46 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


In [13]:
%%time
clf.fit(kg, train_entities, train_labels)

HBox(children=(IntProgress(value=0, description='Neighborhood extraction', max=140, style=ProgressStyle(descri…




HBox(children=(IntProgress(value=0, description='estimator loop', max=25, style=ProgressStyle(description_widt…


CPU times: user 35.5 s, sys: 2.89 s, total: 38.4 s
Wall time: 3min 33s


In [14]:
preds = clf.predict(kg, test_entities)
print(accuracy_score(test_labels, preds))

HBox(children=(IntProgress(value=0, description='Extracting neighborhoods', max=36, style=ProgressStyle(descri…


0.9166666666666666
