In [1]:
import rdflib
import pandas as pd

from tqdm import tqdm_notebook

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold

import sys
sys.path.append('..')
from tree_builder import KGPTree, KGPForest, KPGTransformer
from datastructures import *
import time

import pickle

import warnings; warnings.filterwarnings('ignore')

In [2]:
rdf_file = 'data/AIFB/aifb.n3'
_format = 'n3'
train_file = 'data/AIFB/AIFB_test.tsv'
test_file = 'data/AIFB/AIFB_train.tsv'
entity_col = 'person'
label_col = 'label_affiliation'
label_predicates = [
    rdflib.URIRef('http://swrc.ontoware.org/ontology#affiliation'),
    rdflib.URIRef('http://swrc.ontoware.org/ontology#employs'),
    rdflib.URIRef('http://swrc.ontoware.org/ontology#carriedOutBy')
]
output = 'output/aifb_depth10.p'

In [3]:
print(end='Loading data... ', flush=True)
g = rdflib.Graph()
g.parse(rdf_file, format=_format)
print('OK')

test_data = pd.read_csv(train_file, sep='\t')
train_data = pd.read_csv(test_file, sep='\t')

train_entities = [rdflib.URIRef(x) for x in train_data[entity_col]]
train_labels = train_data[label_col]

test_entities = [rdflib.URIRef(x) for x in test_data[entity_col]]
test_labels = test_data[label_col]

kg = Graph.rdflib_to_graph(g, label_predicates=label_predicates)

Loading data... OK


# Path Feature Transformation

In [4]:
transf = KPGTransformer(kg, path_max_depth=8, n_features=10000, progress=tqdm_notebook)
transf.fit(train_entities, train_labels)

HBox(children=(IntProgress(value=0, description='Extracting neighborhoods', max=140, style=ProgressStyle(descr…




HBox(children=(IntProgress(value=0, description='walk loop', max=18517, style=ProgressStyle(description_width=…



HBox(children=(IntProgress(value=0, description='walk loop', max=18517, style=ProgressStyle(description_width=…



HBox(children=(IntProgress(value=0, description='walk loop', max=18517, style=ProgressStyle(description_width=…



HBox(children=(IntProgress(value=0, description='walk loop', max=18517, style=ProgressStyle(description_width=…



In [None]:
train_features = transf.transform(train_entities)
test_features = transf.transform(test_entities)

useful_features = np.sum(train_features, axis=0) > 1

train_features = train_features[:, useful_features]
test_features = test_features[:, useful_features]

print(train_features.shape)

HBox(children=(IntProgress(value=0, description='Extracting neighborhoods', max=140, style=ProgressStyle(descr…

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = GridSearchCV(RandomForestClassifier(max_features=None), 
                   {'n_estimators': [10, 100, 250], 'max_depth': [5, 10, None]})
clf.fit(train_features, train_labels)
print(clf.best_params_)
preds = clf.predict(test_features)
print(accuracy_score(test_labels, preds))

In [None]:
from sklearn.linear_model import LogisticRegression

clf = GridSearchCV(LogisticRegression(penalty='l1'), {'C': [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]})
clf.fit(train_features, train_labels)
print(clf.best_params_)
preds = clf.predict(test_features)
print(accuracy_score(test_labels, preds))

In [None]:
from sklearn.svm import SVC

clf = GridSearchCV(SVC(), {'kernel': ['rbf', 'linear'], 'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]})
clf.fit(train_features, train_labels)
print(clf.best_params_)
preds = clf.predict(test_features)
print(accuracy_score(test_labels, preds))

# Single KG Path Tree

In [None]:
clf = KGPTree(kg, path_max_depth=8, min_samples_leaf=1, max_tree_depth=None, progress=tqdm_notebook)

In [None]:
%%time
clf.fit(train_entities, train_labels)

In [None]:
preds = clf.predict(test_entities)
print(accuracy_score(test_labels, preds))

# Forest of KG Path Trees

In [None]:
clf = KGPForest(kg, path_max_depth=8, 
                min_samples_leaf=1, 
                max_tree_depth=None,
                n_estimators=25,
                vertex_sample=0.5,
                progress=tqdm_notebook,
                n_jobs=-1)

In [None]:
%%time
clf.fit(train_entities, train_labels)

In [None]:
preds = clf.predict(test_entities)
print(accuracy_score(test_labels, preds))