In [20]:
import json
import random

import numpy as np
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC

from config import VCSLAM
from util.parse import generate_dictionaries, generate_id_dict, encode_triples

In [None]:
datasets = [
            VCSLAM()
            #ARMSTRADER()
            ]

In [22]:
vcslam = VCSLAM()
dataset = vcslam

with open(vcslam.PARSED_MODELS_PATH, 'r') as file:
    vcslam_models = json.load(file)

# models = armstrader_models if dataset.identifier == "armstrader" else vcslam_models
models = vcslam_models
template = vcslam_models
triples = []
for model in models[:]:
    for triple in model:
        triples.append(tuple(triple))

classes, predicates = generate_dictionaries(triples)
classes_mapping = generate_id_dict(classes)
predicates_mapping = generate_id_dict(predicates)


In [23]:
len(triples)

2560

In [24]:
def prepare_data(models, classes_mapping=None, predicates_mapping=None, shuffle=False,
                 triple_weights=None, verbose=False):
    triples = []
    for model in models[:]:
        for triple in model:
            triples.append(tuple(triple))

    if shuffle:
        random.shuffle(triples)

    if triple_weights:
        triples = modrel.reduce_relations(triples, triple_weights, verbose=verbose)
    if not classes_mapping:
        classes, predicates = generate_dictionaries(triples)
        classes_mapping = generate_id_dict(classes)
        predicates_mapping = generate_id_dict(predicates)
    # print("training triples size", len(train_triples))

    encoded_triples = encode_triples(triples, classes_mapping, predicates_mapping)
    X = encoded_triples[:, [0, 2]]
    y = encoded_triples[:, 1]
    # print(X.shape, y.shape)
    # print(train_data, X,y)
    return X, y

In [25]:
import preprocess.modify_relations as modrel

# triples = modrel.reduce_relations(triples, targets)

#test_triples = random.sample(triples, floor(len(triples) / 10))
# test_triples = [triple for triple in test_triples if triple[0] == 'http://schema.org/Offer' and triple[2] == str(XSD.string)]



In [26]:
n_jobs = -1

In [27]:
def calc_hits_mrr(model, predictions, reference, hits=None):
    if hits is None:
        hits = [3]
    predictions = model.classes_[np.argsort(predictions)[:, ::-1]]
    positions = []
    for row in range(len(predictions)):
        for i in range(predictions.shape[1]):
            if predictions[row][i] == reference[row]:
                positions.append(i)
                break

    positions = [(i + 1) for i in positions]
    result = {}
    result['mrr'] = np.mean(positions)
    for value in hits:
        hits_at_value = [1 if pos <= value else 0 for pos in positions]
        result['hits@' + str(value)] = np.sum(hits_at_value) / len(hits_at_value)
    return result


In [28]:
%%capture
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)

C_values = [10,100,1000]
gammas = ["scale", 0.001, 0.0001]
results = []
rng  = "X"
for dataset in datasets:
    for c in C_values:
        for gamma in gammas:
            X, y = prepare_data(template, classes_mapping=classes_mapping,
                                predicates_mapping=predicates_mapping)
            svm = SVC(gamma=gamma, C=c, probability=True)
            cv_results = cross_validate(svm, X[:], y[:].reshape(-1), scoring='accuracy', cv=cv, n_jobs=n_jobs,
                                        error_score='raise', return_estimator=True)
            hits_3 = []
            mrrs = []
            for est in cv_results['estimator']:
                pred = est.predict_proba(X)
                hits_mrr = calc_hits_mrr(est, pred, y, hits=[1,3])
                hits_3.append(hits_mrr['hits@3'])
                mrrs.append(hits_mrr['mrr'])
            scores = cv_results['test_score']
            results.append([f'SVM {dataset.identifier} {rng} {c} {gamma}', mean(scores), std(scores), mean(mrrs), std(mrrs), mean(hits_3), std(hits_3)])
            #print(f'RFC {dataset.identifier} {rng} {estimator} | {mean(n_scores):.3f} ({std(n_scores):.3f})')


In [29]:
results = sorted(results, key=lambda x: x[1], reverse=True)
results[:10]

[['RFC vcslam X 100 0.001',
  0.5328125,
  0.00935870980524292,
  8.276544016636155,
  0.26601055036420235,
  0.7889310628704611,
  0.004296412257829757],
 ['RFC vcslam X 1000 0.001',
  0.53046875,
  0.01074041178505275,
  8.44620233370971,
  0.16776373576904763,
  0.7761548900403551,
  0.006033975126346682],
 ['RFC vcslam X 10 0.001',
  0.51796875,
  0.01141534312203317,
  8.421994527668929,
  0.3280565382433172,
  0.7847777324697216,
  0.003436958691320613],
 ['RFC vcslam X 1000 0.0001',
  0.506640625,
  0.017451803049756777,
  7.644577031654333,
  0.1437712959414364,
  0.6615684449757917,
  0.005634775274003378],
 ['RFC vcslam X 100 0.0001',
  0.487109375,
  0.011627246926669698,
  7.522091205838629,
  0.20591967583597898,
  0.6917456043266708,
  0.005774101974373568],
 ['RFC vcslam X 1000 scale',
  0.480859375,
  0.015098553195381007,
  7.623770292967103,
  0.14904268269713694,
  0.6527938364764896,
  0.004509754308981115],
 ['RFC vcslam X 10 0.0001',
  0.454296875,
  0.01540865853