In [1]:
import json
import random

import numpy as np
from numpy import mean
from numpy import std
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold

from config import ARMSTRADER, VCSLAM
from util.parse import generate_dictionaries, generate_id_dict, encode_triples

In [2]:
datasets = [
            VCSLAM()
            # ARMSTRADER()
            ]

In [3]:
dataset = datasets[0]
vcslam = VCSLAM()
with open(vcslam.PARSED_MODELS_PATH, 'r') as file:
    vcslam_models = json.load(file)

armstrader = ARMSTRADER()
with open(armstrader.PARSED_MODELS_PATH, 'r') as file:
    armstrader_models = json.load(file)

# models = armstrader_models if dataset.identifier == "armstrader" else vcslam_models
models = vcslam_models
template = vcslam_models
triples = []
for model in models[:]:
    for triple in model:
        triples.append(tuple(triple))

classes, predicates = generate_dictionaries(triples)
classes_mapping = generate_id_dict(classes)
predicates_mapping = generate_id_dict(predicates)


In [4]:
len(triples)

2560

In [5]:
def prepare_data(models_template, multiplier=1, classes_mapping=None, predicates_mapping=None, shuffle=False,
                 triple_weights=None, verbose=False):
    models = []
    for i in range(multiplier):
        models += models_template
    if shuffle:
        random.shuffle(models)

    triples = []
    for model in models[:]:
        for triple in model:
            triples.append(tuple(triple))

    if triple_weights:
        triples = modrel.reduce_relations(triples, triple_weights, verbose=verbose)
    if not classes_mapping:
        classes, predicates = generate_dictionaries(triples)
        classes_mapping = generate_id_dict(classes)
        predicates_mapping = generate_id_dict(predicates)
    # print("training triples size", len(train_triples))

    encoded_triples = encode_triples(triples, classes_mapping, predicates_mapping)
    X = encoded_triples[:, [0, 2]]
    y = encoded_triples[:, 1]
    # print(X.shape, y.shape)
    # print(train_data, X,y)
    return X, y

In [6]:
import preprocess.modify_relations as modrel

# triples = modrel.reduce_relations(triples, targets)

#test_triples = random.sample(triples, floor(len(triples) / 10))
# test_triples = [triple for triple in test_triples if triple[0] == 'http://schema.org/Offer' and triple[2] == str(XSD.string)]



In [7]:
def calc_hits_mrr(model, predictions, reference, hits=None):
    if hits is None:
        hits = [3]
    predictions = model.classes_[np.argsort(predictions)[:, ::-1]]
    positions = []
    for row in range(len(predictions)):
        for i in range(predictions.shape[1]):
            if predictions[row][i] == reference[row]:
                positions.append(i)
                break

    positions = [1/(i + 1) for i in positions]
    result = {}
    result['mrr'] = np.mean(positions)
    for value in hits:
        hits_at_value = [1 if pos <= value else 0 for pos in positions]
        result['hits@' + str(value)] = np.sum(hits_at_value) / len(hits_at_value)
    return result


In [8]:
n_jobs = -1

ranges = [
    1, 2,3
    #,5, 10
    ]

In [9]:
%%capture
from sklearn.model_selection import cross_validate

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1)
# estimators = [10, 20, 30]
estimators = [10]
max_depths = [10,20]
results = []
for dataset in datasets:
    for rng in ranges:
        for estimator in estimators:
            for depth in max_depths:
                X, y = prepare_data(template, rng, classes_mapping=classes_mapping,
                                    predicates_mapping=predicates_mapping)
                rfc = RandomForestClassifier(n_estimators=estimator, max_depth=depth)
                cv_results = cross_validate(rfc, X[:], y[:].reshape(-1), scoring='accuracy', cv=cv, n_jobs=n_jobs,
                                            error_score='raise', return_estimator=True)
                hits_3 = []
                mrrs = []
                for est in cv_results['estimator']:
                    pred = est.predict_proba(X)
                    hits_mrr = calc_hits_mrr(est, pred, y)
                    hits_3.append(hits_mrr['hits@3'])
                    mrrs.append(hits_mrr['mrr'])
                scores = cv_results['test_score']
                results.append([f'RFC {dataset.identifier} {rng} {estimator} {depth}', mean(scores), std(scores), mean(mrrs), std(mrrs), mean(hits_3), std(hits_3)])
                #print(f'RFC {dataset.identifier} {rng} {estimator} | {mean(n_scores):.3f} ({std(n_scores):.3f})')


In [10]:
results = sorted(results, key=lambda x: x[1], reverse=True)
results[:10]

[['RFC vcslam 3 10 20',
  0.915234375,
  0.006820729611306696,
  1.173984375,
  0.057163054030499004,
  0.9953906250000001,
  0.0005182226234930386],
 ['RFC vcslam 2 10 20',
  0.8498046875,
  0.014542586303895533,
  1.6310937500000002,
  0.12685022271918267,
  0.9861718749999999,
  0.0014532090029483154],
 ['RFC vcslam 3 10 10',
  0.6865885416666667,
  0.006848017600158685,
  2.1434374999999997,
  0.07231539671164433,
  0.87265625,
  0.006541073652792009],
 ['RFC vcslam 2 10 10',
  0.655078125,
  0.0068051763242907605,
  2.323046875,
  0.04165242269551983,
  0.8645312500000001,
  0.006420525253532639],
 ['RFC vcslam 1 10 20',
  0.57421875,
  0.011912461470294459,
  4.120673761787963,
  0.26787166091330245,
  0.9369063276790082,
  0.0022428387913397303],
 ['RFC vcslam 1 10 10',
  0.5515625,
  0.018879759333741516,
  3.278944306455352,
  0.23247652133866897,
  0.858930836490563,
  0.011825281582324293]]

In [11]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = [
  #{'C': [100], 'kernel': ['linear']},
  #{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
results = []
for rng in ranges:
    X,y = prepare_data(template,rng, classes_mapping=classes_mapping, predicates_mapping=predicates_mapping)
    clf = GridSearchCV(estimator=SVC(),             param_grid=param_grid)
    clf.fit(X,y)
    result = clf.cv_results_
    results.append(result)



In [12]:
for result in results:
    print(result['mean_test_score'])

[0.48125    0.4        0.49882813 0.44492188 0.51757812 0.4828125
 0.53398437 0.49179688]
[0.54277344 0.41894531 0.66171875 0.47695312 0.80761719 0.55976563
 0.89316406 0.65957031]
[0.58684896 0.43059896 0.7421875  0.50065104 0.85260417 0.59544271
 0.91770833 0.71354167]


In [13]:
from sklearn.ensemble import AdaBoostClassifier
param_grid = [
  {'n_estimators': [20, 50, 100], 'learning_rate': [0.01,0.1,1.0]}
 ]

for rng in ranges:
    X,y = prepare_data(template,rng, classes_mapping=classes_mapping, predicates_mapping=predicates_mapping)
    clf = GridSearchCV(estimator=AdaBoostClassifier(),
             param_grid=param_grid)
    clf.fit(X,y)
    result = clf.cv_results_
    print(result)



{'mean_fit_time': array([0.13021531, 0.32398992, 0.64400234, 0.12640014, 0.3193996 ,
       0.65459504, 0.12699814, 0.31341882, 0.63219733]), 'std_fit_time': array([0.0041335 , 0.01348019, 0.00452806, 0.00149832, 0.0106918 ,
       0.007218  , 0.00269521, 0.0055228 , 0.01054075]), 'mean_score_time': array([0.01419153, 0.03621116, 0.07398896, 0.01459856, 0.03620477,
       0.07440076, 0.01440177, 0.03559518, 0.07039585]), 'std_score_time': array([0.00040131, 0.0019339 , 0.00141364, 0.00049994, 0.00041434,
       0.00344345, 0.00048705, 0.00049126, 0.00313351]), 'param_learning_rate': masked_array(data=[0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[20, 50, 100, 20, 50, 100, 20, 50, 100],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       



{'mean_fit_time': array([0.25539861, 0.6460041 , 1.31590142, 0.26460476, 0.66517019,
       1.344029  , 0.2710259 , 0.64510708, 1.42349424]), 'std_fit_time': array([0.00650015, 0.02087913, 0.06320763, 0.00930426, 0.02322514,
       0.06127836, 0.00752893, 0.01727119, 0.08913329]), 'mean_score_time': array([0.03579979, 0.08439755, 0.17879963, 0.03599577, 0.08978291,
       0.18460135, 0.03728204, 0.08299603, 0.17780838]), 'std_score_time': array([0.00273189, 0.00366273, 0.01344696, 0.00167414, 0.00772851,
       0.019097  , 0.00337696, 0.00179262, 0.00548457]), 'param_learning_rate': masked_array(data=[0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[20, 50, 100, 20, 50, 100, 20, 50, 100],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       



{'mean_fit_time': array([0.42039504, 1.0484971 , 1.97551169, 0.40022621, 1.01628847,
       1.97769265, 0.38559546, 0.98613858, 2.00441055]), 'std_fit_time': array([0.02357065, 0.03208679, 0.0213534 , 0.00937289, 0.04189147,
       0.01956654, 0.01262036, 0.02255652, 0.04392788]), 'mean_score_time': array([0.07687087, 0.19948492, 0.37573757, 0.07259398, 0.18008113,
       0.36954789, 0.06919699, 0.17786918, 0.36415114]), 'std_score_time': array([0.0052501 , 0.01873053, 0.03036701, 0.00261036, 0.0030448 ,
       0.01199584, 0.00074833, 0.00873743, 0.0102006 ]), 'param_learning_rate': masked_array(data=[0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[20, 50, 100, 20, 50, 100, 20, 50, 100],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       

In [14]:
result

{'mean_fit_time': array([0.42039504, 1.0484971 , 1.97551169, 0.40022621, 1.01628847,
        1.97769265, 0.38559546, 0.98613858, 2.00441055]),
 'std_fit_time': array([0.02357065, 0.03208679, 0.0213534 , 0.00937289, 0.04189147,
        0.01956654, 0.01262036, 0.02255652, 0.04392788]),
 'mean_score_time': array([0.07687087, 0.19948492, 0.37573757, 0.07259398, 0.18008113,
        0.36954789, 0.06919699, 0.17786918, 0.36415114]),
 'std_score_time': array([0.0052501 , 0.01873053, 0.03036701, 0.00261036, 0.0030448 ,
        0.01199584, 0.00074833, 0.00873743, 0.0102006 ]),
 'param_learning_rate': masked_array(data=[0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[20, 50, 100, 20, 50, 100, 20, 50, 100],
              mask=[False, False, False, False, False, False, False, False,
                    

In [None]:

param_grid = [
  {'n_estimators': [50,100, 200], 'max_depth': [3,5,10,20]}
 ]

for rng in ranges:
    X,y = prepare_data(template,rng, classes_mapping=classes_mapping, predicates_mapping=predicates_mapping)
    clf = GridSearchCV(estimator=GradientBoostingClassifier(),             param_grid=param_grid)
    #clf = GradientBoostingClassifier()
    clf.fit(X,y)
    result = clf.cv_results_
    print(result)



{'mean_fit_time': array([ 23.97829952,  60.77276764, 164.20759096,  26.53822379,
        50.5984396 ,  93.86546693,  61.99181046, 104.17175999,
       154.87854385, 121.26059628, 179.37930059, 218.20026522]), 'std_fit_time': array([ 3.38912973, 14.74585637, 47.26587634,  2.04379036,  1.48441054,
        3.7841438 ,  2.7557627 ,  7.63983642,  6.77344892,  6.70417916,
       19.75381004, 13.49696919]), 'mean_score_time': array([0.03978906, 0.0747592 , 0.12921343, 0.07369118, 0.14462566,
       0.22172303, 0.21102095, 0.35384836, 0.43730826, 0.29425559,
       0.45979934, 0.55798097]), 'std_score_time': array([0.00040078, 0.00699492, 0.00903974, 0.00727527, 0.0031599 ,
       0.02405193, 0.02249015, 0.02428777, 0.02607211, 0.01610029,
       0.04497023, 0.09557476]), 'param_max_depth': masked_array(data=[3, 3, 3, 5, 5, 5, 10, 10, 10, 20, 20, 20],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False],
       fill_value='?'



{'mean_fit_time': array([ 53.9668601 , 146.33724723, 363.01407566,  52.84308605,
       100.07764988, 188.85317855,  91.53308668, 173.53828287,
       265.77699871, 180.49404426, 265.91842484, 341.87836456]), 'std_fit_time': array([ 7.25169961, 22.35364144, 79.58857679,  5.19772207,  1.81718678,
        0.43285449,  1.39527916,  6.24339304,  5.47567737,  1.305257  ,
        7.73107959,  0.80563856]), 'mean_score_time': array([0.08840446, 0.18219004, 0.25400972, 0.15748162, 0.2860034 ,
       0.42479224, 0.38701482, 0.76064768, 0.89118977, 0.60781856,
       0.95537863, 1.05082269]), 'std_score_time': array([0.00489061, 0.03087608, 0.01728178, 0.02618036, 0.01252253,
       0.02453097, 0.01011048, 0.10023061, 0.02407369, 0.02520184,
       0.06373866, 0.01925529]), 'param_max_depth': masked_array(data=[3, 3, 3, 5, 5, 5, 10, 10, 10, 20, 20, 20],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False],
       fill_value='?'

