In [1]:
import  os
import  re
import glob

import  pandas as pd
import  numpy as np

from src.utils import *
from src.models import TransE, rTransE



  from .autonotebook import tqdm as notebook_tqdm


#### OWL2Bench

In [19]:
OWL2Bench_dbs = [ { 'path' : './datasets/OWL2Bench/OWL2Bench1/',
                'train_file'  :'_train_OWL2Bench1',
                'test_file' : '_test_OWL2Bench1'},
                 { 'path' : './datasets/OWL2Bench/OWL2Bench2/',
                'train_file'  :'_train_OWL2Bench2',
                'test_file' : '_test_OWL2Bench2'} ]


for db_ in OWL2Bench_dbs:
    
    path = db_['path']
    train_file= db_['train_file']
    test_file= db_['test_file']
    
    print('Running...', train_file, test_file)
    
    # load data
    df_train= load_ore_files(path+train_file)
    
    data_subclass_train = df_train[df_train['p']== 'SubClassOf']
    data_subclass_train= data_subclass_train[['s','o']].rename(columns={'s':'subClass','o':'class'})
    transitive_classes= pd.merge(data_subclass_train,data_subclass_train,
                                 how='left',right_on=['subClass'],left_on=['class']
    ).dropna(subset=['class_y'])
    del transitive_classes['class_x']
    transitive_classes.columns = ['class_0', 'class_1', 'class_2']
    transitive_classes = transitive_classes.drop_duplicates(subset=['class_0', 'class_1', 'class_2'])
    data_subclass_train_quads = transitive_classes.reset_index(drop=True)
    
    df_test= load_ore_files(path+test_file)
    
    data_subclass_test = df_test[df_test['p']== 'SubClassOf']
    data_subclass_test= data_subclass_test[['s','o']].rename(columns={'s':'subClass','o':'class'})

    
    # generate triplets and quadruples
    res = prepare_subclass_data(data_subclass_train,data_subclass_train_quads)
    node_dict, node_count, train_trips, train_quads = res
    _, _, test_trips, test_quads = prepare_subclass_data(data_subclass_test)
    print(len(train_trips),len(train_quads))
    
    # tarin TransE
    model_OWL_TransE  = TransE(node_count,1)
    model_OWL_TransE._train(train_trips,train_quads);
    model_OWL_TransE._eval(test_trips) # evaluate TransE
    
    # train rTransE
    model_OWL_rTransE  = rTransE(node_count,1)
    model_OWL_rTransE._train(train_trips,train_quads,num_epoches=300);
    model_OWL_rTransE._eval(test_trips)  # evaluate rTransE
    
    print()

Running... _train_OWL2Bench1 _test_OWL2Bench1
105 64
epoch 0,	 train loss 1.03
epoch 50,	 train loss 1.11
epoch 100,	 train loss 0.99
hits@1  tensor(0.) ,hits@10  tensor(0.2000) ,MR  tensor(31.6333) ,MRR  tensor(0.0866)
epoch 0,	 train loss 0.94
epoch 100,	 train loss 0.91
epoch 150,	 train loss 0.98
epoch 200,	 train loss 0.66
hits@1  tensor(0.0667) ,hits@10  tensor(0.2667) ,MR  tensor(35.1667) ,MRR  tensor(0.1242)

Running... _train_OWL2Bench2 _test_OWL2Bench2
105 53
epoch 0,	 train loss 1.03
epoch 50,	 train loss 0.95
epoch 100,	 train loss 0.88
hits@1  tensor(0.0667) ,hits@10  tensor(0.2667) ,MR  tensor(35.6333) ,MRR  tensor(0.1203)
epoch 50,	 train loss 0.91
epoch 100,	 train loss 0.96
epoch 150,	 train loss 0.80
epoch 200,	 train loss 0.84
epoch 250,	 train loss 0.57
hits@1  tensor(0.) ,hits@10  tensor(0.3000) ,MR  tensor(30.5667) ,MRR  tensor(0.0911)



#### ORE

In [21]:
ORE_dbs = [     { 'path'      : './datasets/ORE/ORE1/',
                'train_file'  : '_train_ORE1',
                'test_file'   : '_test_ORE1'},
                { 'path'      : './datasets/ORE/ORE2/',
                'train_file'  : '_train_ORE2',
                'test_file'   : '_test_ORE2'},
                { 'path'      : './datasets/ORE/ORE3/',
                'train_file'  : '_train_ORE3',
                'test_file'   : '_test_ORE3'}]

In [22]:
for db_ in ORE_dbs:
    
    path = db_['path']
    train_file= db_['train_file']
    test_file= db_['test_file']
    
    print('Running...', train_file, test_file)
    
    # load data
    df_train= load_ore_files(path+train_file)
    data_subclass_train = df_train[df_train['p']== 'SubClassOf']
    data_subclass_train= data_subclass_train[['s','o']].rename(columns={'s':'subClass','o':'class'})
    transitive_classes= pd.merge(data_subclass_train,
                                 data_subclass_train,
                                 how='left',right_on=['subClass'],left_on=['class']
    ).dropna(subset=['class_y'])
    del transitive_classes['class_x']
    transitive_classes.columns = ['class_0', 'class_1', 'class_2']
    transitive_classes = transitive_classes.drop_duplicates(subset=['class_0', 'class_1', 'class_2']) 
    data_subclass_train_quads = transitive_classes.reset_index(drop=True)
    
    df_test= load_ore_files(path+test_file)
    data_subclass_test = df_test[df_test['p']== 'SubClassOf']
    data_subclass_test= data_subclass_test[['s','o']].rename(columns={'s':'subClass','o':'class'})
    data_subclass_test.head()
    
    res = prepare_subclass_data(data_subclass_train,data_subclass_train_quads)
    node_dict, node_count, train_trips, train_quads = res
    _, _, test_trips, test_quads = prepare_subclass_data(data_subclass_test)
    print(len(train_trips),len(train_quads))
    
    # tarin TransE
    print('')
    model_ORE_TransE  = TransE(node_count,1)
    model_ORE_TransE._train(train_trips,train_quads);

    model_ORE_TransE._eval(test_trips) # evaluate TransE
    
    # train rTransE
    model_ORE_rTransE  = rTransE(node_count,1)
    model_ORE_rTransE._train(train_trips,train_quads,num_epoches=300);
    model_ORE_rTransE._eval(test_trips)  # evaluate RTransE
    
    print()

Running... _train_ORE1 _test_ORE1
8194 9073

epoch 0,	 train loss 1.10
epoch 100,	 train loss 0.99
hits@1  tensor(0.0090) ,hits@10  tensor(0.0974) ,MR  tensor(51.2131) ,MRR  tensor(0.0493)
epoch 0,	 train loss 1.17
epoch 50,	 train loss 1.04
epoch 100,	 train loss 1.06
epoch 250,	 train loss 0.91
hits@1  tensor(0.0085) ,hits@10  tensor(0.0854) ,MR  tensor(50.3454) ,MRR  tensor(0.0476)

Running... _train_ORE2 _test_ORE2
8204 9369

epoch 0,	 train loss 0.95
epoch 50,	 train loss 1.10
hits@1  tensor(0.0090) ,hits@10  tensor(0.1003) ,MR  tensor(51.5712) ,MRR  tensor(0.0514)
epoch 50,	 train loss 1.12
epoch 100,	 train loss 1.15
epoch 150,	 train loss 1.03
epoch 200,	 train loss 0.99
hits@1  tensor(0.0090) ,hits@10  tensor(0.0926) ,MR  tensor(50.0546) ,MRR  tensor(0.0495)

Running... _train_ORE3 _test_ORE3
8187 9122

epoch 0,	 train loss 1.08
epoch 50,	 train loss 1.03
epoch 100,	 train loss 1.05
hits@1  tensor(0.0064) ,hits@10  tensor(0.1085) ,MR  tensor(50.4748) ,MRR  tensor(0.0503)
epoch

#### CaLiGraph

In [3]:
CLG_dbs = [ { 'path'      : 'datasets/clg/clg_10e4/',
                'train_file'  : 'clg_10e4-train.nt',
                'test_file'   : 'clg_10e4-test.nt'},
            { 'path'      : 'datasets/clg/clg_10e5/',
                'train_file'  : 'clg_10e5-train.nt',
                'test_file'   : 'clg_10e5-test.nt'}]

In [4]:
max_test_batch_size = 10000

In [None]:
for db_ in CLG_dbs:
    
    path = db_['path']
    train_file= db_['train_file']
    test_file= db_['test_file']
    
    print('Running...', train_file, test_file)
    
    df_train= load_clg_files(path+train_file)
    data_subclass_train = df_train[df_train['p']== '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
    data_subclass_train = data_subclass_train[['s','o']].rename(columns={'s':'subClass','o':'class'})   
    transitive_classes= pd.merge(data_subclass_train,data_subclass_train,how='left',right_on=['subClass'],left_on=['class']).dropna(subset=['class_y'])
    del transitive_classes['class_x']
    transitive_classes.columns = ['class_0', 'class_1', 'class_2']
    transitive_classes = transitive_classes.drop_duplicates(subset=['class_0', 'class_1', 'class_2']) # drop duplicates
    data_subclass_train_quads = transitive_classes.reset_index(drop=True)
    
    df_test= load_clg_files(path+test_file)
    data_subclass_test = df_test[df_test['p']== '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
    data_subclass_test= data_subclass_test[['s','o']].rename(columns={'s':'subClass','o':'class'})
    
    res = prepare_subclass_data(data_subclass_train,data_subclass_train_quads)
    ode_dict, node_count, train_trips, train_quads = res
    _, _, test_trips, test_quads = prepare_subclass_data(data_subclass_test)
    print(len(train_trips),len(train_quads))
    
    # Train TransE
    print('')
    model_e  = TransE(node_count,1)
    model_e._train(train_trips,train_quads);
    model_e._eval(test_trips[:max_test_batch_size])  # evaluate TransE
    
    # Train rTransE
    model_r  = rTransE(node_count,1)
    model_r._train(train_trips,train_quads);
    model_r._eval(test_trips[:max_test_batch_size]) # evaluate rTransE
    
    print()

Running... clg_10e4-train.nt clg_10e4-test.nt
59956 87509

epoch 0,	 train loss 1.10
epoch 50,	 train loss 0.90
hits@1  tensor(0.) ,hits@10  tensor(0.) ,MR  tensor(24.) ,MRR  tensor(0.0417)
epoch 0,	 train loss 11.88
epoch 50,	 train loss 10.33
epoch 100,	 train loss 8.90
hits@1  tensor(1.) ,hits@10  tensor(1.) ,MR  tensor(1.) ,MRR  tensor(1.)

Running... clg_10e5-train.nt clg_10e5-test.nt
96273 1858

epoch 0,	 train loss 0.91
epoch 50,	 train loss 1.01
epoch 100,	 train loss 0.89
hits@1  tensor(0.0021) ,hits@10  tensor(0.0356) ,MR  tensor(67.1800) ,MRR  tensor(0.0259)
epoch 0,	 train loss 4.07
epoch 50,	 train loss 1.34
epoch 100,	 train loss 2.78
hits@1  tensor(0.6780) ,hits@10  tensor(0.7128) ,MR  tensor(16.8881) ,MRR  tensor(0.6933)

Running... clg_full-train.nt clg_full-val.nt


In [None]:
traind_df = load_clg_zfiles('./datasets/clg/clg_full/clg_full-train.nt.gz')