In [1]:
import  os
import  re
import glob

import  pandas as pd
import  numpy as np

from src.utils import *
from src.models import TransE, rTransE

from src.env import Env
from src.agent import DQN_Network, ExperienceReplay, process_agent_samples

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
OWL2Bench_dbs = [ { 'path' : './datasets/OWL2Bench/OWL2Bench1/',
                'train_file'  :'_train_OWL2Bench1',
                'test_file' : '_test_OWL2Bench1'}               ,
                 { 'path' : './datasets/OWL2Bench/OWL2Bench2/',
                'train_file'  :'_train_OWL2Bench2',
                'test_file' : '_test_OWL2Bench2'} ]


for db_ in OWL2Bench_dbs:
    
    path = db_['path']
    train_file= db_['train_file']
    test_file= db_['test_file']
    
    print('Running...', train_file, test_file)
    
    # load data
    df_train=load_ore_files(path+train_file)
                
    ## subclass relations
    data_subclass_train = df_train[df_train['p']== 'SubClassOf']
    data_subclass_train = data_subclass_train[['s','o']].rename(columns={'s':'subClass','o':'class'})
    transitive_classes  = pd.merge(data_subclass_train,data_subclass_train,
                                 how='left',right_on=['subClass'],left_on=['class']
    ).dropna(subset=['class_y'])
    del transitive_classes['class_x']
    transitive_classes.columns = ['class_0', 'class_1', 'class_2']
    transitive_classes = transitive_classes.drop_duplicates(subset=['class_0', 'class_1', 'class_2'])
    data_subclass_train_quads = transitive_classes.reset_index(drop=True)            
      
    ## assertion relations
    data_assertion_train = df_train[df_train['p']== 'ClassAssertion']
    data_assertion_train = data_assertion_train[['s','o']].rename(columns={'s':'class','o':'assertion'})

    ## cross relations             
    transitive_classes = pd.merge(data_assertion_train,data_subclass_train,
                                 how='left',right_on=['subClass'],left_on=['class']).dropna(subset=['subClass'])
    print(len(transitive_classes))
    del transitive_classes['class_x']
    transitive_classes.columns = ['assertion', 'class_0', 'class_1']
    data_cross_quads = transitive_classes.reset_index(drop=True)
    
    res = prepare_crossclass_data(data_subclass_train,data_subclass_train_quads,0,
                            data_assertion_train,1,
                           data_cross_quads)
    node_dict, node_count, train_trips, train_quads = res
    
       
        
    # load test data
    df_test= load_ore_files(path+test_file)
                 
    data_assertion_test = df_test[df_test['p']== 'ClassAssertion']
    data_assertion_test = data_assertion_test[['s','o']].rename(columns={'s':'class','o':'assertion'})
    res_subcls = prepare_subclass_data(data_assertion_test,transitive_classes=None,
                                tc1='class',tc2='assertion',r=1)
    data_subclass_test  = df_test[df_test['p']== 'SubClassOf']
    data_subclass_test  = data_subclass_test[['s','o']].rename(columns={'s':'subClass','o':'class'})       
    res_assert = prepare_subclass_data(data_subclass_test,transitive_classes=None,r=0)

    test_trips = res_subcls[2] + res_assert[2]
                 
    print(len(test_trips))
    
    ## tarin TransE
    #print('')
    #model_ORE_TransE  = TransE(node_coun=t,2)
    #model_ORE_TransE._train(train_trips,train_quads);
    ##model_ORE_TransE._eval(test_trips) # evaluate TransE
    
    # train rTransE
    model_ORE_rTransE  = rTransE(node_count,2)
    model_ORE_rTransE._train(train_trips,train_quads,num_epoches=200);
    model_ORE_rTransE._eval(test_trips)  # evaluate RTransE
    
    env   = Env(train_trips)
    agent = DQN_Network([60, 64, 2],lr=1e-3)
    agent_samples = agent.train(env,
            model_ORE_rTransE.entity_embds.detach().numpy(),
            model_ORE_rTransE.rel_embds.detach().numpy(),
            episodes = 20000)
    
    unique_agent_samples = process_agent_samples(train_quads,agent_samples)
    updated_train_trips  = train_trips+unique_agent_samples
    
    update_train_trips = train_trips+unique_agent_samples
    model_ORE_rTransE._train(update_train_trips,train_quads,num_epoches=100);
    model_ORE_rTransE._eval(test_trips)  # evaluate RTransE
    
    print()


#### ORE

In [40]:
ORE_dbs = [     { 'path'      : './datasets/ORE/ORE1/',
                'train_file'  : '_train_ORE1',
                'test_file'   : '_test_ORE1'},
                { 'path'      : './datasets/ORE/ORE2/',
                'train_file'  : '_train_ORE2',
                'test_file'   : '_test_ORE2'},
                { 'path'      : './datasets/ORE/ORE3/',
                'train_file'  : '_train_ORE3',
                'test_file'   : '_test_ORE3'}]

In [None]:
for db_ in ORE_dbs:
    
    path = db_['path']
    train_file= db_['train_file']
    test_file= db_['test_file']
    
    print('Running...', train_file, test_file)
    
    # load data
    df_train= load_ore_files(path+train_file)
                
    ## subclass relations
    data_subclass_train = df_train[df_train['p']== 'SubClassOf']
    data_subclass_train = data_subclass_train[['s','o']].rename(columns={'s':'subClass','o':'class'})
    transitive_classes  = pd.merge(data_subclass_train,data_subclass_train,
                                 how='left',right_on=['subClass'],left_on=['class']
    ).dropna(subset=['class_y'])
    del transitive_classes['class_x']
    transitive_classes.columns = ['class_0', 'class_1', 'class_2']
    transitive_classes = transitive_classes.drop_duplicates(subset=['class_0', 'class_1', 'class_2'])
    data_subclass_train_quads = transitive_classes.reset_index(drop=True)            
      
    ## assertion relations
    data_assertion_train = df_train[df_train['p']== 'ClassAssertion']
    data_assertion_train = data_assertion_train[['s','o']].rename(columns={'s':'class','o':'assertion'})

    ## cross relations             
    transitive_classes = pd.merge(data_assertion_train,data_subclass_train,
                                 how='left',right_on=['subClass'],left_on=['class']).dropna(subset=['subClass'])
    print(len(transitive_classes))
    del transitive_classes['class_x']
    transitive_classes.columns = ['assertion', 'class_0', 'class_1']
    data_cross_quads = transitive_classes.reset_index(drop=True)
    
    res = prepare_crossclass_data(data_subclass_train,data_subclass_train_quads,0,
                            data_assertion_train,1,
                           data_cross_quads)
    node_dict, node_count, train_trips, train_quads = res
    
       
        
    # load test data
    df_test= load_ore_files(path+test_file)
                 
    data_assertion_test = df_test[df_test['p']== 'ClassAssertion']
    data_assertion_test = data_assertion_test[['s','o']].rename(columns={'s':'class','o':'assertion'})
    res_subcls = prepare_subclass_data(data_assertion_test,transitive_classes=None,
                                tc1='class',tc2='assertion',r=1)
    data_subclass_test  = df_test[df_test['p']== 'SubClassOf']
    data_subclass_test  = data_subclass_test[['s','o']].rename(columns={'s':'subClass','o':'class'})       
    res_assert = prepare_subclass_data(data_subclass_test,transitive_classes=None,r=0)

    test_trips = res_subcls[2] + res_assert[2]
                 
    print(len(test_trips))
    
    ## tarin TransE
    #print('')
    #model_ORE_TransE  = TransE(node_count,2)
    #model_ORE_TransE._train(train_trips,train_quads);
    #model_ORE_TransE._eval(test_trips) # evaluate TransE
    
    # train rTransE
    model_ORE_rTransE  = rTransE(node_count,2)
    model_ORE_rTransE._train(train_trips,train_quads,num_epoches=200);
    model_ORE_rTransE._eval(test_trips)  # evaluate RTransE
    
    env   = Env(train_trips)
    agent = DQN_Network([60, 64, 2],lr=1e-3)
    agent_samples = agent.train(env,
            model_ORE_rTransE.entity_embds.detach().numpy(),
            model_ORE_rTransE.rel_embds.detach().numpy(),
            episodes = 20000)
    
    unique_agent_samples = process_agent_samples(train_quads,agent_samples)
    updated_train_trips  = train_trips+unique_agent_samples
    
    update_train_trips = train_trips+unique_agent_samples
    model_ORE_rTransE._train(update_train_trips,train_quads,num_epoches=100);
    model_ORE_rTransE._eval(test_trips)  # evaluate RTransE
    
    print()
 

#### CaLiGraph

In [42]:
CLG_dbs = [ { 'path'      : 'datasets/clg/clg_10e4/',
                'train_file'  : 'clg_10e4-train.nt',
                'test_file'   : 'clg_10e4-test.nt-e'},
            { 'path'      : 'datasets/clg/clg_10e5/',
                'train_file'  : 'clg_10e5-train.nt',
                'test_file'   : 'clg_10e5-test.nt'}]

In [43]:
max_test_batch_size = 1000

In [45]:
for db_ in CLG_dbs:
    path = db_['path']
    train_file= db_['train_file']
    test_file= db_['test_file']
    
    print('Running...', train_file, test_file)
    
    df_train= load_clg_files(path+train_file)
    data_subclass_train = df_train[df_train['p']== '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
    data_subclass_train = data_subclass_train[['s','o']].rename(columns={'s':'subClass','o':'class'})   
    transitive_classes= pd.merge(data_subclass_train,data_subclass_train,how='left',
                                 right_on=['subClass'],left_on=['class']).dropna(subset=['class_y'])
    del transitive_classes['class_x']
    transitive_classes.columns = ['class_0', 'class_1', 'class_2']
    transitive_classes = transitive_classes.drop_duplicates(subset=['class_0', 'class_1', 'class_2']) # drop duplicates
    data_subclass_train_quads = transitive_classes.reset_index(drop=True)

    data_type_train = df_train[df_train['p']== '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>']
    data_type_train = data_type_train[['s','o']].rename(columns={'s':'type','o':'class'})   

    transitive_classes = pd.merge(data_type_train,data_subclass_train,
                                     how='left',left_on=['class'],right_on=['subClass']).dropna(subset=['class_y'])
    del transitive_classes['class_x']
    transitive_classes.columns = ['type', 'class_0', 'class_1']
    data_cross_quads = transitive_classes.reset_index(drop=True)

    res = prepare_crossclass_data(data_subclass_train,data_subclass_train_quads,0,
                            data_type_train,1,
                           transitive_classes,tc1='class',tc2='subClass',
                              ac1='class',ac2='type',
                              qc1='class_0',qc2='class_1',qc3='class_2',
                              cc1='type',cc2='class_0',cc3='class_1')

    node_dict, node_count, train_trips, train_quads = res

    df_test= load_clg_files(path+test_file)


    data_subclass_test = df_test[df_test['p']== '<http://www.w3.org/2000/01/rdf-schema#subClassOf>']
    data_subclass_test= data_subclass_test[['s','o']].rename(columns={'s':'subClass','o':'class'})

    res = prepare_subclass_data(data_subclass_train,data_subclass_train_quads)
    _, _, test_trips1, test_quads1 = res

    data_type_test = df_test[df_test['p']== '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>']
    data_type_test= data_type_test[['s','o']].rename(columns={'s':'type','o':'class'})

    _, _, test_trips2, test_quads2 = prepare_subclass_data(data_type_test,tc1='class',
                                    tc2='type')


    test_trips = test_trips1 + test_trips2


    print(len(train_trips),len(test_trips))
    
    
    ## tarin TransE
    #print('')
    #model_ORE_TransE  = TransE(node_count,2)
    #model_ORE_TransE._train(train_trips,train_quads);
    #model_ORE_TransE._eval(test_trips[:max_test_batch_size]) # evaluate TransE
    
    # train rTransE
    model_ORE_rTransE  = rTransE(node_count,2)
    model_ORE_rTransE._train(train_trips,train_quads,num_epoches=200);
    model_ORE_rTransE._eval(test_trips[:max_test_batch_size])  # evaluate RTransE
    
    env   = Env(train_trips)
    agent = DQN_Network([60, 64, 2],lr=1e-3)
    agent_samples = agent.train(env,
            model_ORE_rTransE.entity_embds.detach().numpy(),
            model_ORE_rTransE.rel_embds.detach().numpy(),
            episodes = 20000)
    
    unique_agent_samples = process_agent_samples(train_quads,agent_samples)
    updated_train_trips  = train_trips+unique_agent_samples
    
    update_train_trips = train_trips+unique_agent_samples
    model_ORE_rTransE._train(update_train_trips,train_quads,num_epoches=100);
    model_ORE_rTransE._eval(test_trips[:max_test_batch_size])  # evaluate RTransE
    
    print()

Running... clg_10e4-train.nt clg_10e4-test.nt-e
111533 74694
epoch 0,	 train loss 13.25
epoch 50,	 train loss 11.49
epoch 100,	 train loss 10.19
epoch 150,	 train loss 8.44
hits@1  tensor(0.4770) ,hits@10  tensor(0.6870) ,MR  tensor(16.7180) ,MRR  tensor(0.5545)
epoch 0	ep_len 1	average loss 1.27	reward 1.00	done True
epoch 2000	ep_len 1	average loss 0.43	reward 1.00	done True
epoch 4000	ep_len 1	average loss 0.16	reward 1.00	done True
epoch 6000	ep_len 1	average loss 0.56	reward 1.00	done True
epoch 8000	ep_len 1	average loss 0.42	reward 1.00	done True
epoch 10000	ep_len 1	average loss 0.22	reward 1.00	done True
epoch 12000	ep_len 1	average loss 1.13	reward 1.00	done True
epoch 14000	ep_len 1	average loss 0.99	reward 1.00	done True
epoch 16000	ep_len 11	average loss 0.80	reward -0.70	done False
epoch 18000	ep_len 1	average loss 1.04	reward 1.00	done True
unique ratio: 0.80
epoch 0,	 train loss 7.67
epoch 50,	 train loss 6.28
hits@1  tensor(0.5000) ,hits@10  tensor(0.6970) ,MR  tensor(