In [14]:
import os
import sys
import time
#sys.path.append('../..')

from datasets.get_datasets import *
from revision import *
from transfer import *
from mapping import *
from boostsrl import boostsrl
import numpy as np
import random
import json

In [15]:
def get_best(sPreds, tPreds, srcFacts, tarFacts, n_sentences=50000, forceHead=None):
    '''Return best mapping found given source and target predicates and facts'''
    srcPreds = sPreds
    tarPreds = mapping.clean_preds(tPreds)
    start = time.time()
    results = {}
    source = KnowledgeGraph()
    source.background(srcPreds)
    source.facts(srcFacts)
    target = KnowledgeGraph()
    target.background(tarPreds)
    target.facts(tarFacts)
    results['Knowledge compiling time'] = time.time() - start
    new_start = time.time()
    source.generate_sentences(max_depth=4, n_sentences=n_sentences)
    target.generate_sentences(max_depth=4, n_sentences=n_sentences)
    results['Generating paths time'] = time.time() - new_start
    new_start = time.time()
    source_sentences = set([' '.join(i) for i in source.sentences if len(i) > 1])
    target_sentences = set([' '.join(i) for i in target.sentences if len(i) > 1])
    best = -1
    best_mapping_size = 0
    best_mapping = None
    fHead = None if not forceHead else mapping.find_pred(forceHead, tarPreds)
    possible_mappings = mapping.mapping(srcPreds, tarPreds, forceHead=fHead)
    # return None if incompatible forceHead is defined
    if not len(possible_mappings):
        return ({}, None)
    results['Generating mappings time'] = time.time() - new_start
    new_start = time.time()
    results['Possible mappings'] = len(possible_mappings)
    scores = []
    for mapping_dict in possible_mappings:
        score = mapping.mapping_score(mapping_dict, source_sentences, target_sentences)
        scores.append((mapping_dict, score, len(mapping_dict)))
    scores = sorted(scores, key=lambda tup: (tup[1], tup[2]), reverse=True)
    results['Finding best mapping'] = time.time() - new_start
    results['Total time'] = time.time() - start
    
    unaries = []
    for srcPred in srcPreds:
        s = mapping.get_types(srcPred)
        if len(s[1]) == 1:
            unaries.append(s[0])
    ret = []
    for el in scores:
        best_mapping = el[0]
        mapd = []
        for key, value in best_mapping.items():
            if key in unaries:
                string = key + '(A) -> ' + value + '(A)'
            else:
                string = key + '(A,B) -> ' + (value if value[0] != '_' else value[1:]) + ('(A,B)' if value[0] != '_' else '(B,A)')
            mapd.append(string)
        ret.append((mapd, el[1], el[2]))
    return (ret, results)

In [16]:
seed = 0

bk = {
      'imdb': ['workedunder(+person,+person).',
              'workedunder(+person,-person).',
              'workedunder(-person,+person).',
              'female(+person).',
              'actor(+person).',
              'director(+person).',
              'movie(+movie,+person).',
              'movie(+movie,-person).',
              'movie(-movie,+person).',
              'genre(+person,+genre).'],
      'uwcse': ['professor(+person).',
        'student(+person).',
        'advisedby(+person,+person).',
        'advisedby(+person,-person).',
        'advisedby(-person,+person).',
        'tempadvisedby(+person,+person).',
        'tempadvisedby(+person,-person).',
        'tempadvisedby(-person,+person).',
        'ta(+course,+person,+quarter).',
        'ta(-course,-person,+quarter).',
        'ta(+course,-person,-quarter).',
        'ta(-course,+person,-quarter).',
        'hasposition(+person,+faculty).',
        'hasposition(+person,-faculty).',
        'hasposition(-person,+faculty).',
        'publication(+title,+person).',
        'publication(+title,-person).',
        'publication(-title,+person).',
        'inphase(+person,+prequals).',
        'inphase(+person,-prequals).',
        'inphase(-person,+prequals).',
        'courselevel(+course,+level).',
        'courselevel(+course,-level).',
        'courselevel(-course,+level).',
        'yearsinprogram(+person,+year).',
        'yearsinprogram(-person,+year).',
        'yearsinprogram(+person,-year).',
        'projectmember(+project,+person).',
        'projectmember(+project,-person).',
        'projectmember(-project,+person).',
        'sameproject(+project,+project).',
        'sameproject(+project,-project).',
        'sameproject(-project,+project).',
        'samecourse(+course,+course).',
        'samecourse(+course,-course).',
        'samecourse(-course,+course).',
        'sameperson(+person,+person).',
        'sameperson(+person,-person).',
        'sameperson(-person,+person).',]}

In [30]:
source = 'imdb'
target = 'uwcse'
predicate = 'workedunder'
to_predicate = 'advisedby'

source_balanced = 1
balanced = 1
firstRun = False
n_runs = 28
folds = 3
print_function = None

nodeSize = 2
numOfClauses = 8
maxTreeDepth = 3

# Load source dataset
src_total_data = datasets.load(source, bk[source], seed=seed)
src_data = datasets.load(source, bk[source], target=predicate, balanced=source_balanced, seed=seed)

# Group and shuffle
src_facts = datasets.group_folds(src_data[0])
src_pos = datasets.group_folds(src_data[1])
src_neg = datasets.group_folds(src_data[2])

In [31]:
# learning from source dataset
background = boostsrl.modes(bk[source], [predicate], useStdLogicVariables=False, maxTreeDepth=maxTreeDepth, nodeSize=nodeSize, numOfClauses=numOfClauses)
[model, total_revision_time, source_structured, will, variances] = revision.learn_model(background, boostsrl, predicate, src_pos, src_neg, src_facts, refine=None, trees=10, print_function=print_function)

preds = mapping.get_preds(source_structured, bk[source])
print('Predicates from source: %s' % preds + '\n')

# learn from first folds
i = 0

# Load total target dataset
tar_total_data = datasets.load(target, bk[target], seed=seed)

[tar_train_pos, tar_test_pos] = datasets.get_kfold_small(i, tar_total_data[0])

# transfer
mapping_rules, mapping_results = get_best(preds, bk[target], datasets.group_folds(src_total_data[0]), tar_train_pos, forceHead=to_predicate)

Predicates from source: ['workedunder(person,person).', 'director(person).', 'movie(movie,person).', 'actor(person).']



In [32]:
len(mapping_rules)

180

In [33]:
mapping_rules

[(['workedunder(A,B) -> advisedby(A,B)',
   'actor(A) -> student(A)',
   'movie(A,B) -> publication(A,B)',
   'director(A) -> professor(A)'],
  0.055205047318611984,
  4),
 (['workedunder(A,B) -> advisedby(B,A)',
   'actor(A) -> professor(A)',
   'movie(A,B) -> publication(A,B)',
   'director(A) -> student(A)'],
  0.055205047318611984,
  4),
 (['workedunder(A,B) -> advisedby(A,B)',
   'actor(A) -> student(A)',
   'movie(A,B) -> publication(A,B)'],
  0.03761755485893417,
  3),
 (['workedunder(A,B) -> advisedby(B,A)',
   'actor(A) -> professor(A)',
   'movie(A,B) -> publication(A,B)'],
  0.03761755485893417,
  3),
 (['workedunder(A,B) -> advisedby(A,B)',
   'actor(A) -> professor(A)',
   'movie(A,B) -> publication(A,B)',
   'director(A) -> professor(A)'],
  0.03732503888024884,
  4),
 (['workedunder(A,B) -> advisedby(A,B)',
   'actor(A) -> student(A)',
   'movie(A,B) -> publication(A,B)',
   'director(A) -> student(A)'],
  0.03732503888024884,
  4),
 (['workedunder(A,B) -> advisedby(B,A)

In [23]:
# Load new predicate target dataset
new_target = to_predicate
tar_data = datasets.load(target, bk[target], target=new_target, balanced=balanced, seed=seed)

# Group and shuffle
[tar_train_facts, tar_test_facts] =  datasets.get_kfold_small(i, tar_data[0])
[tar_train_pos, tar_test_pos] =  datasets.get_kfold_small(i, tar_data[1])
[tar_train_neg, tar_test_neg] =  datasets.get_kfold_small(i, tar_data[2])

background = boostsrl.modes(bk[target], [new_target], useStdLogicVariables=False, maxTreeDepth=maxTreeDepth, nodeSize=nodeSize, numOfClauses=numOfClauses)

aucroc_scores = []
for rule in mapping_rules:
    # learning from scratch
    transferred_structured = transfer.transfer(source_structured, rule[0])
    [model, t_results, structured, will, variances] = revision.learn_test_model(background, boostsrl, new_target, tar_train_pos, tar_train_neg, tar_train_facts, tar_test_pos, tar_test_neg, tar_test_facts, refine=revision.get_boosted_refine_file(transferred_structured), trees=10, print_function=print_function)
    res = (rule[0], t_results['AUC ROC'])
    print(res)
    aucroc_scores.append(res)

(['workedunder(A,B) -> advisedby(A,B)', 'actor(A) -> student(A)', 'movie(A,B) -> publication(A,B)', 'director(A) -> professor(A)'], 0.960059)
(['workedunder(A,B) -> advisedby(B,A)', 'actor(A) -> professor(A)', 'movie(A,B) -> publication(A,B)', 'director(A) -> student(A)'], 0.960059)
(['workedunder(A,B) -> advisedby(A,B)', 'actor(A) -> student(A)', 'movie(A,B) -> sameperson(A,B)', 'director(A) -> professor(A)'], 0.948718)
(['workedunder(A,B) -> advisedby(A,B)', 'actor(A) -> student(A)', 'movie(A,B) -> publication(A,B)'], 0.731262)
(['workedunder(A,B) -> advisedby(B,A)', 'actor(A) -> professor(A)', 'movie(A,B) -> publication(A,B)'], 0.917899)
(['workedunder(A,B) -> advisedby(A,B)', 'actor(A) -> professor(A)', 'movie(A,B) -> publication(A,B)', 'director(A) -> professor(A)'], 0.544872)
(['workedunder(A,B) -> advisedby(A,B)', 'actor(A) -> student(A)', 'movie(A,B) -> publication(A,B)', 'director(A) -> student(A)'], 0.820513)
(['workedunder(A,B) -> advisedby(B,A)', 'actor(A) -> professor(A)',

In [24]:
aucroc_scores = sorted(aucroc_scores, key=lambda tup: (tup[1]), reverse=True)

In [25]:
aucroc_scores

[(['workedunder(A,B) -> advisedby(A,B)'], 0.969181),
 (['workedunder(A,B) -> advisedby(B,A)'], 0.969181),
 (['workedunder(A,B) -> advisedby(A,B)',
   'actor(A) -> student(A)',
   'movie(A,B) -> publication(A,B)',
   'director(A) -> professor(A)'],
  0.960059),
 (['workedunder(A,B) -> advisedby(B,A)',
   'actor(A) -> professor(A)',
   'movie(A,B) -> publication(A,B)',
   'director(A) -> student(A)'],
  0.960059),
 (['workedunder(A,B) -> advisedby(A,B)',
   'actor(A) -> student(A)',
   'movie(A,B) -> sameperson(A,B)',
   'director(A) -> professor(A)'],
  0.948718),
 (['workedunder(A,B) -> advisedby(A,B)',
   'actor(A) -> student(A)',
   'movie(A,B) -> sameperson(B,A)',
   'director(A) -> professor(A)'],
  0.948718),
 (['workedunder(A,B) -> advisedby(B,A)',
   'actor(A) -> professor(A)',
   'movie(A,B) -> sameperson(A,B)',
   'director(A) -> student(A)'],
  0.948718),
 (['workedunder(A,B) -> advisedby(B,A)',
   'actor(A) -> professor(A)',
   'movie(A,B) -> sameperson(B,A)',
   'director(A