# Relation extraction using distant supervision

# The corpus

In [1]:
from corpus import read_examples, Corpus, show_examples_for_pair
from knowledge_base import read_kb_triples, KB, count_relation_combinations
from evaluation import *
from dataset import find_unrelated_pairs, split_corpus_and_kb
from collections import Counter, defaultdict, namedtuple
from EDA import find_common_middles
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
import random
from featurizer import simple_bag_of_words_featurizer

In [2]:
examples = read_examples()

Reading examples from rel_ext_data/corpus.tsv.gz
Read 414123 examples


In [3]:
ex = examples[1]
' '.join((ex.left, ex.mention_1, ex.middle, ex.mention_2, ex.right))

'to all Spanish-occupied lands . The horno has a beehive shape and uses wood as the only heat source . The procedure still used in parts of New Mexico and Arizona is to build a fire inside the Horno and , when the proper amount of time has passed , remove the embers and ashes and insert the'

In [4]:
counter = Counter()
for example in examples:
    counter[example.entity_1] += 1
    counter[example.entity_2] += 1
print('The corpus contains {} entities'.format(len(counter)))
counts = sorted([(count, key) for key, count in counter.items()], reverse=True)
print('The most common entities are:')
for count, key in counts[:20]:
    print('{:10d} {}'.format(count, key))

The corpus contains 107820 entities
The most common entities are:
      9399 India
      6214 England
      4585 Germany
      4486 France
      4128 Australia
      3939 China
      3930 Canada
      3897 Italy
      3368 California
      3125 Pakistan
      3103 Europe
      3097 New_York_City
      3025 London
      2470 Japan
      2468 United_Kingdom
      2279 New_Zealand
      2275 New_York
      2259 Spain
      2132 Philippines
      2120 Asia


In [5]:
corpus = Corpus(examples)
corpus

Corpus with 414123 examples

In [6]:
show_examples_for_pair('Steve_Jobs', 'Pixar', corpus)

The first of 9 examples for Steve_Jobs and Pixar is:
Example(entity_1='Steve_Jobs', entity_2='Pixar', left='of visual effects on films like The Abyss ( 1989 ) , Terminator 2 ( 1991 ) and Jurassic Park ( 1993 ) The computer graphics division of ILM was bought by', mention_1='Steve Jobs', middle='and became', mention_2='Pixar', right=', who would go on to make several groundbreaking animated films starting with Toy Story ( 1995 ) – more information on the history of that here', left_POS='of/IN visual/JJ effects/NNS on/IN films/NNS like/IN The/DT Abyss/NN -LRB-/-LRB- 1989/CD -RRB-/-RRB- ,/, Terminator/NNP 2/CD -LRB-/-LRB- 1991/CD -RRB-/-RRB- and/CC Jurassic/JJ Park/NN -LRB-/-LRB- 1993/CD -RRB-/-RRB- The/DT computer/NN graphics/NNS division/NN of/IN ILM/NNP was/VBD bought/VBN by/IN', mention_1_POS='Steve/NNP Jobs/NNP', middle_POS='and/CC became/VBD', mention_2_POS='Pixar/NNP', right_POS=',/, who/WP would/MD go/VB on/IN to/TO make/VB several/JJ groundbreaking/VBG animated/JJ films/NNS start

In [7]:
show_examples_for_pair('Pixar', 'Steve_Jobs', corpus)

The first of 2 examples for Pixar and Steve_Jobs is:
Example(entity_1='Pixar', entity_2='Steve_Jobs', left='in the visual accompaniment to his recordings of Bach ’ s Six Suites for Unaccompanied Cello . Ma has also been seen with Apple Inc. and former', mention_1='Pixar', middle='CEO', mention_2='Steve Jobs', right='. Ma is often invited to press events for Jobs ’ s companies , and has performed on stage during event keynote presentations , as well as appearing in', left_POS="in/IN the/DT visual/JJ accompaniment/NN to/TO his/PRP$ recordings/NNS of/IN Bach/NNP '/POS s/NNS Six/CD Suites/NNP for/IN Unaccompanied/NNP Cello/NNP ./. Ma/NNP has/VBZ also/RB been/VBN seen/VBN with/IN Apple/NNP Inc./NNP and/CC former/JJ", mention_1_POS='Pixar/NNP', middle_POS='CEO/NNP', mention_2_POS='Steve/NNP Jobs/NNP', right_POS="./. Ma/NNP is/VBZ often/RB invited/VBN to/TO press/VB events/NNS for/IN Jobs/NNP '/POS s/NNS companies/NNS ,/, and/CC has/VBZ performed/VBN on/IN stage/NN during/IN event/NN keynote/

# The knowledge base

In [8]:
kb_triples = read_kb_triples()

Reading KB triples from rel_ext_data/kb.tsv.gz ...
Read 56575 KB triples


In [9]:
kb = KB(kb_triples)
kb

KB with 56575 triples

In [10]:
all_relations = kb.get_all_relations()
print(len(all_relations))

16


In [11]:
for rel in all_relations:
    print('{:12d} {}'.format(len(kb.get_triples_for_relation(rel)), rel))

        2140 adjoins
        3316 author
         637 capital
       22489 contains
        4958 film_performance
        2404 founders
        1012 genre
        3280 has_sibling
        3774 has_spouse
        3153 is_a
        1981 nationality
        2013 parents
        1388 place_of_birth
        1031 place_of_death
        1526 profession
        1473 worked_at


In [12]:
for rel in all_relations:
    print(tuple(kb.get_triples_for_relation(rel)[0]))

('adjoins', 'Siegburg', 'Bonn')
('author', 'Uncle_Silas', 'Sheridan_Le_Fanu')
('capital', 'Tunisia', 'Tunis')
('contains', 'Brickfields', 'Kuala_Lumpur_Sentral_railway_station')
('film_performance', 'Colin_Hanks', 'The_Great_Buck_Howard')
('founders', 'Bomis', 'Jimmy_Wales')
('genre', 'SPARQL', 'Semantic_Web')
('has_sibling', 'Ari_Emanuel', 'Rahm_Emanuel')
('has_spouse', 'Percy_Bysshe_Shelley', 'Mary_Shelley')
('is_a', 'Bhanu_Athaiya', 'Costume_designer')
('nationality', 'Ruben_Rausing', 'Sweden')
('parents', 'Prince_Arthur_of_Connaught', 'Prince_Arthur,_Duke_of_Connaught_and_Strathearn')
('place_of_birth', 'William_Penny_Brookes', 'Much_Wenlock')
('place_of_death', 'Jean_Drapeau', 'Montreal')
('profession', 'Rufus_Wainwright', 'Actor')
('worked_at', 'Ray_Jackendoff', 'Tufts_University')


In [13]:
kb.get_triples_for_entities('France', 'Germany')

[KBTriple(rel='adjoins', sbj='France', obj='Germany')]

In [14]:
kb.get_triples_for_entities('Germany', 'France')

[KBTriple(rel='adjoins', sbj='Germany', obj='France')]

In [15]:
kb.get_triples_for_entities('Pixar', 'Steve_Jobs')

[KBTriple(rel='founders', sbj='Pixar', obj='Steve_Jobs')]

In [16]:
kb.get_triples_for_entities('Steve_Jobs', 'Pixar')

[KBTriple(rel='worked_at', sbj='Steve_Jobs', obj='Pixar')]

In [17]:
kb.get_triples_for_entities('Cleopatra', 'Ptolemy_XIII_Theos_Philopator')

[KBTriple(rel='has_sibling', sbj='Cleopatra', obj='Ptolemy_XIII_Theos_Philopator'),
 KBTriple(rel='has_spouse', sbj='Cleopatra', obj='Ptolemy_XIII_Theos_Philopator')]

In [18]:
counter = Counter()
for kbt in kb.get_triples():
    counter[kbt.sbj] += 1
    counter[kbt.obj] += 1
print('The KB contains {} entities'.format(len(counter)))
counts = sorted([(count, key) for key, count in counter.items()], reverse=True)
print('The most common entities are:')
for count, key in counts[:20]:
    print('{:10d} {}'.format(count, key))

The KB contains 46275 entities
The most common entities are:
       962 England
       815 India
       465 London
       456 Italy
       437 France
       420 Germany
       412 California
       396 United_Kingdom
       378 Canada
       324 New_York_City
       262 Actor
       248 New_York
       244 Australia
       235 China
       226 Philippines
       224 Japan
       223 Russia
       214 Scotland
       204 Europe
       177 Pakistan


### Joining the corpus and the KB

In [19]:
def count_examples(corpus, kb):
    counter = Counter()
    for rel in all_relations:
        for kbt in kb.get_triples_for_relation(rel):
            # count examples in both forward and reverse directions
            counter[rel] += len(corpus.get_examples_for_entities(kbt.sbj, kbt.obj))
            counter[rel] += len(corpus.get_examples_for_entities(kbt.obj, kbt.sbj))
    # report results
    print('{:20s} {:>10s} {:>10s} {:>10s}'.format('', '', '', 'examples'))
    print('{:20s} {:>10s} {:>10s} {:>10s}'.format('relation', 'examples', 'triples', '/triple'))
    print('{:20s} {:>10s} {:>10s} {:>10s}'.format('--------', '--------', '-------', '-------'))
    for rel in all_relations:
        nx = counter[rel]
        nt = len(kb.get_triples_for_relation(rel))
        print('{:20s} {:10d} {:10d} {:10.2f}'.format(rel, nx, nt, 1.0 * nx / nt))
        
count_examples(corpus, kb)

                                             examples
relation               examples    triples    /triple
--------               --------    -------    -------
adjoins                   85660       2140      40.03
author                    15822       3316       4.77
capital                   12520        637      19.65
contains                  99572      22489       4.43
film_performance          11195       4958       2.26
founders                   8061       2404       3.35
genre                      1941       1012       1.92
has_sibling               12332       3280       3.76
has_spouse                16188       3774       4.29
is_a                       6955       3153       2.21
nationality                4649       1981       2.35
parents                    5387       2013       2.68
place_of_birth             2214       1388       1.60
place_of_death             2047       1031       1.99
profession                 2876       1526       1.88
worked_at                  4

In [20]:
unrelated_pairs = find_unrelated_pairs(corpus, kb)
print('Found {} unrelated pairs, including:'.format(len(unrelated_pairs)))
for pair in list(unrelated_pairs)[:10]:
    print('   ', pair)

Found 301073 unrelated pairs, including:
    ('South_Africa', 'Afghanistan')
    ('Detroit_Zoo', 'Detroit_Institute_of_Arts')
    ('Charles_Darwin', 'Ernst_Mayr')
    ('Buddy_Rich', 'Ben_Webster')
    ('The_Flip_Wilson_Show', 'Some_Like_It_Hot')
    ('Edmund_Burke', 'Jeremy_Bentham')
    ('Ty_Cobb', 'Miller_Huggins')
    ('Fred_Lynn', 'Jim_Rice')
    ('Metallica', 'Guns_and_Roses')
    ('Cantilan', 'Lanuza')


In [21]:
count_relation_combinations(kb)

The most common relation combinations are:
      1526 ('is_a', 'profession')
       495 ('capital', 'contains')
       183 ('place_of_birth', 'place_of_death')
        76 ('nationality', 'place_of_birth')
        11 ('nationality', 'place_of_death')
        11 ('adjoins', 'contains')
         8 ('has_sibling', 'has_spouse')
         3 ('nationality', 'place_of_birth', 'place_of_death')
         2 ('parents', 'worked_at')
         1 ('nationality', 'worked_at')
         1 ('has_spouse', 'parents')
         1 ('author', 'founders')


### Building datasets

In [22]:
def build_datasets(corpus, kb, include_positive=True, sampling_rate=0.1, seed=1, KBTriple = namedtuple('KBTriple', 'rel, sbj, obj')):
    unrelated_pairs = find_unrelated_pairs(corpus, kb)
    random.seed(seed)
    unrelated_pairs = random.sample(unrelated_pairs, int(sampling_rate * len(unrelated_pairs)))
    kbts_by_rel = defaultdict(list)
    labels_by_rel = defaultdict(list)
    for index, rel in enumerate(all_relations):
        if include_positive:
            for kbt in kb.get_triples_for_relation(rel):
                kbts_by_rel[rel].append(kbt)
                labels_by_rel[rel].append(True)
        for sbj, obj in unrelated_pairs:
            kbts_by_rel[rel].append(KBTriple(rel, sbj, obj))
            labels_by_rel[rel].append(False)        
    return kbts_by_rel, labels_by_rel

In [23]:
kbts_by_rel, labels_by_rel = build_datasets(corpus, kb, include_positive=True, sampling_rate=0.1, seed=1)

## Evaluation

In [24]:
data = split_corpus_and_kb(corpus, kb,seed=1)

In [25]:
def build_datasets_for_split(split, include_positive=True, sampling_rate=0.1, seed=1):
    return build_datasets(data[split]['corpus'], data[split]['kb'], include_positive, sampling_rate, seed)

In [26]:
def evaluate(classifier, test_split='dev', verbose=True):
    test_kbts_by_rel, true_labels_by_rel = build_datasets_for_split(test_split)
    results = {}
    if verbose:
        print_statistics_header()
    for rel in all_relations:
        pred_labels = classifier(test_kbts_by_rel[rel])
        stats = precision_recall_fscore_support(true_labels_by_rel[rel], pred_labels, beta=0.5)
        stats = [stat[1] for stat in stats]  # stats[1] is the stat for label True
        stats.append(len(pred_labels))  # number of examples
        results[rel] = stats
        if verbose:
            print_statistics_row(rel, results[rel])
    avg_result = macro_average_results(results)
    if verbose:
        print_statistics_footer(avg_result)
    return avg_result[2]  # return f_0.5 score as summary statistic

In [27]:
def lift(f):
    return lambda xs: [f(x) for x in xs]

def make_random_classifier(p=0.50):
    def random_classify(kb_triple):
        return random.random() < p
    return lift(random_classify)

In [28]:
evaluate(make_random_classifier())

relation              precision     recall    f-score    support       size
------------------    ---------  ---------  ---------  ---------  ---------
adjoins                   0.058      0.515      0.070        303       5319
author                    0.088      0.508      0.106        480       5496
capital                   0.019      0.539      0.024         89       5105
contains                  0.349      0.502      0.371       2667       7683
film_performance          0.138      0.491      0.162        822       5838
founders                  0.064      0.482      0.078        359       5375
genre                     0.039      0.608      0.048        166       5182
has_sibling               0.092      0.493      0.109        513       5529
has_spouse                0.110      0.530      0.130        575       5591
is_a                      0.099      0.547      0.119        494       5510
nationality               0.054      0.463      0.066        311       5327
parents     

0.09757501010273492

In [29]:
find_common_middles(data,all_relations,show_output=True)

adjoins              fwd         8461 ,
adjoins              fwd         5633 and
adjoins              fwd          993 , and
adjoins              fwd         5599 ,
adjoins              fwd         3780 and
adjoins              fwd          680 , and
author               fwd         1214 by
author               fwd          155 ,
author               fwd          130 , by
author               fwd         1106 's
author               fwd          294 ‘ s
author               fwd          175 ’ s
capital              fwd           37 ,
capital              fwd           19 in
capital              fwd           18 (
capital              fwd         3711 ,
capital              fwd          178 in
capital              fwd           87 , the capital of
contains             fwd          460 's
contains             fwd          355 ,
contains             fwd          250 (
contains             fwd        25095 ,
contains             fwd         5603 in
contains             fwd          668 in

{'fwd': defaultdict(<function EDA.find_common_middles.<locals>.<lambda>>,
             {'adjoins': {',', ', and', 'and'},
              'author': {',', ', by', 'by'},
              'capital': {'(', ',', 'in'},
              'contains': {"'s", '(', ','},
              'film_performance': {"'s", 'film', 'in'},
              'founders': {',', 'co-founder', 'founder'},
              'genre': {', a',
               'in 1994 , he became a central figure in the',
               'is a'},
              'has_sibling': {',', ', and', 'and'},
              'has_spouse': {',', 'and', 'and his wife'},
              'is_a': {',', ', a', 'and'},
              'nationality': {'in', 'of', 'of the'},
              'parents': {',', ', son of', 'and'},
              'place_of_birth': {'in', 'of', 'was born in'},
              'place_of_death': {'at', 'in', 'of'},
              'profession': {',', ', a', 'and'},
              'worked_at': {"'s", 'at', 'of'}}),
 'rev': defaultdict(<function EDA.find_common_m

In [30]:
def train_top_k_middles_classifier(train_split='train', top_k=3):
    corpus = data[train_split]['corpus']
    top_k_mids_by_rel = find_common_middles(data,all_relations,split=train_split, top_k=top_k)
    def classify(kb_triple):
        fwd_mids = top_k_mids_by_rel['fwd'][kb_triple.rel]
        rev_mids = top_k_mids_by_rel['rev'][kb_triple.rel]
        for ex in corpus.get_examples_for_entities(kb_triple.sbj, kb_triple.obj):
            if ex.middle in fwd_mids:
                return True
        for ex in corpus.get_examples_for_entities(kb_triple.obj, kb_triple.sbj):
            if ex.middle in rev_mids:
                return True
        return False
    return lift(classify)

In [31]:
evaluate(train_top_k_middles_classifier())

relation              precision     recall    f-score    support       size
------------------    ---------  ---------  ---------  ---------  ---------
adjoins                   0.337      0.406      0.349        303       5319
author                    0.228      0.058      0.144        480       5496
capital                   0.101      0.191      0.111         89       5105
contains                  0.537      0.066      0.220       2667       7683
film_performance          0.400      0.002      0.012        822       5838
founders                  0.196      0.061      0.136        359       5375
genre                     0.000      0.000      0.000        166       5182
has_sibling               0.320      0.222      0.294        513       5529
has_spouse                0.380      0.249      0.344        575       5591
is_a                      0.021      0.010      0.017        494       5510
nationality               0.133      0.039      0.089        311       5327
parents     

0.1203641255018017

In [32]:
def featurize_datasets(
        kbts_by_rel,
        corpus,
        featurizers=[simple_bag_of_words_featurizer],
        vectorizer=None):
    # Create feature counters for all instances (kbts).
    feat_counters_by_rel = defaultdict(list)
    for rel, kbts in kbts_by_rel.items():
        for kbt in kbts:
            feature_counter = Counter()
            for featurizer in featurizers:
                featurizer(kbt, corpus, feature_counter)
            feat_counters_by_rel[rel].append(feature_counter)
    feat_matrices_by_rel = defaultdict(list)
    # If we haven't been given a Vectorizer, create one and fit it to all the feature counters.
    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=True)
        def traverse_dicts():
            for dict_list in feat_counters_by_rel.values():
                for d in dict_list:
                    yield d
        vectorizer.fit(traverse_dicts())
    # Now use the Vectorizer to transform feature dictionaries into feature matrices.
    for rel, feat_counters in feat_counters_by_rel.items():
        feat_matrices_by_rel[rel] = vectorizer.transform(feat_counters)
    return feat_matrices_by_rel, vectorizer

In [33]:
def train_models(
        split='train',
        featurizers=[simple_bag_of_words_featurizer],
        model_factory=lambda: LogisticRegression(fit_intercept=True),
        verbose=True):
    if verbose: print('Building datasets')
    train_o, train_y = build_datasets_for_split(split=split)
    if verbose: print('Featurizing')
    train_X, vectorizer = featurize_datasets(train_o, data[split]['corpus'], featurizers)
    models = {}
    if verbose: print('Training models')
    for rel in all_relations:
        models[rel] = model_factory()
        models[rel].fit(train_X[rel], train_y[rel])
    if verbose: print('Training complete\n')
    return {
        'featurizers': featurizers,
        'vectorizer': vectorizer,
        'models': models,
    }

In [36]:
def predict(split, featurizers, vectorizer, models):
    test_o, test_y = build_datasets_for_split(split=split)
    test_X, _ = featurize_datasets(test_o, data[split]['corpus'], featurizers, vectorizer=vectorizer)
    predictions = {}
    for rel in all_relations:
        predictions[rel] = models[rel].predict(test_X[rel])
    return predictions, test_y

In [38]:
def evaluate_predictions(predictions, test_y, verbose=True):
    results = {}  # one result row for each relation
    if verbose:
        print_statistics_header()
    for rel in all_relations:
        stats = precision_recall_fscore_support(test_y[rel], predictions[rel], beta=0.5)
        stats = [stat[1] for stat in stats]  # stats[1] is the stat for label True
        stats.append(len(test_y[rel]))
        results[rel] = stats
        if verbose:
            print_statistics_row(rel, results[rel])
    avg_result = macro_average_results(results)
    if verbose:
        print_statistics_footer(avg_result)
    return avg_result[2]  # return f_0.5 score as summary statistic

In [39]:
def experiment(
        train_split='train',
        test_split='dev',
        featurizers=[simple_bag_of_words_featurizer],
        model_factory=lambda: LogisticRegression(fit_intercept=True),
        verbose=True):
    train_result = train_models(train_split, featurizers, model_factory, verbose)
    predictions, test_y = predict(test_split,
                                  featurizers,
                                  train_result['vectorizer'],
                                  train_result['models'])
    evaluate_predictions(predictions, test_y, verbose)
    return train_result

In [40]:
_ = experiment()

Building datasets
Featurizing
Training models
Training complete

relation              precision     recall    f-score    support       size
------------------    ---------  ---------  ---------  ---------  ---------
adjoins                   0.944      0.449      0.774        303       5319
author                    0.812      0.548      0.740        480       5496
capital                   0.704      0.213      0.482         89       5105
contains                  0.755      0.606      0.720       2667       7683
film_performance          0.807      0.569      0.745        822       5838
founders                  0.800      0.401      0.667        359       5375
genre                     0.623      0.229      0.463        166       5182
has_sibling               0.861      0.242      0.569        513       5529
has_spouse                0.912      0.360      0.698        575       5591
is_a                      0.690      0.235      0.497        494       5510
nationality            

In [41]:
def examine_model_weights(
        train_split='train',
        featurizers=[simple_bag_of_words_featurizer],
        model_factory=lambda: LogisticRegression(fit_intercept=True),
        k=3,
        verbose=True):
    train_result = train_models(train_split, featurizers, model_factory, verbose)
    feature_names = train_result['vectorizer'].get_feature_names()
    for rel, model in train_result['models'].items():
        print('Highest and lowest feature weights for relation {}:\n'.format(rel))
        sorted_weights = sorted([(wgt, idx) for idx, wgt in enumerate(model.coef_[0])], reverse=True)
        for wgt, idx in sorted_weights[:k]:
            print('{:10.3f} {}'.format(wgt, feature_names[idx]))
        print('{:>10s} {}'.format('.....', '.....'))
        for wgt, idx in sorted_weights[-k:]:
            print('{:10.3f} {}'.format(wgt, feature_names[idx]))
        print()

In [42]:
examine_model_weights()

Building datasets
Featurizing
Training models
Training complete

Highest and lowest feature weights for relation adjoins:

     2.557 Córdoba
     2.399 Taluks
     1.771 Southwest
     ..... .....
    -1.301 22
    -1.324 India
    -1.499 Cook

Highest and lowest feature weights for relation author:

     2.728 author
     2.512 Szathmáry
     2.409 book
     ..... .....
    -2.133 or
    -2.378 directed
    -2.999 1818

Highest and lowest feature weights for relation capital:

     3.625 capital
     1.887 headquarters
     1.765 km
     ..... .....
    -1.470 largest
    -1.666 Madras
    -1.951 Antrim

Highest and lowest feature weights for relation contains:

     3.652 Channel
     2.235 continent
     2.117 districts
     ..... .....
    -2.375 Cook
    -2.394 appeared
    -4.158 Antrim

Highest and lowest feature weights for relation film_performance:

     4.081 starring
     3.584 alongside
     3.355 opposite
     ..... .....
    -1.734 or
    -1.922 then
    -3.863 double



In [43]:
def find_new_relation_instances(
        train_split='train',
        test_split='dev',
        featurizers=[simple_bag_of_words_featurizer],
        model_factory=lambda: LogisticRegression(fit_intercept=True),
        k=10,
        verbose=True):

    # train models
    train_result = train_models(train_split, featurizers, model_factory, verbose)

    # build datasets for negative instances only
    neg_o, neg_y = build_datasets_for_split(test_split, include_positive=False, sampling_rate=1.0)
    neg_X, _ = featurize_datasets(neg_o,
                                  data[test_split]['corpus'],
                                  featurizers,
                                  train_result['vectorizer'])

    # report highest confidence predictions
    for rel, model in train_result['models'].items():
        print('Highest probability examples for relation {}:\n'.format(rel))
        probs = model.predict_proba(neg_X[rel])
        probs = [prob[1] for prob in probs] # probability for class True
        sorted_probs = sorted([(p, idx) for idx, p in enumerate(probs)], reverse=True)
        for p, idx in sorted_probs[:k]:
            print('{:10.3f} {}'.format(p, neg_o[rel][idx]))
        print()

In [44]:
find_new_relation_instances()

Building datasets
Featurizing
Training models
Training complete

Highest probability examples for relation adjoins:

     1.000 KBTriple(rel='adjoins', sbj='Sun', obj='Moon')
     1.000 KBTriple(rel='adjoins', sbj='Moon', obj='Sun')
     1.000 KBTriple(rel='adjoins', sbj='India', obj='Maharashtra')
     1.000 KBTriple(rel='adjoins', sbj='Maharashtra', obj='India')
     1.000 KBTriple(rel='adjoins', sbj='Kashmir', obj='India')
     1.000 KBTriple(rel='adjoins', sbj='India', obj='Kashmir')
     1.000 KBTriple(rel='adjoins', sbj='Europe', obj='Great_Britain')
     1.000 KBTriple(rel='adjoins', sbj='Great_Britain', obj='Europe')
     0.999 KBTriple(rel='adjoins', sbj='Ukraine', obj='Chernobyl_Nuclear_Power_Plant')
     0.999 KBTriple(rel='adjoins', sbj='Chernobyl_Nuclear_Power_Plant', obj='Ukraine')

Highest probability examples for relation author:

     1.000 KBTriple(rel='author', sbj='The_Doors_of_Perception', obj='Aldous_Huxley')
     1.000 KBTriple(rel='author', sbj='Aldous_Huxley', 

     1.000 KBTriple(rel='parents', sbj='Isaac', obj='Abraham')
     1.000 KBTriple(rel='parents', sbj='Abraham', obj='Isaac')
     1.000 KBTriple(rel='parents', sbj='Abraham', obj='Ishmael')
     1.000 KBTriple(rel='parents', sbj='Ishmael', obj='Abraham')
     1.000 KBTriple(rel='parents', sbj='Kim_Jong-il', obj='Kim_Jong-un')
     1.000 KBTriple(rel='parents', sbj='Kim_Jong-un', obj='Kim_Jong-il')
     1.000 KBTriple(rel='parents', sbj='Anne_Boleyn', obj='Elizabeth_I_of_England')
     1.000 KBTriple(rel='parents', sbj='Elizabeth_I_of_England', obj='Anne_Boleyn')
     1.000 KBTriple(rel='parents', sbj='Indira_Gandhi', obj='Rajiv_Gandhi')
     1.000 KBTriple(rel='parents', sbj='Rajiv_Gandhi', obj='Indira_Gandhi')

Highest probability examples for relation place_of_birth:

     1.000 KBTriple(rel='place_of_birth', sbj='Matale_District', obj='Sri_Lanka')
     1.000 KBTriple(rel='place_of_birth', sbj='Sri_Lanka', obj='Matale_District')
     1.000 KBTriple(rel='place_of_birth', sbj='North_I