In [2]:
import pandas as pd
import numpy as np
import sys, os
import glob

from mitie import *
from collections import defaultdict

### Loading NER Model

In [3]:
ner = named_entity_extractor('./MITIE-models/english/ner_model.dat')

### Loading Data Set

In [4]:
dataset = pd.DataFrame.from_csv('./data-preprocessed.tsv', sep='\t')

In [49]:
#Convert dataset to tokens
titles = []
facts = []
number = 100
for idx in range(number):
    content = ""
    content = dataset["Content"].values[idx]
    title = dataset["Content"].index[idx]
    tokens = tokenize(content)
    #Get entities from tokens
    # entities is a list of tuples, each containing an xrange that indicates which
    # tokens are part of the entity, the entity tag, and an associate score.  The
    # entities are also listed in the order they appear in the input text file.
    # Here we just print the score, tag, and text for each entity to the screen.
    # The larger the score the more confident MITIE is in its prediction.
    entities = ner.extract_entities(tokens)
    # Now let's run one of MITIE's binary relation detectors.  MITIE comes with a
    # bunch of different types of relation detector and includes tools allowing you
    # to train new detectors.  However, here we simply use one, the "person born in
    # place" relation detector.
    rel_classifier_names = glob.glob("./MITIE-models/english/binary_relations/*.svm")
    for rel_classifier_name in rel_classifier_names:
        rel_detector = binary_relation_detector(rel_classifier_name)
        relation_type = rel_classifier_name.split(".")[-2]
        # First, let's make a list of neighboring entities.  Once we have this list we
        # will ask the relation detector if any of these entity pairs is an example of
        # the "person born in place" relation.
        neighboring_entities = [(entities[i][0], entities[i+1][0]) for i in xrange(len(entities)-1)]
        # Also swap the entities and add those in as well.  We do this because "person
        # born in place" mentions can appear in the text in as "place is birthplace of
        # person".  So we must consider both possible orderings of the arguments.
        neighboring_entities += [(r,l) for (l,r) in neighboring_entities]
        # Now that we have our list, let's check each entity pair and see which one the
        # detector selects.
        for first_entity, second_entity in neighboring_entities:
            fact = []
            # Detection has two steps in MITIE. First, you convert a pair of entities
            # into a special representation.
            rel = ner.extract_binary_relation(tokens, first_entity, second_entity)
            # Then you ask the detector to classify that pair of entities.  If the
            # score value is > 0 then it is saying that it has found a relation.  The
            # larger the score the more confident it is.  Finally, the reason we do
            # detection in two parts is so you can reuse the intermediate rel in many
            # calls to different relation detectors without needing to redo the
            # processing done in extract_binary_relation().
            score = rel_detector(rel)
            # Print out any matching relations.
            if (score > 0):
                first_entity_text     = " ".join(tokens[i] for i in first_entity)
                second_entity_text = " ".join(tokens[i] for i in second_entity)
                fact.append(first_entity_text)
                fact.append(relation_type)
                fact.append(second_entity_text)
                facts.append(fact)
                titles.append(title)

In [50]:
facts_numpy = np.array(facts)
titles_numpy = np.array(titles)
ner_data_frame = pd.DataFrame(facts_numpy, index=titles_numpy, columns=['first_entity','relation_type','second_entity'])
ner_data_frame

Unnamed: 0,first_entity,relation_type,second_entity
1951 San Jose State Spartans football team,California,contains,San Jose
The Araca Group,Matthew Rego,organizations_founded,Tony Award-winning
The Araca Group,Araca,organizations_founded,Araca Group
The Araca Group,James Naughton,directed_by,Mike Nichols
The Araca Group,The Vagina Monologues,directed_by,Mike Nichols
The Araca Group,Hank Unger,influenced_by,Michael Rego
The Araca Group,Araca Group,author,Gregory Maguire
The Araca Group,Cloud Nine,author,Caryl Churchill
The Araca Group,Dallas,author,Debbie
The Araca Group,Rock of Ages,includes_event,Boeing-Boeing
