In [2]:
import pandas as pd
import numpy as np
import sys, os
import glob

from mitie import *
from collections import defaultdict

### Loading NER Model

In [3]:
ner = named_entity_extractor('./MITIE-models/english/ner_model.dat')

### Loading Data Set

In [42]:
dataset = pd.read_csv('./data_test_coreference.tsv', sep='\t')

In [21]:
#Convert dataset to tokens
titles = []
facts = []
number = len(dataset)
print('Size:', number)
for idx in range(number):
    print(idx)
    content = ""
    content = dataset["Content"].values[idx]
    title = dataset["Content"].index[idx]
    tokens = tokenize(content)
    #Get entities from tokens
    # entities is a list of tuples, each containing an xrange that indicates which
    # tokens are part of the entity, the entity tag, and an associate score.  The
    # entities are also listed in the order they appear in the input text file.
    # Here we just print the score, tag, and text for each entity to the screen.
    # The larger the score the more confident MITIE is in its prediction.
    entities = ner.extract_entities(tokens)
    # Now let's run one of MITIE's binary relation detectors.  MITIE comes with a
    # bunch of different types of relation detector and includes tools allowing you
    # to train new detectors.  However, here we simply use one, the "person born in
    # place" relation detector.
    rel_classifier_names = glob.glob("./MITIE-models/english/binary_relations/*.svm")
    for rel_classifier_name in rel_classifier_names:
        rel_detector = binary_relation_detector(rel_classifier_name)
        relation_type = rel_classifier_name.split(".")[-2]
        # First, let's make a list of neighboring entities.  Once we have this list we
        # will ask the relation detector if any of these entity pairs is an example of
        # the "person born in place" relation.
        neighboring_entities = [(entities[i][0], entities[i+1][0]) for i in xrange(len(entities)-1)]
        # Also swap the entities and add those in as well.  We do this because "person
        # born in place" mentions can appear in the text in as "place is birthplace of
        # person".  So we must consider both possible orderings of the arguments.
        neighboring_entities += [(r,l) for (l,r) in neighboring_entities]
        # Now that we have our list, let's check each entity pair and see which one the
        # detector selects.
        for first_entity, second_entity in neighboring_entities:
            fact = []
            # Detection has two steps in MITIE. First, you convert a pair of entities
            # into a special representation.
            rel = ner.extract_binary_relation(tokens, first_entity, second_entity)
            # Then you ask the detector to classify that pair of entities.  If the
            # score value is > 0 then it is saying that it has found a relation.  The
            # larger the score the more confident it is.  Finally, the reason we do
            # detection in two parts is so you can reuse the intermediate rel in many
            # calls to different relation detectors without needing to redo the
            # processing done in extract_binary_relation().
            score = rel_detector(rel)
            # Print out any matching relations.
            if (score > 0.5):
                first_entity_text     = " ".join(tokens[i].decode("utf-8")  for i in first_entity)
                second_entity_text = " ".join(tokens[i].decode("utf-8")  for i in second_entity)
                fact.append(first_entity_text)
                fact.append(relation_type)
                fact.append(second_entity_text)
                facts.append(fact)
                titles.append(title)

Size: 164
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163


In [45]:
t_to_n = {}
n_to_t = {}

for i in range(len(dataset)):
    n_to_t[i] = dataset.iloc[i, ]
    t_to_n[dataset.iloc[i, 0]] = i

facts_numpy = np.array(facts)


In [47]:
dd = {}
dd['entity1'] = facts_numpy[:, 0].tolist()
dd['relation'] = facts_numpy[:, 1].tolist()
dd['entity2'] = facts_numpy[:, 2].tolist()
dd['id_article'] = [t_to_n[x]+1 for x in titles]
dd['article'] = titles
ner_data_frame = pd.DataFrame(data=dd)
ner_data_frame = ner_data_frame[['id_article', 'article', 'entity1', 'relation', 'entity2']]
ner_data_frame.to_csv('mitie_content_coref_0.5.tsv', sep='\t')