-
Notifications
You must be signed in to change notification settings - Fork 11
/
distantSupervision.py
executable file
·134 lines (121 loc) · 4.69 KB
/
distantSupervision.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import json
def loadTargetTypes(filename):
map = {}
with open(filename, 'r') as fin:
for line in fin:
seg = line.strip('\r\n').split('\t')
fbType = seg[0]
cleanType = seg[1]
map[fbType] = cleanType
return map
def linkToFB(jsonFname, outFname, mentionTypeRequired, entityTypesFname, relationTypesFname, freebase_dir):
mid2typeFname = freebase_dir+'/freebase-mid-type.map'
mid2nameFname = freebase_dir+'/freebase-mid-name.map'
relationTupleFname = freebase_dir+'/freebase-facts.txt'
mid2types = {}
name2mids = {}
mids2relation = {}
targetEMTypes = loadTargetTypes(entityTypesFname)#{'<http://rdf.freebase.com/ns/people.person>':'PERSON', '<http://rdf.freebase.com/ns/organization.organization>':'ORGANIZATION', '<http://rdf.freebase.com/ns/location.location>':'LOCATION'}
with open(mid2typeFname, 'r') as mid2typeFile, open(mid2nameFname, 'r') as mid2nameFile, open(relationTupleFname, 'r') as relationTupleFile:
for line in mid2typeFile:
seg = line.strip('\r\n').split('\t')
mid = seg[0]
type = seg[1].split('/')[-1][:-1]
if type in targetEMTypes:
if mid in mid2types:
mid2types[mid].add(targetEMTypes[type])
else:
mid2types[mid] = set([targetEMTypes[type]])
print('finish loading mid2typeFile')
if mentionTypeRequired != 'em':
targetRMTypes = loadTargetTypes(relationTypesFname)
for line in relationTupleFile:
seg = line.strip('\r\n').split('\t')
mid1 = seg[0]
type = seg[1].split('/')[-1][:-1]
mid2 = seg[2]
if type in targetRMTypes and mid1 in mid2types and mid2 in mid2types:
key = (mid1, mid2)
if key in mids2relation:
mids2relation[key].add(targetRMTypes[type])
else:
mids2relation[key] = set([targetRMTypes[type]])
print('finish loading relationTupleFile')
for line in mid2nameFile:
seg = line.strip('\r\n').split('\t')
mid = seg[0]
name = seg[1].lower()
if mid in mid2types and name.endswith('@en'):
name = name[1:].replace('"@en', '')
if name in name2mids:
name2mids[name].add(mid)
else:
name2mids[name] = set([mid])
print('finish loading mid2nameFile')
with open(jsonFname, 'r') as fin, open(outFname, 'w') as fout:
linkableCt = 0
for line in fin:
sentDic = json.loads(line.strip('\r\n'))
entityMentions = []
em2mids = {}
for em in sentDic['entityMentions']:
emText = em['text'].lower()
types = set()
if emText in name2mids:
linkableCt += 1
mids = name2mids[emText]
em2mids[(int(em['start']), em['text'])] = set(mids)
for mid in mids:
types.update(set(mid2types[mid]))
em['label'] = ','.join(types)
if len(types) > 0:
entityMentions.append(em)
sentDic['entityMentions'] = entityMentions
if mentionTypeRequired != 'em':
sentDic['relationMentions'] = []
for (eid1, e1text) in em2mids:
for (eid2, e2text) in em2mids:
if eid2 != eid1:
rmDic = dict()
rmDic['em1Text'] = e1text
rmDic['em2Text'] = e2text
labels = set()
for mid1 in em2mids[(eid1, e1text)]:
for mid2 in em2mids[(eid2, e2text)]:
if (mid1, mid2) in mids2relation:
labels.update(set(mids2relation[(mid1, mid2)]))
if len(labels) > 0:
rmDic['label'] = ','.join(labels)
sentDic['relationMentions'].append(rmDic)
if mentionTypeRequired == 'rm':
del sentDic['entityMentions']
fout.write(json.dumps(sentDic) + '\n')
def getNegRMs(jsonFname, outputFname):
with open(jsonFname, 'r') as fin, open(outputFname, 'w') as fout:
for line in fin:
sentDic = json.loads(line.strip('\r\n'))
rms = set()
ems = set()
newRms = []
relationMentions = []
for em in sentDic['entityMentions']:
ems.add(em['text'])
for rm in sentDic['relationMentions']:
relationMentions.append(rm)
rms.add(frozenset([rm['em1Text'], rm['em2Text']]))
for em1 in ems:
for em2 in ems:
if em1 != em2:
if frozenset([em1, em2]) not in rms:
newRm = dict()
newRm['em1Text'] = em1
newRm['em2Text'] = em2
newRm['label'] = 'None'
newRms.append(newRm)
rms.add(frozenset([em1, em2]))
#break
for rm in newRms:
relationMentions.append(rm)
if len(relationMentions) > 0:
sentDic['relationMentions'] = relationMentions
fout.write(json.dumps(sentDic)+'\n')