-
Notifications
You must be signed in to change notification settings - Fork 21
/
utils.py
131 lines (106 loc) · 3.88 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#
# Tools for loading NER datasets
#
import os
import re
import glob
import random
import numpy as np
import collections
from trove.data.dataloaders.dataloaders import dataloader
###############################################################################
#
# Sampling
#
###############################################################################
def reservoir_sampling(iterable, n, seed=1234):
"""Standard reservoir sampling of a Python iterable."""
np.random.seed(seed)
i = 0
pool = []
for item in iterable:
if len(pool) < n:
pool.append(item)
else:
i += 1
k = random.randint(0, i)
if k < n:
pool[k] = item
return pool
def load_unlabeled_sample(fpath, num_samples, seed=1234, max_docs=100000):
"""
Reservoir sample JSON documents. If `seed` and `max_docs` are fixed,
then this returns a deterministic subsample of the docs at `fpath`.
"""
filelist = glob.glob(f"{fpath}/*") if os.path.isdir(fpath) else [fpath]
assert len(filelist) > 0
sample = reservoir_sampling(dataloader(filelist), max_docs, seed)
return sample[0:num_samples]
###############################################################################
#
# Term Frequency
#
###############################################################################
def ngrams(seq, max_ngrams=4):
for i in range(0, len(seq)):
for j in range(i + 1, min(i + max_ngrams + 1, len(seq))):
term = tuple(seq[i:j]) if ' '.join(seq[i:j]).isupper() else \
tuple(map(lambda x: x.lower(), seq[i:j]))
yield term
def ngram_idf(sentences, max_ngrams=4):
doc_freq = {}
for s in sentences:
if s.document.name not in doc_freq:
doc_freq[s.document.name] = {n: collections.Counter() \
for n in range(1, max_ngrams + 1)}
words = [w for w in s.words if w.strip()]
for tokens in set(ngrams(words)):
term = ' '.join(tokens)
doc_freq[s.document.name][len(tokens)][term] = 1
freq = {n: collections.Counter() for n in range(1, max_ngrams + 1)}
for name in doc_freq:
for n in doc_freq[name]:
for term in doc_freq[name][n]:
freq[n][term] += 1
for n in freq:
freq[n] = {term: np.log10(len(doc_freq) / freq[n][term]) \
for term in freq[n]}
return dict(freq)
def get_dict_coverage(term_weights, dictionaries):
scores = collections.defaultdict(float)
for name in dictionaries:
for term in term_weights:
if term in dictionaries[name]:
scores[name] += term_weights[term]
return dict(scores)
def score_umls_ontologies(sentences, ontologies, concepts=None, max_ngrams=4):
# compute term weights (IDF)
idf = ngram_idf(sentences, max_ngrams=max_ngrams)
# compute weighted coverage
weights = collections.defaultdict(float)
for ngram in idf:
coverage = get_dict_coverage(idf[ngram], ontologies)
for name in coverage:
weights[name] += coverage[name]
# restrict to some set of concepts/semantic types
if concepts:
sab2sty = collections.defaultdict(set)
for (sab, sty) in weights:
sab2sty[sab].add(sty)
rm_sab = []
for sab in sab2sty:
if not sab2sty[sab].intersection(concepts):
rm_sab.append(sab)
rm = []
for (sab, sty) in weights:
if sab in rm_sab:
rm.append((sab, sty))
print(f'Removed {len(rm_sab)} source vocabs, '
f'{len(rm)} SAB/STY dictionaries')
for key in rm:
del weights[key]
# compute score by SAB (source vocabulary / ontology name)
scores = collections.defaultdict(float)
for (sab, sty) in weights:
scores[sab] += weights[(sab, sty)]
return scores