From a06c7822be32441f5ea0d9c5a60331159758732c Mon Sep 17 00:00:00 2001 From: Luke Hsiao Date: Mon, 20 Aug 2018 16:26:27 -0700 Subject: [PATCH 1/2] Fix bug in Ngram splitting logic Rather than returning the TemporarySpan, along with its splits, Snorkel was returning the TemporarySpan twice, and only the 2nd split. Hiromu Hota fixed this bug in Fonduer in [1]. This commit fixes it for Snorkel. [1] https://github.com/HazyResearch/fonduer/pull/108 Co-authored-by: Hiromu Hota --- snorkel/candidates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snorkel/candidates.py b/snorkel/candidates.py index ac0e1c80a..e71c49684 100644 --- a/snorkel/candidates.py +++ b/snorkel/candidates.py @@ -172,7 +172,7 @@ def apply(self, context): ts1 = TemporarySpan(char_start=start, char_end=start + m.start(1) - 1, sentence=context) if ts1 not in seen: seen.add(ts1) - yield ts + yield ts1 ts2 = TemporarySpan(char_start=start + m.end(1), char_end=end, sentence=context) if ts2 not in seen: seen.add(ts2) From 93f75fa1a646221178936a80c4a71e0781183c3f Mon Sep 17 00:00:00 2001 From: Luke Hsiao Date: Tue, 21 Aug 2018 15:21:51 -0700 Subject: [PATCH 2/2] Fix bug of yielding empty spans A bug occurs if the text of the span ends in one of the split tokens. For example, "BC546-" will try to yield "BC546-", "BC546", and an empty span with invalid char_start and char_end. This stops it from yielding the empty span. See https://github.com/HazyResearch/fonduer/pull/112/. Co-authored-by: Hiromu Hota --- snorkel/candidates.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snorkel/candidates.py b/snorkel/candidates.py index e71c49684..080aa93af 100644 --- a/snorkel/candidates.py +++ b/snorkel/candidates.py @@ -170,11 +170,11 @@ def apply(self, context): m = re.search(self.split_rgx, context.text[start-offsets[0]:end-offsets[0]+1]) if m is not None and l < self.n_max + 1: ts1 = TemporarySpan(char_start=start, char_end=start + m.start(1) - 1, sentence=context) - if ts1 not in seen: + if ts1 not in seen and ts1.get_span(): seen.add(ts1) yield ts1 ts2 = TemporarySpan(char_start=start + m.end(1), char_end=end, sentence=context) - if ts2 not in seen: + if ts2 not in seen and ts1.get_span(): seen.add(ts2) yield ts2