From 93f75fa1a646221178936a80c4a71e0781183c3f Mon Sep 17 00:00:00 2001 From: Luke Hsiao Date: Tue, 21 Aug 2018 15:21:51 -0700 Subject: [PATCH] Fix bug of yielding empty spans A bug occurs if the text of the span ends in one of the split tokens. For example, "BC546-" will try to yield "BC546-", "BC546", and an empty span with invalid char_start and char_end. This stops it from yielding the empty span. See https://github.com/HazyResearch/fonduer/pull/112/. Co-authored-by: Hiromu Hota --- snorkel/candidates.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snorkel/candidates.py b/snorkel/candidates.py index e71c49684..080aa93af 100644 --- a/snorkel/candidates.py +++ b/snorkel/candidates.py @@ -170,11 +170,11 @@ def apply(self, context): m = re.search(self.split_rgx, context.text[start-offsets[0]:end-offsets[0]+1]) if m is not None and l < self.n_max + 1: ts1 = TemporarySpan(char_start=start, char_end=start + m.start(1) - 1, sentence=context) - if ts1 not in seen: + if ts1 not in seen and ts1.get_span(): seen.add(ts1) yield ts1 ts2 = TemporarySpan(char_start=start + m.end(1), char_end=end, sentence=context) - if ts2 not in seen: + if ts2 not in seen and ts1.get_span(): seen.add(ts2) yield ts2