diff --git a/src/iamsystem/matcher/strategy.py b/src/iamsystem/matcher/strategy.py index a6595b3..984eaaa 100644 --- a/src/iamsystem/matcher/strategy.py +++ b/src/iamsystem/matcher/strategy.py @@ -79,15 +79,7 @@ def detect( count_not_stopword=count_not_stopword, ) new_trans.append(next_trans) - # Why 'next_trans not in transitions: - # Don't create multiple annotations for the same transition - # For example 'cancer cancer' with keyword 'cancer': - # if an annotation was created for the first 'cancer' - # occurent, don't create a new one of the second occurence. - if ( - next_node.is_a_final_state() - and next_trans not in transitions - ): + if next_node.is_a_final_state(): annot = create_annot( last_trans=next_trans, stop_tokens=stop_tokens ) @@ -183,19 +175,11 @@ def detect( ) new_trans.add(next_trans) for trans in new_trans: - # create an annotation if: - # 1) node is a final state - # 2) an annotation wasn't created yet for this state: - # 2.1 there is no previous 'none-obsolete state'. if trans.node.is_a_final_state(): - old_trans = transitions.get(trans.id, None) - if old_trans is None or old_trans.is_obsolete( - count_not_stopword=count_not_stopword, w=w - ): - annot = create_annot( - last_trans=trans, stop_tokens=stop_tokens - ) - annots.append(annot) + annot = create_annot( + last_trans=trans, stop_tokens=stop_tokens + ) + annots.append(annot) for nexttoken in trans.node.get_children_tokens(): avaible_trans[nexttoken].add(trans.id) transitions[trans.id] = trans diff --git a/tests/test_matcher.py b/tests/test_matcher.py index f6e513e..186e40c 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -211,28 +211,36 @@ def test_duplicate_states_generate_lot_of_overlaps(self): If the algorithm takes all possible paths then it outputs 16 annotations. By storing algorithms' states in a set rather than in an array, an existing state is replaced. + New behavior due to + https://github.com/scossin/iamsystem_python/issues/18 issue: + two annotations are created since prostate is repeated. """ matcher = Matcher.build(keywords=["cancer de la prostate"], w=3) annots = matcher.annot_text( text="cancer cancer de de la la prostate prostate" ) - self.assertEqual(len(annots), 1) + self.assertEqual(len(annots), 2) self.assertEqual( str(annots[0]), "cancer de la prostate 7 13;17 19;23 34 cancer de la prostate", ) - def test_duplicate_states_annotations_created(self): - """Check it creates two annotations, one for the first occurence of - 'cancer', the next one using the last occurence of 'cancer'.""" + def test_states_override(self): + """States overriding avoid multiple overlapping. + See https://github.com/scossin/iamsystem_python/issues/11 + Here it creates three annotations: 1) first occurence of + 'cancer', 2) second occurence of cancer, 3) a single annotation for + cancer de la prostate (state 'cancer' overrides the previous ones). + """ matcher = Matcher.build( keywords=["cancer", "cancer de la prostate"], w=10 ) annots = matcher.annot_text(text="cancer cancer cancer de la prostate") - self.assertEqual(len(annots), 2) + self.assertEqual(len(annots), 3) self.assertEqual(str(annots[0]), "cancer 0 6 cancer") + self.assertEqual(str(annots[1]), "cancer 7 13 cancer") self.assertEqual( - str(annots[1]), + str(annots[2]), "cancer de la prostate 14 35 cancer de la prostate", ) @@ -566,12 +574,21 @@ def test_repeated_words(self): """Check repeated words are annotated multiple times. https://github.com/scossin/iamsystem_python/issues/18 """ - from iamsystem import Matcher - matcher = Matcher.build(keywords=["cancer"]) annots = matcher.annot_text(text="cancer cancer") self.assertEqual(2, len(annots)) + def test_repeated_words_large_window(self): + """Check repeated words are annotated multiple times with the large + window strategy. + https://github.com/scossin/iamsystem_python/issues/18 + """ + matcher = Matcher.build( + keywords=["cancer"], strategy=EMatchingStrategy.LARGE_WINDOW + ) + annots = matcher.annot_text(text="cancer cancer") + self.assertEqual(2, len(annots)) + if __name__ == "__main__": unittest.main()