Skip to content

Commit

Permalink
#18: remove checking if a transition state already exists before crea…
Browse files Browse the repository at this point in the history
…ting an annotation. Fix #18
  • Loading branch information
scossin committed Mar 21, 2023
1 parent ac8d104 commit cd3aea5
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 29 deletions.
26 changes: 5 additions & 21 deletions src/iamsystem/matcher/strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,7 @@ def detect(
count_not_stopword=count_not_stopword,
)
new_trans.append(next_trans)
# Why 'next_trans not in transitions:
# Don't create multiple annotations for the same transition
# For example 'cancer cancer' with keyword 'cancer':
# if an annotation was created for the first 'cancer'
# occurent, don't create a new one of the second occurence.
if (
next_node.is_a_final_state()
and next_trans not in transitions
):
if next_node.is_a_final_state():
annot = create_annot(
last_trans=next_trans, stop_tokens=stop_tokens
)
Expand Down Expand Up @@ -183,19 +175,11 @@ def detect(
)
new_trans.add(next_trans)
for trans in new_trans:
# create an annotation if:
# 1) node is a final state
# 2) an annotation wasn't created yet for this state:
# 2.1 there is no previous 'none-obsolete state'.
if trans.node.is_a_final_state():
old_trans = transitions.get(trans.id, None)
if old_trans is None or old_trans.is_obsolete(
count_not_stopword=count_not_stopword, w=w
):
annot = create_annot(
last_trans=trans, stop_tokens=stop_tokens
)
annots.append(annot)
annot = create_annot(
last_trans=trans, stop_tokens=stop_tokens
)
annots.append(annot)
for nexttoken in trans.node.get_children_tokens():
avaible_trans[nexttoken].add(trans.id)
transitions[trans.id] = trans
Expand Down
33 changes: 25 additions & 8 deletions tests/test_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,28 +211,36 @@ def test_duplicate_states_generate_lot_of_overlaps(self):
If the algorithm takes all possible paths then it outputs 16
annotations. By storing algorithms' states in a set rather than in
an array, an existing state is replaced.
New behavior due to
https://github.com/scossin/iamsystem_python/issues/18 issue:
two annotations are created since prostate is repeated.
"""
matcher = Matcher.build(keywords=["cancer de la prostate"], w=3)
annots = matcher.annot_text(
text="cancer cancer de de la la prostate prostate"
)
self.assertEqual(len(annots), 1)
self.assertEqual(len(annots), 2)
self.assertEqual(
str(annots[0]),
"cancer de la prostate 7 13;17 19;23 34 cancer de la prostate",
)

def test_duplicate_states_annotations_created(self):
"""Check it creates two annotations, one for the first occurence of
'cancer', the next one using the last occurence of 'cancer'."""
def test_states_override(self):
"""States overriding avoid multiple overlapping.
See https://github.com/scossin/iamsystem_python/issues/11
Here it creates three annotations: 1) first occurence of
'cancer', 2) second occurence of cancer, 3) a single annotation for
cancer de la prostate (state 'cancer' overrides the previous ones).
"""
matcher = Matcher.build(
keywords=["cancer", "cancer de la prostate"], w=10
)
annots = matcher.annot_text(text="cancer cancer cancer de la prostate")
self.assertEqual(len(annots), 2)
self.assertEqual(len(annots), 3)
self.assertEqual(str(annots[0]), "cancer 0 6 cancer")
self.assertEqual(str(annots[1]), "cancer 7 13 cancer")
self.assertEqual(
str(annots[1]),
str(annots[2]),
"cancer de la prostate 14 35 cancer de la prostate",
)

Expand Down Expand Up @@ -566,12 +574,21 @@ def test_repeated_words(self):
"""Check repeated words are annotated multiple times.
https://github.com/scossin/iamsystem_python/issues/18
"""
from iamsystem import Matcher

matcher = Matcher.build(keywords=["cancer"])
annots = matcher.annot_text(text="cancer cancer")
self.assertEqual(2, len(annots))

def test_repeated_words_large_window(self):
"""Check repeated words are annotated multiple times with the large
window strategy.
https://github.com/scossin/iamsystem_python/issues/18
"""
matcher = Matcher.build(
keywords=["cancer"], strategy=EMatchingStrategy.LARGE_WINDOW
)
annots = matcher.annot_text(text="cancer cancer")
self.assertEqual(2, len(annots))


if __name__ == "__main__":
unittest.main()

0 comments on commit cd3aea5

Please sign in to comment.