Merge pull request #17 from scossin/version0.4.0

scossin · Mar 11, 2023 · afb5cf6 · afb5cf6
2 parents 7f8fef4 + 8b244ce
commit afb5cf6
Show file tree

Hide file tree

Showing 34 changed files with 1,233 additions and 354 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,22 @@
 # ChangeLog
 
+## Version 0.4.0 (beta)
+
+### Breaking changes
+- IAnnotation: remove 'brat_formatter' instance getter/setter.
+'set_brat_formatter' becomes a class method to change the BratFormatter.
+- Rename BratFormatters classes.
+- Span (super class of Annotation): remove 'to_brat_format' method.
+- Remove "offsets" attribute from the dictionary produced by
+the 'to_dict' method of an annotation.
+- FuzzyAlgo and ISynsProvider, get_synonyms method: change parameter name 'states' to 'transitions'.
+
+-### Enhancement
+-- Bug fixes: 11 to 16.
+-- Add the 'NoOverlap' matching strategy.
+-- Add IAMsystem version to the 'to_dict' method of an annotation.
+
+
 ## Version 0.3.0 (beta)
 
 ### Breaking changes

diff --git a/docs/source/_static/largeWindowSize.png b/docs/source/_static/largeWindowSize.png
diff --git a/docs/source/_static/no_overlap.png b/docs/source/_static/no_overlap.png
diff --git a/docs/source/_static/window_1.png b/docs/source/_static/window_1.png
diff --git a/docs/source/_static/window_2.png b/docs/source/_static/window_2.png
diff --git a/docs/source/api_doc.rst b/docs/source/api_doc.rst
@@ -20,6 +20,12 @@ Matcher build
    :members: build
    :noindex:
 
+EMatchingStrategy
+^^^^^^^^^^^^^^^^^
+.. autoclass:: iamsystem.EMatchingStrategy
+   :members:
+   :noindex:
+
 Span
 ----
 .. autoclass:: iamsystem.matcher.annotation.Span
@@ -281,23 +287,30 @@ Brat
 Formatter
 ^^^^^^^^^
 
-TokenFormatter
+EBratFormatters
+"""""""""""""""
+.. autoclass:: iamsystem.EBratFormatters
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+ContSeqFormatter
 """"""""""""""
-.. autoclass:: iamsystem.TokenFormatter
+.. autoclass:: iamsystem.ContSeqFormatter
    :members:
    :undoc-members:
    :show-inheritance:
 
-IndividualTokenFormatter
-""""""""""""""""""""""""
-.. autoclass:: iamsystem.IndividualTokenFormatter
+ContSeqStopFormatter
+""""""""""""""
+.. autoclass:: iamsystem.ContSeqStopFormatter
    :members:
    :undoc-members:
    :show-inheritance:
 
-TokenStopFormatter
-""""""""""""""""""
-.. autoclass:: iamsystem.TokenStopFormatter
+TokenFormatter
+""""""""""""""
+.. autoclass:: iamsystem.TokenFormatter
    :members:
    :undoc-members:
    :show-inheritance:

diff --git a/docs/source/brat.rst b/docs/source/brat.rst
@@ -20,9 +20,9 @@ The default Brat formatter groups continuous sequence of tokens:
     :start-after: # start_test_brat_default_formatter
     :end-before: # end_test_brat_default_formatter
 
-Indeed, "North America" has two tokens, "North" and "America" but a continuous annotation (0 13) is created.
+Although "North America" has two tokens, "North" and "America", a continuous Brat annotation (0 13) is created.
 
-In order to have one Brat span for each token, you can use the :ref:`api_doc:IndividualTokenFormatter`:
+In order to have one Brat span for each token, you can use the :ref:`api_doc:TokenFormatter`:
 
 .. literalinclude:: ../../tests/test_doc.py
     :language: python
@@ -32,7 +32,7 @@ In order to have one Brat span for each token, you can use the :ref:`api_doc:Ind
     :end-before: # end_test_brat_individual_formatter
 
 If you have stopwords in your matching sequences, you can include them in the Brat annotation using
-:ref:`api_doc:TokenStopFormatter`.
+:ref:`api_doc:ContSeqStopFormatter`.
 Stopwords are included if and only if they form a continuous sequence of tokens.
 Check the differences:
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -24,7 +24,7 @@
 author = "Sebastien Cossin"
 
 # The full version, including alpha/beta/rc tags
-release = "0.3.0"
+release = "0.4.0"
 
 # -- General configuration ---------------------------------------------------
 

diff --git a/docs/source/matcher.rst b/docs/source/matcher.rst
@@ -81,3 +81,66 @@ the algorithm fails to detect it. For example:
 This problem can be solved by changing the order of the tokens in a sentence
 which is the responsibility of the tokenizer.
 See Tokenizer section on :ref:`tokenizer:Change tokens order`.
+
+Matching strategies
+^^^^^^^^^^^^^^^^^^^
+
+The matching strategy is the core of the IAMsystem algorithm.
+There are currently two different strategies: *window matching* and *NoOverlap* (See :ref:`api_doc:EMatchingStrategy`).
+The *NoOverlap* strategy does not take into account the window parameter and does not produce any overlap
+(except in case of an ambiguity).
+The window strategy (default) allows the detection of discontinuous tokens sequence and nested annotations.
+
+No Overlap
+""""""""""
+
+.. literalinclude:: ../../tests/test_doc.py
+    :language: python
+    :dedent:
+    :start-after: # start_test_no_overlap_strategy
+    :end-before: # end_test_no_overlap_strategy
+
+.. image:: _static/no_overlap.png
+  :width: 400
+  :alt: NoOverlapImage
+
+The *NoOverlap* matching strategy was the first matching strategy implemented by IAMsystem and was described in research papers.
+It only uses a window of 1 (window parameter has no effect) and doesn't detect nested annotations.
+First, the algorithm builds a trie datastructure to store all the keywords. Then, for each token in the document,
+it calls fuzzy matching algorithms (not shown on the image) and tries to find a match at the current state.
+The first state is the ROOT node of the trie. If a token is a stopword (*and* in the example), the algorithm goes
+to the next token and the states remain the same. If the next token is a dead end (*south* in the example), the
+algorithm returns to the ROOT node and starts again. When the algorithm reaches a node containing a keyword
+(the green node in the example), it generates an annotation.
+In general, a single path is found, so the algorithm doesn't generate any overlap.
+
+
+Window Matching
+"""""""""""""""
+
+.. image:: _static/window_2.png
+  :width: 400
+  :alt: window_2.png
+
+This is the default strategy. It is used in all the examples of this documentation.
+Compared to the *NoOverlap* strategy, the ROOT node is repeated at each step
+which makes it possible to detect nested annotations. Also the window size determines the *lifetime* of a node.
+In this example, the state/node *north* is always alive at the south token because the window size is 2, which means that
+that *north* is two tokens away from *America*, excluding the stop words.
+
+Window Matching speed and LargeWindowMatching
+"""""""""""""""""""""""""""""""""""""""""""""
+
+.. image:: _static/largeWindowSize.png
+  :width: 400
+  :alt: largeWindowSize.png
+
+When the window size is small, the number of operations depends little on the number of keywords.
+As the window increases, the number of operations grows and can become proportional to n*m with n the number of
+tokens in the document and m the number of keywords.
+The *LargeWindowMatching* strategy trades space for time complexity, it produces exactly the same annotations as the
+*WindowMatching* strategy with a number of operations proportional to n*log(m).
+*LargeWindowMatching* is slower than the *WindowMatching* when w is small but much faster when w is large,
+for example when w=1000.
+The image above shows that the *LargeWindowMatching* strategy becomes faster as the window exceeds a certain threshold.
+The value of the threshold depends on the terminology, so performance tests should be performed when a large window is used.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "iamsystem"
-version = "0.3.0"
+version = "0.4.0"
 authors = [
   { name="Sebastien Cossin", email="cossin.sebastien@gmail.com" },
 ]

diff --git a/src/iamsystem/__init__.py b/src/iamsystem/__init__.py
@@ -2,8 +2,10 @@
     "Matcher",
     "IMatcher",
     "IBaseMatcher",
+    "EMatchingStrategy",
     "Annotation",
     "IAnnotation",
+    "PrintAnnot",
     "rm_nested_annots",
     "IStopwords",
     "Stopwords",
@@ -48,20 +50,24 @@
     "SimStringWrapper",
     "ESimStringMeasure",
     "IBratFormatter",
-    "TokenFormatter",
-    "TokenStopFormatter",
+    "EBratFormatters",
+    "ContSeqFormatter",
+    "ContSeqStopFormatter",
     "SpanFormatter",
-    "IndividualTokenFormatter",
+    "TokenFormatter",
 ]
 
+__annot_version__ = "0.4.0"
+
 from iamsystem.brat.adapter import BratDocument
 from iamsystem.brat.adapter import BratEntity
 from iamsystem.brat.adapter import BratNote
 from iamsystem.brat.adapter import BratWriter
-from iamsystem.brat.formatter import IndividualTokenFormatter
+from iamsystem.brat.formatter import ContSeqFormatter
+from iamsystem.brat.formatter import ContSeqStopFormatter
+from iamsystem.brat.formatter import EBratFormatters
 from iamsystem.brat.formatter import SpanFormatter
 from iamsystem.brat.formatter import TokenFormatter
-from iamsystem.brat.formatter import TokenStopFormatter
 from iamsystem.fuzzy.abbreviations import Abbreviations
 from iamsystem.fuzzy.abbreviations import token_is_upper_case
 from iamsystem.fuzzy.api import ContextFreeAlgo
@@ -90,6 +96,8 @@
 from iamsystem.matcher.api import IBratFormatter
 from iamsystem.matcher.api import IMatcher
 from iamsystem.matcher.matcher import Matcher
+from iamsystem.matcher.printannot import PrintAnnot
+from iamsystem.matcher.strategy import EMatchingStrategy
 from iamsystem.stopwords.api import IStopwords
 from iamsystem.stopwords.negative import NegativeStopwords
 from iamsystem.stopwords.simple import NoStopwords

diff --git a/src/iamsystem/brat/adapter.py b/src/iamsystem/brat/adapter.py
@@ -4,7 +4,9 @@
 from typing import Iterable
 from typing import List
 
+from iamsystem.brat.formatter import ContSeqFormatter
 from iamsystem.matcher.api import IAnnotation
+from iamsystem.matcher.api import IBratFormatter
 
 
 class BratEntity:
@@ -114,10 +116,17 @@ class BratDocument:
     one per line. See https://brat.nlplab.org/standoff.html
     """
 
-    def __init__(self):
+    def __init__(self, brat_formatter: IBratFormatter = None):
+        """Create a Brat Document.
+
+        :param brat_formatter: a strategy to create Brat annotations span,
+            like merging continuous sequence of tokens. Default BratFormatter
+            create a Brat span for each individual token.
+        """
         self.brat_entities: List[BratEntity] = []
         self.brat_notes: List[BratNote] = []
         self.get_note: get_note_fun = get_note_keyword_label
+        self.brat_formatter = brat_formatter or ContSeqFormatter()
 
     def add_annots(
         self,
@@ -145,12 +154,12 @@ def add_annots(
                 b_type = annot.keywords[0].__getattribute__(keyword_attr)
             elif brat_type is not None:
                 b_type = brat_type
-            text, offsets = annot.brat_formatter.get_text_and_offsets(annot)
+            text, offsets = self.brat_formatter.get_text_and_offsets(annot)
             brat_entity = BratEntity(
                 entity_id=self._get_entity_id(),
                 brat_type=b_type,
                 offsets=offsets,
-                text=text,
+                text=text.replace("\n", "\\n"),
             )
             self.brat_entities.append(brat_entity)
 

diff --git a/src/iamsystem/brat/formatter.py b/src/iamsystem/brat/formatter.py
@@ -1,27 +1,27 @@
+from enum import Enum
 from typing import Tuple
 
 from iamsystem.brat.util import get_brat_format_seq
 from iamsystem.matcher.api import IAnnotation
 from iamsystem.matcher.api import IBratFormatter
+from iamsystem.tokenization.util import get_text_and_offsets_of_sequences
 from iamsystem.tokenization.util import group_continuous_seq
-from iamsystem.tokenization.util import multiple_seq_to_offsets
 from iamsystem.tokenization.util import remove_trailing_stopwords
 
 
-class TokenFormatter(IBratFormatter):
+class ContSeqFormatter(IBratFormatter):
     """Default Brat Formatter: annotate a document by selecting continuous
     sequences of tokens but ignore stopwords."""
 
     def get_text_and_offsets(self, annot: IAnnotation) -> Tuple[str, str]:
         """Return tokens' labels and token's offsets (merge if continuous)"""
         sequences = group_continuous_seq(tokens=annot.tokens)
-        offsets = multiple_seq_to_offsets(sequences=sequences)
-        seq_offsets = get_brat_format_seq(offsets)
-        seq_label = " ".join([token.label for token in annot.tokens])
-        return seq_label, seq_offsets
+        return get_text_and_offsets_of_sequences(
+            sequences=sequences, annot=annot
+        )
 
 
-class IndividualTokenFormatter(IBratFormatter):
+class TokenFormatter(IBratFormatter):
     """Annotate a document by creating (start,end) offsets for each token
     (In comparison to TokenFormatter, it doesn't merge continuous sequence)."""
 
@@ -32,7 +32,7 @@ def get_text_and_offsets(self, annot: IAnnotation) -> Tuple[str, str]:
         return seq_label, seq_offsets
 
 
-class TokenStopFormatter(IBratFormatter):
+class ContSeqStopFormatter(IBratFormatter):
     """A Brat formatter that takes into account stopwords: annotate a document
     by selecting continuous sequences of tokens/stopwords."""
 
@@ -54,26 +54,32 @@ def get_text_and_offsets(self, annot: IAnnotation) -> Tuple[str, str]:
             sequences = remove_trailing_stopwords(
                 sequences=sequences, stop_i=stop_i
             )
-        seq_tokens = [token for seq in sequences for token in seq]
-        seq_label = " ".join([token.label for token in seq_tokens])
-        offsets = multiple_seq_to_offsets(sequences=sequences)
-        seq_offsets = get_brat_format_seq(offsets)
-        return seq_label, seq_offsets
+        return get_text_and_offsets_of_sequences(
+            sequences=sequences, annot=annot
+        )
 
 
 class SpanFormatter(IBratFormatter):
-    """A simple Brat formatter that only uses start,end offsets
+    """A simple Brat formatter that only uses start, end offsets
     of an annotation"""
 
-    def __init__(self, text: str):
-        """Create a brat formatter.
-
-        :param text: the document of the annotation.
-        """
-        self.text = text
-
     def get_text_and_offsets(self, annot: IAnnotation) -> Tuple[str, str]:
         """Return text, offsets by start and end offsets of the annotation."""
-        seq_label = self.text[annot.start : annot.end]  # noqa
+        seq_label = annot.text[annot.start : annot.end]  # noqa
         seq_offsets = f"{annot.start} {annot.end}"
         return seq_label, seq_offsets
+
+
+class EBratFormatters(Enum):
+    """An enumerated list of available Brat Formatters."""
+
+    DEFAULT = ContSeqFormatter()
+    "Default to CONTINUOUS_SEQ."
+    TOKEN = TokenFormatter()
+    "A fragment for each token."
+    CONTINUOUS_SEQ = ContSeqFormatter()
+    "Merge a continuous sequence of tokens but ignore stopwords."
+    CONTINUOUS_SEQ_STOP = ContSeqStopFormatter()
+    "Merge a continuous sequence of tokens with stopwords."
+    SPAN = SpanFormatter()
+    "A Brat annotation from first token start-offsets to last token end-offsets."  # noqa