Added a findcorrectionhandling() method that attempts to determine, g…

…iven a textclass, what kind of correction handling is appropriate. This is used by text validation (proycon/folia#24). The method is limited and can not deal with complex situations (nested corrections, inconsistencies), in such cases text validation will be skipped alltogether for that element.
proycon · Oct 5, 2017 · cb269b6 · cb269b6
1 parent 4d669a5
commit cb269b6
Show file tree

Hide file tree

Showing 2 changed files with 147 additions and 0 deletions.
diff --git a/pynlpl/formats/folia.py b/pynlpl/formats/folia.py
@@ -770,6 +770,61 @@ def stricttext(self, cls='current'):
         """Alias for :meth:`text` with ``strict=True``"""
         return self.text(cls,strict=True)
 
+    def findcorrectionhandling(self, cls):
+        """Find the proper correctionhandling given a textclass by looking in the underlying corrections where it is reused"""
+        if cls == "current":
+            return CorrectionHandling.CURRENT
+        elif cls == "original":
+            return CorrectionHandling.ORIGINAL #backward compatibility
+        else:
+            correctionhandling = None
+            #but any other class may be anything
+            #Do we have corrections at all? otherwise no need to bother
+            for correction in self.select(Correction):
+                #yes, in which branch is the text class found?
+                found = False
+                hastext = False
+                if correction.hasnew():
+                    found = True
+                    doublecorrection = correction.new().count(Correction) > 0
+                    if doublecorrection: return None #skipping text validation, correction is too complex (nested) to handle for now
+                    for t in  correction.new().select(TextContent):
+                        hastext = True
+                        if t.cls == cls:
+                            if correctionhandling is not None and correctionhandling != CorrectionHandling.CURRENT:
+                                return None #inconsistent
+                            else:
+                                correctionhandling = CorrectionHandling.CURRENT
+                            break
+                elif correction.hascurrent():
+                    found = True
+                    doublecorrection = correction.current().count(Correction) > 0
+                    if doublecorrection: return None #skipping text validation, correction is too complex (nested) to handle for now
+                    for t in  correction.current().select(TextContent):
+                        hastext = True
+                        if t.cls == cls:
+                            if correctionhandling is not None and correctionhandling != CorrectionHandling.CURRENT:
+                                return None #inconsistent
+                            else:
+                                correctionhandling = CorrectionHandling.CURRENT
+                            break
+                if correction.hasoriginal():
+                    found = True
+                    doublecorrection = correction.original().count(Correction) > 0
+                    if doublecorrection: return None #skipping text validation, correction is too complex (nested) to handle for now
+                    for t in  correction.original().select(TextContent):
+                        hastext = True
+                        if t.cls == cls:
+                            if correctionhandling is not None and correctionhandling != CorrectionHandling.ORIGINAL:
+                                return None #inconsistent
+                            else:
+                                correctionhandling = CorrectionHandling.ORIGINAL
+                            break
+            if correctionhandling is None:
+                #well, we couldn't find our textclass in any correction, just fall back to current and let text validation fail if needed
+                return CorrectionHandling.CURRENT
+
+
     def textvalidation(self, warnonly=None):
         """Run text validation on this element. Checks whether any text redundancy is consistent and whether offsets are valid.
 
@@ -786,6 +841,12 @@ def textvalidation(self, warnonly=None):
         for cls in self.doc.textclasses:
             if self.hastext(cls, strict=True) and not isinstance(self, (Linebreak, Whitespace)):
                 if self.doc and self.doc.debug: print("[PyNLPl FoLiA DEBUG] Text validation on " + repr(self),file=stderr)
+                correctionhandling = self.findcorrectionhandling(cls)
+                if correctionhandling is None:
+                    #skipping text validation, correction is too complex (nested) to handle for now; just assume valid (benefit of the doubt)
+                    if self.doc and self.doc.debug: print("[PyNLPl FoLiA DEBUG] SKIPPING Text validation on " + repr(self) + ", too complex to handle (nested corrections or inconsistent use)",file=stderr)
+                    return True #just assume it's valid then
+
                 strictnormtext = self.text(cls,retaintokenisation=False,strict=True, normalize_spaces=True)
                 deepnormtext = self.text(cls,retaintokenisation=False,strict=False, normalize_spaces=True)
                 if strictnormtext != deepnormtext:

diff --git a/pynlpl/tests/folia.py b/pynlpl/tests/folia.py
@@ -3439,6 +3439,92 @@ def test013c_correction(self):
 </FoLiA>""".format(version=folia.FOLIAVERSION, generator='pynlpl.formats.folia-v' + folia.LIBVERSION)
         doc = folia.Document(string=xml, textvalidation=True)
 
+    def test013d_correction(self):
+        """Validation - Text Validation on Correction (Double text layers, structural changes)"""
+        xml = """<?xml version="1.0" encoding="UTF-8"?>
+<FoLiA xmlns="http://ilk.uvt.nl/folia" xmlns:xlink="http://www.w3.org/1999/xlink" xml:id="test" version="{version}" generator="{generator}">
+  <metadata type="native">
+    <annotations>
+      <token-annotation annotator="ucto" annotatortype="auto" datetime="2017-09-25T10:29:52" set="tokconfig-nld"/>
+      <style-annotation />
+    </annotations>
+  </metadata>
+  <text xml:id="example.text">
+    <p xml:id="example.p.1">
+      <s xml:id="example.p.1.s.1">
+        <t>Is het creëren van een volwaardig literair oeuvre voorbehouden aan schrijvers?</t>
+        <t class="old">Is het creeren van een volwaardig litterair oeuvre voor behouden aan schrijvers?</t>
+        <w xml:id="example.p.1.s.1.w.1" class="WORD">
+          <t offset="0">Is</t>
+          <t class="old" offset="0">Is</t>
+        </w>
+        <w xml:id="example.p.1.s.1.w.2" class="WORD">
+          <t offset="3">het</t>
+          <t class="old" offset="3">het</t>
+        </w>
+        <w xml:id="example.p.1.s.1.w.3" class="WORD">
+          <correction>
+           <new>
+              <t offset="7">creëren</t>
+           </new>
+           <original auth="no">
+              <t class="old" offset="7">creeren</t>
+           </original>
+          </correction>
+        </w>
+        <w xml:id="example.p.1.s.1.w.4" class="WORD">
+          <t offset="15">van</t>
+          <t class="old" offset="15">van</t>
+        </w>
+        <w xml:id="example.p.1.s.1.w.5" class="WORD">
+          <t offset="19">een</t>
+          <t class="old" offset="19">een</t>
+        </w>
+        <w xml:id="example.p.1.s.1.w.6" class="WORD">
+          <t offset="23">volwaardig</t>
+          <t class="old" offset="23">volwaardig</t>
+        </w>
+        <w xml:id="example.p.1.s.1.w.7" class="WORD">
+          <t offset="34">literair</t>
+          <t class="old" offset="34">litterair</t>
+        </w>
+        <w xml:id="example.p.1.s.1.w.8" class="WORD">
+          <t offset="43">oeuvre</t>
+          <t class="old" offset="44">oeuvre</t>
+        </w>
+        <correction>
+         <new>
+            <w xml:id="example.p.1.s.1.w.9" class="WORD">
+              <t offset="50">voorbehouden</t>
+            </w>
+         </new>
+         <original>
+            <w xml:id="example.p.1.s.1.w.9a" class="WORD">
+              <t class="old" offset="51">voor</t>
+            </w>
+            <w xml:id="example.p.1.s.1.w.9b" class="WORD">
+              <t class="old" offset="56">behouden</t>
+            </w>
+         </original>
+        </correction>
+        <w xml:id="example.p.1.s.1.w.10" class="WORD">
+          <t offset="63">aan</t>
+          <t class="old" offset="65">aan</t>
+        </w>
+        <w xml:id="example.p.1.s.1.w.11" class="WORD" space="no">
+          <t offset="67">schrijvers</t>
+          <t class="old" offset="69">schrijvers</t>
+        </w>
+        <w xml:id="example.p.1.s.1.w.12" class="WORD">
+          <t offset="77">?</t>
+          <t class="old" offset="79">?</t>
+        </w>
+      </s>
+    </p>
+  </text>
+</FoLiA>""".format(version=folia.FOLIAVERSION, generator='pynlpl.formats.folia-v' + folia.LIBVERSION)
+        doc = folia.Document(string=xml, textvalidation=True)
+
 with io.open(FOLIAPATH + '/test/example.xml', 'r',encoding='utf-8') as foliaexample_f:
     FOLIAEXAMPLE = foliaexample_f.read()