Skip to content

Commit

Permalink
Added a findcorrectionhandling() method that attempts to determine, g…
Browse files Browse the repository at this point in the history
…iven a textclass, what kind of correction handling is appropriate. This is used by text validation (proycon/folia#24). The method is limited and can not deal with complex situations (nested corrections, inconsistencies), in such cases text validation will be skipped alltogether for that element.
  • Loading branch information
proycon committed Oct 5, 2017
1 parent 4d669a5 commit cb269b6
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 0 deletions.
61 changes: 61 additions & 0 deletions pynlpl/formats/folia.py
Expand Up @@ -770,6 +770,61 @@ def stricttext(self, cls='current'):
"""Alias for :meth:`text` with ``strict=True``"""
return self.text(cls,strict=True)

def findcorrectionhandling(self, cls):
"""Find the proper correctionhandling given a textclass by looking in the underlying corrections where it is reused"""
if cls == "current":
return CorrectionHandling.CURRENT
elif cls == "original":
return CorrectionHandling.ORIGINAL #backward compatibility
else:
correctionhandling = None
#but any other class may be anything
#Do we have corrections at all? otherwise no need to bother
for correction in self.select(Correction):
#yes, in which branch is the text class found?
found = False
hastext = False
if correction.hasnew():
found = True
doublecorrection = correction.new().count(Correction) > 0
if doublecorrection: return None #skipping text validation, correction is too complex (nested) to handle for now
for t in correction.new().select(TextContent):
hastext = True
if t.cls == cls:
if correctionhandling is not None and correctionhandling != CorrectionHandling.CURRENT:
return None #inconsistent
else:
correctionhandling = CorrectionHandling.CURRENT
break
elif correction.hascurrent():
found = True
doublecorrection = correction.current().count(Correction) > 0
if doublecorrection: return None #skipping text validation, correction is too complex (nested) to handle for now
for t in correction.current().select(TextContent):
hastext = True
if t.cls == cls:
if correctionhandling is not None and correctionhandling != CorrectionHandling.CURRENT:
return None #inconsistent
else:
correctionhandling = CorrectionHandling.CURRENT
break
if correction.hasoriginal():
found = True
doublecorrection = correction.original().count(Correction) > 0
if doublecorrection: return None #skipping text validation, correction is too complex (nested) to handle for now
for t in correction.original().select(TextContent):
hastext = True
if t.cls == cls:
if correctionhandling is not None and correctionhandling != CorrectionHandling.ORIGINAL:
return None #inconsistent
else:
correctionhandling = CorrectionHandling.ORIGINAL
break
if correctionhandling is None:
#well, we couldn't find our textclass in any correction, just fall back to current and let text validation fail if needed
return CorrectionHandling.CURRENT


def textvalidation(self, warnonly=None):
"""Run text validation on this element. Checks whether any text redundancy is consistent and whether offsets are valid.
Expand All @@ -786,6 +841,12 @@ def textvalidation(self, warnonly=None):
for cls in self.doc.textclasses:
if self.hastext(cls, strict=True) and not isinstance(self, (Linebreak, Whitespace)):
if self.doc and self.doc.debug: print("[PyNLPl FoLiA DEBUG] Text validation on " + repr(self),file=stderr)
correctionhandling = self.findcorrectionhandling(cls)
if correctionhandling is None:
#skipping text validation, correction is too complex (nested) to handle for now; just assume valid (benefit of the doubt)
if self.doc and self.doc.debug: print("[PyNLPl FoLiA DEBUG] SKIPPING Text validation on " + repr(self) + ", too complex to handle (nested corrections or inconsistent use)",file=stderr)
return True #just assume it's valid then

strictnormtext = self.text(cls,retaintokenisation=False,strict=True, normalize_spaces=True)
deepnormtext = self.text(cls,retaintokenisation=False,strict=False, normalize_spaces=True)
if strictnormtext != deepnormtext:
Expand Down
86 changes: 86 additions & 0 deletions pynlpl/tests/folia.py
Expand Up @@ -3439,6 +3439,92 @@ def test013c_correction(self):
</FoLiA>""".format(version=folia.FOLIAVERSION, generator='pynlpl.formats.folia-v' + folia.LIBVERSION)
doc = folia.Document(string=xml, textvalidation=True)

def test013d_correction(self):
"""Validation - Text Validation on Correction (Double text layers, structural changes)"""
xml = """<?xml version="1.0" encoding="UTF-8"?>
<FoLiA xmlns="http://ilk.uvt.nl/folia" xmlns:xlink="http://www.w3.org/1999/xlink" xml:id="test" version="{version}" generator="{generator}">
<metadata type="native">
<annotations>
<token-annotation annotator="ucto" annotatortype="auto" datetime="2017-09-25T10:29:52" set="tokconfig-nld"/>
<style-annotation />
</annotations>
</metadata>
<text xml:id="example.text">
<p xml:id="example.p.1">
<s xml:id="example.p.1.s.1">
<t>Is het creëren van een volwaardig literair oeuvre voorbehouden aan schrijvers?</t>
<t class="old">Is het creeren van een volwaardig litterair oeuvre voor behouden aan schrijvers?</t>
<w xml:id="example.p.1.s.1.w.1" class="WORD">
<t offset="0">Is</t>
<t class="old" offset="0">Is</t>
</w>
<w xml:id="example.p.1.s.1.w.2" class="WORD">
<t offset="3">het</t>
<t class="old" offset="3">het</t>
</w>
<w xml:id="example.p.1.s.1.w.3" class="WORD">
<correction>
<new>
<t offset="7">creëren</t>
</new>
<original auth="no">
<t class="old" offset="7">creeren</t>
</original>
</correction>
</w>
<w xml:id="example.p.1.s.1.w.4" class="WORD">
<t offset="15">van</t>
<t class="old" offset="15">van</t>
</w>
<w xml:id="example.p.1.s.1.w.5" class="WORD">
<t offset="19">een</t>
<t class="old" offset="19">een</t>
</w>
<w xml:id="example.p.1.s.1.w.6" class="WORD">
<t offset="23">volwaardig</t>
<t class="old" offset="23">volwaardig</t>
</w>
<w xml:id="example.p.1.s.1.w.7" class="WORD">
<t offset="34">literair</t>
<t class="old" offset="34">litterair</t>
</w>
<w xml:id="example.p.1.s.1.w.8" class="WORD">
<t offset="43">oeuvre</t>
<t class="old" offset="44">oeuvre</t>
</w>
<correction>
<new>
<w xml:id="example.p.1.s.1.w.9" class="WORD">
<t offset="50">voorbehouden</t>
</w>
</new>
<original>
<w xml:id="example.p.1.s.1.w.9a" class="WORD">
<t class="old" offset="51">voor</t>
</w>
<w xml:id="example.p.1.s.1.w.9b" class="WORD">
<t class="old" offset="56">behouden</t>
</w>
</original>
</correction>
<w xml:id="example.p.1.s.1.w.10" class="WORD">
<t offset="63">aan</t>
<t class="old" offset="65">aan</t>
</w>
<w xml:id="example.p.1.s.1.w.11" class="WORD" space="no">
<t offset="67">schrijvers</t>
<t class="old" offset="69">schrijvers</t>
</w>
<w xml:id="example.p.1.s.1.w.12" class="WORD">
<t offset="77">?</t>
<t class="old" offset="79">?</t>
</w>
</s>
</p>
</text>
</FoLiA>""".format(version=folia.FOLIAVERSION, generator='pynlpl.formats.folia-v' + folia.LIBVERSION)
doc = folia.Document(string=xml, textvalidation=True)

with io.open(FOLIAPATH + '/test/example.xml', 'r',encoding='utf-8') as foliaexample_f:
FOLIAEXAMPLE = foliaexample_f.read()

Expand Down

0 comments on commit cb269b6

Please sign in to comment.