
# TEI XML to Text-Fabric Convertor

XML-TEI textfiles can be converted to [Text-Fabric format](https://dans-labs.github.io/text-fabric/Model/File-formats/) by using this convertor. It has been designed for Greek, but it should also work with minimal adjustments for other languages (except for the implemented lemmatizer).

See this [readme](https://github.com/pthu/patristics) for more information about the corpus and this work.

See this [notebook](https://nbviewer.jupyter.org/github/annotation/banks/blob/master/programs/convert.ipynb) for a simple setup for a tf conversion if you like to build your own convertor.

In [None]:
import re
import collections
import pickle
import betacode.conv

from os import expanduser, path
from glob import glob
from collections import OrderedDict, namedtuple
from itertools import takewhile
from ordered_set import OrderedSet
from unicodedata import category, normalize
from tf.fabric import Fabric, Timestamp
from tf.convert.walker import CV
from pprint import pprint
from cltk.corpus.greek.beta_to_unicode import Replacer
from cltk.corpus.greek.alphabet import filter_non_greek
from greek_normalisation.normalise import Normaliser
from greek_normalisation.norm_data import ELISION, MOVABLE

# Local imports
from helpertools.lemmatizer import lemmatize
from helpertools.unicodetricks import *
from tffabric import config, languages


In [None]:
def convert(input_path, output_path, **kwargs):
    '''The convert function is the core of the tei2tf module
    
    It takes the following arguments:
    in_path:  the path that contains the TEI formatted texts
    out_path: the path to which the tf-files would be written
    **kwargs: a dictionarry that is usually derived from the
              config.py file, that contains all important
              parameters for the conversion (see documentation)
    '''
    tm         = Timestamp()
    version    = kwargs['version']
    slot_type  = kwargs['slotType']
    dir_struct = kwargs['fileName'] #TODO write function that derives the requested data!
    count1     = 0     # counts the number input files
    count2     = 0     # counts the number of successfully processed files
    
    # input-output file management
    inpath = expanduser(input_path)
    outpath = expanduser(out_path)
    
    # Looping through the inpath and running the tf-conversion
    for xmlfile in glob(f'{inpath}/**/*grc*.xml', recursive=True):
        count1 +=1
        if count1 > 1: print('\n')
        tm.info(f'parsing {xmlfile}\n')
        # dirs is a tuple of directory names defined in config.py
        # they usually correspond to (author, work)
        dirs = dirStruct(xmlfile, dir_struct) 
        # in case of multiple editions of the same work, a number will be prefixed
        if path.isdir(f'{outpath}/{"/".join(dirs)}/tf/{version}'):
            C = 1
            while path.isdir(f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'):
                C +=1
            else:
                TF_PATH = f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'
        else:
            TF_PATH = f'{outpath}/{"/".join(dirs)}/tf/{version}'
            
        # setting up the text-fabric engine
        TF = Fabric(locations=TF_PATH)
        cv = CV(TF)
        # initiating the Conversion class that provides all
        # necessary data and methods for cv.walk()
        x = Conversion(xmlfile, kwargs)
        # running cv.walk() to generate the tf-files
        good = cv.walk(
            x.director,
            slotType,
            otext=x.otext,
            generic=x.generic,
            intFeatures=x.intFeatures,
            featureMeta=x.featureMeta,
            warn=False,
        )
        # Count number of successfully converted files
        if good: COUNTER2 +=1
    tm.info(f'{COUNTER2} of {COUNTER1} works have successfully been converted!')
 
    
    

In [None]:
REPO1 = '~/github/pthu'
REPO2 = REPO1 + '/sources'
VERSION = '1.0'

# Define subcorpus to convert
SOURCE = 'athenaeus'
LOC = 'athenaeus'
# Define the source where the (sub)corpus can be found
SRC_DIR = os.path.expanduser(f'{REPO2}/{SOURCE}')
# Define the export path
TF_DIR = os.path.expanduser(f'{REPO1}/{LOC}') 
# Define the version of the export


# Setup of the Convertor Environment



In [None]:
# XML FUNCTIONS:

def xmlSplitter(xmlfile):
    '''The xmlReader reads a XML file completely into memory,
    while splitting the text on "<" and ">" into a list.
    '''
    
    #TODO: concatenate lines ending with '-'!
    with open(xmlfile) as xml:
        # the filter function ensures that no empty strings are returned
        data = list(filter(None, 
                  ' '.join([line.strip() for line in xml.readlines()])\
                  .replace('<', '#!#<')\
                  .replace('>', '>#!#')\
                  .split('#!#')
                     ))
    return data

def attribClean(elem, **kwargs):
    '''attribClean reads an XML tag and processes a 
    thorough normalization on it, consisting of:
    - strip() of whitespace
    - normalization of elements with attributes into:
        <name attrib1="attribname1" attrib2="attribname2">
    - correction of mistakes defined in kwargs['attrib_errors']
      kwargs is usually defined in config.py.
      
    The function returns a tuple with tag and attribs dict:
    (tag, {keys: values})
    '''
    # clean the elem
    elem = elem.strip('<>\ ')
    elem = re.sub(r'\s*=\s*"\s*', '="', elem)
    # define the tag
    tag = elem[:elem.find(' ')]
    # convert the attributes to a dict
    attribs = {k.strip(): v.strip('" ') for k, v in [elem.split('="') \
        for elem in elem[elem.find(' '):].split('" ')]}
    # correct any mistakes in the attributes 
    # if 'attrib_errors' is provided in the config.py
    if 'attrib_errors' in kwargs:
        attribs = {k: (kwargs['attrib_errors'][v] \
                           if v in kwargs['attrib_errors'] \
                           else v) \
                       for k, v in attribs.items()}
    return (tag, attribs)

def xmlParser(elem, **kwargs):
    '''The xmlParser is able to parse the elements
    created by xmlSplitter(xmlfile). It returns a tuple
    containing the type and normalized element: (type, elem)
    
    Normalization of element involves:
    - strip() of whitespace
    - normalization of elements with attributes into:
        <name attrib1="attribname1" attrib2="attribname2">
    - correction of mistakes defined in kwargs['attrib_errors']
      (see attribClean)
    '''
    # RE PATTERNS
    commentFullRE = re.compile(r'^<!--.*?-->$')
    commentStartRE = re.compile(r'^<!--.*')
    commentStopRE = re.compile(r'.*-->$')
    openTagRE = re.compile(r'<[^/=]+?>')
    closeTagRE = re.compile(r'</.+?>')
    opencloseTagRE = re.compile(r'<[^/=]+?/ *?>')
    openAttrTagRE = re.compile(r'<.+?=.+?[^/] *?>')
    closedAttrTagRE = re.compile(r'<.+?=.+?/ *?>')
    
    # Application of patterns:
    if commentFullRE.fullmatch(elem):
        return ('commentFull', '')
    elif commentStartRE.fullmatch(elem):
        return ('commentStart', '')
    elif commentStopRE.fullmatch(elem):
        return ('commentStop', '')
    elif openTagRE.fullmatch(elem):
        return ('openTag', elem.strip('<> '))
    elif closeTagRE.fullmatch(elem):
        return ('closeTag', elem.strip('<>/ '))
    elif opencloseTagRE.fullmatch(elem):
        return ('opencloseTag', elem.strip('<>/ '))
    elif openAttrTagRE.fullmatch(elem):
        return ('openAttrTag', attribClean(elem, **kwargs))
    elif closedAttrTagRE.fullmatch(elem):
        return ('closedAttrTag', attribClean(elem, **kwargs))

    
# NORMALIZATION AND LEMMATIZER FUNCTIONS:
def normalization(**kwargs):
    

def lemmatizer(lemma_dict_path):
    '''the lemmatizer takes a Python dictionary provided
    in the path (.py or .pickle).
    If a .py file is provided, it needs to be in the same
    folder as tei2tf. In that case, the same directory needs
    an __init__.py file.
    A .pickle file can be located anywhere, as long as the 
    path points to it.
    '''
    if lemma_dict_path.endswith('.pickle'):
        lemmatizer_open = open(path.expanduser(lemma_dict_path), 'rb')
        lemma_dict = pickle.load(lemmatizer_open)
        lemmatizer_open.close()
        return lemma_dict
    else:
        import lemma_dict_path
        
    
def lemmatize(word, lemmatizer, **kwargs):
    if 'udnorm' in kwargs:
        word = normalize(kwargs['udnorm'], word.lower())
    else: 
        word = word.lower()
    if word in lemmatizer:
        lemma = ','.join(lemmatizer[word])
    else:
        lemma = f'*{word}'
    return lemma
    
    
def metaData(self, **kwargs):
    data = xmlSplitter(xmlfile)




# RE PATTERNS METADATA
authorRE = re.compile(r'<author>|<author .+?>') #[^>]*?(.+)</author>')
editorRE = re.compile(r'<editor>|<editor .+?>') #[^>]*?(.+)</editor>')
titleRE = re.compile(r'<title>|<title .+?>')    #[^>]*?(.+)</title>')
bodyStartRE = re.compile(r'<body>|<body .+?>')
bodyStopRE = re.compile(r'</body>|</body .+?>')



# Load Lemmatizer
lemmatizer_open = open(os.path.expanduser(f'{REPO1}/tei_to_tf/helpertools/data/lemmatizer.pickle'), 'rb')
lemmatizer = pickle.load(lemmatizer_open)

# Set up James Tauber's normalizer
jt_normalise = Normaliser().normalise
# use: jt_normalise('greek_word') --> (greek_word, [type])

# Set up betacode to unicode convertor
beta_to_uni = Replacer()
#use: beta_to_uni.beta_code(betacode_text)

# Unicode standards
NFD = 'NFD'
NFC = 'NFC'

class Conversion:
    def __init__(self, path):
        self.path = path
        self.data = self.dataPreprocessor(self.path)
        self.metadata = self.metaData(self.data)[0]
        self.body_index = self.metaData(self.data)[1]
        self.attribs_dict = self.attribsDict(self.data)[0]
        self.opentags = self.attribsDict(self.data)[1]
        self.len_attribs_dict = self.lenAttribsDict(self.attribs_dict)        
        self.section_elems =  self.sectionElems(self.attribs_dict)

        # TF SPECIFIC VARIABLES
        self.slotType = 'word'
        self.intFeatures = set()
        self.generic = {'name': 'Patristic corpus from Perseus',
                        'compiler': 'Ernst Boogert',
                        'institution': 'Protestant Theological University (PThU) Amsterdam/Groningen, The Netherlands',
                        'source1': 'Perseus Digital Library',
                        'source2': 'Open Greek and Latin Project',
                        'url1': 'https://github.com/PerseusDL/canonical-greekLit',
                        'url2': 'https://github.com/OpenGreekAndLatin/First1KGreek',
                        'lang': 'grc',
                        'license': self.metadata['license'] if 'license' in self.metadata else 'not provided by source',
                        'availableStructure': ",".join(self.section_elems),
                        'version': '1.0',
                        'purpose': 'Making Perseus TEI formatted text available in TF',
                        'status': 'initial unchecked conversion',
                        '_book': self.metadata['titleStmt']['title'].replace('(Greek)', '').replace('.', '').replace(',', '').replace('Machine readable text', '').strip(),
                        'author': self.metadata['titleStmt']['author'] \
                                    if 'author' in self.metadata['titleStmt'] else 'not provided',
                        'editor': self.metadata['titleStmt']['editor'] \
                                    if 'editor' in self.metadata['titleStmt'] else 'not provided',
                        'edition': ', '.join([v for k, v in self.metadata['biblStruct'].items()]) + '.',
                        }
        # Definition of text formats
        self.otext = {'fmt:text-orig-full': '{orig} ',
                      'fmt:text-orig-main': '{main} ',
                      'fmt:text-orig-norm': '{norm} ',
                      'fmt:text-orig-plain': '{plain} ',
#                       'fmt:text-orig-beta-full': '{beta_orig} ',
                      'fmt:text-orig-beta-plain': '{beta_plain} ',
                      
                      'fmt:text-orig-lemma': '{lemma} ',
        # Definition of:
                      # main sections (=bookname + first two section levels)
                      'sectionTypes': f'{"" if not self.section_elems else ",".join(self.section_elems[:2])},_sentence',
                      'sectionFeatures': f'{"" if not self.section_elems else ",".join(self.section_elems[:2])},_sentence',
#                       'sectionTypes': f'_book{"" if not self.section_elems else "," + ",".join(self.section_elems[:2])}',
#                       'sectionFeatures': f'_book{"" if not self.section_elems else "," + ",".join(self.section_elems[:2])}',
                      
                      
                      # structure (=bookname +all available levels)          
                      'structureTypes': f'_book{"" if not self.section_elems else "," + ",".join(self.section_elems[:])}',
                      'structureFeatures': f'_book{"" if not self.section_elems else "," + ",".join(self.section_elems[:])}',
                      }
        # These are the feature metadata that are present in all tf-packages to be produced... 
        # Other metadata will be added during the run of the director()...
        self.featureMeta = {
                '_sentence': {
                    'description': 'numbering of sentences with "." as its delimiter',
                },
                '_book': {
                    'description': 'the title of the book',
                },
                'orig': {
                    'description': 'the original form of the text in unicode (UFD norm), including accents and punctuation; if the original text was in betacode, it has been converted to unicode without any normalization',
                },
                'main': {
                    'description': 'the original form of the text in unicode (UFD norm), but extensively normalized (no punctuation and other trailing characters, no elision, normalization of accents.',
                },
                'norm': {
                    'description': 'a normalized form of uni_main, according to the normalization of James Tauber: https://github.com/jtauber/greek-normalisation',
                },
                'plain': {
                    'description': 'the plain form of the text in unicode stripped of all accents and punctuation',
                },
#                 'beta_orig': {
#                     'description': 'the original form of the text, including accents and punctuation; this could be unicode or betacode, depending on the format of in the source',
#                 },
#                 'beta_main': {
#                     'description': 'the original form of the text in betacode, but excluding punctuation and other trailing characters',
#                 },
                'beta_plain': {
                    'description': 'the plain form of the text in betacode stripped of all accents and punctuation',
                },
                'lemma': {
                    'description': 'the lemmatized form of the text tries to return as much as possible the words as a comma-separated list of possible lemmata. If no lemma could be found, the word is preceded by a "*". The lemmata have been defined by using the normalized text',
                },
            }
        
    
    
        
    def metaData(self, data):
        titleStmt = False
        biblStruct = False
        metaTaglist = []
        metadata = {}
        Comment = False
        for elem in data:
            if bodyStartRE.match(elem):
                body_index = data.index(elem) + 1
                break
            elem = elem.strip('{ ,.}')
            if Comment == False:
                if elem == '':
                    continue
                elif commentStartRE.fullmatch(elem):
                    if commentFullRE.fullmatch(elem):
                        continue
                    Comment = True
                    continue
                elif 'license' in elem.lower() or 'creative commons' in elem.lower():
                    metadata['license'] = elem
                    continue
                else:
                    if elem.startswith('<titleStmt'):
                        titleStmt = True
                        metadata['titleStmt'] = {}
                    elif elem.startswith('</titleStmt'):
                        titleStmt = False
                    elif elem.startswith(('<biblStruct', '<sourceDesc')):
                        biblStruct = True
                        metadata['biblStruct'] = {}
                    elif elem.startswith(('</biblStruct', '</sourceDesc')):
                        biblStruct = False
                    elif titleStmt == True:
                        if elem.startswith('<'):
                            tag_split = elem.find(' ') if not elem.find(' ') == -1 else elem.find('>')
                            metaTaglist.append(elem[1:tag_split])
                        else:
                            if metaTaglist[-1] in metadata['titleStmt']:
                                metadata['titleStmt'][metaTaglist[-1]] += \
                                  f', {elem}' if not elem in metadata['titleStmt'][metaTaglist[-1]] else ''
                            else:
                                metadata['titleStmt'][metaTaglist[-1]] = elem
                    elif biblStruct == True:
                        if elem.startswith('<'):
                            tag_split = elem.find(' ') if not elem.find(' ') == -1 else elem.find('>')
                            metaTaglist.append(elem[1:tag_split])
                        else:
                            if metaTaglist[-1] in metadata['biblStruct']:
                                metadata['biblStruct'][metaTaglist[-1]] += \
                                  f', {elem}' if not elem in metadata['biblStruct'][metaTaglist[-1]] else ''
                            else:
                                metadata['biblStruct'][metaTaglist[-1]] = elem
            else:
                if commentStopRE.fullmatch(elem):
                    Comment = False
                continue
        return (metadata, body_index)
    
    def attribsDict(self, data):
        attribs_dict = {}
        opentags = set()
        Comment = False
        for elem in data[self.body_index:]:
            elem = elem.strip()
            if Comment == False:
                if elem == '':
                    continue
                elif commentStartRE.fullmatch(elem):
                    if commentFullRE.fullmatch(elem):
                        continue
                    Comment = True
                    continue
                elif openAttrTagRE.fullmatch(elem):
                    elem = re.sub(r'\s*=\s*"\s*', '="', elem)
                    tag_split = elem.find(' ')
                    tag = elem[1:tag_split]
                    if tag.startswith('div'): 
                        tag = 'div'
                    attribs = {key: val.strip() for key, val in [elem.split('="') for elem in elem[tag_split:-1].strip().split('" ')]}
                    for key, val in attribs.items():
                        if val.strip('"') in CORR_ATTRIB_VALS:
                            attribs[key] = CORR_ATTRIB_VALS[val.strip('"')]
                        else:
                            attribs[key] = val.strip('"')
                    if 'type' in attribs:
                        if attribs['type'] == 'edition':
                            continue
                    tag_name = tuple((tag, tuple(key for key in attribs.keys() if key not in {'corresp', 'merge', 'resp'})))
                    if tag_name in attribs_dict:
                        for attrib in attribs:
                            if attrib in attribs_dict[tag_name]:
                                attribs_dict[tag_name][attrib].add(attribs[attrib])
                            else:
                                attribs_dict[tag_name][attrib] = OrderedSet([attribs[attrib]])
                    else:
                        attribs_dict[tag_name] = {k: OrderedSet([v]) for k, v in attribs.items()}
                    opentags.add(tag_name)
                elif closedAttrTagRE.fullmatch(elem):
                    elem = re.sub(r'\s*=\s*"\s*', '="', elem)
                    tag_split = elem.find(' ')
                    tag = elem[1:tag_split]
                    if tag.startswith('div'):
                        tag = 'div'
                    attribs = {key: val.strip() for key, val in [elem.split('="') for elem in elem[tag_split:-2].strip().split('" ')]}
                    for key, val in attribs.items():
                        if val.strip('"') in CORR_ATTRIB_VALS:
                            attribs[key] = CORR_ATTRIB_VALS[val.strip('"')]
                        else:
                            attribs[key] = val.strip('"')
                    tag_name = tuple((tag, tuple(key for key in attribs.keys() if key not in {'corresp', 'merge', 'resp'})))
                    if tag_name in attribs_dict:
                        for attrib in attribs:
                            if attrib in attribs_dict[tag_name]:
                                attribs_dict[tag_name][attrib].add(attribs[attrib])
                            else:
                                attribs_dict[tag_name][attrib] = OrderedSet([attribs[attrib]])
                    else:
                        attribs_dict[tag_name] = {k: OrderedSet([v]) for k, v in attribs.items()}
                else:
                    continue        
            else:
                if commentStopRE.fullmatch(elem):
                    Comment = False
                continue
#         pprint(attribs_dict)
        return attribs_dict, opentags
    
    def lenAttribsDict(self, dictionary):
        return {key: {k: len(v) for k, v in val.items()} for key, val in dictionary.items()}
    
    def sectionElems(self, dictionary):
        section_list = []
        nonSections = nonSections = {'altpage', 'altnumbering', 'altref', 'mspage',}
        for key, val in dictionary.items():
            if key[0].startswith('div'):
                number = False
                sections = False
                for k, v in val.items():
                    if len(v) > 1 and sorted(v)[0][0].isdigit(): # The sorted guarantees that the numbers are in front
                        number = True
                    elif len(v) >= 1 and not sorted(v)[0][0].isdigit() and not v[0].startswith(('urn', 'textpart')):
                        section_list = list(v)
                        sections = True
                if number == True and sections == True: # Identification of sectioning units
                    break
        if len(section_list) <= 2:
            for key, val in dictionary.items():
                if key[0].startswith('milestone') and all(i in key[1] for i in ('unit', 'n')):
                    if sorted(val['n'])[0].isdigit():
                        section_list.extend([i for i in val['unit'] if not i in nonSections])
#         pprint(section_list)
        return section_list
    
    def director(self, cv):
        tm = Timestamp()  
        Comment = False
        NegatedEditionTag = False
        nonIntFeatures = {'otype', 'oslots',}
        excludeTags = {'head', 'note', 'title', 'bibl'}
        counter = dict(_sentence=1, word=0)
        cur = {}
        tagList = []
        closedSectionList = []
        data = self.data
        lemma_counter = [0, 0]
        
        tagList.append('_book')
        cur['_book'] = cv.node('_book')
        cv.feature(cur['_book'], _book=self.generic['_book'])
        nonIntFeatures.add('_book')

        for elem in data[self.body_index:]:
#             print(elem)
#             print(tagList)
            elem = elem.strip()
            if Comment == False:
                if commentStartRE.fullmatch(elem): #DONE
                    if commentFullRE.fullmatch(elem):
                        continue
                    Comment = True
                    continue

                elif openTagRE.fullmatch(elem): #DONE
#                     print(f'openTagRE = {elem}')
                    # These are the features linked to the coming nodes
                    tag_name = elem[1:-1]
                    tagList.append(tag_name)
                    if tag_name in cur:
                        cv.terminate(cur[tag_name])
                    if not tag_name in excludeTags:
                        if tag_name in counter:
                            counter[tag_name] +=1
                        else:
                            counter[tag_name] = 1    
                        cur[tag_name] = cv.node(tag_name)
                        cv.feature(cur[tag_name], **{tag_name: counter[tag_name]})
                        cv.meta(tag_name, description="open tag without further specification. See the name of the .tf-file for it's meaning",)
                    else:
                        if tag_name in cur: cv.terminate(cur[tag_name])
                        cur[tag_name] = cv.node(tag_name)
                    continue

                elif closeTagRE.fullmatch(elem): #DONE
#                     print(f'closeTagRE = {elem}')
                    # These are the signs showing the close of a feature belonging to preceding nodes
                    if bodyStopRE.fullmatch(elem):
                        if NegatedEditionTag == True:
                            for ntp in cur:
                                if not ntp in self.section_elems and not ntp == '_book':
                                    cv.terminate(cur[ntp])
                            for ntp in self.section_elems[::-1]:
                                cv.terminate(cur[ntp])
                            cv.terminate(cur['_book'])
                            break
                        else:
                            for ntp in cur:
                                if not ntp in self.section_elems and not ntp == '_book':
                                    cv.terminate(cur[ntp])
                            for ntp in self.section_elems[::-1]:
                                cv.terminate(cur[ntp])
                            cv.terminate(cur['_book'])
                            del tagList[-1]
                            break
                    if tagList[-1] in excludeTags:
                        pass
                    elif tagList[-1] in self.section_elems:
                        index = self.section_elems.index(tagList[-1])
                        for ntp in self.section_elems[:index:-1]:
                            if ntp in cur: cv.terminate(cur[ntp])     
                    elif not cv.linked(cur[tagList[-1]]):
                        pass
#                     else:
#                         cv.terminate(cur[tagList[-1]])
                    del tagList[-1]

                elif openAttrTagRE.fullmatch(elem):
#                     print(f'openAttrTagRE = {elem}')
                    # These are the features linked to coming nodes
                    elem = re.sub(r'\s*=\s*"\s*', '="', elem)
                    tag_split = elem.find(' ')
                    attribs = {key: val.strip() for key, val in [elem.split('="') for elem in elem[tag_split:-1].strip().split('" ')]}
                    for key, val in attribs.items():
                        if val.strip('"') in CORR_ATTRIB_VALS:
                            attribs[key] = CORR_ATTRIB_VALS[val.strip('"')]
                        else:
                            attribs[key] = val.strip('"')
                    if NegatedEditionTag == False:        
                        if 'type' in attribs:
                            if attribs['type'] == 'edition':
                                NegatedEditionTag = True
                                self.generic['urn'] = attribs['n'] if 'n' in attribs else 'not provided'
                                continue
                    tag_name = elem[1:tag_split]
                    if tag_name.startswith('div'):
                        tag_name = 'div'
                    tag = tuple((tag_name, tuple(key for key in attribs.keys() if key not in {'corresp', 'merge', 'resp'})))
                    if tag_name in excludeTags:
                        if tag_name in cur: cv.terminate(cur[tag_name])
                        tagList.append(tag_name)
                        cur[tag_name] = cv.node(tag_name)
                        continue
                    highest_value_attrib = max(self.len_attribs_dict[tag], 
                                               key=lambda key: self.len_attribs_dict[tag][key])
                    sec = False
                    
                    for v in attribs.values():
                        if v in self.section_elems[:]:
                            sec = True
                            value = v
                    if sec == True:
                        for k, v in attribs.items():
                            if v == value:
                                if v == self.section_elems[0] and not k == 'n':
                                    for ntp in self.section_elems[::-1]:
                                        if ntp in cur: cv.terminate(cur[ntp])
                                    desc = 'first section level'
                                elif len(self.section_elems) > 1 and v == self.section_elems[1] and not k == 'n':
                                    for ntp in self.section_elems[:0:-1]:
                                        if ntp in cur: cv.terminate(cur[ntp])
                                    desc = 'second section level'
                                elif len(self.section_elems) > 2 and v == self.section_elems[2] and not k == 'n':
                                    for ntp in self.section_elems[:1:-1]:
                                        if ntp in cur: cv.terminate(cur[ntp])
                                    desc = 'third section level'
                                elif len(self.section_elems) > 3 and v == self.section_elems[3] and not k == 'n':
                                    for ntp in self.section_elems[:2:-1]:
                                        if ntp in cur: cv.terminate(cur[ntp])
                                    desc = 'fourth section level'    
                                elif len(self.section_elems) > 4 and v == self.section_elems[4] and not k == 'n':
                                    for ntp in self.section_elems[:3:-1]:
                                        if ntp in cur: cv.terminate(cur[ntp])
                                    desc = 'fifth section level'    
                                elif len(self.section_elems) > 5 and v == self.section_elems[5] and not k == 'n':
                                    for ntp in self.section_elems[:4:-1]:
                                        if ntp in cur: cv.terminate(cur[ntp])
                                    desc = 'sixth section level'   
                                else: #elif k == 'n': # in case k == 'n'!
#                                     print(f'openAttrTagRE = {elem}')
                                    v = attribs['subtype'] if 'subtype' in attribs else v
                                    tagList.append(v)
                                    content = attribs[highest_value_attrib].strip()
                                    desc = 'not provided'
                                    if v in self.section_elems:
                                        index = self.section_elems.index(v) - 1
                                        for ntp in self.section_elems[:index:-1]:
                                            if ntp in cur: cv.terminate(cur[ntp])
                                    if v in cur: cv.terminate(cur[v])
                                    cur[v] = cv.node(v)
                                    if not content.isdigit():
                                        nonIntFeatures.add(v)
                                    cv.feature(cur[v], **{v: content})
                                    cv.meta(v, description=desc,)
                                    break
                                tagList.append(v)
                                content = attribs['n'].strip() if 'n' in attribs else attribs[highest_value_attrib].strip()
                                if v in cur: cv.terminate(cur[v])
                                cur[v] = cv.node(v)
                                if not content.isdigit():
                                    nonIntFeatures.add(v)
                                cv.feature(cur[v], **{v: content})
                                if 'corresp' in attribs:
                                    cv.feature(cur[v], **{'corresp': attribs['corresp']})
                                    nonIntFeatures.add('corresp')
                                    cv.meta('corresp', description='this feature shows a correspondence with another source at the place indicated')
                                cv.meta(v, description=desc,)
                                break
                    else:
                        # If only one attrib differs: it cannot be made clear which name to choose, hence choose everything
                        
                        
                        
                        attribList = []
                        for attr in self.len_attribs_dict[tag]:
                            if self.len_attribs_dict[tag][attr] > 1:
                                attribList.append(attr)
                        if len(attribList) > 1:
                            
                            tag_name += '-' + '-'.join([v for k, v in attribs.items() 
                                                        if k in attribList 
                                                        and not k == highest_value_attrib
                                                        and not v[0].isdigit()])
                            if tag_name.endswith('-'):
                                tag_name += '-'.join([v for k, v in attribs.items() 
                                                        if k in attribList 
                                                        and not v[0].isdigit()])
                        content = attribs['n'] if 'n' in attribs else attribs[highest_value_attrib]
                        tagList.append(tag_name)
                        if tag_name in cur: cv.terminate(cur[tag_name])
                        cur[tag_name] = cv.node(tag_name)
                        if not content.isdigit():
                            nonIntFeatures.add(tag_name)
                        cv.feature(cur[tag_name], **{tag_name: content})
                        cv.meta(tag_name, description="not provided",)
                        continue
                        
                elif closedAttrTagRE.fullmatch(elem):
#                     print(f'closedAttrTagRE = {elem}')
                    elem = re.sub(r'\s*=\s*"\s*', '="', elem)
                    tag_split = elem.find(' ')
                    attribs = {key: val.strip() for key, val in [elem.split('="') for elem in elem[tag_split:-2].strip().split('" ')]}
                    for key, val in attribs.items():
                        if val.strip('"') in CORR_ATTRIB_VALS:
                            attribs[key] = CORR_ATTRIB_VALS[val.strip('"')]
                        else:
                            attribs[key] = val.strip('"')
                    tag_name = elem[1:tag_split]
                    if tag_name.startswith('div'):
                        tag_name = 'div'
                    tag = tuple((tag_name, tuple(key for key in attribs.keys() if key not in {'corresp', 'merge', 'resp'})))
                    highest_value_attrib = max(self.len_attribs_dict[tag], 
                                               key=lambda key: self.len_attribs_dict[tag][key])
                    sec = False
                    for v in attribs.values():
                        if v in self.section_elems[:]:
                            sec = True
                            value = v
                            break
                    if sec == True:
                        for k, v in attribs.items():
                            if v == value:
                                if v == self.section_elems[0] and not k == 'n':
                                    for ntp in self.section_elems[::-1]:
                                        if ntp in cur: cv.terminate(cur[ntp])
                                    desc = 'first section level'
                                elif len(self.section_elems) > 1 and v == self.section_elems[1] and not k == 'n':
                                    for ntp in self.section_elems[:0:-1]:
                                        if ntp in cur: cv.terminate(cur[ntp])
                                    desc = 'second section level'
                                elif len(self.section_elems) > 2 and v == self.section_elems[2] and not k == 'n':
                                    for ntp in self.section_elems[:1:-1]:
                                        if ntp in cur: cv.terminate(cur[ntp])
                                    desc = 'third section level'
                                elif len(self.section_elems) > 3 and v == self.section_elems[3] and not k == 'n':
                                    for ntp in self.section_elems[:2:-1]:
                                        if ntp in cur: cv.terminate(cur[ntp])
                                    desc = 'fourth section level'    
                                elif len(self.section_elems) > 4 and v == self.section_elems[4] and not k == 'n':
                                    for ntp in self.section_elems[:3:-1]:
                                        if ntp in cur: cv.terminate(cur[ntp])
                                    desc = 'fifth section level'    
                                elif len(self.section_elems) > 5 and v == self.section_elems[5] and not k == 'n':
                                    for ntp in self.section_elems[:4:-1]:
                                        if ntp in cur: cv.terminate(cur[ntp])
                                    desc = 'sixth section level'    
                                else: #elif k == 'n': # in case k == 'n'!
#                                     print(f'openAttrTagRE = {elem}')
                                    v = attribs['subtype'] if 'subtype' in attribs else v
                                    content = attribs[highest_value_attrib].strip()
                                    desc = 'not provided'
                                    if v in self.section_elems:
                                        index = self.section_elems.index(v) - 1
                                        for ntp in self.section_elems[:index:-1]:
                                            if ntp in cur: cv.terminate(cur[ntp])
                                    if v in cur: cv.terminate(cur[v])
                                    cur[v] = cv.node(v)
                                    if not content.isdigit():
                                        nonIntFeatures.add(v)
                                    cv.feature(cur[v], **{v: content})
                                    cv.meta(v, description=desc,)
                                    break
                                content = attribs['n'].strip() if 'n' in attribs else attribs[highest_value_attrib].strip()
                                if v in cur: cv.terminate(cur[v])
                                cur[v] = cv.node(v)                            
                                if tag in self.opentags:
                                    n = cv.slot()    
                                if not content.isdigit():
                                    nonIntFeatures.add(v)
                                cv.feature(cur[v], **{v: content})
                                if 'corresp' in attribs:
                                    cv.feature(cur[v], **{'corresp': attribs['corresp']})
                                    nonIntFeatures.add('corresp')
                                    cv.meta('corresp', description='this feature shows a correspondence with another source at the place indicated')
                                cv.meta(v, description=desc,)
                                break

                    else:
                        attribList = []
                        for attr in self.len_attribs_dict[tag]:
                            if self.len_attribs_dict[tag][attr] > 1:
                                attribList.append(attr)
                        if len(attribList) > 1:
                            tag_name += '-' + '-'.join([v for k, v in attribs.items() 
                                                        if k in attribList 
                                                        and not k == highest_value_attrib
                                                        and not v[0].isdigit()])
                            if tag_name.endswith('-'):
                                tag_name += '-'.join([v for k, v in attribs.items() 
                                                        if k in attribList 
#                                                         and not k == highest_value_attrib
                                                        and not v[0].isdigit()])
                        content = attribs['n'].strip() if 'n' in attribs else attribs[highest_value_attrib].strip()
                        if tag_name in cur: cv.terminate(cur[tag_name])
                        cur[tag_name] = cv.node(tag_name)
                        if not content.isdigit():
                            nonIntFeatures.add(tag_name)
                        cv.feature(cur[tag_name], **{tag_name: content})
                        cv.meta(tag_name, description="not given",)
                        continue

                elif opencloseTagRE.fullmatch(elem):
#                     print(f'opencloseTagRE = {elem}')
                    tag_name = elem[1:-2]
                    counter[tag_name] = 1 if tag_name not in counter else counter[tag_name] + 1
                    if tag_name in cur: cv.terminate(cur[tag_name])
                    cur[tag_name] = cv.node(tag_name)
                    cv.feature(cur[tag_name], **{tag_name: counter[tag_name]})
                    cv.meta(tag_name, description="open-close-tag without further specification. See the name of the .tf-file for it's meaning",)

                else: # These are the text nodes themselves
                    if re.fullmatch(r'\s*', elem):
                        continue
                    else:
                        for sec in self.section_elems:
                            if sec not in cv.activeTypes():
                                if sec == self.section_elems[-1]:
                                    cur[sec] = cv.node(sec)
                                    cv.feature(cur[sec], **{sec: 1})
                                else:
                                    cur[sec] = cv.node(sec)
                                    cv.feature(cur[sec], **{sec: 0})
                                    
                        assigned = False
                        for tag in tagList:
                            if tag in excludeTags and tag in cv.activeTypes():
                                elem = normalize(NFD, elem)
#                                 n = cv.slot()
#                                 cv.feature(n, **{tag: elem})
#                                 cur[tag] = cv.node(tag)
                                cv.feature(cur[tag], **{tag: elem})
                                cv.meta(tag, description="open tag without further specification. See the name of the .tf-file for it's meaning",)
                                nonIntFeatures.add(tag)
                                assigned = True
                                break
                        if assigned == True:
                            continue
                        
#                         if tagList[-1] in excludeTags:
#                             if tagList[-1] in cur and cv.linked(cur[tagList[-1]]): 
#                                 cv.terminate(cur[tagList[-1]])
#                             elem = normalize(NFD, elem)
#                             cur[tagList[-1]] = cv.node(tagList[-1])
#                             cv.feature(cur[tagList[-1]], **{tagList[-1]: elem})
#                             cv.meta(tagList[-1], description="open tag without further specification. See the name of the .tf-file for it's meaning",)
#                             nonIntFeatures.add(tagList[-1])
#                             continue

                        # In this stage, the unicode NFC format will be used, 
                        # to prevent that letter accents at the start of a word
                        # will be chopped off; later we change to NFD
                        try:
                            elem.encode('ascii')
#                             elem_uni = filter_non_greek(
#                                            normalize(NFC, beta_to_uni.beta_code(elem))
#                                        )
                            elem_uni = normalize(NFC, beta_to_uni.beta_code(elem))
                        except UnicodeEncodeError:
#                             elem_uni = filter_non_greek(
#                                            normalize(NFC, elem)
#                                        )
                            elem_uni = normalize(NFC, elem)
                        # elem_uni is now containing a string of Greek text 
                        # with NFC normalization only
                        for word in elem_uni.split():
                            # word contains the original form of a Greek word
                            if word == '':
                                continue
                                
                            
                            #counter['word'] +=1
                            
                            # pass the original form of the word into cv.feature
                            for (preWord, midWord, postWord) in splitPunc(word):
                                # midWord_pl will be used for various normalization actions
                                midWord_pl = plainLow(midWord)
                                if midWord_pl == '' or midWord_pl == 'ʼ': # ʼ is a single letter modifier
                                    if '.' in postWord:
                                        cv.terminate(cur['_sentence'])
                                        counter['_sentence'] +=1
                                    try:
                                        
                                        cv.resume(w)
                                        orig = cv.get('orig', w) + preWord + midWord + postWord
                                        try:
                                            post = cv.get('post', w) + preWord + midWord + postWord
                                            cv.feature(w, post=post)
                                        # If midWord does not exist, any other sign 
                                        # is authomatically assigned to preWord not postWord.
                                        # This preWord is bound to the end of the previous word
                                        except:
                                            pass
                                        cv.feature(w, orig=orig)
                                        cv.terminate(w)
                                        break
                                    except UnboundLocalError:
                                        break
                                # Then we need to check for any elided form
                                if '_sentence' not in cv.activeTypes():
                                    cur['_sentence'] = cv.node('_sentence')
                                    cv.feature(cur['_sentence'], _sentence=counter['_sentence'])
                                
                                if midWord_pl + '’' in ELISION_norm and postWord != '':
                                    # we normalize the elision accent (many different ones have been used!)
                                    # for the original form (any elision accent will be replaced by the standard one)
                                    word = preWord + midWord + '’' + postWord[1:]
                                    # we modify the midWord and the postWord
                                    midWord = ELISION_norm[midWord_pl + '’'] # midWord gets the un-elided form!
                                    postWord = postWord[1:]          # postWord loses the elision accent!
                                # Deletion of movable-nu
                                if midWord_pl.endswith(('εν', 'σιν', 'στιν')) and len(midWord_pl) >= 3:
                                    midWord = midWord[:-1]
                                # Handling final-sigma
                                if midWord_pl.endswith('σ'):
                                    midWord = midWord[:-1] + 'ς'
                                # Handling various forms of ου
                                if midWord_pl in ('ουχ', 'ουκ'):
                                    midWord = midWord[:-1]
                                # Handling ἐξ
                                if midWord_pl == 'εξ':
                                    midWord = midWord[:-1] + 'κ'
                                # Definition of formats
                                midWord_main = normalize(NFD, midWord.lower())
                                midWord_norm = normalize(NFD, jt_normalise(midWord)[0])
                                midWord_plain = plainLow(midWord)
                                midWord_beta_plain = betacode.conv.uni_to_beta(midWord_plain)
                                # Lemmatization and counter for calculating the coverage ratio
                                lemma = lemmatize(midWord_main, lemmatizer)
                                if lemma.startswith('*'):
                                    lemma = lemmatize(midWord_norm, lemmatizer)
                                    if lemma.startswith('*'):
                                        lemma = lemmatize(midWord_plain, lemmatizer)
                                if lemma.startswith('*'):
                                    lemma_counter[1] +=1
                                else:
                                    lemma_counter[0] +=1
                                
                                # After the pre-processing, we continue to assigning everything
                                # Slot assignment!
                                w = cv.slot()
                                # Feature assignment
                                cv.feature(w, orig=word)
                                cv.feature(w, main=midWord_main)
                                cv.feature(w, norm=midWord_norm)
                                cv.feature(w, plain=midWord_plain)
                                cv.feature(w, beta_plain=midWord_beta_plain)
                                cv.feature(w, lemma=lemma)
#                                 cv.feature(w, _sentence=counter['_sentence'])
                                
                                # Creation of sentence feature at the start of the process
#                                 if counter['_sentence'] == 0:
#                                     counter['_sentence'] +=1
#                                     cur['_sentence'] = cv.node('_sentence')
#                                     cv.feature(cur['_sentence'], _sentence=counter['_sentence'])
#                                     cv.feature(w, _sentence=counter['_sentence'])
                                
                                if preWord != '':
                                    cv.feature(w, pre=preWord)
                                    cv.meta('pre', description='pre gives non-letter characters at the start of a word',)
                                    nonIntFeatures.add('pre')
                                if postWord != '':
                                    cv.feature(w, post=postWord)
                                    cv.meta('post', description='post gives non-letter characters at the end of a word',)
                                    nonIntFeatures.add('post')
                                    if '.' in postWord:
                                        cv.terminate(cur['_sentence'])
                                        counter['_sentence'] +=1
#                                         cur['_sentence'] = cv.node('_sentence')
# #                                         cv.feature(cur['_sentence'], _sentence=counter['_sentence'])
#                                         cv.feature(w, _sentence=counter['_sentence'])
                                        
            else:
                if commentStopRE.fullmatch(elem):
                    Comment = False
                continue
        
        
        nonIntFeatures.update(('word', 'orig', 'main', 'norm', 'plain', 'beta_plain', 'lemma'))        
        cv.meta('lemma', **{'coverage ratio': f'{round(lemma_counter[0] / ((lemma_counter[0] + lemma_counter[1]) / 100 ), 2)}%'})
        for feature in cv.metaData:
            if feature in nonIntFeatures:
                cv.meta(feature, valueType='str')
            else:
                if feature == "":
                    pass
                else:
                    cv.meta(feature, valueType='int')
        # Final check of tags
        tm.indent(level=1)
        if len(tagList) == 0:
            tm.info('No tag mistake(s) found...')
        else:
            tm.info(str(len(tagList)) + ' tag error(s) found.')


In [None]:
CORR_ATTRIB_VALS = {
    '{http://www.w3.org/XML/1998/namespace}id': 'id',
    '{http://www.w3.org/XML/1998/namespace}lang': 'lang',
    'boo': 'book',
    'fo1otnote': 'footnote',
    'fo6tnote': 'footnote',
    'foo1tnote': 'footnote',
    'foodnote': 'footnote',
    'foonote': 'footnote',
    'footn2ote': 'footnote',
    'footno1te': 'footnote',
    'footno3te': 'footnote',
    'footnot': 'footnote',
    'footnot1e': 'footnote',
    'footnot2e': 'footnote',
    'footnote1': 'footnote',
    'footnote2': 'footnote',
    'footnte': 'footnote',
    'footnτote': 'footnote',
    'footote': 'footnote',
    'fotnote': 'footnote',
    'Τfootnote': 'footnote',
    'm5arginal': 'marginal',
    'margi4nal': 'marginal',
    'margial': 'marginal',
    'marginael': 'marginal',
    'marginai': 'marginal',
    'marginale': 'marginal',
    'marginalp': 'marginal',
    'marginapl': 'marginal',
    'marginaΕl': 'marginal',
    'marginaΣl': 'marginal',
    'marginl': 'marginal',
    'margipnal': 'marginal',
    'margnal': 'marginal',
    'margpinal': 'marginal',
    'marinalΑB': 'marginal',
    'marpginal': 'marginal',
    'märginal': 'marginal',
    ' chapter': 'chapter',
    ' section': 'section',
    'antistrohe': 'antistrophe',
    'chap0ter': 'chapter',
    'chapter1': 'chapter',
    'chapterer': 'chapter',
    'chapters': 'chapter',
    'chaptser': 'chapter',
    'chaspter': 'chapter',
    'ephymn.': 'ephymn',
    'sction': 'section',
    'sectionn': 'section',
    'setence': 'sentence',
    'setion': 'section',
    'secton': 'section',
    'subdsection': 'subsection',
    'subection': 'subsection',
    'pargraph': 'paragraph',
}


# Update the list of James Tauber with some additional forms
ELISION.update(
    {
        'ἔσθ’': 'ἔστι',
        'γ’': 'γέ',
        'μ’': 'μή',
        'τοσαῦτ’': 'τοσοῦτος',
        'ἆρ’': 'ἆρα',
        'προσῆλθ’': 'προσῆλθε',
        'θ’': 'θε',
        'ἐνθάδ’': 'ἐνθάδε',
        'ἔστ’': 'ἔστε',
        'τοτ’': 'τοτε',
        'σ’': 'σε',
        'οὔτ’': 'οὔτε',
        'ἠδ’': 'ἠδη',
        'τ’': 'τε',
    }
    )
#Normalize ELISION to unaccented keys and normalized accented values
ELISION_norm = {plainLow(k) + '’': normalize(NFC, v) for k, v in ELISION.items()}

attributes = {'id', 'cols', 'hand', 'subtype', 'evidence', 'lang', 'value', 'direct', '{http://www.w3.org/XML/1998/namespace}id', 'status', 'from', 'to', 'corresp', 'who', 'key', 'ed', 'rows', 'cause', 'source', '{http://www.w3.org/XML/1998/namespace}lang', 'extent', 'part', 'targOrder', 'anchored', 'ana', 'target', 'quantity', 'default', 'unit', 'cert', 'reason', 'org', 'TEIform', 'instant', 'n', 'type', 'role', 'rend', 'place', 'break', 'desc', 'sample', 'met', 'resp', 'url'}
attrib_type = {'*marturi/a', '*pro/klhsis', 'sphragis', 'proverb', 'bekker page', 'NarrProof', 'noclass', 'footnot', 'hexameter', 'complaint', 'statute', 'Parabasis', 'tetrameters', 'antiprelude', 'anapests', 'fo6tnote', 'marginaAl', 'marginalXXXIVv', 'Text', 'Continued', 'summary', 'proepirrheme', 'mesode', '*yh/fisma', 'prose', 'agreement', 'marginal919a', 'fragment', '*grafh/', 'num', 'footnote1', 'commentary', '*)ekmarturi/a', 'law', 'marginaΕl', '*xro/nos', 'festival', 'alternative', 'subsection', 'noparse', 'section', '*)ara/', 'challenge', 'footnτote', 'margin', 'eleg', 'meter', 'toc', 'footno1te', 'index', 'ethnic', 'Book', 'decree', 'marpginal', 'winner', 'boo', 'altnum', 'marginale', 'trimeter', 'Agon', 'episode', 'schedule', 'catchword', '*dialogismo\\s tw=n *(hmerw=n', 'marginai', 'Parodos', 'dates', 'footnte', 'marginalB', 'margina6l', 'margina70rl', 'footno3te', '*yhfi/smata', 'marginalW', 'proagon', 'prelude', 'salutation', 'margial', 'poem', 'monody', 'indictment', 'oath', '*sunhtopi/a *boiwtw=n kai\\ *fwke/wn', 'editorial', 'sling', 'testimonium', 'marginalHdt.', 'epirrheme', 'verse paraphrase', '*do/gma *summa/xwn', 'troch', '*sunqh=kai', 'Τfootnote', 'Antikatakeleusmos', 'lease', 'corr', 'strophe', 'footote', '*no/mos', 'continued', 'antepirrheme', 'Epirrheme', 'Lyric-Scene', 'iamb', 'footnot2e', 'm5arginal', 'translation', 'worktitle', 'margi4nal', 'antiproepirrheme', 'verse', 'will', 'names', 'resolution', 'marginal77v', 'antistrophe', '*do/gma *sune/drwn', 'dactyls', 'witnesses', 'inscription', 'group', 'footnote', 'mentioned', 'verse-paraphrase', 'clause', 'margina', 'depositions', 'foonote', 'chapter', 'footn2ote', 'subscription', 'Verse', 'nomorph', 'fo1otnote', 'intro', 'prologue', 'reply', 'Episode', 'Katakeleusmos', 'margpinal', 'constellation', 'elegiacs', 'antikatakeleusmos', 'explanation', 'place', 'language', 'desc', 'footnote2', 'tetralogy', 'marginal', 'part', 'nomSac', 'Choral', 'katakeleusmenos', 'trochees', '*marturi/ai', 'deposition', 'foo1tnote', 'month', 'marginalE', 'inscript', 'parabasis', 'marginapl', 'märginal', 'speaker', 'marginalC', 'subtitle', 'antipnigos', 'dact', 'suggestion', 'counter-plea', 'person', 'Extract', 'pnigos', 'direct', 'subtext', 'unspecified', 'katakeleusmos', 'textpart', 'term', 'emph', 'marginl', 'dialogue', '*diaqh=kai', 'close', 'Prologue', 'marginalp', 'Name', 'witness', 'terms', 'marginaDl', 'orig', '*ma/rtures', 'race', 'text', '*)epistolh/', 'header', 'footnot1e', 'foodnote', 'marinalΑB', 'iambic', '*(/orkoi', 'title', 'main', 'epode', 'book', 'marginaΣl', 'sub', 'choral', 'letter', 'oracle', 'Papyr', 'antikatakeleusmenos', 'marginael', 'paraphrase', 'iambics', 'antepirrhema', '*yh/fisma peri\\ *dwrea\\s toi=s a)po\\ *fulh=s', 'Exodus', 'drama', 'margipnal', 'lyric', 'fotnote', 'bibliography', 'spoken', 'lemma', 'Prose', 'margnal', '*no/moi', 'argument', 'epirrhema', 'edition', 'work', 'margina15vl'}
attrib_subtype = {'hexameter', 'Parabasis', 'tetrameters', 'anapests', 'antiprelude', 'source', 'sentence', 'comment', 'page', 'Letter', 'fragment', 'conspectus', 'Antepirrheme', 'commentary', 'TOC', 'subsection', 'section', 'quaestio', 'subdsection', 'toc', 'auctorm', 'index', 'fabula', ' chapter', 'epistle', 'ephymn.', 'Book', 'chapterer', 'preface', 'exordium', 'Agon', 'castlist', 'episode', 'Parodos', 'proagon', 'prelude', 'poem', 'monody', 'chap0ter', 'epirrheme', 'ephymnion', 'Antikatakeleusmos', 'chaptser', 'strophe', 'dramatispersonae', 'line', 'antepirrheme', 'Epirrheme', 'sectionn', 'Lyric-Scene', 'ii_loci', 'sction', 'sigla', 'auctores', 'chaspter', 'verse', 'antistrohe', 'Pnigos', 'ancient', 'Antipnigos', 'antistrophe', 'volume', 'dactyls', 'haeresis', 'wolfii', 'chapter', 'appendix', ' section', 'iii_loci', 'number', 'Episode', 'Katakeleusmos', 'paragraph', 'antikatakeleusmos', 'ephymn', 'aphorism', 'corrigenda', 'part', 'Choral', 'hypothesis', 'katakeleusmenos', 'trochees', 'subection', 'parabasis', 'essay', 'proode', 'autorum', 'antipnigos', 'pnigos', 'kommos', 'epigram', 'katakeleusmos', 'addenda', 'dialogue', 'close', 'Prologue', 'supplementa', 'setion', 'ducangii', 'setence', 'entry', 'chapter1', 'index.1', 'chapters', 'epode', 'book', 'epigraph', 'speech', 'loci', 'letter', 'choral', 'antikatakeleusmenos', 'iambics', 'trochaic', 'iv_loci', 'Exodus', 'type', 'lyric', 'index.2', 'homilia', 'work'}
tag_names = {'head', 'pb', 'note', 'hi', 'lg', 'gap', 'div1', 'seg', 'div2', 'sic', 'del', 'add', 'milestone', 'title', 'q', 'div', 'p', 'l', 'lb', 'argument', 'sp', 'div3', 'num', 'quote', 'speaker', 'bibl', 'date', 'ab', 'lemma', 'foreign'} # Biblical and Patristic literature only


# Configuration of the TF director

The function `authorWork(path)` reads some metadata from the sourcefiles to process them properly. Then we process the xml-files by reading them and calling the `cv.walk()` function. As a result, valid TF-packages should be produced.

In [None]:
tm = Timestamp()

def authorWork(path):
    author = None
    editor = None
    book = None
    afound = False
    efound = False
    bfound = False
    TitleStmt = False
    metaTaglist = []
    metaData = {}
    
    with open(path) as xml:
        data = ' '.join([line.strip() for line in list(takewhile(lambda line: not bodyStartRE.search(line), xml.readlines()))])\
                      .replace('<', '#!#<')\
                      .replace('>', '>#!#')\
                      .split('#!#')
        for elem in data:
            elem = elem.strip('{ ,.}')
            if elem == '':
                continue
            if elem.startswith('<body'):
                break
            elif elem.startswith('<titleStmt'):
                TitleStmt = True
            elif elem.startswith('</titleStmt'):
                TitleStmt = False
            elif TitleStmt == True:
                if elem.startswith('<'):
                    tag_split = elem.find(' ') if not elem.find(' ') == -1 else elem.find('>')
                    metaTaglist.append(elem[1:tag_split])
                else:
                    if metaTaglist[-1] in metaData:
                        metaData[metaTaglist[-1]] += \
                            f', {elem}' if not elem in metaData[metaTaglist[-1]] else ''
                    else:
                        metaData[metaTaglist[-1]] = elem
        if not 'author' in metaData and not 'editor' in metaData:
            TitleStmt = False
            for elem in data:
                elem = elem.strip('{ ,.}')
                if elem.startswith('<body'):
                    break
                if elem == '':
                    continue 
                elif elem.startswith('<biblStruct'):
                    TitleStmt = True
                elif elem.startswith('</biblStruct'):
                    TitleStmt = False
                elif TitleStmt == True:
                    if elem.startswith('<'):
                        tag_split = elem.find(' ') if not elem.find(' ') == -1 else elem.find('>')
                        metaTaglist.append(elem[1:tag_split])
                    else:
                        if metaTaglist[-1] in metaData:
                            metaData[metaTaglist[-1]] += \
                                f', {elem}' if not elem in metaData[metaTaglist[-1]] else ''
                        else:
                            metaData[metaTaglist[-1]] = elem

    author = metaData['author'].title() if 'author' in metaData \
                else metaData['editor'].title() if 'editor' in metaData \
                else 'undefined'
    
    book = metaData['title'].replace('(Greek)', '').replace('.', '').replace(',', '').replace('Machine readable text', '').strip().title()
    return (author, book)

COUNTER1 = 0
COUNTER2 = 0

for xmlfile in glob.glob(SRC_DIR+'/**/*grc*.xml', recursive=True):
# for xmlfile in glob.glob(SRC_DIR + '/canonical-greekLit/data/tlg0059/tlg004/tlg0059.tlg004.perseus-grc2.xml'):
# for xmlfile in glob.glob(SRC_DIR + '/tlg2042'+'/**/*grc*.xml', recursive=True):
# for xmlfile in glob.glob(SRC_DIR + '/tlg0031/tlg004/tlg0031.tlg004.perseus-grc2.xml'):
# for xmlfile in glob.glob(SRC_DIR + '/tlg0555/tlg002/tlg0555.tlg002.opp-grc1.xml'):
# for xmlfile in glob.glob(SRC_DIR + '/tlg0555/tlg001/tlg0555.tlg001.opp-grc1.xml'):
# for xmlfile in glob.glob(SRC_DIR +'/tlg0555/**/*grc*.xml', recursive=True):
    if COUNTER1 >= 1:
        print('\n\n')
    COUNTER1 +=1

    tm.info(f'parsing {xmlfile}\n')
    (author, book) = authorWork(xmlfile)
    if os.path.isdir(f'{TF_DIR}/{author}/{book}/tf/{VERSION}'):
        C = 1
        while os.path.isdir(f'{TF_DIR}/{author}/{C}_{book}/tf/{VERSION}'):
            C +=1
        else:
            TF_PATH = f'{TF_DIR}/{author}/{C}_{book}/tf/{VERSION}'
    else:
        TF_PATH = f'{TF_DIR}/{author}/{book}/tf/{VERSION}'
    TF = Fabric(locations=TF_PATH)
    cv = CV(TF)
    x = Conversion(xmlfile)
    slotType = 'word'
    good = cv.walk(
        x.director,
        x.slotType,
        otext=x.otext,
        generic=x.generic,
        intFeatures=x.intFeatures,
        featureMeta=x.featureMeta,
        warn=False,
    )
    if good: COUNTER2 +=1
tm.info(f'{COUNTER2} of {COUNTER1} works have successfully been converted!')
lemmatizer_open.close()

In [None]:
import re
kwargs = {'attrib_errors': {'Eusebus': 'Eusebius'}}
elem = '<div textpart = "   chapter" ref=" Eusebus   ">' 

def attribClean(elem, **kwargs):
    elem = elem.strip('<>\ ')
    elem = re.sub(r'\s*=\s*"\s*', '="', elem)
    tag = elem[:elem.find(' ')]
    attribs = {k.strip(): v.strip('" ') for k, v in [elem.split('="') \
        for elem in elem[elem.find(' '):].split('" ')]}
    if 'attrib_errors' in kwargs:
        attribs = {k: (kwargs['attrib_errors'][v] if v in kwargs['attrib_errors'] else v)\
                   for k, v in attribs.items()}
    return (tag, attribs)

attribClean(elem, **kwargs)

In [None]:
s = '<di t/ >'
print(s.strip('<>/ '))

In [None]:
def attribClean(elem):
    elem = elem.strip('<> ')
    elem = re.sub(r'\s*=\s*"\s*', '="', elem)
    tag = elem[:elem.find(' ')]
    attribs = {k.strip(): v.strip('" ') for k, v in [elem.split('="') \
        for elem in elem[elem.find(' '):].split('" ')]}
    if 'attrib_errors' in kwargs:
        attribs = {k: (kwargs['attrib_errors'][v] if v in kwargs['attrib_errors'] else v)\
                   for k, v in attribs.items()}
    return attribs

In [None]:
class Test():
    name = 'tester'
    
    @classmethod
    def normalize(cls, text):
        return text.lower() + cls.name

dictio = {'norm': Test.normalize}
        
test = {
    'lang': 'Custom',
    'version': '1.0',
    'slot_type': 'word',
    'udnorm': 'NFD',
    'dir_struct': ['author', 'book', 'editor'],
    'sentence_delimit': ['.', ';'],
    'lang_processor': Test,
}

# x = test['lang_processor']('test')
print(dictio['norm']('DiT IS een TeST'))

In [None]:
from .helpertools.tokenizer import splitWord

splitWord('.,dit.is?!')

In [3]:
def tokenize(string):
    '''This basic tokenize method splits a string 
    on spaces, without returning empty strings.
    '''
    return list(filter(None, string.strip().split(' ')))
#     return string.split(' ')

s = ' dit   is een   hele mond   vol  '
tokenize(s)

['dit', 'is', 'een', 'hele', 'mond', 'vol']