
# TEI XML to Text-Fabric Convertor

XML-TEI textfiles can be converted to [Text-Fabric format](https://dans-labs.github.io/text-fabric/Model/File-formats/) by using this convertor. It has been designed for Greek, but it should also work with minimal adjustments for other languages (except for the implemented lemmatizer).

See this [readme](https://github.com/pthu/patristics) for more information about the corpus and this work.

See this [notebook](https://nbviewer.jupyter.org/github/annotation/banks/blob/master/programs/convert.ipynb) for a simple setup for a tf conversion if you like to build your own convertor.

In [None]:
import re
import collections
import pickle
import betacode.conv
from pprint import pprint

from os import path
from glob import glob
from collections import OrderedDict, namedtuple
from itertools import takewhile
from ordered_set import OrderedSet
from unicodedata import category, normalize
from tf.fabric import Fabric, Timestamp
from tf.convert.walker import CV
from pprint import pprint
from cltk.corpus.greek.beta_to_unicode import Replacer
from cltk.corpus.greek.alphabet import filter_non_greek
from greek_normalisation.normalise import Normaliser
# from greek_normalisation.norm_data import ELISIONS, MOVABLE

# Local imports
from helpertools.lemmatizer import lemmatize
from helpertools.unicodetricks import *
from helpertools.xmlparser import xmlSplitter, elemParser, metadataReader, attribsAnalysis, lenAttribsDict, sectionElems
from tf_config import langsettings
from data.attrib_errors import error_dict
from convertor_metadata import convertor_metadata


In [None]:
tm = Timestamp()

class Conversion:
    def __init__(self, data, meta, sLemmatizer=None, lang='generic', **kwargs):
        self.data              = data
        self.lang              = lang
        self.langsettings      = kwargs
        self.attribs_dict, \
        self.section_tags, \
        self.open_section_tags = attribsAnalysis(self.data, lang=self.lang, **kwargs)
        self.len_attribs_dict  = lenAttribsDict(self.attribs_dict)
        self.section_dict, \
        self.sections          = sectionElems(self.attribs_dict, self.section_tags, **self.langsettings)
        self.struct_list       = OrderedSet()
        self.metadata          = meta
        self.lemmatizer        = sLemmatizer

        # TF SPECIFIC VARIABLES
        self.slotType         = self.langsettings['slot_type']
        self.intFeatures      = set()
        self.generic          = meta
        #TODO: add entry 'availableStructure'
                        
        # Definition of text formats
        self.otext = {
            **{k: v['format'] for k, v in self.langsettings['text_formats'].items()}, \
            **{'sectionTypes': f'{",".join(self.sections[:2] + [self.sections[-1]] if len(self.sections) > 2 else self.sections)}'}, \
            **{'sectionFeatures': f'{",".join(self.sections[:2] + [self.sections[-1]] if len(self.sections) > 2 else self.sections)}'}, \
            **{'structureTypes': f'{",".join(self.sections)}'}, \
            **{'structureFeatures': f'{",".join(self.sections)}'}
        }
#         pprint(self.otext)
                      # TODO: add 'sectionTypes', 'sectionFeatures',  'structureTypes' 'structureFeatures' 

        # These are the feature metadata that are present in all tf-packages to be produced... 
        # Other metadata will be added during the run of the director()...
        self.featureMeta = {v['name']: {'description': v['metadata']} \
                            for k, v in self.langsettings['text_formats'].items()}
#         pprint(self.featureMeta)


    def director(self, cv):
        Comment = False
        nonIntFeatures = {'otype', 'oslots',}
        excludeTags = set()
        counter = dict(_sentence=1, _phrase=1)
        cur = {}
        tagList = []
        secElems = []
        orderedSectionSet = OrderedSet()
        closedSectionList = []
        lemma_counter = [0, 0]
        
        tagList.append('_book')
        cur['_book'] = cv.node('_book')
        cv.feature(cur['_book'], _book=self.metadata['title'])
        nonIntFeatures.add('_book')
        
        TEXT = False
        udnorm = self.langsettings['udnorm']
        
        for elem in self.data:
            code, content = elemParser(elem, lang=self.lang, **self.langsettings)
            
            if code == 'text':
                assigned = False
                for tag in tagList:
                    if tag in self.langsettings['non_text_elems'] and tag in cv.activeTypes():
                        elem = normalize(udnorm, elem)
                        cv.feature(cur[tag], **{tag: elem})
                        cv.meta(tag, description="open tag without further specification. See the name of the .tf-file for it's meaning",)
                        nonIntFeatures.add(tag)
                        assigned = True
                        break
                if assigned == True:
                    continue
     
                if self.lang == 'greek':
                    try:
                        elem.encode('ascii')
                        elem = normalize(udnorm, betacode.conv.beta_to_uni(elem))
                    except UnicodeEncodeError:
                        elem = normalize(udnorm, elem)
                        
                for token in self.langsettings['tokenizer'](elem, **self.langsettings['tokenizer_args']):
                    # midWord_pl will be used for various normalization actions
                    word_orig = token[1]
                    token_norm = self.langsettings['replace_func'](token) \
                              if 'replace_func' in self.langsettings \
                              else token
                    pre, word, post = token_norm
                    
                    if not plainLow(word):
                        if set(pre) & langsettings['sentence_delimit']:
                            cv.terminate(cur['_sentence'])
                            counter['_sentence'] +=1
                        try:
                            cv.resume(w)
                            orig = cv.get('orig', w) + pre
                            try:
                                post = cv.get('post', w) + pre
                                cv.feature(w, post=post)
                            except:
                                pass
                            cv.feature(w, orig=orig)
                            cv.terminate(w)
                            continue
                        except UnboundLocalError:
                            continue
                    if TEXT == False:
                        self.struct_list.update(('_phrase', '_sentence'))
                    TEXT = True
                    
                    for struct in self.struct_list:
                        if struct not in cv.activeTypes():
                            if struct in {'_phrase', '_sentence'}:
                                cur[struct] = cv.node(struct)
                                cv.feature(cur[struct], _sentence=counter[struct])
                            else:
                                cur[struct] = cv.node(struct)
                                cv.feature(cur[struct], **{struct: 0})
                    w = cv.slot()
                        
                    for _, form in self.langsettings['text_formats'].items():
                        name = form['name']
                        func = form['function']
                        if name == 'orig':
                            cv.feature(w, name=func(token))
                        elif name == 'lemma':
                            lemma = func(token, self.lemmatizer)
                            if lemma.startswith('*'):
                                lemma_counter[1] +=1
                            else:
                                lemma_counter[0] +=1
                        else:
                            cv.feature(w, name=func(token_norm))
                            
                        if pre != '':
                            cv.feature(w, pre=pre)
                            cv.meta('pre', description='pre gives non-letter characters at the start of a word',)
                            nonIntFeatures.add('pre')
                        if post != '':
                            cv.feature(w, post=post)
                            cv.meta('post', description='post gives non-letter characters at the end of a word',)
                            nonIntFeatures.add('post')
                            if set(post) & self.langsettings['sentence_delimit'] and TEXT == True:
                                cv.terminate(cur['_sentence'])
                                counter['_sentence'] +=1
                                TEXT = False
                            if set(post) & self.langsettings['phrase_delimit'] and TEXT == True:
                                cv.terminate(cur['_phrase'])
                                counter['_phrase'] +=1
                                TEXT = False
                                           
                                            
            elif code == 'closeTag':
                if tagList[-1] in secElems:
                    if not cv.linked(cur[tagList[-1]]): # CHECK whether this works as expected!
                        cv.slot()
                    index = secElems.index(tagList[-1])
                    for ntp in secElems[:index:-1]:
                        if ntp in cur: cv.terminate(cur[ntp])
                cv.terminate(cur[tagList[-1]])
                del tagList[-1]
                
                
            elif code in {'openAttrTag','closedAttrTag'}:
                tag, attribs = content
                tag_name  = tuple((tag, tuple(key for key in attribs.keys() \
                              if key not in self.langsettings['ignore_attrib_keys'])))
                # Section elements
                if tag_name in self.section_dict:
                    sec, val = self.section_dict[tag_name]
                    if len(attribs) == 1:
                        section = sec
                        value = attribs[val]
                    else:
                        section = attribs[sec]
                        value = attribs[val]
                    if section in cur: cv.terminate(cur[section])
                    tagList.append(section)
                    if not section in self.langsettings['ignore_section_values']:
                        for el in secElems[:-1]:
                            if not el in cv.activeTypes():
                                cur[el] = cv.node(el)
                                cv.feature(cur[el], **{el: 0})
                        secElems.append(section)
                        orderedSectionSet.add(section)
                        cur[section] = cv.node(section)
                        cv.feature(cur[section], **{section: value})
                        continue
                    else:
                        cur[section] = cv.node(section)
                        cv.feature(cur[section], **{section: value})
                        continue
                # Non-section elements
                else:
                    if 'n' in attribs:
                        value = attribs['n']
                        name = max(self.attribs_dict[tag_name], 
                                   key=lambda key: self.attribs_dict[tag_name][key] \
                                   if not key == 'n' else OrderedSet())
                    else:
                        name = max(attribs_dict[tag_name], 
                                    key=lambda key: attribs_dict[tag_name][key])
                        value = max(attribs_dict[tag_name], 
                                          key=lambda key: attribs_dict[tag_name][key] \
                                          if not value == name else OrderedSet())
                    tagList.append(attribs[name])
                    cur[attribs[name]] = cv.node(attribs[name])
                    cv.feature(cur[attribs[name]], **{name: value})
                    continue
            
            elif code == 'openTag':
                tag_name = content[1:-1]
                tagList.append(tag_name)
                if tag_name in cur: cv.terminate(cur[tag_name])
                if not tag_name in excludeTags:
                    if tag_name in counter:
                        counter[tag_name] +=1
                    else:
                        counter[tag_name] = 1    
                    cur[tag_name] = cv.node(tag_name)
                    cv.feature(cur[tag_name], **{tag_name: counter[tag_name]})
                    cv.meta(tag_name, description="open tag without further specification. See the name of the .tf-file for it's meaning",)
#                 else:
#                     if tag_name in cur: cv.terminate(cur[tag_name])
#                     cur[tag_name] = cv.node(tag_name)
                continue
                
            elif code == 'openCloseTag':
                tag_name = content[1:-2]
                counter[tag_name] = 1 if tag_name not in counter else counter[tag_name] + 1
                if tag_name in cur: cv.terminate(cur[tag_name])
                cur[tag_name] = cv.node(tag_name)
                cv.feature(cur[tag_name], **{tag_name: counter[tag_name]})
                cv.meta(tag_name, description="open-close-tag without further specification. See the name of the .tf-file for it's meaning",)

            elif code == 'comment':
                continue
                
            elif code == 'bodyStop':
                for ntp in cur:
                    if not ntp in secElems and not ntp == '_book':
                        cv.terminate(cur[ntp])
                for ntp in secElems[::-1]:
                    cv.terminate(cur[ntp])
                cv.terminate(cur['_book'])
                del tagList[-1]
                break


#===================================================================================
            
# #             print(elem)
# #             print(tagList)
#             elem = elem.strip()
#             if Comment == False:
#                 if commentStartRE.fullmatch(elem): #DONE
#                     if commentFullRE.fullmatch(elem):
#                         continue
#                     Comment = True
#                     continue

#                 elif openTagRE.fullmatch(elem): #DONE
# #                     print(f'openTagRE = {elem}')
#                     # These are the features linked to the coming nodes



#                 elif openAttrTagRE.fullmatch(elem):
# #                     print(f'openAttrTagRE = {elem}')
#                     # These are the features linked to coming nodes
#                     elem = re.sub(r'\s*=\s*"\s*', '="', elem)
#                     tag_split = elem.find(' ')
#                     attribs = {key: val.strip() for key, val in [elem.split('="') for elem in elem[tag_split:-1].strip().split('" ')]}
#                     for key, val in attribs.items():
#                         if val.strip('"') in CORR_ATTRIB_VALS:
#                             attribs[key] = CORR_ATTRIB_VALS[val.strip('"')]
#                         else:
#                             attribs[key] = val.strip('"')
#                     if NegatedEditionTag == False:        
#                         if 'type' in attribs:
#                             if attribs['type'] == 'edition':
#                                 NegatedEditionTag = True
#                                 self.generic['urn'] = attribs['n'] if 'n' in attribs else 'not provided'
#                                 continue
#                     tag_name = elem[1:tag_split]
#                     if tag_name.startswith('div'):
#                         tag_name = 'div'
#                     tag = tuple((tag_name, tuple(key for key in attribs.keys() if key not in {'corresp', 'merge', 'resp'})))
#                     if tag_name in excludeTags:
#                         if tag_name in cur: cv.terminate(cur[tag_name])
#                         tagList.append(tag_name)
#                         cur[tag_name] = cv.node(tag_name)
#                         continue
#                     highest_value_attrib = max(self.len_attribs_dict[tag], 
#                                                key=lambda key: self.len_attribs_dict[tag][key])
#                     sec = False
                    
#                     for v in attribs.values():
#                         if v in self.section_elems[:]:
#                             sec = True
#                             value = v
#                     if sec == True:
#                         for k, v in attribs.items():
#                             if v == value:
#                                 if v == self.section_elems[0] and not k == 'n':
#                                     for ntp in self.section_elems[::-1]:
#                                         if ntp in cur: cv.terminate(cur[ntp])
#                                     desc = 'first section level'
#                                 elif len(self.section_elems) > 1 and v == self.section_elems[1] and not k == 'n':
#                                     for ntp in self.section_elems[:0:-1]:
#                                         if ntp in cur: cv.terminate(cur[ntp])
#                                     desc = 'second section level'
#                                 elif len(self.section_elems) > 2 and v == self.section_elems[2] and not k == 'n':
#                                     for ntp in self.section_elems[:1:-1]:
#                                         if ntp in cur: cv.terminate(cur[ntp])
#                                     desc = 'third section level'
#                                 elif len(self.section_elems) > 3 and v == self.section_elems[3] and not k == 'n':
#                                     for ntp in self.section_elems[:2:-1]:
#                                         if ntp in cur: cv.terminate(cur[ntp])
#                                     desc = 'fourth section level'    
#                                 elif len(self.section_elems) > 4 and v == self.section_elems[4] and not k == 'n':
#                                     for ntp in self.section_elems[:3:-1]:
#                                         if ntp in cur: cv.terminate(cur[ntp])
#                                     desc = 'fifth section level'    
#                                 elif len(self.section_elems) > 5 and v == self.section_elems[5] and not k == 'n':
#                                     for ntp in self.section_elems[:4:-1]:
#                                         if ntp in cur: cv.terminate(cur[ntp])
#                                     desc = 'sixth section level'   
#                                 else: #elif k == 'n': # in case k == 'n'!
# #                                     print(f'openAttrTagRE = {elem}')
#                                     v = attribs['subtype'] if 'subtype' in attribs else v
#                                     tagList.append(v)
#                                     content = attribs[highest_value_attrib].strip()
#                                     desc = 'not provided'
#                                     if v in self.section_elems:
#                                         index = self.section_elems.index(v) - 1
#                                         for ntp in self.section_elems[:index:-1]:
#                                             if ntp in cur: cv.terminate(cur[ntp])
#                                     if v in cur: cv.terminate(cur[v])
#                                     cur[v] = cv.node(v)
#                                     if not content.isdigit():
#                                         nonIntFeatures.add(v)
#                                     cv.feature(cur[v], **{v: content})
#                                     cv.meta(v, description=desc,)
#                                     break
#                                 tagList.append(v)
#                                 content = attribs['n'].strip() if 'n' in attribs else attribs[highest_value_attrib].strip()
#                                 if v in cur: cv.terminate(cur[v])
#                                 cur[v] = cv.node(v)
#                                 if not content.isdigit():
#                                     nonIntFeatures.add(v)
#                                 cv.feature(cur[v], **{v: content})
#                                 if 'corresp' in attribs:
#                                     cv.feature(cur[v], **{'corresp': attribs['corresp']})
#                                     nonIntFeatures.add('corresp')
#                                     cv.meta('corresp', description='this feature shows a correspondence with another source at the place indicated')
#                                 cv.meta(v, description=desc,)
#                                 break
#                     else:
#                         # If only one attrib differs: it cannot be made clear which name to choose, hence choose everything
                        
                        
                        
#                         attribList = []
#                         for attr in self.len_attribs_dict[tag]:
#                             if self.len_attribs_dict[tag][attr] > 1:
#                                 attribList.append(attr)
#                         if len(attribList) > 1:
                            
#                             tag_name += '-' + '-'.join([v for k, v in attribs.items() 
#                                                         if k in attribList 
#                                                         and not k == highest_value_attrib
#                                                         and not v[0].isdigit()])
#                             if tag_name.endswith('-'):
#                                 tag_name += '-'.join([v for k, v in attribs.items() 
#                                                         if k in attribList 
#                                                         and not v[0].isdigit()])
#                         content = attribs['n'] if 'n' in attribs else attribs[highest_value_attrib]
#                         tagList.append(tag_name)
#                         if tag_name in cur: cv.terminate(cur[tag_name])
#                         cur[tag_name] = cv.node(tag_name)
#                         if not content.isdigit():
#                             nonIntFeatures.add(tag_name)
#                         cv.feature(cur[tag_name], **{tag_name: content})
#                         cv.meta(tag_name, description="not provided",)
#                         continue
                        
#                 elif closedAttrTagRE.fullmatch(elem):
# #                     print(f'closedAttrTagRE = {elem}')
#                     elem = re.sub(r'\s*=\s*"\s*', '="', elem)
#                     tag_split = elem.find(' ')
#                     attribs = {key: val.strip() for key, val in [elem.split('="') for elem in elem[tag_split:-2].strip().split('" ')]}
#                     for key, val in attribs.items():
#                         if val.strip('"') in CORR_ATTRIB_VALS:
#                             attribs[key] = CORR_ATTRIB_VALS[val.strip('"')]
#                         else:
#                             attribs[key] = val.strip('"')
#                     tag_name = elem[1:tag_split]
#                     if tag_name.startswith('div'):
#                         tag_name = 'div'
#                     tag = tuple((tag_name, tuple(key for key in attribs.keys() if key not in {'corresp', 'merge', 'resp'})))
#                     highest_value_attrib = max(self.len_attribs_dict[tag], 
#                                                key=lambda key: self.len_attribs_dict[tag][key])
#                     sec = False
#                     for v in attribs.values():
#                         if v in self.section_elems[:]:
#                             sec = True
#                             value = v
#                             break
#                     if sec == True:
#                         for k, v in attribs.items():
#                             if v == value:
#                                 if v == self.section_elems[0] and not k == 'n':
#                                     for ntp in self.section_elems[::-1]:
#                                         if ntp in cur: cv.terminate(cur[ntp])
#                                     desc = 'first section level'
#                                 elif len(self.section_elems) > 1 and v == self.section_elems[1] and not k == 'n':
#                                     for ntp in self.section_elems[:0:-1]:
#                                         if ntp in cur: cv.terminate(cur[ntp])
#                                     desc = 'second section level'
#                                 elif len(self.section_elems) > 2 and v == self.section_elems[2] and not k == 'n':
#                                     for ntp in self.section_elems[:1:-1]:
#                                         if ntp in cur: cv.terminate(cur[ntp])
#                                     desc = 'third section level'
#                                 elif len(self.section_elems) > 3 and v == self.section_elems[3] and not k == 'n':
#                                     for ntp in self.section_elems[:2:-1]:
#                                         if ntp in cur: cv.terminate(cur[ntp])
#                                     desc = 'fourth section level'    
#                                 elif len(self.section_elems) > 4 and v == self.section_elems[4] and not k == 'n':
#                                     for ntp in self.section_elems[:3:-1]:
#                                         if ntp in cur: cv.terminate(cur[ntp])
#                                     desc = 'fifth section level'    
#                                 elif len(self.section_elems) > 5 and v == self.section_elems[5] and not k == 'n':
#                                     for ntp in self.section_elems[:4:-1]:
#                                         if ntp in cur: cv.terminate(cur[ntp])
#                                     desc = 'sixth section level'    
#                                 else: #elif k == 'n': # in case k == 'n'!
# #                                     print(f'openAttrTagRE = {elem}')
#                                     v = attribs['subtype'] if 'subtype' in attribs else v
#                                     content = attribs[highest_value_attrib].strip()
#                                     desc = 'not provided'
#                                     if v in self.section_elems:
#                                         index = self.section_elems.index(v) - 1
#                                         for ntp in self.section_elems[:index:-1]:
#                                             if ntp in cur: cv.terminate(cur[ntp])
#                                     if v in cur: cv.terminate(cur[v])
#                                     cur[v] = cv.node(v)
#                                     if not content.isdigit():
#                                         nonIntFeatures.add(v)
#                                     cv.feature(cur[v], **{v: content})
#                                     cv.meta(v, description=desc,)
#                                     break
#                                 content = attribs['n'].strip() if 'n' in attribs else attribs[highest_value_attrib].strip()
#                                 if v in cur: cv.terminate(cur[v])
#                                 cur[v] = cv.node(v)                            
#                                 if tag in self.opentags:
#                                     n = cv.slot()    
#                                 if not content.isdigit():
#                                     nonIntFeatures.add(v)
#                                 cv.feature(cur[v], **{v: content})
#                                 if 'corresp' in attribs:
#                                     cv.feature(cur[v], **{'corresp': attribs['corresp']})
#                                     nonIntFeatures.add('corresp')
#                                     cv.meta('corresp', description='this feature shows a correspondence with another source at the place indicated')
#                                 cv.meta(v, description=desc,)
#                                 break

#                     else:
#                         attribList = []
#                         for attr in self.len_attribs_dict[tag]:
#                             if self.len_attribs_dict[tag][attr] > 1:
#                                 attribList.append(attr)
#                         if len(attribList) > 1:
#                             tag_name += '-' + '-'.join([v for k, v in attribs.items() 
#                                                         if k in attribList 
#                                                         and not k == highest_value_attrib
#                                                         and not v[0].isdigit()])
#                             if tag_name.endswith('-'):
#                                 tag_name += '-'.join([v for k, v in attribs.items() 
#                                                         if k in attribList 
# #                                                         and not k == highest_value_attrib
#                                                         and not v[0].isdigit()])
#                         content = attribs['n'].strip() if 'n' in attribs else attribs[highest_value_attrib].strip()
#                         if tag_name in cur: cv.terminate(cur[tag_name])
#                         cur[tag_name] = cv.node(tag_name)
#                         if not content.isdigit():
#                             nonIntFeatures.add(tag_name)
#                         cv.feature(cur[tag_name], **{tag_name: content})
#                         cv.meta(tag_name, description="not given",)
#                         continue

#                 elif opencloseTagRE.fullmatch(elem):
# #                     print(f'opencloseTagRE = {elem}')

#                 else: # These are the text nodes themselves
#                     if re.fullmatch(r'\s*', elem):
#                         continue
#                     else:
                        
# #                                         cur['_sentence'] = cv.node('_sentence')
# # #                                         cv.feature(cur['_sentence'], _sentence=counter['_sentence'])
# #                                         cv.feature(w, _sentence=counter['_sentence'])
                                        
#             else:
#                 if commentStopRE.fullmatch(elem):
#                     Comment = False
#                 continue
        
        
        nonIntFeatures.update(('word', 'orig', 'main', 'norm', 'plain', 'beta_plain', 'lemma'))        
        cv.meta('lemma', **{'coverage ratio': f'{round(lemma_counter[0] / ((lemma_counter[0] + lemma_counter[1]) / 100 ), 2)}%'})
        for feature in cv.metaData:
            if feature in nonIntFeatures:
                cv.meta(feature, valueType='str')
            else:
                if feature == "":
                    pass
                else:
                    cv.meta(feature, valueType='int')
        # Final check of tags
        tm.indent(level=1)
        if len(tagList) == 0:
            tm.info('No tag mistake(s) found...')
        else:
            tm.info(str(len(tagList)) + ' tag error(s) found.')


In [None]:
def convert(input_path, output_path, lang='generic',
            version='1.0', metadata=convertor_metadata):
    '''The convert function is the core of the tei2tf module
    
    It takes the following arguments:
    in_path:  the path that contains the TEI formatted texts
    out_path: the path to which the tf-files would be written
    **kwargs: a dictionarry that is usually derived from the
              config.py file, that contains all important
              parameters for the conversion (see documentation)
    '''
    from tf_config import langsettings
    langsettings = langsettings[lang]
    udnorm       = langsettings['udnorm']
    tm           = Timestamp()
    slot_type    = langsettings['slot_type']
    dir_struct   = langsettings['dir_struct'] #TODO write function that derives the requested data!
    count1       = 0     # counts the number input files
    count2       = 0     # counts the number of successfully processed files
    sLemmatizer  = langsettings['lemmatizer']()
    
    # input-output file management
    inpath = path.expanduser(input_path)
    outpath = path.expanduser(output_path)

    # Looping through the inpath and running the tf-conversion
    for xmlfile in glob(f'{inpath}/**/*grc*.xml', recursive=True):
        count1 +=1
        if count1 > 1: print('\n')
        tm.info(f'parsing {xmlfile}\n')
        
        # creation of data to extract metadata
        # and to inject later into the Conversion object
        data = xmlSplitter(xmlfile)
        body_index, metadat = metadataReader(data, lang=lang, **langsettings['metadata'])
        metadata.update(metadat)
#         pprint(metadata)

        # definition of output dir structure on the basis of metadata
        dirs = []
        for i in dir_struct:
            assigned = False
            for j in i:
                if j in metadata:
                    dirs.append(metadata[j])
                    assigned = True
                    break
            if assigned == False:
                dirs.append(f'unknown {"-".join(i)}')
        
        # dirs is a list of lists of which the tagnames used are defined in config.py
        # they usually correspond to something like (author, work, editor/edition)
        # in case of multiple editions of the same work, a number will be prefixed
        if path.isdir(f'{outpath}/{"/".join(dirs)}/tf/{version}'):
            C = 1
            while path.isdir(f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'):
                C +=1
            else:
                TF_PATH = f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'
        else:
            TF_PATH = f'{outpath}/{"/".join(dirs)}/tf/{version}'
            
        # setting up the text-fabric engine
        TF = Fabric(locations=TF_PATH)
        cv = CV(TF)
        # initiating the Conversion class that provides all
        # necessary data and methods for cv.walk()
        x = Conversion(data[body_index:], metadata, sLemmatizer=sLemmatizer, lang=lang, **langsettings)
        # running cv.walk() to generate the tf-files
        good = cv.walk(
            x.director,
            slotType=slot_type,
            otext=x.otext,
            generic=x.generic,
            intFeatures=x.intFeatures,
            featureMeta=x.featureMeta,
            warn=False,
        )
        # Count number of successfully converted files
        if good: 
            count2 +=1
            tm.info('Conversion was successful...')
    tm.info(f'{count2} of {count1} works have successfully been converted!')
 
    
convert('~/github/pthu/pilot/sources/pt', '~/github/pthu/out', lang='greek')    

In [None]:
# REPO1 = '~/github/pthu'
# REPO2 = REPO1 + '/sources'
# VERSION = '1.0'

# # Define subcorpus to convert
# SOURCE = 'athenaeus'
# LOC = 'athenaeus'
# # Define the source where the (sub)corpus can be found
# SRC_DIR = os.path.expanduser(f'{REPO2}/{SOURCE}')
# # Define the export path
# TF_DIR = os.path.expanduser(f'{REPO1}/{LOC}') 
# # Define the version of the export


# Setup of the Convertor Environment



In [None]:
CORR_ATTRIB_VALS = {
    '{http://www.w3.org/XML/1998/namespace}id': 'id',
    '{http://www.w3.org/XML/1998/namespace}lang': 'lang',
    'boo': 'book',
    'fo1otnote': 'footnote',
    'fo6tnote': 'footnote',
    'foo1tnote': 'footnote',
    'foodnote': 'footnote',
    'foonote': 'footnote',
    'footn2ote': 'footnote',
    'footno1te': 'footnote',
    'footno3te': 'footnote',
    'footnot': 'footnote',
    'footnot1e': 'footnote',
    'footnot2e': 'footnote',
    'footnote1': 'footnote',
    'footnote2': 'footnote',
    'footnte': 'footnote',
    'footnτote': 'footnote',
    'footote': 'footnote',
    'fotnote': 'footnote',
    'Τfootnote': 'footnote',
    'm5arginal': 'marginal',
    'margi4nal': 'marginal',
    'margial': 'marginal',
    'marginael': 'marginal',
    'marginai': 'marginal',
    'marginale': 'marginal',
    'marginalp': 'marginal',
    'marginapl': 'marginal',
    'marginaΕl': 'marginal',
    'marginaΣl': 'marginal',
    'marginl': 'marginal',
    'margipnal': 'marginal',
    'margnal': 'marginal',
    'margpinal': 'marginal',
    'marinalΑB': 'marginal',
    'marpginal': 'marginal',
    'märginal': 'marginal',
    ' chapter': 'chapter',
    ' section': 'section',
    'antistrohe': 'antistrophe',
    'chap0ter': 'chapter',
    'chapter1': 'chapter',
    'chapterer': 'chapter',
    'chapters': 'chapter',
    'chaptser': 'chapter',
    'chaspter': 'chapter',
    'ephymn.': 'ephymn',
    'sction': 'section',
    'sectionn': 'section',
    'setence': 'sentence',
    'setion': 'section',
    'secton': 'section',
    'subdsection': 'subsection',
    'subection': 'subsection',
    'pargraph': 'paragraph',
}


# Update the list of James Tauber with some additional forms
ELISION.update(
    {
        'ἔσθ’': 'ἔστι',
        'γ’': 'γέ',
        'μ’': 'μή',
        'τοσαῦτ’': 'τοσοῦτος',
        'ἆρ’': 'ἆρα',
        'προσῆλθ’': 'προσῆλθε',
        'θ’': 'θε',
        'ἐνθάδ’': 'ἐνθάδε',
        'ἔστ’': 'ἔστε',
        'τοτ’': 'τοτε',
        'σ’': 'σε',
        'οὔτ’': 'οὔτε',
        'ἠδ’': 'ἠδη',
        'τ’': 'τε',
    }
    )
#Normalize ELISION to unaccented keys and normalized accented values
ELISION_norm = {plainLow(k) + '’': normalize(NFC, v) for k, v in ELISION.items()}

attributes = {'id', 'cols', 'hand', 'subtype', 'evidence', 'lang', 'value', 'direct', '{http://www.w3.org/XML/1998/namespace}id', 'status', 'from', 'to', 'corresp', 'who', 'key', 'ed', 'rows', 'cause', 'source', '{http://www.w3.org/XML/1998/namespace}lang', 'extent', 'part', 'targOrder', 'anchored', 'ana', 'target', 'quantity', 'default', 'unit', 'cert', 'reason', 'org', 'TEIform', 'instant', 'n', 'type', 'role', 'rend', 'place', 'break', 'desc', 'sample', 'met', 'resp', 'url'}
attrib_type = {'*marturi/a', '*pro/klhsis', 'sphragis', 'proverb', 'bekker page', 'NarrProof', 'noclass', 'footnot', 'hexameter', 'complaint', 'statute', 'Parabasis', 'tetrameters', 'antiprelude', 'anapests', 'fo6tnote', 'marginaAl', 'marginalXXXIVv', 'Text', 'Continued', 'summary', 'proepirrheme', 'mesode', '*yh/fisma', 'prose', 'agreement', 'marginal919a', 'fragment', '*grafh/', 'num', 'footnote1', 'commentary', '*)ekmarturi/a', 'law', 'marginaΕl', '*xro/nos', 'festival', 'alternative', 'subsection', 'noparse', 'section', '*)ara/', 'challenge', 'footnτote', 'margin', 'eleg', 'meter', 'toc', 'footno1te', 'index', 'ethnic', 'Book', 'decree', 'marpginal', 'winner', 'boo', 'altnum', 'marginale', 'trimeter', 'Agon', 'episode', 'schedule', 'catchword', '*dialogismo\\s tw=n *(hmerw=n', 'marginai', 'Parodos', 'dates', 'footnte', 'marginalB', 'margina6l', 'margina70rl', 'footno3te', '*yhfi/smata', 'marginalW', 'proagon', 'prelude', 'salutation', 'margial', 'poem', 'monody', 'indictment', 'oath', '*sunhtopi/a *boiwtw=n kai\\ *fwke/wn', 'editorial', 'sling', 'testimonium', 'marginalHdt.', 'epirrheme', 'verse paraphrase', '*do/gma *summa/xwn', 'troch', '*sunqh=kai', 'Τfootnote', 'Antikatakeleusmos', 'lease', 'corr', 'strophe', 'footote', '*no/mos', 'continued', 'antepirrheme', 'Epirrheme', 'Lyric-Scene', 'iamb', 'footnot2e', 'm5arginal', 'translation', 'worktitle', 'margi4nal', 'antiproepirrheme', 'verse', 'will', 'names', 'resolution', 'marginal77v', 'antistrophe', '*do/gma *sune/drwn', 'dactyls', 'witnesses', 'inscription', 'group', 'footnote', 'mentioned', 'verse-paraphrase', 'clause', 'margina', 'depositions', 'foonote', 'chapter', 'footn2ote', 'subscription', 'Verse', 'nomorph', 'fo1otnote', 'intro', 'prologue', 'reply', 'Episode', 'Katakeleusmos', 'margpinal', 'constellation', 'elegiacs', 'antikatakeleusmos', 'explanation', 'place', 'language', 'desc', 'footnote2', 'tetralogy', 'marginal', 'part', 'nomSac', 'Choral', 'katakeleusmenos', 'trochees', '*marturi/ai', 'deposition', 'foo1tnote', 'month', 'marginalE', 'inscript', 'parabasis', 'marginapl', 'märginal', 'speaker', 'marginalC', 'subtitle', 'antipnigos', 'dact', 'suggestion', 'counter-plea', 'person', 'Extract', 'pnigos', 'direct', 'subtext', 'unspecified', 'katakeleusmos', 'textpart', 'term', 'emph', 'marginl', 'dialogue', '*diaqh=kai', 'close', 'Prologue', 'marginalp', 'Name', 'witness', 'terms', 'marginaDl', 'orig', '*ma/rtures', 'race', 'text', '*)epistolh/', 'header', 'footnot1e', 'foodnote', 'marinalΑB', 'iambic', '*(/orkoi', 'title', 'main', 'epode', 'book', 'marginaΣl', 'sub', 'choral', 'letter', 'oracle', 'Papyr', 'antikatakeleusmenos', 'marginael', 'paraphrase', 'iambics', 'antepirrhema', '*yh/fisma peri\\ *dwrea\\s toi=s a)po\\ *fulh=s', 'Exodus', 'drama', 'margipnal', 'lyric', 'fotnote', 'bibliography', 'spoken', 'lemma', 'Prose', 'margnal', '*no/moi', 'argument', 'epirrhema', 'edition', 'work', 'margina15vl'}
attrib_subtype = {'hexameter', 'Parabasis', 'tetrameters', 'anapests', 'antiprelude', 'source', 'sentence', 'comment', 'page', 'Letter', 'fragment', 'conspectus', 'Antepirrheme', 'commentary', 'TOC', 'subsection', 'section', 'quaestio', 'subdsection', 'toc', 'auctorm', 'index', 'fabula', ' chapter', 'epistle', 'ephymn.', 'Book', 'chapterer', 'preface', 'exordium', 'Agon', 'castlist', 'episode', 'Parodos', 'proagon', 'prelude', 'poem', 'monody', 'chap0ter', 'epirrheme', 'ephymnion', 'Antikatakeleusmos', 'chaptser', 'strophe', 'dramatispersonae', 'line', 'antepirrheme', 'Epirrheme', 'sectionn', 'Lyric-Scene', 'ii_loci', 'sction', 'sigla', 'auctores', 'chaspter', 'verse', 'antistrohe', 'Pnigos', 'ancient', 'Antipnigos', 'antistrophe', 'volume', 'dactyls', 'haeresis', 'wolfii', 'chapter', 'appendix', ' section', 'iii_loci', 'number', 'Episode', 'Katakeleusmos', 'paragraph', 'antikatakeleusmos', 'ephymn', 'aphorism', 'corrigenda', 'part', 'Choral', 'hypothesis', 'katakeleusmenos', 'trochees', 'subection', 'parabasis', 'essay', 'proode', 'autorum', 'antipnigos', 'pnigos', 'kommos', 'epigram', 'katakeleusmos', 'addenda', 'dialogue', 'close', 'Prologue', 'supplementa', 'setion', 'ducangii', 'setence', 'entry', 'chapter1', 'index.1', 'chapters', 'epode', 'book', 'epigraph', 'speech', 'loci', 'letter', 'choral', 'antikatakeleusmenos', 'iambics', 'trochaic', 'iv_loci', 'Exodus', 'type', 'lyric', 'index.2', 'homilia', 'work'}
tag_names = {'head', 'pb', 'note', 'hi', 'lg', 'gap', 'div1', 'seg', 'div2', 'sic', 'del', 'add', 'milestone', 'title', 'q', 'div', 'p', 'l', 'lb', 'argument', 'sp', 'div3', 'num', 'quote', 'speaker', 'bibl', 'date', 'ab', 'lemma', 'foreign'} # Biblical and Patristic literature only


# Configuration of the TF director

The function `authorWork(path)` reads some metadata from the sourcefiles to process them properly. Then we process the xml-files by reading them and calling the `cv.walk()` function. As a result, valid TF-packages should be produced.

In [None]:
def authorWork(path):
    author = None
    editor = None
    book = None
    afound = False
    efound = False
    bfound = False
    TitleStmt = False
    metaTaglist = []
    metaData = {}
    
    with open(path) as xml:
        data = ' '.join([line.strip() for line in list(takewhile(lambda line: not bodyStartRE.search(line), xml.readlines()))])\
                      .replace('<', '#!#<')\
                      .replace('>', '>#!#')\
                      .split('#!#')
        for elem in data:
            elem = elem.strip('{ ,.}')
            if elem == '':
                continue
            if elem.startswith('<body'):
                break
            elif elem.startswith('<titleStmt'):
                TitleStmt = True
            elif elem.startswith('</titleStmt'):
                TitleStmt = False
            elif TitleStmt == True:
                if elem.startswith('<'):
                    tag_split = elem.find(' ') if not elem.find(' ') == -1 else elem.find('>')
                    metaTaglist.append(elem[1:tag_split])
                else:
                    if metaTaglist[-1] in metaData:
                        metaData[metaTaglist[-1]] += \
                            f', {elem}' if not elem in metaData[metaTaglist[-1]] else ''
                    else:
                        metaData[metaTaglist[-1]] = elem
        if not 'author' in metaData and not 'editor' in metaData:
            TitleStmt = False
            for elem in data:
                elem = elem.strip('{ ,.}')
                if elem.startswith('<body'):
                    break
                if elem == '':
                    continue 
                elif elem.startswith('<biblStruct'):
                    TitleStmt = True
                elif elem.startswith('</biblStruct'):
                    TitleStmt = False
                elif TitleStmt == True:
                    if elem.startswith('<'):
                        tag_split = elem.find(' ') if not elem.find(' ') == -1 else elem.find('>')
                        metaTaglist.append(elem[1:tag_split])
                    else:
                        if metaTaglist[-1] in metaData:
                            metaData[metaTaglist[-1]] += \
                                f', {elem}' if not elem in metaData[metaTaglist[-1]] else ''
                        else:
                            metaData[metaTaglist[-1]] = elem

    author = metaData['author'].title() if 'author' in metaData \
                else metaData['editor'].title() if 'editor' in metaData \
                else 'undefined'
    
    book = metaData['title'].replace('(Greek)', '').replace('.', '').replace(',', '').replace('Machine readable text', '').strip().title()
    return (author, book)

COUNTER1 = 0
COUNTER2 = 0

for xmlfile in glob.glob(SRC_DIR+'/**/*grc*.xml', recursive=True):
# for xmlfile in glob.glob(SRC_DIR + '/canonical-greekLit/data/tlg0059/tlg004/tlg0059.tlg004.perseus-grc2.xml'):
# for xmlfile in glob.glob(SRC_DIR + '/tlg2042'+'/**/*grc*.xml', recursive=True):
# for xmlfile in glob.glob(SRC_DIR + '/tlg0031/tlg004/tlg0031.tlg004.perseus-grc2.xml'):
# for xmlfile in glob.glob(SRC_DIR + '/tlg0555/tlg002/tlg0555.tlg002.opp-grc1.xml'):
# for xmlfile in glob.glob(SRC_DIR + '/tlg0555/tlg001/tlg0555.tlg001.opp-grc1.xml'):
# for xmlfile in glob.glob(SRC_DIR +'/tlg0555/**/*grc*.xml', recursive=True):
    if COUNTER1 >= 1:
        print('\n\n')
    COUNTER1 +=1

    tm.info(f'parsing {xmlfile}\n')
    (author, book) = authorWork(xmlfile)
    if os.path.isdir(f'{TF_DIR}/{author}/{book}/tf/{VERSION}'):
        C = 1
        while os.path.isdir(f'{TF_DIR}/{author}/{C}_{book}/tf/{VERSION}'):
            C +=1
        else:
            TF_PATH = f'{TF_DIR}/{author}/{C}_{book}/tf/{VERSION}'
    else:
        TF_PATH = f'{TF_DIR}/{author}/{book}/tf/{VERSION}'
    TF = Fabric(locations=TF_PATH)
    cv = CV(TF)
    x = Conversion(xmlfile)
    slotType = 'word'
    good = cv.walk(
        x.director,
        x.slotType,
        otext=x.otext,
        generic=x.generic,
        intFeatures=x.intFeatures,
        featureMeta=x.featureMeta,
        warn=False,
    )
    if good: COUNTER2 +=1
tm.info(f'{COUNTER2} of {COUNTER1} works have successfully been converted!')
lemmatizer_open.close()

In [None]:
import re
kwargs = {'attrib_errors': {'Eusebus': 'Eusebius'}}
elem = '<div textpart = "   chapter" ref=" Eusebus   ">' 

def attribClean(elem, **kwargs):
    elem = elem.strip('<>\ ')
    elem = re.sub(r'\s*=\s*"\s*', '="', elem)
    tag = elem[:elem.find(' ')]
    attribs = {k.strip(): v.strip('" ') for k, v in [elem.split('="') \
        for elem in elem[elem.find(' '):].split('" ')]}
    if 'attrib_errors' in kwargs:
        attribs = {k: (kwargs['attrib_errors'][v] if v in kwargs['attrib_errors'] else v)\
                   for k, v in attribs.items()}
    return (tag, attribs)

attribClean(elem, **kwargs)

In [None]:
s = '<di t/ >'
print(s.strip('<>/ '))

In [None]:
def attribClean(elem):
    elem = elem.strip('<> ')
    elem = re.sub(r'\s*=\s*"\s*', '="', elem)
    tag = elem[:elem.find(' ')]
    attribs = {k.strip(): v.strip('" ') for k, v in [elem.split('="') \
        for elem in elem[elem.find(' '):].split('" ')]}
    if 'attrib_errors' in kwargs:
        attribs = {k: (kwargs['attrib_errors'][v] if v in kwargs['attrib_errors'] else v)\
                   for k, v in attribs.items()}
    return attribs

In [None]:
class Test():
    name = 'tester'
    
    @classmethod
    def normalize(cls, text):
        return text.lower() + cls.name

dictio = {'norm': Test.normalize}
        
test = {
    'lang': 'Custom',
    'version': '1.0',
    'slot_type': 'word',
    'udnorm': 'NFD',
    'dir_struct': ['author', 'book', 'editor'],
    'sentence_delimit': ['.', ';'],
    'lang_processor': Test,
}

# x = test['lang_processor']('test')
print(dictio['norm']('DiT IS een TeST'))

In [None]:
from .helpertools.tokenizer import splitWord

splitWord('.,dit.is?!')

In [None]:
def tokenize(string):
    '''This basic tokenize method splits a string 
    on spaces, without returning empty strings.
    '''
    return list(filter(None, string.strip().split(' ')))
#     return string.split(' ')

s = ' dit   is een   hele mond   vol  '
tokenize(s)

In [None]:
dictio = {"a": (1,), "b": (1, 2, 3, 4), "n": (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}
# print(max(dictio, key=lambda key: len(v) for k, v in dictio.items() ))
max(dictio, key=lambda key: dictio[key])

In [None]:


dictio = {
    'text_formats':     {'orig': {'structure': '{orig} ',
                                  'metadata': 'original format of the word including punctuation'},
                         'main': {'structure': '{main} ',
                                  'metadata': 'normalized format of the word excluding punctuation'},
                         'norm': {'structure': '{norm} ',
                                  'metadata': 'normalized format (James Tauber) of the word excluding punctuation'},
                         'plain': {'structure': '{plain} ',
                                   'metadata': 'plain format in lowercase'},
                         'beta_plain': {'structure': '{beta_plain} ',
                                        'metadata': 'plain format in lowercase betacode (=Greek in Roman characters'},
                         'lemma': {'structure': '{lemma} ',
                                   'metadata': 'possible lemmata of the original words'},
                        },
}

# for i in dictio['text_formats']:
#     dic[i] = dictio['text_formats'][i]['structure']
    
dic = {**{k: v['structure'] for k, v in dictio['text_formats'].items()}, **{1:1}}
print(dic)



In [None]:
plainLow('ἔχοντα/ἔχονται')

In [None]:
check = ['a', 'b', 'c', 'd', 'e']
index = check.index('c')
print(check[:index-1:-1])

In [None]:
from pprint import pprint

langsettings = {'text_formats': {'fmt:text-orig-full': {'name': 'orig',
                                                'format': '{pre}{orig}{post} ',
                                                'metadata': 'original format of the word including punctuation'},
                         'fmt:text-orig-main': {'name': 'main',
                                                'format': '{main} ',
                                                'metadata': 'normalized format of the word excluding punctuation'},
                         'fmt:text-orig-norm': {'name': 'norm', 
                                                'format': '{norm} ',
                                                'metadata': 'normalized format (James Tauber) of the word excluding punctuation'},
                         'fmt:text-orig-plain': {'name': 'plain',
                                                'format': '{plain} ',
                                                'metadata': 'plain format in lowercase'},
                         'fmt:text-orig-beta-plain': {'name': 'beta_plain',
                                                'format': '{beta_plain} ',
                                                'metadata': 'plain format in lowercase betacode (=Greek in Roman characters'},
                         'fmt:text-orig-lemma': {'name': 'lemma',
                                                'format': '{lemma} ',
                                                'metadata': 'possible lemmata of the original words'},
                        },}
# print(langsettings['text_formats'])
pprint({v['name']: v['metadata'] for k, v in langsettings['text_formats'].items()},)

In [None]:
p = OrderedSet()
p.update(('a', 'b'))
print(p)

In [None]:
key = 'name'
value = 'value'
dictio = {}
dictio.setdefault(key, {}).update(value)
print(dictio)