
# TEI XML to Text-Fabric Convertor

XML-TEI textfiles can be converted to [Text-Fabric format](https://dans-labs.github.io/text-fabric/Model/File-formats/) by using this convertor. It has been designed for Greek, but it should also work with minimal adjustments for other languages (except for the implemented lemmatizer).

See this [readme](https://github.com/pthu/patristics) for more information about the corpus and this work.

See this [notebook](https://nbviewer.jupyter.org/github/annotation/banks/blob/master/programs/convert.ipynb) for a simple setup for a tf conversion if you like to build your own convertor.

In [None]:
import re
import collections
import pickle
import betacode.conv
# import ray

from multiprocessing import Pool
from pprint import pprint
from os import path
from glob import glob
from collections import OrderedDict, namedtuple
from itertools import takewhile
from ordered_set import OrderedSet
from unicodedata import category, normalize
from tf.fabric import Fabric, Timestamp
from tf.convert.walker import CV
from pprint import pprint
# from cltk.corpus.greek.beta_to_unicode import Replacer
from cltk.corpus.greek.alphabet import filter_non_greek
from greek_normalisation.normalise import Normaliser
# from greek_normalisation.norm_data import ELISIONS, MOVABLE

# Local imports
from helpertools.lemmatizer import lemmatize
from helpertools.unicodetricks import *
from helpertools.xmlparser import xmlSplitter, dataParser, metadataReader, attribsAnalysis #, lenAttribsDict, sectionElems
from tf_config import langsettings
from data.tlge_metadata import tlge_metadata
from data.attrib_errors import error_dict
from convertor_metadata import convertor_metadata


In [None]:
# Initiate timer
tm = Timestamp()
# Initiate multiprocessing tool
# ray.init()

class csvConversion:
    def __init__(self, data, metadat, header=True, sLemmatizer=None, lang='generic', **kwargs):
        self.data              = data
        self.lang              = lang
        self.langsettings      = kwargs
        self.metadata          = metadat
        self.header            = header
        self.sections          = list(filter(None, data[0].split('\t')[:-1])) if header == True \
                                 else ( list(filter(None, self.metadata['citation_scheme'].split('/'))) if 'citation_scheme' in self.metadata \
                                        else list(filter(None, input("No header data could be found; please enter an appropriate header: ").split())) )
        self.structs           = list(tuple(self.sections) + ('_sentence', '_phrase'))
        self.lemmatizer        = sLemmatizer

        # TF SPECIFIC VARIABLES
        self.slotType         = self.langsettings['slot_type']
        self.intFeatures      = set()
        self.generic          = metadat
                        
        # Definition of text formats
        self.otext = {
            **{k: v['format'] for k, v in self.langsettings['text_formats'].items()}, \
            **{'sectionTypes': f'{",".join(self.sections[:2] + [self.sections[-1]] if len(self.sections) > 2 else self.sections)}'}, \
            **{'sectionFeatures': f'{",".join(self.sections[:2] + [self.sections[-1]] if len(self.sections) > 2 else self.sections)}'}, \
            **{'structureTypes': f'_book,{",".join(self.structs)}'}, \
            **{'structureFeatures': f'_book,{",".join(self.structs)}'}
        }

        # These are the feature metadata that are present in all tf-packages to be produced... 
        # Other metadata will be added during the run of the director()...
        self.featureMeta = {v['name']: {'description': v['metadata']} \
                            for k, v in self.langsettings['text_formats'].items()}
        
        
    def director(self, cv):
        nonIntFeatures = {'otype', 'oslots',}
        counter = dict(_sentence=1, _phrase=1)
        cur = {}
        linked_features_dict = {}
        lemma_counter = [0, 0]
        udnorm = self.langsettings['udnorm']
        
        cur['_book'] = cv.node('_book')
        cv.feature(cur['_book'], _book=self.metadata['title'])
        cv.meta('_book', description=self.metadata['title_full'])
        nonIntFeatures.add('_book')
        
        if self.header == True:
            csv_data = self.data[1:]
        else:
            csv_data = self.data
            
        head = False
#         last_ref = None
        res_text = None
        non_splitters = self.langsettings['tokenizer_args']['non_splitters']
        
        for line in csv_data:
#             print(f'line = {line}')
            splitline = line.split('\t')
#             print(f'splitline = {splitline}')
            ref = splitline[:-1]
#             print(f'ref = {ref}')
            text = splitline[-1].strip()
#             print(f'text = {text}')
            
            # Handle sectioning
            ind = 0
            for sec in self.sections:
                num = ind + 1
                if sec in cur and cv.active(cur[sec]):
                    cur_sec = cv.get(sec, cur[sec])
                    new_sec = ref[ind]
                    if not cur_sec == new_sec:
                        for s in self.sections[:ind:-1]:
                            cv.terminate(cur[s])
                        cv.terminate(cur[sec])
                        cur[sec] = cv.node(sec)
                        cv.feature(cur[sec], **{sec: ref[ind]})
                        cv.meta(sec, description=f'structure feature of the {num}{"st" if num == 1 else ""}{"nd" if num == 2 else ""}{"rd" if num == 3 else ""}{"th" if num > 3 else ""} level',)
                else:
                    cur[sec] = cv.node(sec)
                    cv.feature(cur[sec], **{sec: ref[ind]})
                    cv.meta(sec, description=f'structure feature of the {num}{"st" if num == 1 else ""}{"nd" if num == 2 else ""}{"rd" if num == 3 else ""}{"th" if num > 3 else ""} level',)
                if not ref[ind].isdigit():
                    nonIntFeatures.add(sec)
                ind +=1    
#             last_ref = ref
            
            # Handle text
            if res_text != None:
                text = res_text + text
                res_text = None
            if text.endswith(non_splitters):
                ptext = text.split(' ')
                res_text = ptext[-1].rstrip(''.join(non_splitters))
                text = ' '.join(ptext[:-1])

            if self.lang == 'greek':
                try:
                    text.encode('ascii')
                    text = normalize(udnorm, betacode.conv.beta_to_uni(text))
                except UnicodeEncodeError:
                    text = normalize(udnorm, text)

            for token in self.langsettings['tokenizer'](text, **self.langsettings['tokenizer_args']):
#                 print(f'token = {token}')
                pre, origword, post = token
                            
                # Handle headers
                if '{' in pre:
                    head = True
                
                if not plainLow(token[1]):
                    if set(pre) & self.langsettings['sentence_delimit']:
                        if '_phrase' in cur:
                            cv.terminate(cur['_phrase'])
                            counter['_phrase'] +=1
                        if '_sentence' in cur:
                            cv.terminate(cur['_sentence'])
                            counter['_sentence'] +=1
                    try:
                        cv.resume(w)
                        orig = cv.get('orig', w) + pre
                        try:
                            post = cv.get('post', w) + pre
                            cv.feature(w, post=post)
                        except:
                            pass
                        cv.feature(w, orig=orig)
                        cv.terminate(w)
                        continue
                    except UnboundLocalError:
                        continue
                for s in ('_phrase', '_sentence'):
                    if s not in cv.activeTypes():
                        cur[s] = cv.node(s)
                        cv.feature(cur[s], **{s: counter[s]})  

                token_norm = self.langsettings['replace_func'](token) \
                          if 'replace_func' in self.langsettings \
                          else token
#                     print(f'token_norm = {token_norm}')
                _, words, __ = token_norm
                            
                if head == True:
                    if '}' in post:
                        head = False
                    if not '}' in pre:
                        content = normalize(udnorm, f'{pre}{origword}{post}')
                        if 'head' in cur and not cv.linked(cur['head']):
                            content = f"{cv.get('head', cur['head'])} {content}"
                            cv.feature(cur['head'], **{'head': content})
                            continue
                        if 'head' in cv.activeTypes() and cv.linked(cur['head']):
                            cv.terminate(cur['head'])
                        cur['head'] = cv.node('head')
                        cv.feature(cur['head'], **{'head': content})
                        cv.meta('head', description="open tag without further specification. See the name of the .tf-file for it's meaning",)
                        nonIntFeatures.add('head')
                        continue
                    else:
                        head = False
                        if 'head' in cur and not cv.linked(cur['head']):
                            content = f"{cv.get('head', cur['head'])} {pre}"
                            cv.feature(cur['head'], **{'head': content})
                            
                words = tuple((('', word, '') for word in words.split(' ')))
                preAssigned = False
                origAssigned = False
                postAssigned = False
                for word in words:
                    w = cv.slot()
                    if preAssigned == False:
                        cv.feature(w, pre=pre)
                        cv.meta('pre', description='pre gives non-letter characters at the start of a word',)
                        nonIntFeatures.add('pre')
                        preAssigned = True
                    if postAssigned == False:
                        cv.feature(w, post=post)
                        cv.meta('post', description='post gives non-letter characters at the end of a word',)
                        nonIntFeatures.add('post')
                        postAssigned = True

                    for _, form in self.langsettings['text_formats'].items():
                        name = form['name']
                        func = form['function']
                        meta = form['metadata']
                        nonIntFeatures.add(name)

                        if name == 'orig':
                            if origAssigned == False:
                                cv.feature(w, **{name: func(token)})
                                cv.meta(name, description=meta)
                                origAssigned = True
                            else:
                                cv.feature(w, **{name: ''})
                        elif name == 'lemma':
                            if len(words) == 1:
                                lemma = func(token, self.lemmatizer)
                            else:
                                lemma = func(word, self.lemmatizer)
                            cv.feature(w, **{name: lemma})
                            cv.meta(name, description=meta)
                            if lemma.startswith('*'):
                                lemma_counter[1] +=1
                            else:
                                lemma_counter[0] +=1
#                             print(f'lemma = {lemma}')
#                             print(f'lemma_counter = {lemma_counter}')
                        else:
                            cv.feature(w, **{name: func(word)})
                            cv.meta(name, description=meta)

                if post != '':
                    cv.feature(w, post=post)
                    cv.meta('post', description='post gives non-letter characters at the end of a word',)
                    nonIntFeatures.add('post')
                    if set(post) & self.langsettings['phrase_delimit'] | self.langsettings['sentence_delimit']: # and TEXT == True:
                        cv.terminate(cur['_phrase'])
                        counter['_phrase'] +=1
                    if set(post) & self.langsettings['sentence_delimit']: # and TEXT == True:
                        cv.terminate(cur['_sentence'])
                        counter['_sentence'] +=1
            continue
        
        for ntp in self.structs[::-1]:
            if ntp in cur: cv.terminate(cur[ntp])
        for ntp in cur:
            if ntp in cur: cv.terminate(cur[ntp])

        if not lemma_counter == [0, 0]:
            cv.meta('lemma', **{'coverage_ratio': f'{round(lemma_counter[0] / ((lemma_counter[0] + lemma_counter[1]) / 100 ), 2)}%'})
        cv.meta('_sentence', description=f"sentences defined by the following delimiters: {self.langsettings['sentence_delimit']}",)
        cv.meta('_phrase', description=f"phrases defined by the following delimiters: {self.langsettings['phrase_delimit']}",)
        for feature in cv.metaData:
            if feature in nonIntFeatures:
                cv.meta(feature, valueType='str')
            else:
                if feature == "":
                    pass
                else:
                    cv.meta(feature, valueType='int')

        

In [None]:

class xmlConversion:
    def __init__(self, data, meta, sLemmatizer=None, lang='generic', **kwargs):
        self.data              = data
        self.lang              = lang
        self.langsettings      = kwargs
        self.analyzed_dict, \
        self.sections          = attribsAnalysis(self.data, lang=self.lang, **kwargs)
        self.structs           = tuple(tuple(self.sections) + ('_sentence', '_phrase'))
        self.metadata          = meta
        self.lemmatizer        = sLemmatizer

        # TF SPECIFIC VARIABLES
        self.slotType         = self.langsettings['slot_type']
        self.intFeatures      = set()
        self.generic          = meta
#         pprint(self.generic)
                        
        # Definition of text formats
        self.otext = {
            **{k: v['format'] for k, v in self.langsettings['text_formats'].items()}, \
            **{'sectionTypes': f'{",".join(self.sections[:2] + [self.sections[-1]] if len(self.sections) > 2 else self.sections)}'}, \
            **{'sectionFeatures': f'{",".join(self.sections[:2] + [self.sections[-1]] if len(self.sections) > 2 else self.sections)}'}, \
            **{'structureTypes': f'_book,{",".join(self.structs)}'}, \
            **{'structureFeatures': f'_book,{",".join(self.structs)}'}
        }

        # These are the feature metadata that are present in all tf-packages to be produced... 
        # Other metadata will be added during the run of the director()...
        self.featureMeta = {v['name']: {'description': v['metadata']} \
                            for k, v in self.langsettings['text_formats'].items()}

    
    def director(self, cv):
        nonIntFeatures = {'otype', 'oslots',}
        counter = dict(_sentence=1, _phrase=1)
        cur = {}
        tagList = []
        linked_features_dict = {}
        lemma_counter = [0, 0]
        udnorm = self.langsettings['udnorm']
        
        tagList.append('_book')
        cur['_book'] = cv.node('_book')
        cv.feature(cur['_book'], _book=self.metadata['title'])
        cv.meta('_book', description="bookname as given in the metadata part of the xml")
        nonIntFeatures.add('_book')
        
        for code, content in self.data:
            if code == 'text':
                if tagList[-1] in self.langsettings['non_text_tags']:
                    tag = tagList[-1]
                    content = normalize(udnorm, content)
                    if not cv.linked(cur[tag]):
                        content = f'{cv.get(tag, cur[tag])} {content}'
                        cv.feature(cur[tag], **{tag: content})
                    else:
                        cv.feature(cur[tag], **{tag: content})
                        cv.meta(tag, description="open tag without further specification. See the name of the .tf-file for it's meaning",)
                    nonIntFeatures.add(tag)
                    continue
     
                if self.lang == 'greek':
                    try:
                        content.encode('ascii')
                        content = normalize(udnorm, betacode.conv.beta_to_uni(content))
                    except UnicodeEncodeError:
                        content = normalize(udnorm, content)
                if not set(self.structs) <= cv.activeTypes():        
                    for struct in self.structs:
                        if struct not in cv.activeTypes() and struct not in {'_phrase', '_sentence'}:
                            cur[struct] = cv.node(struct)
                            cv.feature(cur[struct], **{struct: 0})
                         
                for token in self.langsettings['tokenizer'](content, **self.langsettings['tokenizer_args']):
#                     print(f'token = {token}')
                    pre, _, post = token
                    if not plainLow(token[1]):
                        if set(pre) & self.langsettings['sentence_delimit']:
                            if '_phrase' in cur:
                                cv.terminate(cur['_phrase'])
                                counter['_phrase'] +=1
                            if '_sentence' in cur:
                                cv.terminate(cur['_sentence'])
                                counter['_sentence'] +=1
                        try:
                            cv.resume(w)
                            orig = cv.get('orig', w) + pre
                            try:
                                post = cv.get('post', w) + pre
                                cv.feature(w, post=post)
                            except:
                                pass
                            cv.feature(w, orig=orig)
                            cv.terminate(w)
                            continue
                        except UnboundLocalError:
                            continue
                    for s in ('_phrase', '_sentence'):
                        if s not in cv.activeTypes():
                            cur[s] = cv.node(s)
                            cv.feature(cur[s], **{s: counter[s]})  
                            
                    token_norm = self.langsettings['replace_func'](token) \
                              if 'replace_func' in self.langsettings \
                              else token
#                     print(f'token_norm = {token_norm}')
                    _, words, __ = token_norm
                    words = tuple((('', word, '') for word in words.split(' ')))
                            
                    preAssigned = False
                    origAssigned = False
                    postAssigned = False
                    for word in words:
                        w = cv.slot()
                        if preAssigned == False:
                            cv.feature(w, pre=pre)
                            cv.meta('pre', description='pre gives non-letter characters at the start of a word',)
                            nonIntFeatures.add('pre')
                            preAssigned = True
                        if postAssigned == False:
                            cv.feature(w, post=post)
                            cv.meta('post', description='post gives non-letter characters at the end of a word',)
                            nonIntFeatures.add('post')
                            postAssigned = True
        
                        for _, form in self.langsettings['text_formats'].items():
                            name = form['name']
                            func = form['function']
                            meta = form['metadata']
                            nonIntFeatures.add(name)
                            
                            if name == 'orig':
                                if origAssigned == False:
                                    cv.feature(w, **{name: func(token)})
                                    cv.meta(name, description=meta)
                                    origAssigned = True
                                else:
                                    cv.feature(w, **{name: ''})
                            elif name == 'lemma':
                                if len(words) == 1:
                                    lemma = func(token, self.lemmatizer)
                                else:
                                    lemma = func(word, self.lemmatizer)
                                cv.feature(w, **{name: lemma})
                                cv.meta(name, description=meta)
                                if lemma.startswith('*'):
                                    lemma_counter[1] +=1
                                else:
                                    lemma_counter[0] +=1
                            else:
                                cv.feature(w, **{name: func(word)})
                                cv.meta(name, description=meta)
        
                    if post != '':
                        cv.feature(w, post=post)
                        cv.meta('post', description='post gives non-letter characters at the end of a word',)
                        nonIntFeatures.add('post')
                        if set(post) & self.langsettings['phrase_delimit'] | self.langsettings['sentence_delimit']: # and TEXT == True:
                            cv.terminate(cur['_phrase'])
                            counter['_phrase'] +=1
                        if set(post) & self.langsettings['sentence_delimit']: # and TEXT == True:
                            cv.terminate(cur['_sentence'])
                            counter['_sentence'] +=1
                continue
        
        
            elif code == 'closeTag':
                if tagList[-1] in self.sections:
                    index = self.sections.index(tagList[-1])
                    for ntp in self.sections[:index:-1]:
                        if ntp in cur: 
                            if not cv.linked(cur[ntp]):
                                cv.slot()
                            cv.terminate(cur[ntp])
                    if not cv.linked(cur[tagList[-1]]):
                        cv.slot()
                    
                elif tagList[-1] in self.langsettings['non_text_tags']:
                    del tagList[-1]
                    continue
                if tagList[-1] in linked_features_dict:
                    for i in linked_features_dict[tagList[-1]]:
                        cv.terminate(cur[i])
                    del linked_features_dict[tagList[-1]]
                cv.terminate(cur[tagList[-1]])
                del tagList[-1]
                continue
                
                
            elif code in {'openAttrTag','closedAttrTag'}:
                tag_name, attribs = content
                value_key, name_keys = self.analyzed_dict[tag_name]
                value = attribs[value_key]
                if name_keys == 'tag':
                    name = tag_name[0]
                else:
                    name = '-'.join([attribs[key] for key in name_keys])
                if not value.isdigit():
                    nonIntFeatures.add(name)
                if code == 'openAttrTag':
                    tagList.append(name)
                if name in self.structs:
                    if name in cur and cv.get(name, cur[name]) == 0 and value.isdigit():
                        cv.feature(cur[name], **{name: str(int(value) - 1)})
                    ind = self.structs.index(name)
                    for struct in self.structs[:ind:-1]:
                        if struct in cur: 
                            if not cv.linked(cur[struct]):
                                cv.slot()
                            cv.terminate(cur[struct])
                    if name in cur:
                        if not cv.linked(cur[name]):
                            cv.slot()
                        cv.terminate(cur[name])
                    for struct in self.structs[:ind]:
                        if not struct in cv.activeTypes():
                            cur[struct] = cv.node(struct)
                            cv.feature(cur[struct], **{struct: 0})
                    cur[name] = cv.node(name)
                    cv.feature(cur[name], **{name: value})
                    cv.meta(name, description=f'structure feature of the {ind}{"st" if ind == 1 else ""}{"nd" if ind == 2 else ""}{"rd" if ind == 3 else ""}{"th" if ind > 3 else ""} level',)
                else:
                    if name in cur and cv.linked(cur[name]):
                        cv.terminate(cur[name])
                        cur[name] = cv.node(name)
                        cv.feature(cur[name], **{name: value})
                    elif name in cur and not cv.linked(cur[name]):
                        pass
                    else:
                        cur[name] = cv.node(name)
                        cv.feature(cur[name], **{name: value})
                        cv.meta(name, description='no feature metadata have been provided; look at the name of the feature and at the data itself to get some clues')
                if set(attribs) & self.langsettings['feature_attribs']:
                    features = tuple(set(attribs) & self.langsettings['feature_attribs'])
                    for f in features:
                        if f in cur:
                            cv.terminate(cur[f])
                        cur[f] = cv.node(f)
                        cv.feature(cur[f], **{f: attribs[f]})
                        cv.meta(f, description='no feature metadata have been provided; look at the name of the feature and at the data itself to get some clues')
                        if not attribs[f].isdigit():
                            nonIntFeatures.add(f)
                        if name in linked_features_dict:
                            linked_features_dict[name].append(f)
                        else:
                            linked_features_dict[name] = [f]
                continue

            
            elif code == 'openTag':
                tag_name = content
                tagList.append(tag_name)
                if tag_name in self.langsettings['non_text_tags']:
                    nonIntFeatures.add(tag_name)
                    if not tag_name in cur:
                        cur[tag_name] = cv.node(tag_name)
                        cv.feature(cur[tag_name], **{tag_name: ''})
                        cv.meta(tag_name, description="open tag without further specification. See the name of the .tf-file for it's meaning",)
                        continue
                    elif tag_name in cur and cv.linked(cur[tag_name]):
                        cv.terminate(cur[tag_name])
                        cur[tag_name] = cv.node(tag_name)
                        cv.feature(cur[tag_name], **{tag_name: ''})
                        cv.meta(tag_name, description="open tag without further specification. See the name of the .tf-file for it's meaning",)
                        continue
                    else:
                        continue
                else:
                    if tag_name in cur:
                        cv.terminate(cur[tag_name])
                    if tag_name in counter:
                        counter[tag_name] +=1
                    else:
                        counter[tag_name] = 1
                cur[tag_name] = cv.node(tag_name)
                cv.feature(cur[tag_name], **{tag_name: counter[tag_name]})
                cv.meta(tag_name, description="open tag without further specification. See the name of the .tf-file for it's meaning",)
                continue
      
        
            elif code == 'openCloseTag':
#                 tag_name = content[1:-2]
                tag_name = content
                counter[tag_name] = 1 if tag_name not in counter else counter[tag_name] + 1
                if tag_name in cur: cv.terminate(cur[tag_name])
                cur[tag_name] = cv.node(tag_name)
                cv.feature(cur[tag_name], **{tag_name: counter[tag_name]})
                cv.meta(tag_name, description="open-close-tag without further specification. See the name of the .tf-file for it's meaning",)
                continue
                
            elif code == 'comment':
                continue
                
            elif code == 'bodyStop':
                for ntp in cur:
                    if not ntp in self.sections and not ntp == '_book':
#                         if not cv.linked(cur[ntp]):
#                             cv.slot()
                        cv.terminate(cur[ntp])
                for ntp in self.sections[::-1]:
                    if not cv.linked(cur[ntp]):
                        cv.slot()
                    cv.terminate(cur[ntp])
                cv.terminate(cur['_book'])
                if tagList:
                    del tagList[-1]
                break
        if not lemma_counter == [0, 0]:
            cv.meta('lemma', **{'coverage_ratio': f'{round(lemma_counter[0] / ((lemma_counter[0] + lemma_counter[1]) / 100 ), 2)}%'})
        cv.meta('_sentence', description=f"sentences defined by the following delimiters: {self.langsettings['sentence_delimit']}",)
        cv.meta('_phrase', description=f"phrases defined by the following delimiters: {self.langsettings['phrase_delimit']}",)
        for feature in cv.metaData:
            if feature in nonIntFeatures:
                cv.meta(feature, valueType='str')
            else:
                if feature == "":
                    pass
                else:
                    cv.meta(feature, valueType='int')
        # Final check of tags
        tm.indent(level=1)
        if len(tagList) == 0:
            tm.info('No tag mistake(s) found...')
        else:
            tm.info(str(len(tagList)) + ' tag error(s) found.')


In [None]:
def convert(input_path, output_path, lang='generic',
            version='2.0', metadata=convertor_metadata, langsettings=langsettings):
    '''The convert function is the core of the tei2tf module
    
    It takes the following arguments:
    in_path:  the path that contains the TEI formatted texts
    out_path: the path to which the tf-files would be written
    **kwargs: a dictionarry that is usually derived from the
              config.py file, that contains all important
              parameters for the conversion (see documentation)
    '''
    
    tm           = Timestamp()
    langsettings = langsettings[lang]
    udnorm       = langsettings['udnorm']
    slot_type    = langsettings['slot_type']
    dir_struct   = langsettings['dir_struct']
    sLemmatizer  = langsettings['lemmatizer']()
    count1       = 0     # counts the number input files
    count2       = 0     # counts the number of successfully processed files
    
    # input-output file management
    inpath = path.expanduser(input_path)
    outpath = path.expanduser(output_path)
    
    global process_file
    
    def process_file(file):
        if file.endswith('.csv'):
#             count1 +=1
            tm.info(f'parsing {file}\n')
            filename = path.splitext(file)[0].split('/')[-1]
            with open(file, 'r') as file_open:
                data = file_open.readlines()
                metadat = tlge_metadata[filename]
#                 pprint(tlge_metadata)
                metadat.update(metadata)
#                 pprint(metadat)
                
                # definition of output dir structure on the basis of metadata
                dirs = []
                for i in dir_struct:
                    assigned = False
                    for j in i:
                        if j in metadat:
                            dirs.append(metadat[j])
                            assigned = True
                            break
                    if assigned == False:
                        dirs.append(f'unknown {"-".join(i)}')

                # dirs is a list of lists of which the tagnames used are defined in config.py
                # they usually correspond to something like (author, work, editor/edition)
                # in case of multiple editions of the same work, a number will be prefixed
                C = 1
                if path.isdir(f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'):
                    while path.isdir(f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'):
                        C +=1
                    else:
                        TF_PATH = f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'
                else:
                    TF_PATH = f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'

                # setting up the text-fabric engine
                TF = Fabric(locations=TF_PATH, silent=True)
                cv = CV(TF, silent=True)
                # initiating the Conversion class that provides all
                # necessary data and methods for cv.walk()
                x = csvConversion(data, metadat, sLemmatizer=sLemmatizer, lang=lang, **langsettings)
                # running cv.walk() to generate the tf-files
                good = cv.walk(
                    x.director,
                    slotType=slot_type,
                    otext=x.otext,
                    generic=x.generic,
                    intFeatures=x.intFeatures,
                    featureMeta=x.featureMeta,
                    warn=False,
                )
                # Count number of successfully converted files
                if good: 
#                     count2 +=1
                    tm.info('Conversion was successful...\n')
                else:
                    tm.info('Unfortunately, conversion was not successful...\n')
   
        elif file.endswith('.xml'):
#             count1 +=1
            if count1 > 1: print('\n')
            tm.info(f'parsing {file}\n')

            # creation of data to extract metadata
            # and to inject later into the Conversion object
            data = dataParser(xmlSplitter(file), lang=lang)
            body_index, metadat = metadataReader(data, lang=lang, **langsettings['metadata'])
            metadat.update(metadata)
    #         pprint(metadata)

            # definition of output dir structure on the basis of metadata
            dirs = []
            for i in dir_struct:
                assigned = False
                for j in i:
                    if j in metadat:
                        dirs.append(metadat[j])
                        assigned = True
                        break
                if assigned == False:
                    dirs.append(f'unknown {"-".join(i)}')

            # dirs is a list of lists of which the tagnames used are defined in config.py
            # they usually correspond to something like (author, work, editor/edition)
            # in case of multiple editions of the same work, a number will be prefixed
            C = 1
            if path.isdir(f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'):
                while path.isdir(f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'):
                    C +=1
                else:
                    TF_PATH = f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'
            else:
                TF_PATH = f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'

            # setting up the text-fabric engine
            TF = Fabric(locations=TF_PATH, silent=True)
            cv = CV(TF, silent=True)
            # initiating the Conversion class that provides all
            # necessary data and methods for cv.walk()
            x = xmlConversion(data[body_index:], metadat, sLemmatizer=sLemmatizer, lang=lang, **langsettings)
            # running cv.walk() to generate the tf-files
            good = cv.walk(
                x.director,
                slotType=slot_type,
                otext=x.otext,
                generic=x.generic,
                intFeatures=x.intFeatures,
                featureMeta=x.featureMeta,
                warn=False,
            )
            # Count number of successfully converted files
            if good: 
#             count2 +=1
                tm.info(f'Conversion of {file.split("/")[-1]} was successful...!\n')
            else:
                tm.info('Unfortunately, conversion of {file.split("/")[-1]} was not successful...\n')

    # Looping through the inpath and running the tf-conversion
#     for xmlfile in glob(f'{inpath}/**/*grc*.xml', recursive=True):
    
    
#     for file in glob(f'{inpath}/**/*.*', recursive=True):
    file_list = glob(f'{inpath}/**/*.*', recursive=True)
#     for file in file_list:
#         process_file(file)
                                 
    pool = Pool()
    pool.map(process_file, file_list)
    pool.close()
    pool.join()
        
        
#     tm.info(f'{count2} of {count1} works have successfully been converted!')
 
    


In [None]:
# convert('~/github/pthu/sources/pt', '~/github/pthu/out', lang='greek')
# convert('~/github/pthu/sources/sourcetexts', '~/github/pthu/out/sources', lang='greek')
# convert('~/github/pthu/sources/sourcetexts/First1KGreek', '~/github/pthu/greek_literature/Open Greek and Latin Project', lang='greek')
# convert('~/github/pthu/sources/sourcetexts/canonical-greekLit', '~/github/pthu/greek_literature/Perseus Digital Library', lang='greek')
# convert('~/github/pthu/sources/sourcetexts/canonical-greekLit/data/tlg0007/tlg081', '~/github/pthu/out/sources', lang='greek')

In [None]:
# Convert the library of the Open Greek and Latin Project
# convert('~/github/pthu/sources/sourcetexts/First1KGreek', '~/github/pthu/greek_literature/Open Greek and Latin Project', lang='greek')

In [None]:
# Convert the library of the Perseus Digital Library
# convert('~/github/pthu/sources/sourcetexts/canonical-greekLit', '~/github/pthu/greek_literature/Perseus Digital Library', lang='greek')

In [None]:
# Convert csv files
# convert('~/github/tlgu-1/TEST/csv_test', '~/github/tlgu-1/TEST/csv_test/out', lang='greek')
# convert('~/github/tlgu-1/TEST/test/', '~/github/tlgu-1/TEST/test/out', lang='greek')
# convert('~/github/tlgu-1/out/csv', '~/github/tlgu-1/out/tf', lang='greek')

In [None]:
# Convert manuscripts in XML (Münster & Birmingham)
convert('~/github/pthu/sources/manuscripts/test/subtest', '~/github/pthu/sources/manuscripts/test/out', lang='greek_ntmss')

In [None]:
a = (1, 2, 3, 4, 5)
ind = 0
print(a[:ind:-1])

In [None]:
s = '{Ζα !!![_ _ _]}'
print(splitPunc(s))

        
    