In [None]:
# General imports
import re, pickle, betacode.conv
from os import path
from glob import glob
from pprint import pprint
from itertools import takewhile
from ordered_set import OrderedSet
from unicodedata import category, normalize
from collections import OrderedDict, namedtuple
from multiprocessing import Pool

# Text Fabric imports
from tf.fabric import Fabric, Timestamp
from tf.convert.walker import CV

# Local imports ##TODO! Cleanup...
from helpertools.lemmatizer import lemmatize
from helpertools.unicodetricks import *
from helpertools.xmlparser import xmlSplitter, dataParser, metadataReader, attribsAnalysis #, lenAttribsDict, sectionElems
from tf_config import langsettings, generic_metadata
from data.tlge_metadata import tlge_metadata
from data.attrib_errors import error_dict

In [None]:
class Conversion:
    def __init__(self, data, **kwargs):
        self.data        = data                                # Data in preprocessed XML or CSV
        for setting, value in kwargs.items():                  # Set langsettings in tf_config as class attributes
            setattr(self, setting, value)                      # NB 'lang' defines the part of langsettings
        self.featuresInd = self.token_features(self.token_out) # Define indexes of features in token output tokenizer
        
        # Collect feature restricted metadata from tf_config
        self.featureMeta = {
            **{k: {'description': v['description']} for k, v in self.text_formats.items()}, \
            **{k: {'description': v['description']} for k, v in self.token_out.items()}, \
            **{k: {'description': v} for k, v in self.struct_counter_metadata.items()}
        }
        self.nonIntFeatures = kwargs['nonIntFeatures'].copy() | \
                              {k for k in self.token_out} | \
                              {k for k in self.text_formats}
        
        # Variables used in processing
        self.res_text = None    # Handle text that ends with non_splitter
        
        
            
    def token_features(self, token_out):
        featuresInd = []
        for i, (part, value) in enumerate(token_out.items()):
            if value['text'] == False:
                featuresInd.append((i, part))
            # Add to nonIntFeatures, because all stringparts are expected to be non-ints
            self.nonIntFeatures.add(part)
        return tuple(featuresInd)

            
    def process_text(self, text):
        text_output = []
              
        #Handle wordbreaks
        if self.res_text != None:
            text, self.res_text = self.res_text + text, None
        if text.endswith(self.tokenizer_args['non_splitters']):
            text, self.res_text = text.rstrip(''.join(self.tokenizer_args['non_splitters'])).rsplit(' ', 1)
            text += ' ' #Add space deleted by split(' ')

        #Process text
        # NB 'orig' is compulsory to have in self.text_formats!
        for t in self.tokenizer(text, **self.tokenizer_args):
            # Define original word
            origAssigned = False
            orig_word = self.text_formats['orig']['function'](t)

            # NB The replace_func can return multiple tokens if words are split like greek crasis forms
            for token in self.replace_func(t):
                token_processed = {}
                
                #Assign orig format
                if not origAssigned:
                    token_processed['orig'] = orig_word
                    origAssigned = True
                else:
                    token_processed['orig'] = ''
                    
                # Process text data
                for form, values in self.text_formats.items():
                    if form not in token_processed: #Prevent the replacement of 'orig'
                        token_processed[form] = values['function'](token)
                    
                # Process feature data
                for i, part in self.featuresInd:
                    token_processed[part] = token[i]
                
                # Append dict to output list
                text_output.append(token_processed)
        
        # Output list of dicts with text and feature data
        return text_output  


In [None]:
class Csv2tf(Conversion):
    def __init__(self, data, **kwargs):
        super().__init__(data, **kwargs)
        self.head     = self.get_header(self.header)
        self.sections = self.head[:-1] if self.header == True \
                                 else (list(filter(None, self.generic['citation_scheme'].lower().split('/'))) \
                                       if 'citation_scheme' in self.generic \
                                 else list(filter(None, input("No header data could be found; please enter an appropriate header: ").lower().split())) )
        self.structs  = tuple(('_book',) + tuple(self.head[:-1]) + tuple(self.struct_counter))
        self.otext = {
            **{v['otext_name']: v['format'] for k, v in self.text_formats.items()}, \
            **{'sectionTypes': f'{",".join(self.sections[:2] + [self.sections[-1]] if len(self.sections) > 2 else self.sections)}'}, \
            **{'sectionFeatures': f'{",".join(self.sections[:2] + [self.sections[-1]] if len(self.sections) > 2 else self.sections)}'}, \
            **{'structureTypes': f'_book,{",".join(self.structs)}'}, \
            **{'structureFeatures': f'_book,{",".join(self.structs)}'}
        }
        
        # Calculate metadata for struct levels
        for num, struct in enumerate(self.structs[1:], 1):
            self.featureMeta[struct] = {'description': f'structure feature of the {num}{"st" if num == 1 else ""}{"nd" if num == 2 else ""}{"rd" if num == 3 else ""}{"th" if num > 3 else ""} level',}

        # Handle tlg head text marked by {head}
        self.head_signs = {'start': {'{',},
                           'stop' : {'}',},}
        
    def get_header(self, head):
        
        def check_header(measure, typed_input):
            if len(typed_input) == measure:
                return [h.lower() for h in typed_input]
            else:
                print(f'The inputed number of header titles is {len(typed_input)}, while it should be {measure}')
                typed_input = list(filter(None, input("No header data could be found; please enter an appropriate header split by spaces:").split()))
                check_header(measure, typed_input)
        
        levels = len(self.data[0].split('\t'))
        if head == False:
            levels = len(self.data[0].split('\t'))
            if levels == 0:
                header = []
            else:
                header = check_header(levels, list(filter(None, input("No header data could be found; please enter an appropriate header: ").split())))
        else:
            if isinstance(head, (list, tuple)):
                header = check_header(levels, head)
            elif self.header == True:
                header = self.data[0].split('\t')
                self.data = self.data[1:]
                levels = len(self.data[0].split('\t'))
                header = check_header(levels, header)    
            else:
                print("something is wrong with the header...!")
        return [h.lower() for h in header]
        
    
    def director(self, cv):
        nonIntFeatures = self.nonIntFeatures.copy()    # keep track of features that are not ints
        counter        = self.struct_counter.copy()    # keep track of calculated struct features defined in tf_config
        udnorm         = self.udnorm            # define the Unicode norm used
        lemma_counter  = [0, 0]                 # keep track of wordforms converted successfully to lemmata
        cur            = {}                     # keep track of node number assignments
        # VARIABLES TO PROCESS PREPROCESSED TLG-E OUTPUT
        tlg_head       = False                  # if true text will be processed as head-feature
        tlg_head_cont  = ''                     # variable to store head text elements
        
        # Define bookname and start first node assignment to cur
        cur['_book'] = cv.node('_book')
        book_title = self.generic['title'] if 'title' in self.generic else 'no title found in metadata'
        book_title_full = self.generic['title_full'] if 'title_full' in self.generic else book_title
        cv.feature(cur['_book'], _book=book_title)
        cv.meta('_book', description=book_title_full)
        nonIntFeatures.add('_book')
        
        # Initiate counters
        for count in counter:
            cur[count] = cv.node(count)
            cv.feature(cur[count], **{count: counter[count]})
            
        refAssigned = False
        w           = False
        
        # PROCESS CSV-DATA LINE BY LINE
        for line in self.data:
            # Split reference and text; NB text is always the last element!
            splitline = line.split('\t')
            ref = splitline[:-1]
            text = splitline[-1].strip()
            if not text.endswith(self.tokenizer_args['non_splitters']):
                text += ' '
     
            # PROCESS TEXT
            # NB token_out is a dictionary with all the text/feature formats
            for token_out in self.process_text(text):
                pprint(token_out)
                
                # ------------------------------------------------
                # Handle TLG head titles (words enclosed in {...})
                if self.typ == 'tlge':
                    if tlg_head == True:
                        # In case 'pre' has the head end sign
                        if self.head_signs['stop'] & ( set(token_out['pre']) | set(token_out['post']) ):
                            tlg_head = False
                            if 'head' in cur: cv.terminate(cur['head'])
                            cur['head'] = cv.node('head')
                            cv.meta('head', description="head title",)
                            nonIntFeatures.add('head')
                            if self.head_signs['stop'] & set(token_out['pre']):
                                content = tlg_head_cont
                            if self.head_signs['stop'] & set(token_out['post']): 
                                content = tlg_head_cont + f"{token_out['orig']}"
                            cv.feature(cur['head'], **{'head': content})
                            tlg_head_cont = ''
                            if self.head_signs['stop'] & set(token_out['post']): continue
                        # In case the token is fully part of the tlg head
                        else:
                            tlg_head_cont += f"{token_out['orig']}"
                            continue
                    if tlg_head == False:
                        if self.head_signs['start'] & set(token_out['pre']):
                            # if the whole head is in one token_out
                            if self.head_signs['stop'] & set(token_out['post']):
                                if 'head' in cur: cv.terminate(cur['head'])
                                content = f"{token_out['orig']}"
                                cur['head'] = cv.node('head')
                                cv.feature(cur['head'], **{'head': content})
                                cv.meta('head', description="head title",)
                                nonIntFeatures.add('head')
                                continue
                            else:
                                tlg_head = True
                                tlg_head_cont += f"{token_out['orig']}"
                                continue
                        else:
                            if self.head_signs['start'] & set(token_out['post']):
                                tlg_head = True
                                token_out['post'], tlg_head_cont = token_out['post'].split(''.join(self.head_signs['start']))
                                tlg_head_cont = ''.join(self.head_signs['start']) + tlg_head_cont
                # End tlg heads
                # ----------------------------------------------

                # Handle empty tokens that still have a pre feature, by adding them to the previous post and orig
                if 'plain' in token_out and token_out['plain'] == '':
                    if 'pre' in token_out and 'post' in token_out:
                        try: # if there is already an existing slot number
                            cv.resume(w)
                            pre = token_out['pre']
                            orig = cv.get('orig', w) + pre
                            cv.feature(w, orig=orig)
                            post = cv.get('post', w) + pre
                            cv.feature(w, post=post)
                            # Check phrase and sentence counters
                            if set(pre) & self.phrase_delimit:
                                if cv.linked(cur['_phrase']):
                                    cv.terminate(cur['_phrase'])
                                    counter['_phrase'] +=1
                                    cur['_phrase'] = cv.node('_phrase')
                                    cv.feature(cur['_phrase'], **{'_phrase': counter['_phrase']}) 
                            if set(pre) & self.sentence_delimit:
                                for count in ('_phrase', '_sentence'):
                                    if cv.linked(cur[count]):
                                        cv.terminate(cur[count])
                                        counter[count] +=1
                                        cur[count] = cv.node(count)
                                        cv.feature(cur[count], **{count: counter[count]})  
                            cv.terminate(w)
                        except:
                            pass
                    continue                                     

                # HANDLE SECTIONING
                if refAssigned == False:
                    for ind, sec in enumerate(self.sections):
                        if sec in cur and cv.active(cur[sec]):          # Check whether section level is active
                            cur_sec = cv.get(sec, cur[sec])             # Get the current section value
                            new_sec = ref[ind]                          # Get the section value in the present line
                            if not cur_sec == new_sec:                  # Check whether the old and the new value are equal
                                for s in self.sections[:ind:-1]:        # If not, terminate all lower section levels
                                    cv.terminate(cur[s])
                                cv.terminate(cur[sec])                  # Terminate the current section level
                                cur[sec] = cv.node(sec)                 # Create new section node
                                cv.feature(cur[sec], **{sec: ref[ind]}) # Add new value to the new section node
                        else:                                           # In case the section is not present in cur OR not active
                            cur[sec] = cv.node(sec)                     # Create new section node
                            cv.feature(cur[sec], **{sec: ref[ind]})     # Add new value to section node
                        if not ref[ind].isdigit():                      # Check whether the value is not an int
                            nonIntFeatures.add(sec)                     # In case the value is no int, add the FEATURE to the set of nonIntFeatures
                    refAssigned = True
                    
                # SLOT ASSIGNMENT!
                # ================
                w = cv.slot()
                # Handle the data dictionary with text formats and features
                for name, value in token_out.items():
                    cv.feature(w, **{name: value})
                    
                # ================

                # Check phrase and sentence counters
                if 'post' in token_out:
                    if set(token_out['post']) & self.phrase_delimit:
                        if cv.linked(cur['_phrase']):
                            cv.terminate(cur['_phrase'])
                            counter['_phrase'] +=1
                            cur['_phrase'] = cv.node('_phrase')
                            cv.feature(cur['_phrase'], **{'_phrase': counter['_phrase']}) 
                    if set(token_out['post']) & self.sentence_delimit:
                        for count in ('_phrase', '_sentence'):
                            if cv.linked(cur[count]):
                                cv.terminate(cur[count])
                                counter[count] +=1
                                cur[count] = cv.node(count)
                                cv.feature(cur[count], **{count: counter[count]}) 
                
                # Run lemma counter
                if 'lemma' in token_out:
                    if token_out['lemma'].startswith('*'):
                        lemma_counter[1] +=1 
                    else:
                        lemma_counter[0] +=1
                        
        # In case the csv-file has a header, but is empty: make one empty slot
        if not w:
            if self.ignore_empty == False:
                w = cv.slot()
                for feat in self.nonIntFeatures:
                    cur[feat] = cv.node(feat)
                    cv.feature(cur[feat], **{feat: ''})
                for feat in self.intFeatures:
                    cur[feat] = cv.node(feat)
                    cv.feature(cur[feat], **{feat: ''})
            else:
                return False
        
        # Terminate structs (includes sections!)
        for ntp in self.structs[::-1]:
            if ntp in cur: cv.terminate(cur[ntp])
        # Terminate any remaining active nodes in cur
        for ntp in cur:
            if ntp in cur: cv.terminate(cur[ntp])

        # Calculate lemmatizer coverage of lemmata
        if not lemma_counter == [0, 0]:
            cv.meta('lemma', **{'coverage_ratio': f'{round(lemma_counter[0] / ((lemma_counter[0] + lemma_counter[1]) / 100 ), 2)}%'})

        # Assign the correct valueType to features
        for feature in cv.metaData:
            if feature in nonIntFeatures:
                cv.meta(feature, valueType='str')
            else:
                if feature == "":
                    pass
                else:
                    cv.meta(feature, valueType='int')
            

In [None]:
class Tlg2tf(Csv2tf):
    def __init__():
        super().__init__(self, data, lang='greek', **kwargs)
        



In [None]:
class Xml2tf(Conversion):
    def __init__(self, data, lang='generic', **kwargs):
        super().__init__(self, data, lang, **kwargs)

    def director(self, cv):
        pass
        

In [None]:
def convert(
        input_path, 
        output_path,
        tlg_out         = False,
        ignore_empty    = True,
        generic         = generic_metadata,  # Generic metadata from tf_config
        lang            = 'generic',         # Chosen language as available in langsettings in tf_config
        typ             = False,             # Used to introduce subclases of a language; e.g. 'tlge' in addition to 'greek'
        header          = False,             # If True: first line of csv would be taken as header. Also tuple and list are allowed
        version         = '1.0',             # Version number to be added in the metadata of every tf-file
        langsettings    = langsettings,      # Reference to langsettings
        multiprocessing = False,             # Can be used if many files need to be converted. If 'True', the program checks number of available cores authomatically; if int, it will try to use that number of cores 
        chunksize       = 1,                 # Defines the number of files to be send to each core in multiprocessing mode
        silent          = False,             # Keeps TF messages silent
    
        ):
    '''The convert function is the core of the tei2tf module
    
    It takes the following arguments:
    in_path:  the path that contains the TEI formatted texts
    out_path: the path to which the tf-files would be written
    **kwargs: a dictionary that is usually derived from the
              config.py file, that contains all important
              parameters for the conversion (see documentation)
    '''
    tm           = Timestamp()
    kwargs       = langsettings[lang]
    dir_struct   = kwargs['dir_struct']
    sLemmatizer  = kwargs['lemmatizer']()
    count1       = 0     # counts the number input files
    count2       = 0     # counts the number of successfully processed files
    
    # Add parameters to kwargs
    kwargs['ignore_empty'] = ignore_empty
    kwargs['generic']      = generic
    kwargs['lang']         = lang
    kwargs['typ']          = typ
    kwargs['header']       = header
    kwargs['version']      = version
    
    # input-output file management
    inpath = path.expanduser(input_path)
    outpath = path.expanduser(output_path)
    
    # Necessary to make process_file picklable for multiprocessing
    global process_file
    
    def process_file(file):
        nonlocal count1
        nonlocal count2
#         nonlocal silent
        if file.endswith('.csv'):
            count1 +=1
            tm.info(f'parsing {file}')
            filename = path.splitext(file)[0].split('/')[-1]
            with open(file, 'r') as file_open:
                data = file_open.readlines()
                metadat = tlge_metadata[filename]
                kwargs['generic'].update(tlge_metadata[filename])
                
                if tlg_out == True:
                    dirs = kwargs['generic']['key'].split(' ')
                # definition of output dir structure on the basis of metadata
                else:
                    dirs = []
                    for i in dir_struct:
                        assigned = False
                        for j in i:
                            if j in metadat:
                                dirs.append(metadat[j])
                                assigned = True
                                break
                        if assigned == False:
                            dirs.append(f'unknown {"-".join(i)}')

                # dirs is a list of lists of which the tagnames used are defined in config.py
                # they usually correspond to something like (author, work, editor/edition)
                # in case of multiple editions of the same work, a number will be prefixed
                C = 1
                if path.isdir(f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'):
                    while path.isdir(f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'):
                        C +=1
                    else:
                        TF_PATH = f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'
                else:
                    TF_PATH = f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'

                # setting up the text-fabric engine
                TF = Fabric(locations=TF_PATH, silent=silent)
                cv = CV(TF, silent=silent)
                # initiating the Conversion class that provides all
                # necessary data and methods for cv.walk()
                x = Csv2tf(data, **kwargs)
                # running cv.walk() to generate the tf-files
                good = cv.walk(
                    x.director,
                    slotType=x.slot_type,
                    otext=x.otext,
                    generic=x.generic,
                    intFeatures=x.intFeatures,
                    featureMeta=x.featureMeta,
                    warn=True,
                )
                # Count number of successfully converted files
                if good: 
                    count2 +=1
                    tm.info('   |    Conversion was successful...\n')
                else:
                    tm.info('   |    Unfortunately, conversion was not successful...')
                    if ignore_empty == True:
                        tm.info('   |    The most probable reason is that no slot numbers could be assigned...\n')
   
        elif file.endswith('.xml'):
            count1 +=1
#             if count1 > 1: print('\n')
            tm.info(f'parsing {file}')

            # creation of data to extract metadata
            # and to inject later into the Conversion object
            data = dataParser(xmlSplitter(file), lang=lang)
            body_index, metadat = metadataReader(data, lang=lang, **langsettings['metadata'])
            metadat.update(metadata)
    #         pprint(metadata)

            # definition of output dir structure on the basis of metadata
            dirs = []
            for i in dir_struct:
                assigned = False
                for j in i:
                    if j in metadat:
                        dirs.append(metadat[j])
                        assigned = True
                        break
                if assigned == False:
                    dirs.append(f'unknown {"-".join(i)}')

            # dirs is a list of lists of which the tagnames used are defined in config.py
            # they usually correspond to something like (author, work, editor/edition)
            # in case of multiple editions of the same work, a number will be prefixed
            C = 1
            if path.isdir(f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'):
                while path.isdir(f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'):
                    C +=1
                else:
                    TF_PATH = f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'
            else:
                TF_PATH = f'{outpath}/{"/".join(dirs)}/{C}/tf/{version}'

            # setting up the text-fabric engine
            TF = Fabric(locations=TF_PATH, silent=silent)
            cv = CV(TF, silent=silent)
            # initiating the Conversion class that provides all
            # necessary data and methods for cv.walk()
            x = Xml2tf(data[body_index:], metadat, sLemmatizer=sLemmatizer, lang=lang, **langsettings)
            # running cv.walk() to generate the tf-files
            good = cv.walk(
                x.director,
                slotType=slot_type,
                otext=x.otext,
                generic=x.generic,
                intFeatures=x.intFeatures,
                featureMeta=x.featureMeta,
                warn=True,
            )
            # Count number of successfully converted files
            if good: 
                count2 +=1
                tm.info(f'   |    Conversion of {file.split("/")[-1]} was successful...!\n')
            else:
                tm.info('   |    Unfortunately, conversion of {file.split("/")[-1]} was not successful...\n')
                
    # Define list of files to be processed
    file_list = glob(f'{inpath}/**/*.*', recursive=True)
        
    if multiprocessing:
        if not type(multiprocessing) == bool:
            # Manual assignment of cores
            pool = Pool(processes=multiprocessing)
        else:
            pool = Pool()
        # Manual assignment of chunksize if many files need to be consumed
        # Manual assignment might improve performance
        pool.imap_unordered(process_file, file_list, chunksize=chunksize)
    #     pool.imap_unordered(process_file, file_list)
        pool.close()
        pool.join()
    
    else:
        for file in file_list:
            process_file(file)
        
    tm.info(f'{count2} of {count1} works have successfully been converted!')
 
    



In [None]:
# def convert(input_path, output_path, lang='generic', typ=False, **kwargs):
#     # For how to change the kwargs arguments: https://stackoverflow.com/questions/44784577/in-method-call-args-how-to-override-keyword-argument-of-unpacked-dict
    
    
#     if typ == 'tlge':
#         kwargs['head_signs'] =  {'start': '{',
#                                  'stop': '}',}
        
#     #Check for original or preprocessed tlg-E files
    
#     elif typ == 'mss':
#         pass
    
        

In [None]:
# convert('~/github/tlgu-1/out/csv', 
#         '~/github/tlgu-1/out/csv/tf',
#         tlg_out=True,
#         lang='greek', 
#         typ='tlge', 
#         header=True,
#         multiprocessing=False,
#         chunksize=1,
#         silent=True,
#        )

convert('~/github/tlgu-1/debug/test', 
        '~/github/tlgu-1/out/csv/tf', 
        tlg_out=True,
        lang='greek', 
        typ='tlge', 
        header=True,
        multiprocessing=False,
        chunksize=50,
        silent=True,
       )

# Test
# convert('~/github/tlgu-1/TEST/csv_test', '~/github/tlgu-1/TEST/csv_test/out', lang='greek', typ='tlge', header=True, multiprocessing=False, silent=True)

