In [None]:
# General imports
import re, pickle, betacode.conv
from os import path
from glob import glob
from pprint import pprint
from itertools import takewhile
from ordered_set import OrderedSet
from unicodedata import category, normalize
from collections import OrderedDict, namedtuple
from multiprocessing import Pool

# Text Fabric imports
from tf.fabric import Fabric, Timestamp
from tf.convert.walker import CV

# Local imports ##TODO! Cleanup...
from helpertools.lemmatizer import lemmatize
from helpertools.unicodetricks import *
from helpertools.xmlparser import xmlSplitter, dataParser, metadataReader, attribsAnalysis #, lenAttribsDict, sectionElems
from tf_config import langsettings, generic_metadata
from data.tlge_metadata import tlge_metadata
from data.attrib_errors import error_dict

In [None]:
class Conversion:
    def __init__(self, data, lang='generic', typ=None, **kwargs):
        self.data        = data                                # Data in preprocessed XML or CSV
        self.lang        = lang                                # Language
        self.typ         = typ                                 # Subspecification of language; e.g. tlge
        self.generic     = generic_metadata                    # Generic TF metadata
        for setting, value in kwargs[self.lang].items():       # Set langsettings in tf_config as class attributes
            setattr(self, setting, value)                      # NB 'lang' defines the part of langsettings
        self.featuresInd = self.token_features(self.token_out) # Define indexes of features in token output tokenizer
        
        # Collect feature restricted metadata from tf_config
        self.featureMeta = {
            **{v['name']: {'description': v['metadata']} for k, v in self.text_formats.items()}, \
            **{k: {'description': v['metadata']} for k, v in self.token_out.items()},
        }
            
        # Variables used in processing
        self.res_text = None    # Handle text that ends with non_splitter
        
        
    def __repr__(self):
        print('The current class attributes are:')
        for key, value in self.__dict__.items():
            print(f'{key:<20} = {value}')
            
            
    def token_features(self, token_out):
        featuresInd = []
        for i, part in enumerate(self.token_out):
            if part['text'] == False:
                featuresInd.append((i, part))
            # Add to nonIntFeatures, because all stringparts are expected to be non-ints
            self.nonIntFeatures.add(part)
        return tuple(featuresInd)

            
    def process_text(self, text):
        text_output = []
              
        #Handle wordbreaks
        if self.res_text != None:
            text, self.res_text = self.res_text + text, None
        if text.endswith(self.non_splitters):
            text, self.res_text = text.rstrip(''.join(self.non_splitters)).rsplit(' ', 1)

        #Process text
        # NB 'orig' is compulsory to have in self.text_formats!
        for t in self.tokenizer(text, **self.tokenizer_args):
            # Define original word
            origAssigned = False
            orig_word = self.text_formats['orig']['function'](t)

            # NB The replace_func can return multiple tokens if words are split like greek crasis forms
            for token in self.replace_func(t):
                token_processed = {}
                
                #Check if token has only features and no text
                if 
                
                #Assign orig format
                if not origAssigned:
                    token_processed['orig'] = orig_word
                    origAssigned = True
                else:
                    token_processed['orig'] = ''
                    
                # Process text data
                for form in self.text_formats:
                    token_processed[form]['name'] = form.function(token)
                    
                # Process feature data
                for i, part in self.featuresInd:
                    token_processed[part] = token[i]
                
                # Append dict to output list
                text_output.append(token_processed)
                
        return text_output  
                
        


In [None]:
class Csv2tf(Conversion):
    def __init__(self, data, lang='generic', header=False, **kwargs):
        super().__init__(self, data, lang, **kwargs)
        self.header   = self.get_header(header)
        self.sections = self.header if header == True \
                                 else ( list(filter(None, self.generic['citation_scheme'].split('/'))) \
                                       if 'citation_scheme' in self.generic \
                                 else list(filter(None, input("No header data could be found; please enter an appropriate header: ").split())) )
        self.structs  = tuple(('_book',) + tuple(self.header) + tuple(self.struct_counter))
        self.otext = {
            **{k: v['format'] for k, v in self.text_formats.items()}, \
            **{'sectionTypes': f'{",".join(self.sections[:2] + [self.sections[-1]] if len(self.sections) > 2 else self.sections)}'}, \
            **{'sectionFeatures': f'{",".join(self.sections[:2] + [self.sections[-1]] if len(self.sections) > 2 else self.sections)}'}, \
            **{'structureTypes': f'_book,{",".join(self.structs)}'}, \
            **{'structureFeatures': f'_book,{",".join(self.structs)}'}
        }
        self.featureMeta = {v['name']: {'description': v['metadata']} \
                            for k, v in self.text_formats.items()}
        
        for i, struct in enumerate(self.structs, 1):
            self.featureMeta[struct] = {'description': f'structure feature of the {num}{"st" if num == 1 else ""}{"nd" if num == 2 else ""}{"rd" if num == 3 else ""}{"th" if num > 3 else ""} level',}

        # Handle tlg head text marked by {head}
        self.head_signs = {'start': {'{',},
                           'stop' : {'}',},}
        
    def get_header(head):
        
        def check_header(measure, typed_input):
            if len(typed_input) == measure:
                print('header successfully entered!')
                return typed_input
            else:
                print(f'The inputed number of header titles is {len(typed_input)}, while it should be {measure}')
                typed_input = list(filter(None, input("No header data could be found; please enter an appropriate header: ").split()))
                check_header(measure, typed_input)
        
        levels = len(self.data[0].split('\t'))+
        
        if head == False:
            levels = len(self.data[0].split('\t'))
            if levels == 0:
                header = []
            else:
                header = check_header(levels, list(filter(None, input("No header data could be found; please enter an appropriate header: ").split())))
        else:
            if isinstance(head, (list, tuple)):
                header = check_header()
                header = head
            else:
                header = self.data[0].split('\t')[:-1]
                self.data = self.data[1:]
        return header
        
    
    def director(self, cv):
        nonIntFeatures = self.nonIntFeatures
        counter        = self.struct_counter
        udnorm         = self.udnorm
        
        lemma_counter        = [0, 0]
        cur                  = {}
        
        #Designate bookname and start first node assignment
        cur['_book'] = cv.node('_book')
        book_title = self.generic['title'] if 'title' in self.generic else 'no title found in metadata'
        book_title_full = self.generic['title_full'] if 'title_full' in self.generic else book_title
        cv.feature(cur['_book'], _book=book_title)
        cv.meta('_book', description=book_title_full)
        nonIntFeatures.add('_book')
        
        #Declaration of global variables used in the process_text() method!
        tlg_head = False
        head_res = None
        
        for line in self.data:
            splitline = line.split('\t')
            ref = splitline[:-1]
            text = splitline[-1].strip()
        
            # Handle sectioning
            ind = 0
            for sec in self.sections:
                num = ind + 1
                if sec in cur and cv.active(cur[sec]):
                    cur_sec = cv.get(sec, cur[sec])
                    new_sec = ref[ind]
                    if not cur_sec == new_sec:
                        for s in self.sections[:ind:-1]:
                            cv.terminate(cur[s])
                        cv.terminate(cur[sec])
                        cur[sec] = cv.node(sec)
                        cv.feature(cur[sec], **{sec: ref[ind]})
#                         cv.meta(sec, description=f'structure feature of the {num}{"st" if num == 1 else ""}{"nd" if num == 2 else ""}{"rd" if num == 3 else ""}{"th" if num > 3 else ""} level',)
                else:
                    cur[sec] = cv.node(sec)
                    cv.feature(cur[sec], **{sec: ref[ind]})
#                     cv.meta(sec, description=f'structure feature of the {num}{"st" if num == 1 else ""}{"nd" if num == 2 else ""}{"rd" if num == 3 else ""}{"th" if num > 3 else ""} level',)
                if not ref[ind].isdigit():
                    nonIntFeatures.add(sec)
                ind +=1
                    
            # Process text
            # NB token_out is a dictionary with all the text/feature formats
            for token_out in self.process_text(text):
                
                
                # Handle TLG heads {head words}
                if self.typ == 'tlge':
                    if tlg_head == True:
                        # In case 'pre' has the head end sign
                        if self.head_signs['stop'] & set(token_out['pre']):
                            tlg_head = False
                        # In case 'post' has the head end sign
                        elif self.head_signs['stop'] & set(token_out['post']):
                            if 'head' in cur and not cv.linked(cur['head']):
                                content = f"{cv.get('head', cur['head'])}{token_out['pre']}{token_out['orig']{token_out['post']}"
                                cv.feature(cur['head'], **{'head': content})
                            tlg_head = False
                            continue
                        # In case the token is fully part of the tlg head
                        else:
                            # In case already parts of head exist
                            if 'head' in cur and not cv.linked(cur['head']):
                                content = f"{cv.get('head', cur['head'])}{token_out['pre']}{token_out['orig']{token_out['post']}"
                                cv.feature(cur['head'], **{'head': content})
                            # In case a new head has to be made
                            else:
                                if 'head' in cv.activeTypes() and cv.linked(cur['head']):
                                    cv.terminate(cur['head'])
                                cur['head'] = cv.node('head')
                                content = f"{token_out['pre']}{token_out['orig']{token_out['post']}"
                                if head_res:
                                    content = head_res + content
                                    head_res = None
                                cv.feature(cur['head'], **{'head': content})
                                cv.meta('head', description="head title",)
                                nonIntFeatures.add('head')
                            continue

                    if tlg_head == False:
                        if self.head_signs['start'] & set(token_out['pre']):
                            self.tlg_head = True
                            if 'head' in cv.activeTypes() and cv.linked(cur['head']):
                                cv.terminate(cur['head'])
                            cur['head'] = cv.node('head')
                            content = f"{token_out['pre']}{token_out['orig']{token_out['post']}"
                            cv.feature(cur['head'], **{'head': content})
                            cv.meta('head', description="head title",)
                            nonIntFeatures.add('head')
                            continue
                        else:
                            if self.head_signs['start'] & set(token_out['post']):
                                self.tlg_head = True
                                token_out['post'], head_res = token_out['post'].split(''.join(head_signs['start']))
                                head_res = ''.join(head_signs['start']) + head_res

                # Handle empty slots that still have features, by adding them to the previous slot
                

                # SLOT ASSIGNMENT!
                w = cv.slot()
                # Handle the data dictionary with text formats and features
                for name, value in token_out.items():
                    cv.feature(w, **{name: value})
                
                # Run lemma counter
                if 'lemma' in token_out:
                    if token_out['lemma'].startswith('*'):
                        lemma_counter[1] +=1 
                    else:
                        lemma_counter[0] +=1
        
    
        for ntp in self.structs[::-1]:
            if ntp in cur: cv.terminate(cur[ntp])
        for ntp in cur:
            if ntp in cur: cv.terminate(cur[ntp])

        if not lemma_counter == [0, 0]:
            cv.meta('lemma', **{'coverage_ratio': f'{round(lemma_counter[0] / ((lemma_counter[0] + lemma_counter[1]) / 100 ), 2)}%'})
        cv.meta('_sentence', description=f"sentences defined by the following delimiters: {self.langsettings['sentence_delimit']}",)
        cv.meta('_phrase', description=f"phrases defined by the following delimiters: {self.langsettings['phrase_delimit']}",)
        for feature in cv.metaData:
            if feature in nonIntFeatures:
                cv.meta(feature, valueType='str')
            else:
                if feature == "":
                    pass
                else:
                    cv.meta(feature, valueType='int')
            

In [None]:
class Tlg2tf(Csv2tf):
    def __init__():
        super().__init__(self, data, lang='greek', **kwargs)
        



In [None]:
class Xml2tf(Conversion):
    def __init__(self, data, lang='generic', **kwargs):
        super().__init__(self, data, lang, **kwargs)

    def director(self, cv):
        
        

In [None]:
def convert(input_path, output_path, lang='generic', typ=False, **kwargs):
    # For how to change the kwargs arguments: https://stackoverflow.com/questions/44784577/in-method-call-args-how-to-override-keyword-argument-of-unpacked-dict
    
    
    if typ == 'tlge':
        kwargs['head_signs'] =  {'start': '{',
                                 'stop': '}',}
        
    #Check for original or preprocessed tlg-E files
    
    elif typ == 'mss':
        pass
    
        