In [None]:
# General imports
import re, pickle, betacode.conv
from os import path
from glob import glob
from pprint import pprint
from itertools import takewhile
from ordered_set import OrderedSet
from unicodedata import category, normalize
from collections import OrderedDict, namedtuple
from multiprocessing import Pool

# Text Fabric imports
from tf.fabric import Fabric, Timestamp
from tf.convert.walker import CV

# Local imports ##TODO! Cleanup...
from helpertools.lemmatizer import lemmatize
from helpertools.unicodetricks import *
from helpertools.xmlparser import xmlSplitter, dataParser, metadataReader, attribsAnalysis #, lenAttribsDict, sectionElems
from tf_config import langsettings
from data.tlge_metadata import tlge_metadata
from data.attrib_errors import error_dict
from convertor_metadata import convertor_metadata

In [None]:
class Conversion:
    def __init__(self, data, lang='generic', typ=False, **kwargs):
        self.data = data
        self.lang = lang
        self.typ  = typ
        
        # Set all keys in langsettings (tf_config.py) as class attributes
        # NB Make sure that metadata are uploaded to langsettings somewhere in the process!
        for setting, value in kwargs[self.lang].items():
            setattr(self, setting, value)
        
        # Define the index number of the parts of the token that represent the text and features
        featuresInd = None
        textInd = []
        for i, part in enumerate(self.token_out):
            if part['text'] == True:
                textInd = i
            else:
                featuresInd.append((i, part))
                
        self.textInd = textInd       
        self.featuresInd = tuple(featureInd) 
            
        # Variables used in processing
        self.res_text = None
        self.tlg_head = False
        
        # Add token features to nonIntFeatures set
        try:
            self.nonIntFeatures.update(self.token_out)
        
    def __repr__(self):
        print('The current class attributes are:')
        for key, value in self.__dict__.items():
            print(f'{key:<20} = {value}')

            
    def process_text(self, text):
        text_output = []
              
        #Handle wordbreaks
        if self.res_text != None:
            text, self.res_text = self.res_text + text, None
        if text.endswith(self.non_splitters):
            text, self.res_text = text.rstrip(''.join(self.non_splitters)).rsplit(' ', 1)

        #Process text
        # NB 'orig' is compulsory to have in self.text_formats!
        for t in self.tokenizer(text, **self.tokenizer_args):
            # Define original word
            origAssigned = False
            orig_word = self.text_formats['orig']['function'](t)

            # NB The replace_func can return multiple tokens if words are split like greek crasis forms
            for token in self.replace_func(t):
                token_processed = {}
                if origAssigned == False:
                    token_processed['orig'] = orig_word
                    origAssigned = True
                    
                # Process text formats
                for form in self.text_formats:
                    if form == 'orig':
                        token_processed[form]['name'] = form.function(token)
                
                for i, part in self.featuresInd:
                    token_processed[part] = token[i]
            
                text_output.append(token_out)
                
        return text_output  
                
            



In [None]:
class Csv2tf(Conversion):
    def __init__(self, data, lang='generic', header=False, **kwargs):
        super().__init__(self, data, lang, **kwargs)
        self.header   = self.get_header(header)
        self.sections = self.header if header == True \
                                 else ( list(filter(None, self.metadata['citation_scheme'].split('/'))) \
                                       if 'citation_scheme' in self.metadata \
                                 else list(filter(None, input("No header data could be found; please enter an appropriate header: ").split())) )
        self.structs  = list(tuple(self.header) + tuple(self.struct_counter))
        self.otext = {
            **{k: v['format'] for k, v in self.text_formats.items()}, \
            **{'sectionTypes': f'{",".join(self.sections[:2] + [self.sections[-1]] if len(self.sections) > 2 else self.sections)}'}, \
            **{'sectionFeatures': f'{",".join(self.sections[:2] + [self.sections[-1]] if len(self.sections) > 2 else self.sections)}'}, \
            **{'structureTypes': f'_book,{",".join(self.structs)}'}, \
            **{'structureFeatures': f'_book,{",".join(self.structs)}'}
        }
        self.featureMeta = {v['name']: {'description': v['metadata']} \
                            for k, v in self.text_formats.items()}

        
    def get_header(head):
        
        def check_header(measure, typed_input):
            if len(typed_input) == measure:
                print('header successfully entered!')
                return typed_input
            else:
                print('The inputed number of header titles is {len(typed_input)}, while it should be {measure}')
                typed_input = list(filter(None, input("No header data could be found; please enter an appropriate header: ").split()))
                check_header(measure, typed_input)
        
        levels = len(self.data[0].split('\t'))
        
        if head == False:
            levels = len(self.data[0].split('\t'))
            if levels == 0:
                header = []
            else:
                header = check_header(levels, list(filter(None, input("No header data could be found; please enter an appropriate header: ").split())))
        else:
            if isinstance(head, (list, tuple)):
                header = check_header()
                header = head
            else:
                header = self.data[0].split('\t')[:-1]
                self.data = self.data[1:]
        return header
        
    
    def director(self, cv):
        nonIntFeatures = self.nonIntFeatures
        counter        = self.struct_counter
        udnorm         = self.udnorm
        
        linked_features_dict = {}
        lemma_counter        = [0, 0]
        cur                  = {}
        
        #Designate bookname and start first node assignment
        cur['_book'] = cv.node('_book')
        cv.feature(cur['_book'], _book=self.metadata['title'])
        cv.meta('_book', description=self.metadata['title_full'])
        nonIntFeatures.add('_book')
        
        #Declaration of global variables used in the process_text() method!
        tlg_head = False
        res_text = None
        
        for line in self.data:
            splitline = line.split('\t')
            ref = splitline[:-1]
            text = splitline[-1].strip()
        
            # Handle sectioning
            ind = 0
            for sec in self.sections:
                num = ind + 1
                if sec in cur and cv.active(cur[sec]):
                    cur_sec = cv.get(sec, cur[sec])
                    new_sec = ref[ind]
                    if not cur_sec == new_sec:
                        for s in self.sections[:ind:-1]:
                            cv.terminate(cur[s])
                        cv.terminate(cur[sec])
                        cur[sec] = cv.node(sec)
                        cv.feature(cur[sec], **{sec: ref[ind]})
                        cv.meta(sec, description=f'structure feature of the {num}{"st" if num == 1 else ""}{"nd" if num == 2 else ""}{"rd" if num == 3 else ""}{"th" if num > 3 else ""} level',)
                else:
                    cur[sec] = cv.node(sec)
                    cv.feature(cur[sec], **{sec: ref[ind]})
                    cv.meta(sec, description=f'structure feature of the {num}{"st" if num == 1 else ""}{"nd" if num == 2 else ""}{"rd" if num == 3 else ""}{"th" if num > 3 else ""} level',)
                if not ref[ind].isdigit():
                    nonIntFeatures.add(sec)
                ind +=1
                    
            # Process text
            for token_out in self.process_text(text):
                
            
            # Handle TLG heads {head words}
            if self.typ == 'tlge':
                if self.head_signs['start'] & ( set(pre) | set(post) ):
                    self.tlg_head = True
                if self.head_signs['stop'] & ( set(pre) | set(post) ):
                    self.tlg_head = False
            

In [None]:
class Tlg2tf(Csv2tf):
    def __init__():
        super().__init__(self, data, lang='greek', **kwargs)
        



In [None]:
class Xml2tf(Conversion):
    def __init__(self, data, lang='generic', **kwargs):
        super().__init__(self, data, lang, **kwargs)

    def director(self, cv):
        
        

In [None]:
def convert(input_path, output_path, lang='generic', typ=False, **kwargs):
    # For how to change the kwargs arguments: https://stackoverflow.com/questions/44784577/in-method-call-args-how-to-override-keyword-argument-of-unpacked-dict
    
    
    if typ == 'tlge':
        kwargs['head_signs'] =  {'start': '{',
                                 'stop': '}',}
        
    #Check for original or preprocessed tlg-E files
    
    elif typ == 'mss':
        pass
    
        