# tfbuilder tutorial
The whole machinery of tfbuilder can be used by importing the convert function from the tfbuilder library.

In [1]:
from tfbuilder import convert

### arguments `convert()`
    input_path      = folder path in which the files to be converted reside
    output_path     = folder path to which all tf-modules are to be written
    tlg_out         = `True` if one wants TLG codes as folder names `False` if folder names from metadata
    ignore_empty    = `True` if source files that don't produce slot numbers need to be ignored
    generic         = Generic metadata to be present in every tf-file to be produced
    lang            = language (referring to languages available in `langsettings`)
    typ             = subtype of a language, if special behavious is required, like `tlge` (tlg-e cdrom)
    header          = if True, the convertor expects csv-files to have a header
    version         = version number to be assigned to the tf-module
    langsettings    = langsettings to be imported; usually, this is the langsettings provided by tfbuilder
    multiprocessing = False --> no multiprocessing
                    = True  --> active multiprocessing; authomatic assignment of number of processor threads
                    = int   --> manual assingment of number of processor threads
    chunksize       = number of files to be assigned to each thread each cycle
    inspect         = return useful information about tags and attributes of XML input to inspect the source
    silent          = if True, all TF-messages are suppressed
    
    
#### remarks `generic` and `langsettings`:
Both are accessible and changeable in tf_config.py. However, one is able to pass his/her own settings (=dictionary) to the convert function...

In [2]:
convert('~/github/pthu/lorenz2020/tfbuilder/test/english', 
        '~/github/pthu/lorenz2020/tfbuilder/test/english/out',
        ignore_empty=False,
        tlg_out=False,
        lang='generic', 
        typ=False, 
        header=False,
        multiprocessing=False,
        chunksize=1,
        silent=False,
       )

  0.00s parsing /home/ernstboogert/github/pthu/lorenz2020/tfbuilder/test/english/tlg0031.tlg004.perseus-eng2.xml
This is Text-Fabric 7.10.0
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

0 features found and 0 ignored
  0.00s Warp feature "otype" not found in
/home/ernstboogert/github/pthu/lorenz2020/tfbuilder/test/english/out/Rainbow Missions, Inc/World English Bible - John, Machine readable text/Rainbow Missions, Inc/1/tf/1.0/
  0.00s Warp feature "oslots" not found in
/home/ernstboogert/github/pthu/lorenz2020/tfbuilder/test/english/out/Rainbow Missions, Inc/World English Bible - John, Machine readable text/Rainbow Missions, Inc/1/tf/1.0/
  0.01s Warp feature "otext" not found. Working without Text-API

  0.00s Importing data from walking through the source ...
   |     0.00s Preparing metadata... 
   |   SECTION   TYPES:    chapter, verse
   |   SECTION   FEATURES: chapter, verse
   |   STRUCTURE TYPES:    _book, chapter, verse, _sentence, _phrase
   |   STRUC

## Convert csv file


### Preparation Homer (James Tauber)

In [3]:
import os
from pprint import pprint

PATH = os.path.expanduser('~/github/pthu/lorenz2020/tfbuilder/test')

csvfile = open(PATH + '/greek_csv/tlg0012-001.csv', 'w+')

with open(os.path.expanduser(PATH + '/greek_csv/tlg0012-001.txt'), 'r') as james:
    for line in james.readlines():
        ref, text = line.split(' ', 1)
        ref1, ref2 = ref.split('.')
        csvfile.write(f'{ref1}\t{ref2}\t{text}')

csvfile.close()

with open(PATH + '/greek_csv/tlg0012-001.csv', 'r') as ernst:
    pprint(ernst.readlines()[:10])

['1\t1\tμῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος\n',
 '1\t2\tοὐλομένην, ἣ μυρίʼ Ἀχαιοῖς ἄλγεʼ ἔθηκε,\n',
 '1\t3\tπολλὰς δʼ ἰφθίμους ψυχὰς Ἄϊδι προΐαψεν\n',
 '1\t4\tἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν\n',
 '1\t5\tοἰωνοῖσί τε πᾶσι, Διὸς δʼ ἐτελείετο βουλή,\n',
 '1\t6\tἐξ οὗ δὴ τὰ πρῶτα διαστήτην ἐρίσαντε\n',
 '1\t7\tἈτρεΐδης τε ἄναξ ἀνδρῶν καὶ δῖος Ἀχιλλεύς.\n',
 '1\t8\tτίς τʼ ἄρ σφωε θεῶν ἔριδι ξυνέηκε μάχεσθαι;\n',
 '1\t9\tΛητοῦς καὶ Διὸς υἱός· ὃ γὰρ βασιλῆϊ χολωθεὶς\n',
 '1\t10\tνοῦσον ἀνὰ στρατὸν ὄρσε κακήν, ὀλέκοντο δὲ λαοί,\n']


In [4]:
convert('~/github/pthu/lorenz2020/tfbuilder/test/greek_csv', 
        '~/github/pthu/lorenz2020/tfbuilder/test/greek_csv/out',
        ignore_empty=False,
        tlg_out=False,
        lang='greek', 
        typ='tlge', 
        header=False,
        multiprocessing=False,
        chunksize=1,
        silent=False,
       )

  0.00s parsing /home/ernstboogert/github/pthu/lorenz2020/tfbuilder/test/greek_csv/tlg0012-001.csv
This is Text-Fabric 7.10.0
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

0 features found and 0 ignored
  0.00s Warp feature "otype" not found in
/home/ernstboogert/github/pthu/lorenz2020/tfbuilder/test/greek_csv/out/Homerus/Ilias/Allen, T.W./1/tf/1.0/
  0.00s Warp feature "oslots" not found in
/home/ernstboogert/github/pthu/lorenz2020/tfbuilder/test/greek_csv/out/Homerus/Ilias/Allen, T.W./1/tf/1.0/
  0.00s Warp feature "otext" not found. Working without Text-API

No header data could be found; please enter an appropriate header: book line text
  0.00s Importing data from walking through the source ...
   |     0.00s Preparing metadata... 
   |   SECTION   TYPES:    book, line
   |   SECTION   FEATURES: book, line
   |   STRUCTURE TYPES:    _book, book, line, _sentence, _phrase
   |   STRUCTURE FEATURES: _book, book, line, _sentence, _phrase
   |   TEXT      FEATUR

## `generic` in `tf_config`


In [5]:
generic_metadata_modified = {
    'convertor_execution': 'Donald Duck',             # Please replace this by your own name!
    'convertor_author': 'Willie Wortel',              # Idem!
    'convertor_date': 'February, 1816',               # Replace by appropriate date
    'convertor_institution': 'The Duck University',   # Replace by your own institution/company
    'famous_uncle': 'Dagobert Duck',
    
    # DO NOT CHANGE!
    'convertor_version': '1.0.0',           # NO CHANGE!
}

## `langsettings` in `tf_config`

In [6]:
from helpertools import langtools
from collections import OrderedDict

langsettings_modified = {
    'english': {
        #OUTPUT DIR STRUCTURE
        #Output dir struct; NB these variable names need to be defined in the metadata!
        #Multiple items in the list define multiple options that will be checked from left to right
        #Output author/title/editor (or one of the other options if they are not provided
        'dir_struct': [['author', 'editor'], 
                       ['title', 'book', 'work'], 
                       ['editor']],
        
        #TF variables!
        'slot_type': 'word',
        'intFeatures': set(),
        'nonIntFeatures': {'otype', 'oslots', 'otext'},
        'struct_counter': OrderedDict([('_sentence', 1), ('_phrase', 1)]),
        'struct_counter_metadata': {
            '_sentence': f"sentences defined by the following delimiters: {{{'.', '?', '!',}}}",
            '_phrase': f"sentences defined by the following delimiters: {{{',', ';', ':',}}}",
        },
        'generic': {}, # = Metadata used by TF
        
        #LANGUAGE VARIABLES
        #Unicode norm
        'udnorm': 'NFD',
        #Package of langtools
        'langtool': langtools.Generic,
        'replace_func': langtools.Generic.replace,
        #Tokenizer
        'tokenizer': langtools.Generic.splitTokenize,
        'tokenizer_args': {'punc': True, 
                           'clean': False,
                           'splitters': None,
                           'non_splitters': ("-", "'"),},
        'token_out': OrderedDict([('pre', {'text': False, 'description': 'interpunction before word'}),
                                 ('orig', {'text': True, 'description': 'the original format of the word without interpunction'}),
                                 ('post', {'text': False, 'description': 'interpunction after word'}),
                                 ]),
        #Lemmatizer
        'lemmatizer': None,
        #Text formats
        'text_formats': {'orig': {'otext_name': 'fmt:text-orig-full',
                                  'format': '{pre}{orig}{post}',
                                  'function': langtools.Generic.origWord,
                                  'description': 'original format of the word including punctuation'},
                         'main': {'otext_name': 'fmt:text-orig-main',
                                  'format': '{main} ',
                                  'function': langtools.Generic.mainWord,
                                  'description': 'normalized format of the word excluding punctuation'},
                         'plain': {'otext_name': 'fmt:text-orig-plain',
                                   'format': '{plain} ',
                                   'function': langtools.Generic.plainWord,
                                   'description': 'plain format in lowercase'},
                        },

        #XML VARIABLES
        #Define the fields from xml metadata that need to be preserved
        # concat = True means that subfields are concatenated
        # concat = False means that subfields get their own metadata entry
        'xmlmetadata': {'titleStmt': {'concat': False, 'delimit': ', ', 'end': ''},
                     'publicationStmt': {'concat': True, 'delimit': ', ', 'end': '.'},
                     'sourceDesc': {'concat': True, 'delimit': ', ', 'end': '.'},
                     'license': {'concat': True, 'delimit': ', ', 'end': '.'},
                     'availability': {'concat': True, 'delimit': ', ', 'end': '.'},
                    },
        
        #Define the tag in which sectioning can be found
        'section_tags': {'div',},
        #Define in which key the section name can be found
        'section_keys': {'subtype'},
        #Define attribute keys that are superfluous and need to be ignored
        'ignore_attrib_keys': set(),
        #Define attribute keys that do not contain a section name
        'non_section_keys': set(),
        #Define values that are no sections, although they are in the right key
        'non_section_values': set(),
        ##Define attribute values that are superfluous and need to be ignored
        ##'ignore_attrib_values': set(),
        #Define tags that contain text elements that need not to be processed as regular text but as features
        'non_text_tags': set(),
        #Define attributes that have values that are feature names (values will be calculated automatically)
        'feature_attribs': {'corresp', 'source'},
        #Define sentence delimiters to be counted by struct_counter
        'sentence_delimit': {'.', '?', '!',},
        #Define phrase delimiters to be counted by struct_counter
        'phrase_delimit': {',', ';', ':',},
    },
}
    

In [7]:
convert('~/github/pthu/lorenz2020/tfbuilder/test/english', 
        '~/github/pthu/lorenz2020/tfbuilder/test/english/out',
        generic=generic_metadata_modified,
        langsettings=langsettings_modified,
        ignore_empty=False,
        tlg_out=False,
        lang='english', 
        typ=False, 
        header=False,
        multiprocessing=False,
        chunksize=1,
        silent=False,
       )

  0.00s parsing /home/ernstboogert/github/pthu/lorenz2020/tfbuilder/test/english/tlg0031.tlg004.perseus-eng2.xml
This is Text-Fabric 7.10.0
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

0 features found and 0 ignored
  0.00s Warp feature "otype" not found in
/home/ernstboogert/github/pthu/lorenz2020/tfbuilder/test/english/out/Rainbow Missions, Inc/World English Bible - John, Machine readable text/Rainbow Missions, Inc/2/tf/1.0/
  0.00s Warp feature "oslots" not found in
/home/ernstboogert/github/pthu/lorenz2020/tfbuilder/test/english/out/Rainbow Missions, Inc/World English Bible - John, Machine readable text/Rainbow Missions, Inc/2/tf/1.0/
  0.00s Warp feature "otext" not found. Working without Text-API

  0.00s Importing data from walking through the source ...
   |     0.00s Preparing metadata... 
   |   SECTION   TYPES:    chapter, verse
   |   SECTION   FEATURES: chapter, verse
   |   STRUCTURE TYPES:    _book, chapter, verse, _sentence, _phrase
   |   STRUC