In [1]:
import re, glob
from copy import copy, deepcopy
from os import path
from pprint import pprint
from collections import OrderedDict
from ordered_set import OrderedSet
from lxml import etree

# fullSub  = re.compile(r'<(note|fw|num).+?</(note|fw|num)>')
# wordSub = re.compile(r'</*w>|<w [^>]*?>')
# elemSub  = re.compile(r'</*(div|pb|pc|cb|hi|supplied|unclear|seg|ex|space)[^/>]*?/*>')
# lbYesSub = re.compile(r'</*lb[^/>]*?/*>')
# lbNoSub  = re.compile(r'<lb [^>]*?break="no"[^/>]*?/*>')
# gabSub   = re.compile(r'<gap.*?/>')
# # partSub  = re.compile(r'<ab[^>]+?part[^>]*?>')
ampSub   = re.compile(r'&[^ ]+?;')
spaceSub = re.compile(r'\s*  \s*')
abRE = re.compile(r'<ab n= *" *(\w+?) *"[^>]*>')


# hands = set()
textElems = set()
nontextElems = set()

# XML parser
# parser = etree.XMLParser(recover=True, strip_cdata=True)
parser = etree.XMLParser(recover=True, huge_tree=True, compact=False, remove_pis=True, strip_cdata=True)

# XML namespaces
NS1 = '{http://www.tei-c.org/ns/1.0}'
NS2 = '{http://www.w3.org/XML/1998/namespace}'

# XML tags to be preserved
DELETE = {
 '{http://www.tei-c.org/ns/1.0}abbr',
 '{http://www.tei-c.org/ns/1.0}cb',
 '{http://www.tei-c.org/ns/1.0}div',
 '{http://www.tei-c.org/ns/1.0}ex',
 '{http://www.tei-c.org/ns/1.0}handshift',
 '{http://www.tei-c.org/ns/1.0}hi',
 '{http://www.tei-c.org/ns/1.0}num',
 '{http://www.tei-c.org/ns/1.0}pb',
 '{http://www.tei-c.org/ns/1.0}pc',
 '{http://www.tei-c.org/ns/1.0}seg',
 '{http://www.tei-c.org/ns/1.0}space',
 '{http://www.tei-c.org/ns/1.0}supplied',
 '{http://www.tei-c.org/ns/1.0}unclear',
 '{http://www.tei-c.org/ns/1.0}w',
}

DELETE_full = {
 '{http://www.tei-c.org/ns/1.0}fw',
 '{http://www.tei-c.org/ns/1.0}note',
}

CHANGE = {
 '{http://www.tei-c.org/ns/1.0}rgd',
 '{http://www.tei-c.org/ns/1.0}lb',
 '{http://www.tei-c.org/ns/1.0}lbP48vyC1L-P13',
 '{http://www.tei-c.org/ns/1.0}lbP49vyC1L-P13',
 '{http://www.tei-c.org/ns/1.0}lbP50vyC4L-P13',
 '{http://www.tei-c.org/ns/1.0}lbP63vyC1L-P13',
 '{http://www.tei-c.org/ns/1.0}lbP64vyC1L-P13',
 '{http://www.tei-c.org/ns/1.0}gap',
}

def mss_clean(input_file, output_path, show_body_elements=False):
    # Construct filename
    file_path = path.splitext(input_file)[0].split('/')
    filename = file_path[-1]
    institution = file_path[-3]
    if len(filename) == 5 and not filename.endswith('S'):
        if filename.startswith('1'):
            filename = 'P' + filename[1:].lstrip('0')
        elif filename.startswith('2'):
            filename = '0' + filename[1:].lstrip('0')
        elif filename.startswith('3'):
            filename = filename[1:].lstrip('0')
        elif filename.startswith('4'):
            filename = 'L' + filename[1:].lstrip('0')
    if institution == 'Birmingham':
        filename = 'B_' + filename
    elif institution == 'Muenster':
        filename = 'M_' + filename
        
    
    # Make XML tree
    tree = etree.parse(input_file, parser)
    root = tree.getroot()
    
    # Get all tags of body elements
#     for elem in root.find(f'{NS1}text/{NS1}body').iter():
#         if elem.text:
#             textElems.add(elem.tag)
#         else:
#             nontextElems.add(elem.tag)
            
    
    
    # Retrieve witnesses
    witnesses = [wit.get(f'{NS2}id') for wit in root.findall(f'.//{NS1}witness')]
#     print(witnesses)

    # Clean tags
    for tag in DELETE:
        if tag == f'{NS1}w':    
        # Birmingham transcriptions get different treatment
            if filename.startswith('B_'):
                for word in root.findall(f'.//{NS1}w'):
#                     word.text = word.text + ' '
                    word.tail = ' '
                etree.strip_tags(root.find(f'{NS1}text/{NS1}body'), tag)
            else:
                etree.strip_tags(root.find(f'{NS1}text/{NS1}body'), tag)
        else:
            etree.strip_tags(root.find(f'{NS1}text/{NS1}body'), tag)
    
    # Clean elements
    for tag in DELETE_full:
        # With this piece of code, you can make visible what will be deleted...
#         delete = root.findall(f'.//{tag}')
#         for i in delete:
#             print(i.text)
            
        etree.strip_elements(root.find(f'{NS1}text/{NS1}body'), tag, with_tail=False)
        
    # Change tags/elements
    for c in CHANGE:
        # Handle line breaks
        if c in {f'{NS1}lb',
                 f'{NS1}lbP48vyC1L-P13',
                 f'{NS1}lbP49vyC1L-P13',
                 f'{NS1}lbP50vyC4L-P13',
                 f'{NS1}lbP63vyC1L-P13',
                 f'{NS1}lbP64vyC1L-P13',}:
            # Handle not breaking lb's first
            lbs_nobreak = root.findall(f'.//{NS1}lb[@break="no"]')
            for lb in lbs_nobreak:
                lb.tag = f'{NS1}lb_nobreak'
            # delete nobreaks
            etree.strip_tags(root.find(f'{NS1}text/{NS1}body'), f'{NS1}lb_nobreak')
            # Then get rid of the other breaks
            lbs = root.findall(f'.//{NS1}lb')
            # delete lbs after a space has been passed to .text
            for lb in lbs:
#                 pass
                lb.text = ' '
            etree.strip_tags(root.find(f'{NS1}text/{NS1}body'), f'{NS1}lb')
            
        # Correct misspelling
        elif c ==  f'{NS1}rgd':
            rgds = root.findall(f'.//{NS1}rgd')
            for r in rgds:
                r.tag = f'{NS1}rdg'
            
        # Handle gaps
        elif c == f'{NS1}gap':
            gaps = root.findall(f'.//{NS1}gap')
            for gap in gaps:
                gap.text = '###gap###'
            etree.strip_tags(root.find(f'{NS1}text/{NS1}body'), f'{NS1}gap')
            
    if not witnesses:
        xml_string = etree.tostring(tree, encoding='unicode')
        out = open(f'{output_path}/{filename}.xml', 'w+')
        out.write(xml_string)
        out.close()
    
    else:
        addUnknown = False
        # Check rdg's without hand, like: <rdg type="orig">
        no_hands = root.findall(f'.//{NS1}rdg')
        for rdg in no_hands:
            if not 'hand' in rdg.attrib:
                rdg.attrib['hand'] = 'firsthand'
            # Check rdg's with empty hand, like: <rdg type="corr" hand="">
            elif rdg.attrib['hand'] == '':
                rdg.attrib['hand'] = 'unknown'
                if not addUnknown:
                    witnesses.append('unknown')
                    addUnknown = True
        
        for wit in witnesses:
            # Make for each witness a deepcopy of the tree
            wit_tree = deepcopy(tree)
            wit_root = wit_tree.getroot()
            
            # Select in case of variation the correct witness (hand!)
            apps = wit_root.findall(f'.//{NS1}app')
            for app in apps:
                # In case the app contains the present witness (hand)
                if app.findall(f'.//{NS1}rdg[@hand="{wit}"]'):
                    for rdg in app.iterchildren(tag=f'{NS1}rdg'):
                        if not rdg.attrib['hand'] == wit:
                            app.remove(rdg)
                    # Only keep the last intervention by the witness (hand)
                    for rdg in app.iterchildren(tag=f'{NS1}rdg'):
                        if not rdg == app[-1]: # Check if it is not the last one, because list object (like app[:-1]) has no iterchildren() method
                            app.remove(rdg)
                # In case the app does not contain the present witness (hand)
                else:
                    # Define the doings of the first hand
                    firsthand = app.findall(f'.//{NS1}rdg[@hand="firsthand"]')
                    # Delete all other hands except the first hand
                    for rdg in app.iterchildren(tag=f'{NS1}rdg'):
                        if not rdg in firsthand:
                            app.remove(rdg)
                    # Keep only the last intervention of the first hand
                    for rdg in firsthand[:-1]:
                        app.remove(rdg)
    
            # Write xml tree to string for final manipulations
            xml_string = etree.tostring(wit_tree, encoding='unicode')
    
            # Setup mechanism to handle sections
            ref_dict = {}
            def ab_modifier(ab):
                nonlocal ref_dict
                ab_split = list(filter(None, re.split(r'[BKV]', re.sub(abRE, r'\g<1>', ab))))
                ab_dict = OrderedDict(zip(('book', 'chapter', 'verse'), ab_split))
                diff = OrderedDict( tuple(OrderedSet(ab_dict.items()) - OrderedSet(ref_dict.items()) ))
                ref_dict = ab_dict
                res = ''
                for d in diff:
                    res += f'<div type="{d}" n="{diff[d]}">'
                return res            

            # Apply section substitution
            xml_string = re.sub(abRE, lambda m: ab_modifier(m.group(0)), xml_string)

            # Final cleanup
            xml_string = re.sub(ampSub, '', xml_string)     # Delete html markup
            xml_string = xml_string.replace('\n', '')
            xml_string = re.sub(spaceSub, ' ', xml_string)  # Delete superfluous spaces
            xml_string = xml_string.replace('</ab>', '')    # Delete end-tags <ab>
            xml_string = xml_string.replace('<ab>', '')
            xml_string = xml_string.replace('\n', '')

            if wit == '*':
                wit = 'star'
            out = open(f'{output_path}/{filename}_{wit}.xml', 'w+')
            out.write(xml_string) #.replace('\n', ''))
            out.close()


In [2]:

# file_list = glob.glob(path.expanduser('~/github/pthu/sources/manuscripts/mss_transcriptions/**/*.xml'), recursive=True)
file_list = glob.glob(path.expanduser('~/github/manuscripts/Muenster/mss/*.xml'), recursive=True)
# file_list = glob.glob(path.expanduser('~/github/manuscripts/Birmingham/John/*.xml'), recursive=True)
# file_list = glob.glob(path.expanduser('~/github/manuscripts/Muenster/test_in/*.xml'), recursive=True)

# print(file_list)

# print(file_list)
# pool = Pool()
# pool.map(process_file, file_list)
# pool.close()
# pool.join()
for file in file_list:
    print(f'Converting {file}...\n')
#     mss_clean(file, path.expanduser('~/github/manuscripts/Muenster/test_out'), show_body_elements=True)
    mss_clean(file, path.expanduser('~/github/manuscripts/Muenster/preprocessed'), show_body_elements=True)
#     mss_clean(file, path.expanduser('~/github/manuscripts/Birmingham/John_preprocessed'), show_body_elements=True)

# print(len(textElems))
# pprint(textElems)
# print(len(nontextElems))
# pprint(nontextElems)
# pprint(hands)
# mss_clean(path.expanduser(path.join('~/github/pthu/sources/manuscripts/mss_transcriptions/Muenster/majuscules/20011.xml')), path.expanduser(path.join('~/github/pthu/sources/manuscripts/test/out')))

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10107.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/32680.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30201.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10055.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/20037.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/31546.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/31534.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/32831.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10070.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/20131.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30378.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/20083.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10128.xml...

Converting /

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/32737.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/20057.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30105.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/40023.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/20020.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30117.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/41825.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/32464.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10085.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/20011.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/32147.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/31729.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/31409.xml...

Converting /

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30348.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30069.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30665.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/32865.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30006.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30423.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/31831.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10063.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10074.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/40060.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30454.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/32606.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30436.xml...

Converting /

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/31574.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/31837.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/20017.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30517.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/32188.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/20003.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/40587.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/32351.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10097.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10052.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30016.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10051.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/31637.xml...

Converting /

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30915.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/31243.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/20167.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10122.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/40950.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/32726.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/31006.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30330.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30687.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/31933.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10121.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10072.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30498.xml...

Converting /

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10065.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10045.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30238.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30752.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/20211.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10093.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10119.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30038.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/10078.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/20099.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/30876.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/31082.xml...

Converting /home/ernstboogert/github/manuscripts/Muenster/mss/31838.xml...

Converting /

In [3]:
pprint(textElems)
pprint(nontextElems)

set()
set()


In [4]:
import re
from collections import OrderedDict
from ordered_set import OrderedSet

# ab = '<ab n="B04K4V6">'

abRE = re.compile(r'<ab n= *" *(\w+?) *"[^>]*>')

test = 'εμπροσθεν εκεινου ·<ab n="B04K3V29"> ο εχων την <ab n="B04K4V1"> νυμφην ·'

refs = ['<ab n="B04K4V6">', '<ab n="B04K4V7">', '<ab n="B04K4V8">', '<ab n="B04K5V1">', '<ab n="B04K5V2">']
ref_dict = OrderedDict()

def ab_modifier(ab):
    global ref_dict
    ab_split = list(filter(None, re.split(r'[BKV]', re.sub(abRE, r'\g<1>', ab))))
    ab_dict = OrderedDict(zip(('book', 'chapter', 'verse'), ab_split))
    diff = OrderedDict( tuple(OrderedSet(ab_dict.items()) - OrderedSet(ref_dict.items()) ))
    ref_dict = ab_dict
    res = ''
    for d in diff:
        res += f'<div type="{d}" n="{diff[d]}">'
    return res
    
# for ab in refs:
#     print(ab_modifier(ab))
#     print(ref_dict)
    

print(re.sub(abRE, lambda m: ab_modifier(m.group(0)), test))


# res = list(filter(None, re.split(r'[BKV]', re.sub(abRE, r'\g<1>', ab))))
# print(res)


εμπροσθεν εκεινου ·<div type="book" n="04"><div type="chapter" n="3"><div type="verse" n="29"> ο εχων την <div type="chapter" n="4"><div type="verse" n="1"> νυμφην ·


In [5]:
import re

n = 'B27K1V2'

# ab_split = list(filter(None, re.split(r'[BKV]', re.sub(abRE, r'\g<1>', ab))))
# ab_dict = OrderedDict(zip(('book', 'chapter', 'verse'), ab_split))

re.split(r'[BKV]', n)

['', '27', '1', '2']

In [6]:
piet = []
for i in piet[:-1]:
    print(i)

In [7]:
if '¯' == '¯':
    print('yes')

yes
