# Prepare necessary metadata files from TLG library

In [None]:
import os, re, string
from pprint import pprint
from itertools import takewhile

REF = os.path.expanduser('~/github/pthu/tfbuilder/tlg_to_tf')

f1 = open(f'{REF}/data/refs_1.txt', encoding='utf-8')
f2 = open(f'{REF}/data/refs_2.txt', encoding='utf-8')

In [None]:
def splitLine(line):
    if line.startswith('    '):
        return (False, line.strip())
    else:
        sline = line.strip()
        first_space = sline.find(' ')
        return (sline[:first_space].strip(), 
                sline[first_space:].strip())

def authorMetadata(file):
    author_dict = {}
    key = None
    authorFound = False
    for l in file.readlines():
        line = l.strip()
        if line.startswith('key') and len(line.split()) == 3:
            authorFound = False
        elif authorFound:
            code, content = splitLine(line)
            author_dict[key].update({code: content})
        elif line.startswith('key') and len(line.split()) == 2:
            authorFound = True
            _, key = line.split()
            author_dict[key] = {}
    return author_dict


def workMetadata(file):
    workRE    = re.compile(r'^([0-9]+){4} (\w+){3}$')
    work_dict = {}
    start     = False
    workName  = ''
    nameFound = False
    workFound = False
    for l in file.readlines():
        line = l.strip()
        if len(line) == 4 and line.isdigit():
            workFound = False
        elif workRE.fullmatch(line):
            if start:
                work_dict[work] = (name, workName)
            start = True
            work = tuple(line.split(' '))
            workFound = True
            nameFound = False
            workName  = ''
            continue
        elif workFound:
            if not nameFound:
                if ',' in line:
                    name = line[:line.find(',')]
                else:
                    name = line
                nameFound = True
                workName += line
            else:
                workName += ' ' + line        
        else:
            continue
    return work_dict


def bibliography(file):
    bibl_dict = {}
    author = None
    workFound = False
    for line in file.readlines():
        line = line.strip()
        if line.replace(' ', '').isdigit() and \
                len(line.replace(' ', '')) == 7:
            workFound = True
            author = '-'.join(line.split())
            ref = ''
        elif line.replace(' ', '').isdigit() and \
                  len(line.replace(' ', '')) == 4:
            workFound = False
            if author:
                bibl_dict[author] = ref[:-1] + '.'
        elif workFound:
            ref += line.strip() + ' '
        
        else:
            continue
    return bibl_dict


def refs(file):
    refs_dict   = {}
    authorFound = False
    workFound   = False
    CUR         = None

    for line in file.readlines():
        sline = line.strip()
        
        if sline == '':
            continue
            
        elif sline.startswith('key') and len(sline.split()) == 2:
            if not CUR == None:
                refs_dict[CUR].update(work_dict)
            author_dict = {}
            work_dict = {}
            authorFound = True
            workFound = False

        elif sline.startswith('key') and len(sline.split()) == 3:
#             pprint(refs_dict)
            if not CUR == None:
                refs_dict[CUR].update(work_dict)
            work_dict = {}
            code, author, work = sline.split()
            CUR = (author, work)
            refs_dict[CUR] = author_dict.copy()
            authorFound = False
            workFound = True
            
        if authorFound:
            cod1, cont1 = splitLine(line)
            if cod1:
                code1, content1 = cod1, cont1
            else:
                content1 = content1 + ' ' + cont1
            author_dict[code1] = content1
            
        elif workFound:
            cod2, cont2 = splitLine(line)
            if cod2:
                code2, content2 = cod2, cont2
            else:
                content2 = content2 + ' ' + cont2
            work_dict[code2] = content2
    
#     if CUR != None:
    refs_dict[CUR].update(work_dict)
    
    
    return refs_dict
        

# author_result = authorMetadata(f2)
works_result = workMetadata(f1)
# bibl_result = bibliography(f2)
refs_result = refs(f2)
# pprint(author_result)
# pprint(works_result)
# pprint(bibl_result)
# pprint(refs_result)


f1.close()
f2.close()

In [None]:
# Works starting with 'X' don't have a citation system
s = set()

for i in refs_result:
    if 'cit' not in refs_result[i]:
        s.add(i)
pprint(s)

# Check whether other works don't have a citation system
for i in refs_result:
    if i not in s:
        if 'cit' not in refs_result[i]:
            print(i)

# Check whether every entry has a title ('tit' | 'wrk')
p = set()
for i in refs_result:
    if not 'tit' in refs_result[i] and not 'wrk' in refs_result[i]:
        p.add(i)
print(p)

In [None]:
# Derive the citation scheme
refs_list = []

for i in refs_result:
    if not i[1].startswith('X'):
        refs_list.append(list(filter(None, refs_result[i]['cit'].lower().split('/'))))

pprint(refs_list)

In [None]:

def tlge2csv(file_path, out_path, ref_dict, metadata_dict):
    fpath = os.path.expanduser(file_path)
    file_list = []
    if os.path.isdir(fpath):
        with os.scandir(fpath) as it:
            for entry in it:
                file_list.append(entry)
    elif os.path.isfile(fpath):
        file_list.append(fpath)
    else:
        print('It looks like something is wrong with the file_path')
    
    if out_path.startswith('~'):
        out_path = os.path.expanduser(out_path)
    try:
        os.makedirs(out_path) 
    except OSError:
        pass
    
    def cleanRef(ref):
        if ref.isdigit():
            ref_out = ref
        elif ref.isalpha():
            ref_out = ref
        else:
            if '_' in ref:
                ref = ref[:ref.find('_')]
            if set(string.ascii_letters) & set(ref):
                if ref[0].isalpha():
                    ref = ''.join(takewhile(lambda c: not c.isdigit(), ref))
                else:
                    ref = ''.join(takewhile(lambda c: not c.isalpha(), ref))
            ref_out = ''.join(c for c in ref if c.isalnum() or c == '-')
            if not ref_out.isalpha():
                ref_out = ref_out.rstrip(string.ascii_letters)
        return ref_out
    
    
    for file in file_list:
        print(file)
        with open(file) as source:
            n, ext    = os.path.splitext(file)
            if not ext == '.txt':
                continue
            name      = n.split('/')[-1]
            tlgNum    = (name[3:7], name[8:])
            workMeta  = metadata_dict[tlgNum]                  # tuple(title, full bibliographical reference)
            citScheme = tuple(filter(None, \
                          ref_dict[tlgNum]['cit'].split('/'))) # tuple(citlevel1, citlevel2, citlevel3, etc.)
#             print(f'workMetadata = {workMeta}')
#             print(f'citation scheme = {citScheme}')
            
            csv_output = open(f'{out_path}{name}.csv', 'w+')
            header = '\t'.join(citScheme) + '\t' + 'text' + '\n'
            csv_output.write(header)
            remainder = ''
    
            for line in (l.strip() for l in source.readlines() if not l.strip() == ''):
                line += remainder
                remainder = ''
                # The try-except construct is used because tlgu sometimes breaks the line unexpectedly
                try:
                    ref, text = line.split('\t', 1)
                except ValueError:
                    remainder = line

                clean_ref = tuple((cleanRef(r) for r in filter(None, ref.split('.'))))
                
                # Check whether the tlg citation scheme is equal to what we find in the file
                # If not equal the scheme is allinged to the right, as seems to be most appropriate
                if len(clean_ref) != len(citScheme):
                    start = len(clean_ref) - len(citScheme)
                    clean_ref = clean_ref[start:]
                
                split_ref = '\t'.join(clean_ref)
                csv_output.write(split_ref + '\t' + text.strip().replace('\t', ' ') + '\n')
            
            csv_output.close()
            
            
tlge2csv('~/github/tlgu-1/out/', '~/github/tlgu-1/out/csv/', refs_result, works_result)
# tlge2csv('~/github/tlgu-1/out/tlg0555-002.txt', '~/github/tlgu-1/out/csv/', refs_result, works_result)
# tlge2csv('~/github/tlgu-1/out/tlg0533-017.txt', '~/github/tlgu-1/out/csv/', refs_result, works_result)



In [None]:
ref = '120_3'
index = ref.find('_')
ref = ref[:index]
print(ref)

In [None]:
citScheme = ('chapter', 'section', 'line')
clean_ref = ('1', 'B', '2', '18', '2')

if len(clean_ref) != len(citScheme):
    start = len(clean_ref) - len(citScheme)
    clean_ref = clean_ref[start:]
    print(clean_ref)

In [None]:
import string
ref = 'p1'
if set(string.ascii_letters) & set(ref):
    if ref[0].isalpha():
        ref = ''.join(next(c for c in ref if not c.isdigit()))
    else:
        ref = ''.join(next(c for c in ref if not c.isalpha()))
print(ref)

In [None]:
import string
from itertools import takewhile

def cleanRef(ref):
        if ref.isdigit():
            ref_out = ref
        elif ref.isalpha():
            ref_out = ref
        else:
            if '_' in ref:
                ref = ref[:ref.find('_')]
            if set(string.ascii_letters) & set(ref):
                if ref[0].isalpha():
                    ref = ''.join(takewhile(lambda c: not c.isdigit(), ref))
                else:
                    ref = ''.join(takewhile(lambda c: not c.isalpha(), ref))
            ref_out = ''.join(c for c in ref if c.isalnum() or c == '-')
            if not ref_out.isalpha():
                ref_out = ref_out.rstrip(string.ascii_letters)
        return ref_out

r = 'asdfp**10**(?)**'
r = '1*'
print(cleanRef(r))

