In [225]:
import os

def split(lines):
    pages = []
    current = []
    for line in lines:
        if line.startswith(""):
            pages.append(current)
            current = [line]
        else:
            current.append(line)
    if len(current) > 0:
        pages.append(current)
    return pages
            


In [226]:
ipf = open("Volume-IX-Tome-IX.txt","r",encoding="utf-8")
lines = ipf.readlines()
pages = split(lines)

In [227]:
opfdir = "vol_I - kopie"
for i,page in enumerate(pages):
    opf = open(os.path.join(opfdir,str(i)+".txt"),"w",encoding="utf-8")
    for line in page:
        opf.write(line)
    opf.close()

In [1]:
import re
def is_roman_numeral(s):
    l = []
    for c in s:
        if c in ["I","X","L","V"]:
            l.append(True)
        else:
            l.append(False)
    if all(l):
        return True
    else:
        return False

def find_lemmata(headers):
    primary_header = headers[0]
    primary_header = primary_header.strip("")
    primary_header = primary_header.strip(" ")
    if vol_nr == "IX":
        primary_header = primary_header.replace("","                      ")
    query = r"\s\s\s\s+"
    ms = re.finditer(query,primary_header)
    ms = list(ms)
    if len(ms) == 2:
        names = primary_header[ms[0].end(0):ms[1].start(0)].strip()
        page_begin = primary_header[:ms[0].start(0)].strip()
        page_end = primary_header[ms[1].end(0):].strip()
    else:
        opf_error.write("\tError in header extraction\n")
        opf_error.write("\t"+primary_header+"\n")
        return None

    primary_header = primary_header.strip()
    parts = [part for part in primary_header.split(" ") if len(part)>0]
    if not page_begin.isnumeric():
        page_begin = "OCR_ERROR"
    if not page_end.isnumeric():
        if not page_begin == "OCR_ERROR":
            page_end = str(int(page_begin) + 1)
        else:
            page_end = "OCR_ERROR"
    if page_begin == "OCR_ERROR":
        if not page_end == "OCR_ERROR":
            page_begin = str(int(page_end)-1)


    if "—" in names:
        names = names.replace("—","-")
    if "-" in names:
        names = names.replace("-","-")
    if "-" in names:
        names = names.replace("-","-")
    if "—" in names:
        names = names.replace("—","-")
    name_sep = "-"
        
    if vol_nr == "IX":
        all_names = [name.strip().upper() for name in names.split(name_sep) if len(name.strip())>1]
    else:
        all_names = [name.strip() for name in names.split(name_sep) if len(name.strip())>1]
    
    if len(headers)>1:
        for header in headers[1:]:
            extra_names = header.strip().strip(" ")
            if vol_nr == "IX":
                all_names.extend([name.strip().upper() for name in extra_names.split(name_sep) if (len(name.strip())>1 and not name.strip().isnumeric())])
            else:
                all_names.extend([name.strip() for name in extra_names.split(name_sep) if (len(name.strip())>1 and not name.strip().isnumeric())])
    return all_names,page_begin,page_end

def find_column_separation(page):
    ends_dic = {}
    lines_dic = {}
    query = r"\s\s+"
    for i,line in enumerate(page):
        ms = re.finditer(query,line)
        ms = list(ms)
        lines_dic[i] = ms
        ends = [m.end(0) for m in ms]
        for end in ends:
            if not end in ends_dic:
                ends_dic[end] = 1
            else:
                ends_dic[end]+=1
    return ends_dic,lines_dic
    
def divide_in_columns(page,ends_dic,lines_dic):
    selected_boundaries = [boundary for boundary,count in ends_dic.items() if count/len(page)>0.1]
    selected_boundaries = [sb for sb in selected_boundaries if sb>20]
    min_b = min(selected_boundaries)

    threshold = 0.1
    kolom1 = []
    kolom2 = []
    errors = []
    for i,line in enumerate(page):
        divided = False
        if len(line) < 2:
            continue
        if len(line) < min_b:
            divided = True
            kolom1.append(line.strip())
        else:
            for m in lines_dic[i]:
                if m.end(0) in selected_boundaries:
                    kolom1.append(line[:m.end(0)].strip())
                    kolom2.append(line[m.end(0):].strip())
                    divided = True
                    break
        if not divided:
            for b in selected_boundaries:
                for m in lines_dic[i]:
                    if m.start(0) <= b < m.end(0):
                        kolom1.append(line[:m.end(0)].strip())
                        kolom2.append(line[m.end(0):].strip())
                        divided = True
                        break
                if divided:
                    break

        if not divided:
            for m in lines_dic[i]:
                if (m.end(0)+1 in selected_boundaries or m.end(0)+2 in selected_boundaries):
                    kolom1.append(line[:m.end(0)].strip())
                    kolom2.append(line[m.end(0):].strip())
                    divided = True
                    break
        if not divided:
            errors.append(line)
    if not len(errors) == 0:
        opf_error.write("\tSegmentation errors: \n")
        for e in errors:
            opf_error.write(e)
    return kolom1,kolom2

def divide_in_records(text,names):
    records = []
    current_record = {"name":"INHERIT","text":[]}
    names_found = []
    for line in text:
        name_found = False
        for name in names:
            if name in line:
                if len(current_record["text"])>0:
                    records.append(current_record)
                current_record = {}
                current_record["name"] = name
                current_record["text"] = [line]
                name_found = True
                names_found.append(name)
        if not name_found:
            current_record["text"].append(line)
            words = line.split(" ")
            uppers = [w.isupper() for w in words]
            potential_error = False
            if any([w.isupper for w in words]):
                for u,w in zip(uppers,words):
                    if u == True and len(w)>2 and w.isalpha() and (not is_roman_numeral(w)):
                        opf_error.write("\tDetected uppercase word %s\n"%(w))
                        potential_error = True
            if potential_error:
                opf_error.write("\t**  Names to look for: %s\n"%(names))
    if len(current_record["text"]) > 0:
        records.append(current_record)
#
    return records, names_found

def merge_records(records1,records2,begin_page,end_page):
    records = []
    for r in records1:
        r["pages"] = str(begin_page)
        records.append(r)
    if records2[0].get("name") == "INHERIT":
        records[-1]["text"].extend(records2[0]["text"])
        records[-1]["pages"] = "%s-%s" % (begin_page,end_page)
    else:
        records2[0]["pages"] = str(end_page)
        records.append(records2[0])
    if len(records2)>1:
        for r in records2[1:]:
            r["pages"] = str(end_page)
            records.append(r)
    return records

def merge_text(record):
    all_text = ""
    for line in record.get("text"):
        if len(line) == 0:
            all_text += "\n"
        elif line[-1] == "-":
            all_text += line[:-1]
        else:
            all_text += line + " "
    record["text"] = all_text
    return record
                

In [26]:
vol_nr = "IX"
BEGIN = 8
END = 228
#SKIP = [215]
#II_BEGIN = 5
#II_END = 502
#III_BEGIN = 19
#III_END = 490
#IV_BEGIN = 23
#IV_END = 507
#V_BEGIN = 21
#V_END = 472
#VI_BEGIN = 22
#VI_END = 591
#VIIa_BEGIN = 4
#VIIa_END = 252
#VIIb_BEGIN = 4
#VIIb_END = 204
#VIIc_BEGIN = 16
#VIIc_END = 208
#VIII_BEGIN = 6
#VIII_END = 243

In [27]:
import json

opf_error = open("processing_errors_vol_%s.txt"% vol_nr,"w",encoding="utf-8")
pages = []

for i in range(BEGIN,END):
    ipf = open("verbeterd/vol_%s/%s.txt" % (vol_nr,i),"r",encoding="utf-8")
    pages.append(ipf.readlines())
    
all_records = []
all_records_clean = []

begin_index = 0
#end_index = 

last_name_of_previous_page = "INHERIT"
for i,page in enumerate(pages[begin_index:]):
    page_i = begin_index+i+1
    opf_error.write("Processing %s\n" % (page_i+BEGIN-1))
    print("Processing %s"%(page_i+BEGIN-1))
    if page_i+BEGIN-1 in SKIP:
        continue
    #header = page[0]
    cutoff_header = 0
    headers = []
    keep_looking = True
    try:
        while keep_looking:
            if len(page[cutoff_header].strip()) == 0:
                keep_looking = False
            else:
                headers.append(page[cutoff_header])
                cutoff_header += 1
    except IndexError:
        opf_error.write("\tNo whitespace between header and body in page %s\n"%(page_i+BEGIN-1))
        continue

    body_raw = page[cutoff_header:]
    remove_whitespace = True
    cutoff_body =  cutoff_header
    while remove_whitespace:
        if len(page[cutoff_body].strip()) > 0:
            remove_whitespace = False
        else:
            cutoff_body += 1
    body = page[cutoff_body:]
    if vol_nr == "IX":
        body = body[:-1]

               
    try:
        names,page_begin,page_end = find_lemmata(headers)
    except TypeError:
        continue

    ends_dic,lines_dic = find_column_separatrion(body)
    kolom1,kolom2 = divide_in_columns(body,ends_dic,lines_dic)

    records_kolom1,names_found1 = divide_in_records(kolom1,names)
    records_kolom2,names_found2 = divide_in_records(kolom2,names)
        
    names_found1.extend(names_found2)

    if names[0] == last_name_of_previous_page:
        names_to_spot = names[1:]
    else:
        names_to_spot = names
   
    names_not_found = list(set(names_to_spot)-set(names_found1))
    
    last_name_of_previous_page = names[-1]
    
    if len(names_not_found) > 0:
        opf_error.write("\tFollowing names not found in page %s: %s\n"% (page_i+BEGIN-1,names_not_found))
    records = merge_records(records_kolom1,records_kolom2,page_begin,page_end)
    
    if len(all_records)>0 and records[0].get("name") == "INHERIT":
        all_records[-1]["text"].extend(records[0]["text"])
        all_records[-1]["pages"] +=  "-%s" % (page_begin)
        if len(records) > 1:
            all_records.extend(records[1:])
    else:
        all_records.extend(records)

for r in all_records:
    all_records_clean.append(merge_text(r))

db = json.dumps(all_records_clean,ensure_ascii=False).encode("utf-8")

opf = open("vol_%s.json"%(vol_nr),"w",encoding="utf-8")
json.dump(all_records_clean,opf,ensure_ascii=False,indent=4)
opf.close()
opf_error.close()





Processing 8
Processing 9
Processing 10
Processing 11
Processing 12
Processing 13
Processing 14
Processing 15
Processing 16
Processing 17
Processing 18
Processing 19
Processing 20
Processing 21
Processing 22
Processing 23
Processing 24
Processing 25
Processing 26
Processing 27
Processing 28
Processing 29
Processing 30
Processing 31
Processing 32
Processing 33
Processing 34
Processing 35
Processing 36
Processing 37
Processing 38
Processing 39
Processing 40
Processing 41
Processing 42
Processing 43
Processing 44
Processing 45
Processing 46
Processing 47
Processing 48
Processing 49
Processing 50
Processing 51
Processing 52
Processing 53
Processing 54
Processing 55
Processing 56
Processing 57
Processing 58
Processing 59
Processing 60
Processing 61
Processing 62
Processing 63
Processing 64
Processing 65
Processing 66
Processing 67
Processing 68
Processing 69
Processing 70
Processing 71
Processing 72
Processing 73
Processing 74
Processing 75
Processing 76
Processing 77
Processing 78
Processi