# Make Translations JSONL

Outputs human translations to the data folder.

http://oracc.museum.upenn.edu/doc/help/editinginatf/primer/inlinetutorial/index.html

In [1]:
import os, sys, math
import requests
import json
import time
import random
import shutil
import zipfile
from collections import defaultdict
from glob import glob
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [2]:
import cdli
import oracc
import languages

In [3]:
out_dir = os.path.abspath(f"/Volumes/FrankDisk/oracc_zips")
os.makedirs(out_dir, exist_ok=True)
out_dir

'/Volumes/FrankDisk/oracc_zips'

In [4]:
# This is loaded later. Put here so I don't accidentally run it.
oracc_translated_pubs = dict()

## Download ORACC Projects

In [5]:
project_zips = oracc.get_all_project_zips(out_dir, verbose=False, tqdm=tqdm)

  0%|          | 0/136 [00:00<?, ?it/s]

In [6]:
print(len(project_zips))
project_zips[:3]

115


['/Volumes/FrankDisk/oracc_zips/adsd.zip',
 '/Volumes/FrankDisk/oracc_zips/adsd-adart1.zip',
 '/Volumes/FrankDisk/oracc_zips/adsd-adart2.zip']

In [7]:
all_corpus_object_ids = oracc.get_all_corpus_object_ids(project_zips[:], tqdm=tqdm)

  0%|          | 0/115 [00:00<?, ?it/s]

In [8]:
len(all_corpus_object_ids)

25522

## Load ORACC Transliterations

In [9]:
oracc_pub_ids_and_langs, transliterated_oracc_corpi = oracc.load_all_project_pub_ids(out_dir, tqdm=tqdm)

  0%|          | 0/136 [00:00<?, ?it/s]

Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fd473f53540>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fd2a7dbccc0>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fd276430140>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fd276fcc6c0>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fd260238ac0>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fd276ffa9c0>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <t

In [10]:
transliterated_oracc_pub_ids = set(transliterated_oracc_corpi.keys())

In [11]:
print(len(oracc_pub_ids_and_langs), "oracc pubs")
print(len(transliterated_oracc_pub_ids), "oracc transliterated pubs")

132703 oracc pubs
21607 oracc transliterated pubs


In [12]:
sorted(list(transliterated_oracc_pub_ids))[-5:]

['X900957', 'X900975', 'X900978', 'X987003', 'X999985']

## Download CDLI for Reference

In [13]:
cdli_pubs = cdli.get_atf()

Downloading https://github.com/cdli-gh/data/raw/master/cdliatf_unblocked.atf
Parsing atf


## Download ORACC Translations

In [14]:
reported_translated_ids = oracc.get_all_translated_object_ids(project_zips, tqdm)
print(len(reported_translated_ids), "reported translations")

  0%|          | 0/115 [00:00<?, ?it/s]

16277 reported translations


In [15]:
for pid, oid in tqdm(reported_translated_ids[:]):
    tpath = oracc.download_object_translation(out_dir, pid, oid)
#     print(tpath)

  0%|          | 0/16277 [00:00<?, ?it/s]

In [16]:
all_object_html_paths = oracc.get_all_object_html_paths(out_dir)
all_translated_ids = sorted(list(all_object_html_paths.keys()))
print(len(all_object_html_paths))
all_translated_ids[:3]

16102


['P010092', 'P010452', 'P010573']

## Normalize ORACC Transliterations

In [17]:
def load_html(path):
    with open(path, "rt") as f:
        return BeautifulSoup(f.read())
    
def load_html_for_object_id(object_id):
#     print(object_id)
    return load_html(all_object_html_paths[object_id])




In [18]:
def get_object_id_pub(object_id):
    pub = cdli.Publication(object_id)
    
    surface = ""
    column = ""
    text_area = None
    def add_line(number, cuneiform):
        nonlocal surface, column, text_area, pub
        if text_area is None:
            name = surface
            if len(column) > 0:
                if len(name) > 0:
                    name += " " + column
                else:
                    name = column
            text_area = cdli.TextArea(name=name)
            pub.text_areas.append(text_area)
        line = cdli.TextLine(number=number, text=cuneiform)
        text_area.lines.append(line)

    html = load_html_for_object_id(object_id)
    texts = html.find_all("div", class_="text")
    langs = defaultdict(lambda: 0)

    for text in texts:
        surface = ""
        column = ""
        text_area = None
        line_index = 0
        table = text.find("table", class_="transliteration")
        if table is None:
            continue
        text_title = text.find("h1").text
        rows = table.find_all("tr")
        for r in rows:
            cols = r.find_all("td")
            rclasses = r["class"] if r.has_attr("class") else []
            if "h" in rclasses:
                htext = cols[0].text.strip()
                if "surface" in rclasses:
                    surface = htext
                    column = ""
                elif "column" in rclasses:
                    column = htext
                text_area = None
                line_index = 0
#                 print("")
#                 print(object_id, text_title, surface, column)
            else:
                lnums = [x for x in cols if x.has_attr("class") and "lnum" in x["class"]]
                if len(lnums) != 1:
                    continue
                lnum = lnums[0].text.strip() if len(lnums) > 0 else ""                
                tlits = [x for x in cols if x.has_attr("class") and "tlit" in x["class"]]
                ntlits = len(tlits)
                cs = [x for x in cols if x.has_attr("class") and "c" in x["class"]]
                xtrs = [x for x in cols if x.has_attr("class") and "xtr" in x["class"]]
                if ntlits == 1:
                    tlit, lang = tlit_to_normalized_ascii(tlits[0])
                    langs[lang] += 1
                    add_line(lnum, tlit)
                    xtr = ""
                    rowspan = 1
                    if len(xtrs) > 0:
                        if xtrs[0].has_attr("rowspan"):
                            rowspan = int(xtrs[0]["rowspan"])
                        xtr = xtr_to_en(xtrs[0])
                        para = cdli.TextParagraph(line_index, line_index + rowspan)
                        para.languages["en"] = xtr
                        text_area.paragraphs.append(para)
                    line_index += 1
                elif len(cs) > 0 and len(cs) == len(xtrs):
                    for i, c in enumerate(cs):
                        tlit, lang = tlit_to_normalized_ascii(c)
                        langs[lang] += 1
                        add_line(lnum + f".{i}", tlit)
                        xtr = xtr_to_en(xtrs[i])
                        para = cdli.TextParagraph(line_index, line_index + 1)
                        para.languages["en"] = xtr
                        text_area.paragraphs.append(para)
                        line_index += 1
                elif ntlits == 0:
                    pass
                else:
                    raise ValueError("Unsupported format: ntlits=", len(tlits))
#                 print(line_index, lnum, "\t", tlit, "\t", rowspan, "\t", xtr)
                
#                 print("row", r["class"], "with", len(cols), "cols", [(x["class"] if x.has_attr("class") else []) for x in cols])
#         print("")
    langs = sorted([(x, langs[x]) for x in langs.keys()], key=lambda x:-x[1])
#     print(langs)
    pub.language = langs[0][0] if len(langs) > 0 else None
    return pub

def print_pub_paragraphs(pub, tgt_len="en"):
    for a in pub.text_areas:
        print(pub.id, a.name)
        for p in a.paragraphs:
            lines = a.lines[p.start_line_index:p.end_line_index]
            cuneiform = " ".join(x.text for x in lines)
            tgt = p.languages[tgt_len]
            print("   >>>>", cuneiform)
            print("   <<<<", tgt)
        print("")
        
def print_pub_lines(pub, tgt_len="en"):
    for a in pub.text_areas:
        print(pub.id, a.name)
        for line in a.lines:
            print(line.text)
        print("")
        

def xtr_to_en(xtr):
    ptr = xtr.find("p", class_="tr")
    if ptr is None:
        return ""
    cell = ptr.find("span", class_="cell")
    if cell is not None:
        ptr = cell
    return ptr.text.strip()
            
tlit_ignore_classes = set(["marker"])

def is_node_sign(node):
    if isinstance(node, str):
        return node == "."
    return node.name=="sup" or (node.has_attr("class") and "sign" in node["class"])

def tlit_to_normalized_ascii(tlit):
    langs = defaultdict(lambda: 0)
    def node_to_str(node, in_sign):
        if isinstance(node, str):
            return [node]
        children_in_sign = False
        ignore = False
        classes = node["class"] if node.has_attr("class") else []
        for c in classes:
            ignore = ignore or (c in tlit_ignore_classes)
        if ignore:
            return []
        if node.name == "span" and "sign" not in classes:
            for c in classes:
                if c in languages.all_languages:
                    langs[c] += 1
        parts = []
        is_sup = node.name == "sup"
        is_sign = all(is_node_sign(x) for x in node)
        if is_sup:
            parts.append("{")
            if "sux" in node["class"] and node.text == "m":
                parts.append("disz")
                parts.append("}")
                return parts
        if is_sign and not in_sign:
#             parts.append("_")
            children_in_sign = True
        for c in node:
            parts.extend(node_to_str(c, in_sign=in_sign or children_in_sign))
#         if is_sign and not in_sign:
#             parts.append("_")
        if is_sup:
            parts.append("}")
        return parts
    tokens = node_to_str(tlit, in_sign=False)
    langs = sorted([(x, langs[x]) for x in langs.keys()], key=lambda x:-x[1])
#     print(langs)
    lang = langs[0][0] if len(langs) > 0 else "?"
    return oracc_unicode_words_to_normalized_ascii(tokens), lang

# get_object_id_pub(all_translated_ids[12900])

In [19]:
def oracc_unicode_words_to_normalized_ascii(tokens):
#     return repr(tokens)
    def proc_token(token):
        if token in unicode_atf_to_ascii_atf_token_replacements:
            return unicode_atf_to_ascii_atf_token_replacements[token]
        for s, t in languages.unicode_atf_to_ascii_atf_replacements:
            token = token.replace(s, t)
        return token
    retokenized = "".join(proc_token(x) for x in tokens)
    underlined = languages.underline_sign_names(retokenized).strip()
    return underlined

In [20]:
unicode_atf_to_ascii_atf_token_replacements = {
    "bán": "ban2",
    "buru": "bur'u",
    "èše": "esze3",
    "géš": "gesz2",
    "šár": "szar2",
    "bùr": "bur3",
    "GÁN": "GAN2",
    "sá": "sa2",
    "sìla": "sila3",
    "lú": "lu2",
    "gur₇": "guru7",
    "taka₄": "tak4",
    "líd": "lid2",
    "zíd": "zi3",
    "teŋ₄": "ti",
    "šaru": "szar'u",
    "úš": "us2",
    "kùš": "kusz3",
    "kug": "ku3",
    "sig₁₀": "si3",
    "zid₂": "zi3",
    "gud": "gu4",
    "bí": "bi2",
    "dug₄": "du11",
    "diŋir": "dingir",
    "àm": "am3",
    "íb": "ib2",
    "íl": "il2",
    "su₁₃": "su3",
    "GIR₃": "GIRI3",
    "ŋiri₂": "gir2",
    "IR₃": "ARAD2",
    "ti₇": "te",
    "giggi": "kukku5",
    "sag₁₀": "saga",
    "gurum": "gur2",
    "aḫ₃": "had2",
    "tu₄": "tum",
    "gab₂": "kab",
    "zid": "zi",
    "tum₁₂": "tu",
    "GEME₂": "dam",
    "šag₄": "sza3",
    "kud": "ku5",
#     "bu": "pu",
#     "pu": "bu",
}

cdli_inconsistencies = {
    "sumun2": "sun2",
}



In [21]:
test_object_ids = [
    "P250815", # szag4 to sza3, kud to ku5
    "P271132", # geme2 to dam
    "P332924", # gab2 to kab, zid to zi, tum12 to tu
    "P271187", # tu4 to tum, ir3 to ARAD2
    "P271030", # pu to bu
    "P228726", # sag10 to saga, gurum to gur2, ah3 to had2
    "P247541", # giggi to kukku5
    "P503256", # Links in cuneiform
    "P237767",
    "P503256",
    "P237730",

    "P010627", # Notes: o ii 66

    "Q000041",
    "Q000057",
]

In [22]:
for t in test_object_ids[:1]:
    print_pub_lines(get_object_id_pub(t))

P250815 


P250815 Obverse
5(asz) gur zu2-lum
nig2-gar {gesz}kiri6
{gesz}kiri6 ib-ni-{d}suen
<<{disz}>>ki ib-ni-{d}suen
{disz}ar-bi-tu-ra-am
szu ba-an-ti
{iti}szu-numun-a
{gesz}sza3-gesznimbar nu-ba-an-ku5

P250815 Reverse
{gesz}kid-da gesznimbar
szu# ba-an-ti



## Compare with CDLI Transliterations

In [23]:
cdli_index = {x.id: x for x in cdli_pubs}
cdli_pub_ids = set(cdli_index.keys())
len(cdli_pub_ids), "cdli pubs"

(134752, 'cdli pubs')

In [24]:
common_pub_ids = sorted(list(cdli_pub_ids.intersection(all_translated_ids)))
len(common_pub_ids), "common"

(775, 'common')

In [25]:
for pid in common_pub_ids[281:282]:
    print("="*40)
    cdli_pub = cdli_index[pid]
    print_pub_lines(cdli_pub)
    print("-"*40)
    oracc_pub = get_object_id_pub(pid)
    print_pub_lines(oracc_pub)

P250815 tablet

P250815 obverse
5(asz) gur zu2-lum
nig2-gar {gesz}kiri6
{gesz}kiri6 ib-ni-{d}suen
<<disz>> ki ib-ni-{d}suen
{disz}ar-bi-tu-ra-am
szu ba-an-ti
iti szu-numun-a
{gesz}sza3-geszimmar nu-ba-an-ku5

P250815 reverse
{gesz}kid-da gesznimbar
szu# ba-an-ti

----------------------------------------
P250815 


P250815 Obverse
5(asz) gur zu2-lum
nig2-gar {gesz}kiri6
{gesz}kiri6 ib-ni-{d}suen
<<{disz}>>ki ib-ni-{d}suen
{disz}ar-bi-tu-ra-am
szu ba-an-ti
{iti}szu-numun-a
{gesz}sza3-gesznimbar nu-ba-an-ku5

P250815 Reverse
{gesz}kid-da gesznimbar
szu# ba-an-ti



## Load ORACC Translations into Memory

In [26]:
oracc_langs = defaultdict(lambda: 0)

random_ids = list(all_translated_ids)
random.shuffle(random_ids)

for pid in tqdm(random_ids[:]):
    p = get_object_id_pub(pid)
    oracc_translated_pubs[pid] = p
    if p.language is not None:
        oracc_langs[p.language] += 1
    
oracc_langs = sorted([(x, oracc_langs[x]) for x in oracc_langs.keys()], key=lambda x:-x[1])
oracc_langs

  0%|          | 0/16102 [00:00<?, ?it/s]

[('akk', 9755),
 ('sux', 2535),
 ('?', 1812),
 ('xur', 335),
 ('peo', 95),
 ('elx', 8),
 ('arc', 8),
 ('grc', 1)]

In [27]:
len(oracc_translated_pubs), "oracc translated pubs"

(16102, 'oracc translated pubs')

## Show CDLI and ORACC Corpi Stats

In [28]:
def cdli_text_area_is_translated(pub, text_area, tgt_lang):
    for line in text_area.lines:
        if tgt_lang in line.languages:
            return True
    return False

def cdli_pub_is_translated(pub, tgt_lang):
    return any(x for x in pub.text_areas if cdli_text_area_is_translated(pub, x, tgt_lang))

cdli_translated_pubs = {x.id: x for x in cdli_pubs if cdli_pub_is_translated(x, "en")}
len(cdli_translated_pubs), "cdli translated pubs"

(5369, 'cdli translated pubs')

In [29]:

def show_language_stats(src_lang, tgt_lang="en"):
    transliterated_cdli_index = {x.id: x for x in cdli_pubs if x.language == src_lang}
    transliterated_cdli_ids = set(transliterated_cdli_index.keys())
    transliterated_oracc_index = {x: transliterated_oracc_corpi[x] for x in transliterated_oracc_corpi.keys() if transliterated_oracc_corpi[x]["lang"] == src_lang}
    transliterated_oracc_ids = set(transliterated_oracc_index.keys())
    all_transliterated_ids = transliterated_cdli_ids.union(transliterated_oracc_ids)

    translated_cdli_index = {x.id: x for x in cdli_translated_pubs.values() if x.language == src_lang}
    translated_cdli_ids = set(translated_cdli_index.keys())
    translated_oracc_index = {x: oracc_translated_pubs[x] for x in oracc_translated_pubs.keys() if oracc_translated_pubs[x].language == src_lang}
    translated_oracc_ids = set(translated_oracc_index.keys())
    all_translated_ids = translated_cdli_ids.union(translated_oracc_ids)
    
    cdli_needs_translation_ids = transliterated_cdli_ids.difference(all_translated_ids)
    oracc_needs_translation_ids = transliterated_oracc_ids.difference(all_translated_ids)
    all_needs_translation_ids = cdli_needs_translation_ids.union(oracc_needs_translation_ids)
    
    oracc_ids = set(x[0] for x in oracc_pub_ids_and_langs if x[1] == src_lang)
    all_ids = transliterated_cdli_ids.union(oracc_ids)

    print(f"                       lang: {src_lang}")
    print()
    print(f"    num transliterated cdli: {len(transliterated_cdli_ids)}")
    print(f"        num translated cdli: {len(translated_cdli_ids)}")
    print(f" num need translations cdli: {len(cdli_needs_translation_ids)}")

    print()
    print(f"                  num oracc: {len(oracc_ids)}")
    print(f"   num transliterated oracc: {len(transliterated_oracc_ids)}")
    print(f"       num translated oracc: {len(translated_oracc_ids)}")
    print(f"num need translations oracc: {len(oracc_needs_translation_ids)}")
    print("-"*42)
    print(f"           num publications: {len(all_ids)}")
    print(f"       num transliterations: {len(all_transliterated_ids)}")
    print(f"           num translations: {len(all_translated_ids)}")
    print(f"      num need translations: {len(all_needs_translation_ids)}")

show_language_stats(src_lang="sux")
print("="*42)
show_language_stats(src_lang="akk")

                       lang: sux

    num transliterated cdli: 99433
        num translated cdli: 4153
 num need translations cdli: 95180

                  num oracc: 104256
   num transliterated oracc: 5204
       num translated oracc: 2535
num need translations oracc: 4025
------------------------------------------
           num publications: 116060
       num transliterations: 104151
           num translations: 6636
      num need translations: 98896
                       lang: akk

    num transliterated cdli: 21890
        num translated cdli: 976
 num need translations cdli: 20678

                  num oracc: 26993
   num transliterated oracc: 16403
       num translated oracc: 9755
num need translations oracc: 6716
------------------------------------------
           num publications: 45207
       num transliterations: 35724
           num translations: 10728
      num need translations: 25063


## Output Translations

In [57]:
translations_out_dir = f"../data"
os.makedirs(translations_out_dir, exist_ok=True)

In [59]:
def lines_to_paragraphs(self, src_lang, tgt_lang, max_length=128):
    self.paragraphs = list()
    plen = 0
    for iline, line in enumerate(self.lines):
        line_len = len(line.text)
        if cdli.looks_like_li(line.text, src_lang):
            self.paragraphs.append(cdli.TextParagraph(iline, iline+1, "li"))
            plen = line_len
        else:
            if len(self.paragraphs) > 0 and self.paragraphs[-1].tag == "p" and plen + line_len < max_length:
                p = self.paragraphs[-1]
                p.end_line_index += 1
                plen += line_len
            else:
                self.paragraphs.append(cdli.TextParagraph(iline, iline+1))
                plen = line_len
    if any(l for l in self.lines if tgt_lang in l.languages):
        for p in self.paragraphs:
            lines = self.lines[p.start_line_index:p.end_line_index]
            tlines = [(x.languages[tgt_lang] if tgt_lang in x.languages else "") for x in lines]
            p.languages[tgt_lang] = languages.remove_extraneous_space(" ".join(tlines))
    return self.paragraphs



In [60]:
def output_translations(corpus_pubs, src_lang, encoding="ascii", tgt_lang="en"):
    srcs = set()
    translations = []

    for corpus, pubs in corpus_pubs:
        pubs = [pubs[x] for x in pubs.keys() if pubs[x].language == src_lang]
        print(f"{corpus} {src_lang} with {len(pubs)} translated publications")
        longest_line_len = 0
        for pub in tqdm(pubs):
            for a in pub.text_areas:
                if (corpus == "cdli") and len(a.lines) > 0:
                    lines_to_paragraphs(a, src_lang, tgt_lang)
                for p in a.paragraphs:
                    if tgt_lang in p.languages:
                        src_lines = [x.text for x in a.lines[p.start_line_index:p.end_line_index]]
                        src = " ".join(src_lines)
                        src = languages.remove_blanks(src)
                        src = languages.underline_sign_names(src)
                        src = languages.dashes_to_dots(src)
                        src = languages.remove_extraneous_space(src)
                        tgt = p.languages[tgt_lang]
                        tgt = tgt.replace("[", "").replace("]", "").replace("(", "").replace(")", "")
                        if len(src) > 0 and languages.target_ok(tgt) and src not in srcs:
                            line_len = len(src) + len(tgt)
                            longest_line_len = max(line_len, longest_line_len)
                            out_line = json.dumps({src_lang:src,tgt_lang:tgt})
                            translations.append(out_line)
                            srcs.add(src)
                        
        print(f"{corpus} {src_lang} longest line length:", longest_line_len)

    path = f"{translations_out_dir}/translations_{src_lang}_to_{tgt_lang}.jsonl"

    with open(path, "wt") as f:
        head = ""
        for out_line in sorted(translations):
            f.write(head)
            f.write(out_line)
            head = "\n"

    return len(translations)
    
corpi = [("cdli", cdli_translated_pubs), ("oracc", oracc_translated_pubs)]
    
output_translations(corpi, "akk")
output_translations(corpi, "sux")


cdli akk with 976 translated publications


  0%|          | 0/976 [00:00<?, ?it/s]

cdli akk longest line length: 508
oracc akk with 9755 translated publications


  0%|          | 0/9755 [00:00<?, ?it/s]

oracc akk longest line length: 5966
cdli sux with 4153 translated publications


  0%|          | 0/4153 [00:00<?, ?it/s]

cdli sux longest line length: 788
oracc sux with 2535 translated publications


  0%|          | 0/2535 [00:00<?, ?it/s]

oracc sux longest line length: 4185


59419