# Download ORACC

In [50]:
import os, sys, math
import requests
import json
import time
import random
import shutil
import zipfile
from collections import defaultdict
from glob import glob
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [2]:
import cdli
import oracc
import languages

In [3]:
out_dir = os.path.abspath(f"/Volumes/FrankDisk/oracc_zips")
os.makedirs(out_dir, exist_ok=True)
out_dir

'/Volumes/FrankDisk/oracc_zips'

## Download ORACC Projects

In [4]:
project_zips = oracc.get_all_project_zips(out_dir, verbose=False, tqdm=tqdm)

  0%|          | 0/136 [00:00<?, ?it/s]

In [5]:
print(len(project_zips))
project_zips[:3]

115


['/Volumes/FrankDisk/oracc_zips/adsd.zip',
 '/Volumes/FrankDisk/oracc_zips/adsd-adart1.zip',
 '/Volumes/FrankDisk/oracc_zips/adsd-adart2.zip']

In [6]:
all_corpus_object_ids = oracc.get_all_corpus_object_ids(project_zips[:], tqdm=tqdm)

  0%|          | 0/115 [00:00<?, ?it/s]

## Download CDLI for Reference

In [7]:
cdli_pubs = cdli.get_atf()

Downloading https://github.com/cdli-gh/data/raw/master/cdliatf_unblocked.atf
Parsing atf


## Download ORACC Translations

In [8]:
reported_translated_ids = oracc.get_all_translated_object_ids(project_zips, tqdm)
print(len(reported_translated_ids), "reported translations")

  0%|          | 0/115 [00:00<?, ?it/s]

16277 reported translations


In [9]:
for pid, oid in tqdm(reported_translated_ids[:]):
    tpath = oracc.download_object_translation(out_dir, pid, oid)
#     print(tpath)

  0%|          | 0/16277 [00:00<?, ?it/s]

In [10]:
all_object_html_paths = oracc.get_all_object_html_paths(out_dir)
all_translated_ids = sorted(list(all_object_html_paths.keys()))
print(len(all_object_html_paths))
all_translated_ids[:3]

16102


['P010092', 'P010452', 'P010573']

## Normalization

In [11]:
def load_html(path):
    with open(path, "rt") as f:
        return BeautifulSoup(f.read())
    
def load_html_for_object_id(object_id):
#     print(object_id)
    return load_html(all_object_html_paths[object_id])




In [12]:
test_object_ids = [
    "P503256", # Links in cuneiform
    "P237767",
    "P503256",
    "P237730",

    "P010627", # Notes: o ii 66

    "Q000041",
    "Q000057",
]

In [13]:
unicode_atf_to_ascii_atf_token_replacements = {
    "bán": "ban2",
    "buru": "bur'u",
    "èše": "esze3",
    "géš": "gesz2",
    "šár": "szar2",
    "bùr": "bur3",
    "GÁN": "GAN2",
    "sá": "sa2",
    "sìla": "sila3",
    "lú": "lu2",
    "gur₇": "guru7",
    "taka₄": "tak4",
    "líd": "lid2",
    "zíd": "zi3",
    "teŋ₄": "ti",
    "šaru": "szar'u",
    "úš": "us2",
    "kùš": "kusz3",
    "kug": "ku3",
    "bu": "pu",
    "sig₁₀": "si3",
    "zid₂": "zi3",
    "gud": "gu4",
    "bí": "bi2",
    "dug₄": "du11",
    "diŋir": "dingir",
    "àm": "am3",
    "íb": "ib2",
    "íl": "il2",
    "su₁₃": "su3",
    "GIR₃": "GIRI3",
    "ti₇": "te",
}

cdli_inconsistencies = {
    "sumun2": "sun2",
}



In [14]:
def oracc_unicode_words_to_normalized_ascii(tokens):
#     return repr(tokens)
    def proc_token(token):
        if token in unicode_atf_to_ascii_atf_token_replacements:
            return unicode_atf_to_ascii_atf_token_replacements[token]
        for s, t in languages.unicode_atf_to_ascii_atf_replacements:
            token = token.replace(s, t)
        return token
    return languages.underline_sign_names("".join(proc_token(x) for x in tokens).strip())
#     return "".join(tokens)

# for t in test_object_ids:
#     print_pub_paragraphs(get_object_id_pub(t))

In [46]:
def get_object_id_pub(object_id):
    pub = cdli.Publication(object_id)
    
    surface = ""
    column = ""
    text_area = None
    def add_line(number, cuneiform):
        nonlocal surface, column, text_area, pub
        if text_area is None:
            name = surface
            if len(column) > 0:
                if len(name) > 0:
                    name += " " + column
                else:
                    name = column
            text_area = cdli.TextArea(name=name)
            pub.text_areas.append(text_area)
        line = cdli.TextLine(number=number, text=cuneiform)
        text_area.lines.append(line)

    html = load_html_for_object_id(object_id)
    texts = html.find_all("div", class_="text")
    langs = defaultdict(lambda: 0)

    for text in texts:
        surface = ""
        column = ""
        text_area = None
        line_index = 0
        table = text.find("table", class_="transliteration")
        if table is None:
            continue
        text_title = text.find("h1").text
        rows = table.find_all("tr")
        for r in rows:
            cols = r.find_all("td")
            rclasses = r["class"] if r.has_attr("class") else []
            if "h" in rclasses:
                htext = cols[0].text.strip()
                if "surface" in rclasses:
                    surface = htext
                    column = ""
                elif "column" in rclasses:
                    column = htext
                text_area = None
                line_index = 0
#                 print("")
#                 print(object_id, text_title, surface, column)
            else:
                lnums = [x for x in cols if x.has_attr("class") and "lnum" in x["class"]]
                if len(lnums) != 1:
                    continue
                lnum = lnums[0].text.strip() if len(lnums) > 0 else ""                
                tlits = [x for x in cols if x.has_attr("class") and "tlit" in x["class"]]
                ntlits = len(tlits)
                cs = [x for x in cols if x.has_attr("class") and "c" in x["class"]]
                xtrs = [x for x in cols if x.has_attr("class") and "xtr" in x["class"]]
                if ntlits == 1:
                    tlit, lang = tlit_to_normalized_ascii(tlits[0])
                    langs[lang] += 1
                    add_line(lnum, tlit)
                    xtr = ""
                    rowspan = 1
                    if len(xtrs) > 0:
                        if xtrs[0].has_attr("rowspan"):
                            rowspan = int(xtrs[0]["rowspan"])
                        xtr = xtr_to_en(xtrs[0])
                        para = cdli.TextParagraph(line_index, line_index + rowspan)
                        para.languages["en"] = xtr
                        text_area.paragraphs.append(para)
                    line_index += 1
                elif len(cs) > 0 and len(cs) == len(xtrs):
                    for i, c in enumerate(cs):
                        tlit, lang = tlit_to_normalized_ascii(c)
                        langs[lang] += 1
                        add_line(lnum + f".{i}", tlit)
                        xtr = xtr_to_en(xtrs[i])
                        para = cdli.TextParagraph(line_index, line_index + 1)
                        para.languages["en"] = xtr
                        text_area.paragraphs.append(para)
                        line_index += 1
                elif ntlits == 0:
                    pass
                else:
                    raise ValueError("Unsupported format: ntlits=", len(tlits))
#                 print(line_index, lnum, "\t", tlit, "\t", rowspan, "\t", xtr)
                
#                 print("row", r["class"], "with", len(cols), "cols", [(x["class"] if x.has_attr("class") else []) for x in cols])
#         print("")
    langs = sorted([(x, langs[x]) for x in langs.keys()], key=lambda x:-x[1])
#     print(langs)
    pub.language = langs[0][0] if len(langs) > 0 else None
    return pub

def print_pub_paragraphs(pub, tgt_len="en"):
    for a in pub.text_areas:
        print(pub.id, a.name)
        for p in a.paragraphs:
            lines = a.lines[p.start_line_index:p.end_line_index]
            cuneiform = " ".join(x.text for x in lines)
            tgt = p.languages[tgt_len]
            print("   >>>>", cuneiform)
            print("   <<<<", tgt)
        print("")
        
def print_pub_lines(pub, tgt_len="en"):
    for a in pub.text_areas:
        print(pub.id, a.name)
        for line in a.lines:
            print(line.text)
        print("")
        

def xtr_to_en(xtr):
    ptr = xtr.find("p", class_="tr")
    if ptr is None:
        return ""
    cell = ptr.find("span", class_="cell")
    if cell is not None:
        ptr = cell
    return ptr.text.strip()
            
tlit_ignore_classes = set(["marker"])

def is_node_sign(node):
    if isinstance(node, str):
        return node == "."
    return node.name=="sup" or (node.has_attr("class") and "sign" in node["class"])

def tlit_to_normalized_ascii(tlit):
    langs = defaultdict(lambda: 0)
    def node_to_str(node, in_sign):
        if isinstance(node, str):
            return [node]
        children_in_sign = False
        ignore = False
        classes = node["class"] if node.has_attr("class") else []
        for c in classes:
            ignore = ignore or (c in tlit_ignore_classes)
        if ignore:
            return []
        if node.name == "span" and "sign" not in classes:
            for c in classes:
                if c in languages.all_languages:
                    langs[c] += 1
        parts = []
        is_sup = node.name == "sup"
        is_sign = all(is_node_sign(x) for x in node)
        if is_sup:
            parts.append("{")
            if "sux" in node["class"] and node.text == "m":
                parts.append("disz")
                parts.append("}")
                return parts
        if is_sign and not in_sign:
#             parts.append("_")
            children_in_sign = True
        for c in node:
            parts.extend(node_to_str(c, in_sign=in_sign or children_in_sign))
#         if is_sign and not in_sign:
#             parts.append("_")
        if is_sup:
            parts.append("}")
        return parts
    tokens = node_to_str(tlit, in_sign=False)
    langs = sorted([(x, langs[x]) for x in langs.keys()], key=lambda x:-x[1])
#     print(langs)
    lang = langs[0][0] if len(langs) > 0 else "?"
    return oracc_unicode_words_to_normalized_ascii(tokens), lang

get_object_id_pub(all_translated_ids[12900])

Publication('Q004592', 'akk', [TextArea('', [TextLine('', '', {})], []), TextArea('Obverse', [TextLine('11', '{disz}asz-szur-_pap_-_a gar_ {d}_bad szid_ asz-szur', {}), TextLine('22', 'A tukul-ti-{d}_masz gar_ {d}_bad szid_ asz-szur', {}), TextLine('33', '_a 10_-_erim-t_ÁH _gar_ {d}_bad szid_ asz-szur', {}), TextLine('44', '5 na-me#-ru isz-tu# na-me-ri', {}), TextLine('55', 'sza KÁ {d}kal-kal-la{?}# a-di na-me-ri#', {}), TextLine('66', 'sza KÁ-_gal-me_ a-na _kisal_{?} {d{?}}nun{?}-nam{?}-nir{?}#', {}), TextLine('77', '[---]  (traces)', {}), TextLine('88', '[---]  (traces)', {})], [TextParagraph(0, 3, {'en': 'Ashurnasirpal, appointee of the god Enlil, vice-regent of Aššur, son of Tukultī-Ninurta (II), appointee of the god Enlil, vice-regent of Aššur, son of Adad-nārārī (II), appointee of the god Enlil, vice-regent of Aššur:'}), TextParagraph(3, 8, {'en': 'The five towers from the towers of the Kalkal Gate to the towers of the gates [(which one uses) when entering] the forecourt of the g

In [52]:
oracc_langs = defaultdict(lambda: 0)

random_ids = list(all_translated_ids)
random.shuffle(random_ids)

oracc_pubs = dict()

for pid in tqdm(random_ids[:]):
    p = get_object_id_pub(pid)
    oracc_pubs[pid] = p
    if p.language is not None:
        oracc_langs[p.language] += 1
    
oracc_langs = sorted([(x, oracc_langs[x]) for x in oracc_langs.keys()], key=lambda x:-x[1])
oracc_langs

  0%|          | 0/16102 [00:00<?, ?it/s]

[('akk', 9755),
 ('sux', 2535),
 ('?', 1812),
 ('xur', 335),
 ('peo', 95),
 ('elx', 8),
 ('arc', 8),
 ('grc', 1)]

In [16]:
def load_project_corpus(project_id, oracc_dir):
    project_zip_path = oracc.get_project_zip_path(project_id, oracc_dir)
    result = dict()
    project_zip = zipfile.ZipFile(project_zip_path, "r")
#     for f in project_zip.filelist:
#         print(f)
    corpi = [x for x in project_zip.filelist if "/corpusjson/" in x.filename and x.filename.endswith(".json")]
    if len(corpi) == 0:
        return result
    for corpus_file_info in corpi:
#         print(corpus_file_info)
        corpus = None
        with project_zip.open(corpus_file_info, "r") as f:
#             corpus = json.load(f)
            try:
                corpus = json.loads(str(f.read(), "utf8"))
            except json.JSONDecodeError:
                print("JSON Error", corpus_file_info.filename)
                corpus = None
#         print(corpus.keys())
        if corpus is not None:
            result[corpus_file_info.filename] = corpus
    return result




test_project_id, test_object_id = [x for x in all_corpus_object_ids if x[1] == "Q007064"][0]
test_corpus = load_project_corpus(test_project_id, out_dir)
test_corpus.keys()

JSON Error ecut/corpusjson/Q007826.json
JSON Error ecut/corpusjson/Q000000.json
JSON Error ecut/corpusjson/Q007087.json
JSON Error ecut/corpusjson/Q008031.json
JSON Error ecut/corpusjson/Q008091.json
JSON Error ecut/corpusjson/Q008092.json
JSON Error ecut/corpusjson/Q008089.json
JSON Error ecut/corpusjson/Q008226.json
JSON Error ecut/corpusjson/Q007089.json
JSON Error ecut/corpusjson/Q008217.json
JSON Error ecut/corpusjson/Q008040.json


dict_keys(['ecut/corpusjson/Q008241.json', 'ecut/corpusjson/Q007858.json', 'ecut/corpusjson/Q007991.json', 'ecut/corpusjson/Q006887.json', 'ecut/corpusjson/Q006924.json', 'ecut/corpusjson/X003142.json', 'ecut/corpusjson/X003069.json', 'ecut/corpusjson/Q008023.json', 'ecut/corpusjson/Q008266.json', 'ecut/corpusjson/Q008304.json', 'ecut/corpusjson/X003047.json', 'ecut/corpusjson/Q007816.json', 'ecut/corpusjson/Q006976.json', 'ecut/corpusjson/Q007788.json', 'ecut/corpusjson/Q007831.json', 'ecut/corpusjson/Q008150.json', 'ecut/corpusjson/Q008924.json', 'ecut/corpusjson/Q007026.json', 'ecut/corpusjson/Q007099.json', 'ecut/corpusjson/Q008072.json', 'ecut/corpusjson/Q008093.json', 'ecut/corpusjson/Q006968.json', 'ecut/corpusjson/Q008923.json', 'ecut/corpusjson/Q008224.json', 'ecut/corpusjson/Q007049.json', 'ecut/corpusjson/Q007058.json', 'ecut/corpusjson/Q008015.json', 'ecut/corpusjson/X003141.json', 'ecut/corpusjson/Q008087.json', 'ecut/corpusjson/Q007986.json', 'ecut/corpusjson/Q007079.json

In [17]:
test_corpus['ecut/corpusjson/Q007064.json']

{'type': 'cdl',
 'project': 'ecut',
 'source': 'http://oracc.org/ecut',
 'license': 'This data is released under the CC0 license',
 'license-url': 'https://creativecommons.org/publicdomain/zero/1.0/',
 'more-info': 'http://oracc.org/doc/opendata/',
 'UTC-timestamp': '2022-02-01T19:42:34',
 'textid': 'Q007064',
 'cdl': [{'node': 'c',
   'type': 'text',
   'id': 'Q007064.U0',
   'cdl': [{'node': 'd', 'type': 'object', 'ref': ''},
    {'node': 'd', 'type': 'surface', 'ref': ''},
    {'node': 'c',
     'type': 'discourse',
     'subtype': 'body',
     'id': 'Q007064.U1',
     'cdl': [{'node': 'c',
       'type': 'sentence',
       'implicit': 'yes',
       'id': 'Q007064.U2',
       'label': '1 - 15',
       'cdl': [{'node': 'd',
         'type': 'line-start',
         'ref': 'Q007064.1',
         'n': '1',
         'label': '1'},
        {'node': 'l',
         'frag': '{d}hal-di-ni-ni',
         'id': 'Q007064.l05f3c',
         'ref': 'Q007064.1.1',
         'inst': 'Haldi=i=NI=ni[Urartia

http://oracc.museum.upenn.edu/doc/help/editinginatf/primer/inlinetutorial/index.html

In [18]:
languages.all_languages

{'akk': 'Akkadian',
 'sux': 'Sumerian',
 'qpn': 'Proper Nouns',
 'arc': 'Aramaic',
 'elx': 'Elamite',
 'grc': 'Greek',
 'peo': 'Old Persian',
 'ug': 'Ugaritic',
 'xur': 'Urartian',
 'akkts': 'Akkadian',
 'suxts': 'Sumerian',
 'qpnts': 'Proper Nouns',
 'arcts': 'Aramaic',
 'elxts': 'Elamite',
 'grcts': 'Greek',
 'peots': 'Old Persian',
 'ugts': 'Ugaritic',
 'de': 'German',
 'en': 'English',
 'es': 'Spanish',
 'fr': 'French',
 'it': 'Italian',
 'ml_de': 'German',
 'ml_en': 'English',
 'ml_es': 'Spanish',
 'ml_fr': 'French',
 'ml_it': 'Italian'}

## Compare with CDLI Transliterations

In [19]:
cdli_index = {x.id: x for x in cdli_pubs}
len(cdli_index)

134677

In [20]:
cdli_pub_ids = set(cdli_index.keys())
len(cdli_pub_ids)

134677

In [21]:
common_pub_ids = sorted(list(cdli_pub_ids.intersection(all_translated_ids)))
len(common_pub_ids), "common"

(775, 'common')

In [22]:
for pid in common_pub_ids[531:532]:
    print("="*40)
    cdli_pub = cdli_index[pid]
    print_pub_lines(cdli_pub)
    print("-"*40)
    oracc_pub = get_object_id_pub(pid)
    print_pub_lines(oracc_pub)

P271197 tablet

P271197 obverse
a-na _lugal_-ri _en_-ia u3 _{d}utu_-ia
um-ma la-ab-a-ya _ARAD2_-ka
u3 ip-ru sza ka-ba2-szi-ka
a-na _giri3-mesz lugal_-ri _en_-ia
u3 _{d}utu_-ia 7(disz)-szu 7(disz)-ta-a-an
am-qut isz-te-me a-wa-te-_mesz_
sza _lugal_-ru isz-tap-ra-an-ni
u3 mi-ia-ti a-na-ku u3
ya-ah-li-qu2 _lugal_-ru _kur{ki}_-szu
_ugu_-ia a-mur a-na-ku _ARAD2_ ki-ti
_lugal_-ri u3 la-a ar-na-ku
u3 la-a ha-t,a2-ku u3
la-a a-kal-li _gu-un-hi-a_-ia
u3 la-a a-kal-li
e-ri-isz-ti7 _lu2_ ra-bi-s,i2-ia
a-nu-ma yi-ka-lu ka-ar-s,i2-ia
ha-ba-lu-ma u3 la-a
yu-sa3-an-ni-qu2 _lugal_-ru _en_-ia
ar-ni-ia sza-ni-tam
i15-ba-asz-szi ar-ni-ia
i-nu-ma ir-ru-ba-ti
a-na _iri_ gaz-ri

P271197 reverse
u3 aq-ta-bu
pu-uh-ri-isz-mi
yi-il-te-qu2 _lugal_-ru
mim-me2-ia u3 mim-me
{disz}mil-ki-li3 a-yi2-ka-am
i15-de ep-sze-et-szu sza
{disz}mil-ki-li _ugu_-ia
sza-ni-tam
a-na {disz}dumu-mu-ia sza-par2 _lugal_-ru
u2-ul i15-de i-nu-ma
{disz}dumu-mu-ia it-ti
_lu2-mesz sa-gaz_
it-ta-na-la-ku
u3 al-lu-u2 na-ad-na-ti7-szu
i-na _s

## Load All Translations into Memory

In [23]:
print(len(all_translated_ids))
all_translated_ids[0]

16102


'P010092'

## Calculate How Many are Untranslated

In [24]:
print(len(cdli_pub_ids), "objects in CDLI")
print(len(all_corpus_object_ids), "objects in ORACC")
print(len(common_pub_ids), "objects common between ORACC and CDLI")
print(len(all_object_html_paths), "translated objects in ORACC")
print(len(all_corpus_object_ids) - len(all_object_html_paths), "untranslated objects in ORACC")

134677 objects in CDLI
25522 objects in ORACC
775 objects common between ORACC and CDLI
16102 translated objects in ORACC
9420 untranslated objects in ORACC
