# Make Translations JSONL

Outputs human translations to the data folder.

http://oracc.museum.upenn.edu/doc/help/editinginatf/primer/inlinetutorial/index.html

In [1]:
import os, sys, math
import requests
import json
import time
import random
import shutil
import zipfile
from collections import defaultdict
from glob import glob
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import importlib

In [2]:
import languages
import cdli
import oracc

In [3]:
oracc_dir = os.path.abspath(f"/Volumes/FrankDisk/oracc_zips")
os.makedirs(oracc_dir, exist_ok=True)
oracc_dir

'/Volumes/FrankDisk/oracc_zips'

In [4]:
# This is loaded later. Put here so I don't accidentally run it.
oracc_translated_pubs = dict()

## Download ORACC Projects

In [5]:
project_zips = oracc.get_all_project_zips(oracc_dir, verbose=False, tqdm=tqdm)

  0%|          | 0/140 [00:00<?, ?it/s]

In [6]:
print(len(project_zips))
project_zips[:3]

119


['/Volumes/FrankDisk/oracc_zips/adsd.zip',
 '/Volumes/FrankDisk/oracc_zips/adsd-adart1.zip',
 '/Volumes/FrankDisk/oracc_zips/adsd-adart2.zip']

In [7]:
all_corpus_object_ids = oracc.get_all_corpus_object_ids(project_zips[:], tqdm=tqdm)

  0%|          | 0/119 [00:00<?, ?it/s]

In [8]:
len(all_corpus_object_ids)

26374

## Load ORACC Transliterations

In [9]:
oracc_pub_ids_and_langs, transliterated_oracc_corpi = oracc.load_all_project_pub_ids(oracc_dir, tqdm=tqdm)

  0%|          | 0/140 [00:00<?, ?it/s]

Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fcf20dc0780>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fccfeaf5380>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fcd1b6ad5c0>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fcd1b051b80>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fcf38049a40>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fcd1b039c00>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <t

In [10]:
transliterated_oracc_pub_ids = set(transliterated_oracc_corpi.keys())

In [11]:
print(len(oracc_pub_ids_and_langs), "oracc pubs")
print(len(transliterated_oracc_pub_ids), "oracc transliterated pubs")

133312 oracc pubs
22076 oracc transliterated pubs


In [12]:
sorted(list(transliterated_oracc_pub_ids))[-5:]

['X900957', 'X900975', 'X900978', 'X987003', 'X999985']

## Download CDLI for Reference

In [13]:
cdli_pubs = cdli.get_atf()

Downloading https://github.com/cdli-gh/data/raw/master/cdliatf_unblocked.atf
Parsing atf


## Download ORACC Translations

In [14]:
reported_translated_ids = oracc.get_all_translated_object_ids(project_zips, tqdm)
print(len(reported_translated_ids), "reported translations")

  0%|          | 0/119 [00:00<?, ?it/s]

16777 reported translations


In [15]:
for pid, oid in tqdm(reported_translated_ids[:]):
    tpath = oracc.download_object_translation(oracc_dir, pid, oid)
#     print(tpath)

  0%|          | 0/16777 [00:00<?, ?it/s]

## Test Normalized ORACC Transliterations

In [16]:
test_object_ids = [
    "P250815", # szag4 to sza3, kud to ku5
    "P271132", # geme2 to dam
    "P332924", # gab2 to kab, zid to zi, tum12 to tu
    "P271187", # tu4 to tum, ir3 to ARAD2
    "P271030", # pu to bu
    "P228726", # sag10 to saga, gurum to gur2, ah3 to had2
    "P247541", # giggi to kukku5
    "P503256", # Links in cuneiform
    "P237767",
    "P503256",
    "P237730",

    "P010627", # Notes: o ii 66

    "Q000041",
    "Q000057",
]

In [17]:
importlib.reload(oracc)

<module 'oracc' from '/Users/fak/Dropbox/Projects/CuneiformTranslators/tools/oracc.py'>

In [18]:
for t in test_object_ids[:1]:
    cdli.print_pub_lines(oracc.get_object_id_pub(t, oracc_dir))

P250815 


P250815 Obverse
5(asz) gur zu2-lum
nig2-gar {gesz}kiri6
{gesz}kiri6 ib-ni-{d}suen
<<{disz}>>ki ib-ni-{d}suen
{disz}ar-bi-tu-ra-am
szu ba-an-ti
{iti}szu-numun-a
{gesz}sza3-gesznimbar nu-ba-an-ku5

P250815 Reverse
{gesz}kid-da gesznimbar
szu# ba-an-ti



## Compare with CDLI Transliterations

In [23]:
cdli_index = {x.id: x for x in cdli_pubs}
cdli_pub_ids = set(cdli_index.keys())
len(cdli_pub_ids), "cdli pubs"

(135201, 'cdli pubs')

In [24]:
all_translated_ids = set(oracc.get_all_object_html_paths(oracc_dir).keys())
len(all_translated_ids), "oracc translated pubs"

(16442, 'oracc translated pubs')

In [25]:
common_pub_ids = sorted(list(cdli_pub_ids.intersection(all_translated_ids)))
len(common_pub_ids), "common"

(775, 'common')

In [26]:
for pid in common_pub_ids[281:282]:
    print("="*40)
    cdli_pub = cdli_index[pid]
    cdli.print_pub_lines(cdli_pub)
    print("-"*40)
    oracc_pub = oracc.get_object_id_pub(pid, oracc_dir)
    cdli.print_pub_lines(oracc_pub)

P250815 tablet

P250815 obverse
5(asz) gur zu2-lum
nig2-gar {gesz}kiri6
{gesz}kiri6 ib-ni-{d}suen
<<disz>> ki ib-ni-{d}suen
{disz}ar-bi-tu-ra-am
szu ba-an-ti
iti szu-numun-a
{gesz}sza3-geszimmar nu-ba-an-ku5

P250815 reverse
{gesz}kid-da gesznimbar
szu# ba-an-ti

----------------------------------------
P250815 


P250815 Obverse
5(asz) gur zu2-lum
nig2-gar {gesz}kiri6
{gesz}kiri6 ib-ni-{d}suen
<<{disz}>>ki ib-ni-{d}suen
{disz}ar-bi-tu-ra-am
szu ba-an-ti
{iti}szu-numun-a
{gesz}sza3-gesznimbar nu-ba-an-ku5

P250815 Reverse
{gesz}kid-da gesznimbar
szu# ba-an-ti



## Load ORACC Translations into Memory

In [27]:
oracc_langs = defaultdict(lambda: 0)

random_ids = list(all_translated_ids)
random.shuffle(random_ids)

for pid in tqdm(random_ids[:]):
    p = oracc.get_object_id_pub(pid, oracc_dir)
    oracc_translated_pubs[pid] = p
    if p.language is not None:
        oracc_langs[p.language] += 1
    
oracc_langs = sorted([(x, oracc_langs[x]) for x in oracc_langs.keys()], key=lambda x:-x[1])
oracc_langs

  0%|          | 0/16442 [00:00<?, ?it/s]

[('akk', 10079),
 ('sux', 2544),
 ('?', 1818),
 ('xur', 335),
 ('peo', 95),
 ('elx', 8),
 ('arc', 8),
 ('grc', 1)]

In [29]:
len(oracc_translated_pubs), "oracc translated pubs"

(16442, 'oracc translated pubs')

## Show CDLI and ORACC Corpi Stats

In [30]:
def cdli_text_area_is_translated(pub, text_area, tgt_lang):
    for line in text_area.lines:
        if tgt_lang in line.languages:
            return True
    return False

def cdli_pub_is_translated(pub, tgt_lang):
    return any(x for x in pub.text_areas if cdli_text_area_is_translated(pub, x, tgt_lang))

cdli_translated_pubs = {x.id: x for x in cdli_pubs if cdli_pub_is_translated(x, "en")}
len(cdli_translated_pubs), "cdli translated pubs"

(5369, 'cdli translated pubs')

In [40]:

def show_language_stats(src_lang, tgt_lang="en"):
    transliterated_cdli_index = {x.id: x for x in cdli_pubs if x.language == src_lang}
    transliterated_cdli_ids = set(transliterated_cdli_index.keys())
    transliterated_oracc_index = {x: transliterated_oracc_corpi[x] for x in transliterated_oracc_corpi.keys() if transliterated_oracc_corpi[x]["lang"] == src_lang}
    transliterated_oracc_ids = set(transliterated_oracc_index.keys())
    all_transliterated_ids = transliterated_cdli_ids.union(transliterated_oracc_ids)

    translated_cdli_index = {x.id: x for x in cdli_translated_pubs.values() if x.language == src_lang}
    translated_cdli_ids = set(translated_cdli_index.keys())
    translated_oracc_index = {x: oracc_translated_pubs[x] for x in oracc_translated_pubs.keys() if oracc_translated_pubs[x].language == src_lang}
    translated_oracc_ids = set(translated_oracc_index.keys())
    all_translated_ids = translated_cdli_ids.union(translated_oracc_ids)
    
    cdli_needs_translation_ids = transliterated_cdli_ids.difference(all_translated_ids)
    oracc_needs_translation_ids = transliterated_oracc_ids.difference(all_translated_ids)
    all_needs_translation_ids = cdli_needs_translation_ids.union(oracc_needs_translation_ids)
    
#     cdli_ids = set(x[0] for x in cdli_pub_ids_and_langs if x[1] == src_lang)
    oracc_ids = set(x[0] for x in oracc_pub_ids_and_langs if x[1] == src_lang)
    all_ids = transliterated_cdli_ids.union(oracc_ids)

    print("="*42)
    print(f"                   lang: {src_lang}")
    print()
#     print(f"                   cdli: {len(cdli_ids)}")
    print(f"    transliterated cdli: {len(transliterated_cdli_ids)}")
    print(f"        translated cdli: {len(translated_cdli_ids)}")
    print(f" need translations cdli: {len(cdli_needs_translation_ids)}")

    print()
    print(f"                  oracc: {len(oracc_ids)}")
    print(f"   transliterated oracc: {len(transliterated_oracc_ids)}")
    print(f"       translated oracc: {len(translated_oracc_ids)}")
    print(f"need translations oracc: {len(oracc_needs_translation_ids)}")
    print("-"*42)
    print(f"                  total: {len(all_ids)}")
    print(f"   transliterated total: {len(all_transliterated_ids)}")
    print(f"       translated total: {len(all_translated_ids)}")
    print(f"need translations total: {len(all_needs_translation_ids)}")

show_language_stats(src_lang="sux")
show_language_stats(src_lang="akk")

                   lang: sux

    transliterated cdli: 99819
        translated cdli: 4153
 need translations cdli: 95566

                  oracc: 104441
   transliterated oracc: 5249
       translated oracc: 2544
need translations oracc: 4069
------------------------------------------
                  total: 116630
   transliterated total: 104582
       translated total: 6645
need translations total: 99326
                   lang: akk

    transliterated cdli: 21953
        translated cdli: 976
 need translations cdli: 20741

                  oracc: 27417
   transliterated oracc: 16827
       translated oracc: 10079
need translations oracc: 6816
------------------------------------------
                  total: 45694
   transliterated total: 36211
       translated total: 11052
need translations total: 25226


## Output Translations

In [41]:
translations_out_dir = f"../data"
os.makedirs(translations_out_dir, exist_ok=True)

In [42]:
def output_translations(corpus_pubs, src_lang, encoding="ascii", tgt_lang="en"):
    srcs = set()
    translations = []

    for corpus, pubs in corpus_pubs:
        pubs = [pubs[x] for x in pubs.keys() if pubs[x].language == src_lang]
        print(f"{corpus} {src_lang} with {len(pubs)} translated publications")
        longest_line_len = 0
        for pub in tqdm(pubs):
            for a in pub.text_areas:
                if (corpus == "cdli") and len(a.lines) > 0:
                    a.lines_to_paragraphs(src_lang, tgt_lang)
                for p in a.paragraphs:
                    if tgt_lang in p.languages:
                        src_lines = [x.text for x in a.lines[p.start_line_index:p.end_line_index]]
                        src = " ".join(src_lines)
                        src = languages.remove_blanks(src)
                        src = languages.underline_sign_names(src)
                        src = languages.dashes_to_dots(src)
                        
                        src = languages.remove_extraneous_space(src)
                        tgt = p.languages[tgt_lang]
                        tgt = tgt.replace("[", "").replace("]", "").replace("(", "").replace(")", "")
                        tgt = tgt.replace("\n", " ").replace("\t", " ")
                        tgt = languages.remove_extraneous_space(tgt)
                        if len(src) > 0 and languages.target_ok(tgt) and src not in srcs:
                            line_len = len(src) + len(tgt)
                            longest_line_len = max(line_len, longest_line_len)
                            out_line = json.dumps({src_lang:src,tgt_lang:tgt})
                            translations.append(out_line)
                            srcs.add(src)
                        
        print(f"{corpus} {src_lang} longest line length:", longest_line_len)

    path = f"{translations_out_dir}/translations_{src_lang}_to_{tgt_lang}.jsonl"

    with open(path, "wt") as f:
        head = ""
        for out_line in sorted(translations):
            f.write(head)
            f.write(out_line)
            head = "\n"

    return len(translations)
    
corpi = [("cdli", cdli_translated_pubs), ("oracc", oracc_translated_pubs)]
    
output_translations(corpi, "akk")
output_translations(corpi, "sux")


cdli akk with 976 translated publications


  0%|          | 0/976 [00:00<?, ?it/s]

cdli akk longest line length: 508
oracc akk with 10079 translated publications


  0%|          | 0/10079 [00:00<?, ?it/s]

oracc akk longest line length: 5966
cdli sux with 4153 translated publications


  0%|          | 0/4153 [00:00<?, ?it/s]

cdli sux longest line length: 788
oracc sux with 2544 translated publications


  0%|          | 0/2544 [00:00<?, ?it/s]

oracc sux longest line length: 4185


59700