# Make Web Site

In [1]:
import sys, os, io, datetime
import json
import random
import requests
import zipfile
import glob
import re
import shutil
import pandas as pd
from slugify import slugify
from tqdm.notebook import tqdm
from html import escape
from collections import defaultdict

In [2]:
import corpi
import cdli
import oracc
import languages

In [3]:
import importlib

In [4]:
pd.set_option("display.max_columns", None)

In [5]:
supported_langs = set(["akk", "sux"])

In [6]:
wwwroot = os.path.abspath("../dist")
os.makedirs(wwwroot, exist_ok=True)
wwwroot

'/Users/fak/Dropbox/Projects/CuneiformTranslators/dist'

## CDLI

In [7]:
importlib.reload(cdli)
importlib.reload(oracc)
importlib.reload(corpi)

<module 'corpi' from '/Users/fak/Dropbox/Projects/CuneiformTranslators/tools/corpi.py'>

In [8]:
cdli_corpus = corpi.CDLI()

## ORACC

In [9]:
oracc_dir="/Volumes/FrankDisk/oracc_zips"
oracc_corpus = corpi.ORACC(oracc_dir=oracc_dir, tqdm=tqdm)

## Merge

In [10]:
all_pubs = dict()
for p in oracc_corpus.oracc_pubs.values():
    if p.language in supported_langs:
        p.corpus = "oracc"
        all_pubs[p.id] = p
for p in cdli_corpus.cdli_pubs.values():
    if p.language in supported_langs:
        p.corpus = "cdli"
        all_pubs[p.id] = p
print(f"Found {len(all_pubs)} unique publications")

Found 134796 unique publications


In [11]:
# Group publications by the first 4 characters of pub.id to create a directory structure
all_pubs_by_dir = defaultdict(list)
for p in all_pubs.values():
    all_pubs_by_dir[p.id[:4].lower()].append(p)
print(f"Found {len(all_pubs_by_dir)} directories")


Found 380 directories


## Get the ML Translations

In [12]:
translations_zip_path = "../data/ml_translations.zip"
with open(translations_zip_path, "rb") as f:
    with zipfile.ZipFile(f) as zf:
        json_name = [n for n in zf.namelist() if n.endswith(".json")][0]
        translations = json.loads(str(zf.read(json_name), "utf-8"))
for k in translations.keys():
    if "_to_" in k:
        print(k, len(translations[k]))

akk_to_en 301778
sux_to_en 553296


In [13]:
num_transliterations = len(all_pubs)
num_translations = len([x for x in all_pubs.values() if x.has_translations()])
print("num_transliterations:", num_transliterations)
print("    num_translations:", num_translations)

num_transliterations: 134796
    num_translations: 14632


## Find Publications we have translations for

In [39]:
tgt_lang = "en"

translated_pubs = []

for pub in tqdm(list(all_pubs.values())):
    has_new_translations = False
    has_ml_translations = True
    has_lines = False
    st_key = f"{pub.language}_to_{tgt_lang}"
    if st_key not in translations:
        continue
    st_translations = translations[st_key]
    for a in pub.text_areas:
        if pub.corpus == "cdli" and len(a.lines) > 0 and len(a.paragraphs) == 0:
            a.lines_to_paragraphs(pub.language, tgt_lang)
        paras = a.paragraphs_to_lines()
        for i, plines in enumerate(paras):
            p = a.paragraphs[i]
            p.languages["ml_"+tgt_lang] = ""
            head = ""
            for si,ei,s in plines:
                if len(s) > 1:
                    has_lines = True
                    has_new_translations = has_new_translations or (tgt_lang not in p.languages)
                    if s in st_translations:
                        p.languages["ml_"+tgt_lang] += head + st_translations[s]
                        head = " "
                    else:
                        raise Exception(f"pub {pub.id} has no translation ({st_key}) for: {s}")
                        has_ml_translations = False
    pub.has_new_translations = has_new_translations
    pub.has_ml_translations = has_ml_translations
    if has_ml_translations and has_lines:
        translated_pubs.append(pub)
    
newly_translated_pubs = [x for x in translated_pubs if x.has_new_translations]
print(len(translated_pubs), "ml translated_pubs")
print(len(newly_translated_pubs), "newly_translated_pubs")

  0%|          | 0/134796 [00:00<?, ?it/s]

Exception: pub P334661 has no translation (akk_to_en) for: [ina _szu-2_] LÚ{v}-A—szip-ri-ia ina pa-an _lugal en_-ia [a]-sa-ap-ra-ász-szu _lugal#_ be-lí lisz-al-szú

In [15]:
len([x for x in translated_pubs if x.language == "sux"]), "sux"

(99420, 'sux')

In [16]:
newly_translated_pubs[0]

Publication('P519714', 'akk', [TextArea('object stele', [], []), TextArea('surface a', [], []), TextArea("column 1'", [TextLine("1'.", '[...] ($ blank space $)', {}), TextLine("2'.", '[...] x', {}), TextLine("3'.", '[...] x', {}), TextLine("4'.", '[...] x', {}), TextLine("5'.", '[...] x', {})], [TextParagraph(0, 5, {'ml_en': 'No translation possible'})]), TextArea("column 2'", [TextLine("1'.", 'x [...]', {}), TextLine("2'.", 'x [...]', {}), TextLine("3'.", 'i-[...]', {}), TextLine("4'.", '{d}[...]', {}), TextLine("5'.", 'x [...]', {}), TextLine("6'.", '[...]', {})], [TextParagraph(0, 6, {'ml_en': '... ... ... ... the god ... ... ... ...'})])])

## Data Dimensions

In [17]:
browser_dimensions = [
#     ("new", lambda p: ["new" if p.has_new_translations else "old"]),
#     ("language", lambda p: [p.language]),
    ("object_type", lambda p: [cdli.get_object_type(p.object_type)]),
    ("genre", lambda p: cdli.get_genres(p.genre)),
    ("period", lambda p: [cdli.period_slug_from_period[x] for x in cdli.get_periods(p.period)]),
]

## HTML Components

## HTML Pages

In [18]:
def get_file_path(site_path):
    return f"{wwwroot}{site_path}"

def get_page_file_path(site_path):
    return f"{get_file_path(site_path)}.html"
    

In [19]:
def header(title, f):
    f.write(f"<DOCTYPE html>\n")
    f.write(f"<html>\n<head>\n")
    f.write(f"<meta charset='utf-8'>\n")
    f.write(f"<title>{escape(title)}</title>\n")
    f.write(f"<meta name='viewport' content='width=device-width, initial-scale=1'>\n")
    f.write(f"<link rel='stylesheet' href='/main.css'>\n")
    f.write(f"</head>\n")
    f.write(f"<body>\n<div class='content'><h1 id='page-title'>{escape(title)}</h1>\n")
    
def footer(f, script=None):
    f.write(f"</div>\n")
    f.write(f"<footer>\n")
    f.write(f"<p class='otitle'>ML Translations by <a href=\"https://huggingface.co/praeclarum/cuneiform\">praeclarum/cuneiform</a></p>\n")
    f.write(f"</footer>\n")
    f.write(f"<script src='/main.js'></script>\n")
    if script is not None:
        f.write(f"<script>{script}</script>\n")
    f.write(f"</body>\n</html>")

In [20]:
def start_page(path, title):
    file_path = get_page_file_path(path)
    file_dir = os.path.dirname(file_path)
    os.makedirs(file_dir, exist_ok=True)
    f = open(file_path, "wt")
#     print(f"Writing {path} at {file_path}")
    header(title, f)
    return f

def end_page(f):
    footer(f)

### Publication Index Pages

In [21]:
data_links = [
    ("Oracc - Open Richly Annotated Cuneiform Corpus", "http://oracc.museum.upenn.edu"),
    ("CDLI - Cuneiform Digital Library Initiative", "https://cdli.ucla.edu"),
    ("ETCSL - Electronic Text Corpus of Sumerian Literature", "https://etcsl.orinst.ox.ac.uk"),
]

In [22]:
language_sort = {
    "akk": 0,
#     "akkts": 1,
#     "elx": 2,
#     "elxts": 3,
    "sux": 4,
#     "suxts": 5,
    "ml_en": 100,
    "en": 1000,
#     "fr": 1001,
}

bad_translators = {"uncertain", "NaN", "no translation", "", "check"}

In [23]:
def paragraph_lines_to_html(a, paragraphs, text_lines):
    html = []
    for pi, plines in enumerate(paragraphs):
        p = a.paragraphs[pi]
        tag = p.tag
        html.append(f"<{tag}>")
        for line_start_index, line_end_index, text in plines:
            for i in range(line_start_index, line_end_index):
                if i >= len(text_lines):
                    continue
                html.append(f"<span class='line line-{i}'>{escape(text_lines[i])}</span>\n")
        html.append(f"</{tag}>\n")
    return "".join(html)

def paragraphs_to_html(a, paragraphs, lang):
    html = []
    for pi, plines in enumerate(paragraphs):
        p = a.paragraphs[pi]
        tag = p.tag
        html.append(f"<{tag}>")
        line_index = plines[0][0] if len(plines) > 0 else 0
        html.append(f"<span class='line line-{line_index}'>{escape(p.languages[lang])}</span>\n")
        html.append(f"</{tag}>\n")
    return "".join(html)

def title_case(str):
    if len(str) == 0:
        return str
    if len(str) == 1:
        return str.upper()
    return str[0].upper() + str[1:]

def output_pub(p, f):
    f.write(f"<h1 class='otitle'>{p.id}: {' and '.join(cdli.get_genres(p.genre))} {cdli.get_object_type(p.object_type)}</h1>\n")
    src_a = ""
    if p.corpus == "cdli":
        src_a = f"<a href='https://cdli.ucla.edu/search/archival_view.php?ObjectID={p.id}'>CDLI</a>"
    elif p.corpus == "oracc":
        src_a = f"<a href='{p.src_url}'>Oracc</a>"
    f.write(f"<p class='otitle'>{p.period} {src_a}</p>\n")
    areas_with_paras = [x for x in p.text_areas if len(x.lines) > 0 and len(x.paragraphs) > 0 and len(x.lines[0].text) > 0]
    for a in areas_with_paras:
        f.write(f"<section class='textarea'>\n")
        if len(areas_with_paras) > 1:
            f.write(f"<h1>{escape(title_case(a.name))}</h1>\n")
        f.write(f"<div class='translations-container'>\n")
        paragraphs = a.paragraphs_to_lines()
        texts = {p.language: paragraph_lines_to_html(a, paragraphs, [l.text for l in a.lines])}
        langs = set()
        for para in a.paragraphs:
            for lang in para.languages:
                if lang in language_sort:
                    langs.add(lang)
        for lang in langs:
            texts[lang] = paragraphs_to_html(a, paragraphs, lang)
        langs.add(p.language)
#             f.write(f"<p><pre>{escape(repr(paragraphs))}</pre></p> ")
#             if "akkts" in langs and "akk" in langs:
#                 langs.remove("akk")
#             if "suxts" in langs and "sux" in langs:
#                 langs.remove("sux")
        langs = sorted(list(langs), key=lambda x:language_sort[x])
        for lang in langs:
            f.write(f"<div class='lang-{lang} text'>\n")
            translator = "ML Translation" if lang.startswith("ml_") else (languages.all_languages[lang])
            if lang == tgt_lang:
                if p.translation_source is not None and p.translation_source not in bad_translators:
                    translator = escape(p.translation_source)
                else:
                    translator = "Unknown"
            f.write(f"<div class='langid'>{translator}</div>\n")
            f.write(texts[lang])
            f.write(f"</div>\n")
        f.write(f"</div></section>\n")

In [24]:
for pdir in tqdm(sorted(list(all_pubs_by_dir.keys()))):
    pubs = sorted(all_pubs_by_dir[pdir], key=lambda p:p.id)
    with start_page(f"/p/{pdir}", pdir) as f:
        for p in pubs:
            f.write(f"<section id='{p.id}' class='pub'>\n")
            output_pub(p, f)
            f.write(f"</section>\n")
        end_page(f)

  0%|          | 0/380 [00:00<?, ?it/s]

In [25]:
def output_browser(path, pubs, ignore_dims, f):
    next_pages = []
    if len(pubs) == 0:
        return next_pages
    f.write(f"<section>\n")
    for dname, dselect in browser_dimensions:
        if dname in ignore_dims:
            continue
        vgroups = defaultdict(lambda: [])
        for p in pubs:
            for v in dselect(p):
                vgroups[v].append(p)
        if len(vgroups) < 2:
            continue
        f.write(f"<h1>{escape(dname)}</h1>\n")    
        for gv in vgroups.keys():
            gpubs = vgroups[gv]
            if len(gpubs) > 0:
                next_pages.append((dname, gv, gpubs))
                f.write(f"<a href='{path}/{gv}/'>{len(gpubs):,} {escape(gv)}</a>\n")
    f.write(f"</section>\n")
    f.write(f"<section>\n")
    max_on_page = 200
    if len(pubs) <= max_on_page or len(next_pages) == 0:
        for p in pubs:
            output_pub(p, f)        
    else:
        f.write(f"<p>{len(pubs):,} publications. Narrow the list to less than {max_on_page:,} by choosing links above.</p>\n")
    f.write(f"</section>\n")
    return next_pages

def output_browser_page(path, dim_value, pubs, ignore_dims):
    with start_page(path + "/index", f"{len(pubs)} {escape(dim_value)}") as f:
        next_pages = output_browser(path, pubs, ignore_dims, f)
        end_page(f)
    for gk, gv, gpubs in next_pages:
        ignores = set(ignore_dims)
        ignores.add(gk)
        output_browser_page(f"{path}/{gv}", gv, gpubs, ignores)


In [26]:
os.makedirs(f"{wwwroot}/fonts", exist_ok=True)
for font in glob.glob("../fonts/*.woff"):
    shutil.copy2(font, f"{wwwroot}/fonts/{os.path.basename(font)}")

In [27]:
languages.old_languages

{'akk': 'Akkadian',
 'sux': 'Sumerian',
 'qpn': 'Proper Nouns',
 'arc': 'Aramaic',
 'elx': 'Elamite',
 'grc': 'Greek',
 'peo': 'Old Persian',
 'ug': 'Ugaritic',
 'xur': 'Urartian'}

In [28]:
languages.cuneiform_text_to_unicode("ki na-lu5-ta ur-nigar{gar} szu ba-ti iti masz-da3-gu7 mu us2-sa ki-masz{ki} ba-hul", "sux")

'𒆠 𒈾lu5𒋫 𒌨nigar{𒃻} szu 𒁀𒋾 iti maszda3gu7 𒈬 us2𒊓 𒆠masz{𒆠} 𒁀hul'

In [29]:
languages.cuneiform_text_to_unicode("iti", "sux")

'iti'

In [30]:
#!rm -r /Users/fak/Dropbox/Projects/CuneiformTranslators/dist

In [31]:
with start_page("/404", "Not Found") as f:
    f.write(f"<p>The page you are looking for is not here.</p>\n")
    footer(f)


In [32]:
shutil.copy("../web/translator.html", f"{wwwroot}/translator.html")
shutil.copy("../web/main.css", f"{wwwroot}/main.css")
shutil.copy("../web/main.js", f"{wwwroot}/main.js")

'/Users/fak/Dropbox/Projects/CuneiformTranslators/dist/main.js'

In [33]:
with start_page("/browse", "Browse") as f:
    f.write(f"<div id='browser'></div>\n")
    script = """
    // get the q query parameter
    const q = new URLSearchParams(window.location.search).get('q');
    if (q) {
    document.title = 'Browse ' + q;
    document.getElementById('page-title').innerText = 'Browse ' + q;
    (async function() { await createPublicationBrowserAsync(document.getElementById('browser'), '/i/'+q+'.json'); })();
    }"""
    footer(f, script=script)



In [34]:
importlib.reload(cdli)
print("Writing /")
with start_page("/index", "AI Cuneiform Translation Corpus") as f:
    f.write(f"<p>The Largest Online Corpus of Translated Cuneiform Texts</p>\n")
    # f.write(f"<a href='/translator'>Online Translator!</a>")
#     pubs = [p for p in output_pubs if p.id>393000 and p.id<394000]
    by_lang = defaultdict(lambda: [])
    for p in all_pubs.values():
        by_lang[p.language].append(p)
    next_pages = []
    f.write(f"<nav>\n")
    f.write(f"<ul>\n")
    f.write(f"<li>{len(all_pubs):,} publications</li>\n")
    for lang in sorted(list(by_lang.keys())):
        gpubs = by_lang[lang]
        # f.write(f"<li><a href='browse.html?q={lang}'>{len(gpubs):,} {escape(lang)}</a></li>\n")
        f.write(f"<li><a href='/{lang}/'>{len(gpubs):,} {escape(lang)}</a></li>\n")
        next_pages.append(("language", lang, gpubs))
    f.write(f"</nav>\n")
    next_pages.extend(output_browser("", list(all_pubs.values()), "", f))
    footer(f)
    
for gk, gv, gpubs in tqdm(next_pages):
    output_browser_page(f"/{gv}", gv, gpubs, set([gk]))
    pass
    
for f in glob.glob("../dist/*"):
    print(f)

Writing /


  0%|          | 0/64 [00:00<?, ?it/s]

../dist/akk
../dist/omen
../dist/other-genre
../dist/tablet
../dist/administrative record
../dist/uruk-iii
../dist/index.html
../dist/lexical
../dist/uncertain
../dist/parthian
../dist/browse.html
../dist/hellenistic
../dist/ur-iii
../dist/barrel
../dist/private-votive
../dist/old-akkadian
../dist/prism
../dist/404.html
../dist/cone
../dist/ed-i-ii
../dist/letter
../dist/other-period
../dist/neo-elamite
../dist/main.css
../dist/early-neo-babylonian
../dist/old-babylonian
../dist/ed-iiib
../dist/seal
../dist/vase
../dist/other-object
../dist/neo-babylonian
../dist/astronomical
../dist/early-old-babylonian
../dist/bulla
../dist/neo-assyrian
../dist/old-assyrian
../dist/main.js
../dist/historical
../dist/envelope
../dist/achaemenid
../dist/ebla
../dist/sux
../dist/translator.html
../dist/administrative
../dist/middle-assyrian
../dist/uruk-iv
../dist/brick
../dist/seleucid
../dist/medical
../dist/vessel
../dist/lentil
../dist/sealing
../dist/ritual
../dist/royal-monumental
../dist/mathemat