# Make Web Site

In [1]:
import sys, os, io, datetime
import json
import random
import requests
import zipfile
import glob
import re
import shutil
import pandas as pd
from slugify import slugify
from tqdm.notebook import tqdm
from html import escape
from collections import defaultdict

In [2]:
import corpi
import cdli
import oracc
import languages

In [3]:
pd.set_option("display.max_columns", None)

In [4]:
supported_langs = set(["akk", "sux"])

In [5]:
wwwroot = os.path.abspath("../dist")
os.makedirs(wwwroot, exist_ok=True)
wwwroot

'/Users/fak/Dropbox/Projects/CuneiformTranslators/dist'

## Download the Catalog

In [6]:
cat = cdli.get_catalog()

Downloading https://github.com/cdli-gh/data/raw/master/cdli_cat.csv


  cat = pd.read_csv(io.StringIO(cat_csv))


In [7]:
cat

Unnamed: 0,accession_no,accounting_period,acquisition_history,alternative_years,ark_number,atf_source,atf_up,author,author_remarks,cdli_collation,cdli_comments,citation,collection,composite_id,condition_description,date_entered,date_of_origin,date_remarks,date_updated,dates_referenced,db_source,designation,dumb,dumb2,electronic_publication,elevation,excavation_no,external_id,findspot_remarks,findspot_square,genre,google_earth_collection,google_earth_provenience,height,id,id_text2,id_text,join_information,language,lineart_up,material,museum_no,object_preservation,object_type,period,period_remarks,photo_up,primary_publication,provenience,provenience_remarks,publication_date,publication_history,published_collation,seal_id,seal_information,stratigraphic_level,subgenre,subgenre_remarks,surface_preservation,text_remarks,thickness,translation_source,width,object_remarks
0,,,,,21198/zz001q0dtm,"Englund, Robert K.",,CDLI,"31x61x18; Lú A 14-16.30-32.48-50; M XVIII, auf...",,,,"Vorderasiatisches Museum, Berlin, Germany",Q000002,,12/4/2001,00.00.00.00,,2020-03-14,00.00.00.00,20011204 protocuneiform_catalogue,"CDLI Lexical 000002, ex. 065",,,,,"W 06435,a",,auf Hügeloberfläche in der Nähe des Südbaues,"M XVIII,?",Lexical,,,31,1,0,1,,undetermined,150ppi 20160630,clay,VAT 01533,,tablet,Uruk III (ca. 3200-3000 BC),,,"CDLI Lexical 000002, ex. 065",Uruk (mod. Warka),,2015ff.,"Englund, Robert K. & Nissen, Hans J., ATU 3 (1...",,,,,Archaic Lu2 A (witness),,,,18,no translation,61,
1,,,,,21198/zz001q0dv4,"Englund, Robert K.",,CDLI,30x48x13; Lú A 13-15.23-25.?; Fundstelle wie W...,,,,"Vorderasiatisches Museum, Berlin, Germany",Q000002,,12/4/2001,00.00.00.00,,2018-10-20,00.00.00.00,20011204 protocuneiform_catalogue,"CDLI Lexical 000002, ex. 066",,,,,"W 06435,b",,auf der Hügeloberfläche in der Nähe des Südbaues,"M XVIII,?",Lexical,,,30,2,0,2,,undetermined,150ppi 20160630,clay,VAT 15263,,tablet,Uruk III (ca. 3200-3000 BC),,,"CDLI Lexical 000002, ex. 066",Uruk (mod. Warka),,2015ff.,"Englund, Robert K. & Nissen, Hans J., ATU 3 (1...",,,,,Archaic Lu2 A (witness),,,,13,no translation,48,
2,,,,,21198/zz001q0dwn,"Englund, Robert K.",,"Englund, Robert K. & Nissen, Hans J.","42x53x19; Vocabulary 9; Qa XVI,2, unter der Ab...",,,,"Vorderasiatisches Museum, Berlin, Germany",,,12/4/2001,,,2020-01-26,,20011204 protocuneiform_catalogue,"ATU 3, pl. 081, W 9123,d",,,,,"W 09123,d",,"unter der Abgleichung der Schicht III, 1,5 m ü...","Qa XVI,2",Lexical,,,42,3,0,3,,undetermined,150ppi 20160630,clay,VAT 15253,,tablet,Uruk IV (ca. 3350-3200 BC),,,"ATU 3, pl. 081, W 9123,d",Uruk (mod. Warka),,1993,"ATU 1, 539",,,,,Archaic Vocabulary (witness),Text category: 15-09; Foreign ID: LVO 9,,,19,no translation,53,
3,,,,,21198/zz001q0dx5,"Englund, Robert K.",,CDLI,26x23x23; Lú A 9-10.?.?; Fundstelle wie W 9123...,,,,"Vorderasiatisches Museum, Berlin, Germany",Q000002,,12/4/2001,00.00.00.00,,2018-10-20,00.00.00.00,20011204 protocuneiform_catalogue,"CDLI Lexical 000002, ex. 051",,,,,"W 09169,d",,"unter der Abgleichung der Schicht III, 1,5 m ü...","Qa XVI,2",Lexical,,,26,4,0,4,,undetermined,150ppi 20160630,clay,VAT 15168,,tablet,Uruk IV (ca. 3350-3200 BC),,,"CDLI Lexical 000002, ex. 051",Uruk (mod. Warka),,2015ff.,"Englund, Robert K. & Nissen, Hans J., ATU 3 (1...",,,,,Archaic Lu2 A (witness),,,,23,no translation,23,
4,,,,,21198/zz001q0dzp,"Englund, Robert K.",,CDLI,"29x36x20; Lú A Vorläufer; Qa XVI,2, unter der ...",,,,"Vorderasiatisches Museum, Berlin, Germany",Q000002,,12/4/2001,00.00.00.00,,2018-10-20,00.00.00.00,20011204 protocuneiform_catalogue,"CDLI Lexical 000002, ex. 172",,,,,"W 09206,k",,"unter der Abgleichung der Schicht III, 1,5 m ü...","Qa XVI,2",Lexical,,,29,5,0,5,,undetermined,150ppi 20160630,clay,VAT 15153,,tablet,Uruk IV (ca. 3350-3200 BC),,,"CDLI Lexical 000002, ex. 172",Uruk (mod. Warka),,2015ff.,"Englund, Robert K. & Nissen, Hans J., ATU 3 (1...",,,,,Archaic Lu2 A (witness),,,,20,no translation,36,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353278,,,,,,no atf,,"Fahad, Saad Salman & Al-Hussainy, Abbas A.",,,,,"National Museum of Iraq, Baghdad, Iraq",,,8/13/2022,,,2022-08-13,,20220813 jagersma,"Adab Al-Rafidayn 63, 83-92 no. 2",,,,,Marad 047,,,,Legal,,,,362114,0,532443,,Sumerian,,clay,IM —,,tablet,Old Babylonian (ca. 1900-1600 BC),,,"Adab Al-Rafidayn 63, 83-92 no. 2",Marad (mod. Wanna-wa-Sadum),,2012,,,,,,,,,,,no translation,,
353279,,,,,,no atf,,"Postgate, J. Nicholas",,,,,"National Museum of Iraq, Baghdad, Iraq",,,8/13/2022,,,2022-08-13,,20220813 jagersma,"Iraq 35, 173-175 (pl. 71-72) TA 2100",,,,,TA 2100,,,,Administrative,,,33,362115,0,532444,,Akkadian,,clay,IM —,,tablet,Old Babylonian (ca. 1900-1600 BC),,,"Iraq 35, 173-175 (pl. 71-72) TA 2100",,,1973,"Lacambre, Denis, Fs Charpin (2019) 525-526",,,,,,,,,20,no translation,34,
353280,,,,,,no atf,,"Postgate, J. Nicholas",,,,,"National Museum of Iraq, Baghdad, Iraq",,,8/13/2022,,,2022-08-13,,20220813 jagersma,"Iraq 35, 173-175 (pl. 71-72) TA 2101",,,,,TA 2101,,,,Administrative,,,32,362116,0,532445,,Akkadian,,clay,IM —,,tablet,Old Babylonian (ca. 1900-1600 BC),,,"Iraq 35, 173-175 (pl. 71-72) TA 2101",,,1973,"Lacambre, Denis, Fs Charpin (2019) 525-526",,,,,,,,,20,no translation,34,
353281,,,"purchased from M. Gejou, Paris, in the summer ...",,,no atf,,"Grant, Elihu",,,,,"private: William T. Grant Jr., Pelham Manor, N...",,,8/19/2022,,,2022-08-19,,20220819 jagersma,"AJSL 34, 199-204",,,,,,,,,Legal,,,,362117,0,532446,,Akkadian,,clay,Grant 17,,tablet & envelope,Old Babylonian (ca. 1900-1600 BC),,,"AJSL 34, 199-204",Larsa (mod. Tell as-Senkereh),,1918,"Koschaker & Ungnad, HG 6 (1923) no. 1470",,,,,,,,,,no translation,,


In [8]:
len(cat)

353283

In [9]:
cat.columns

Index(['accession_no', 'accounting_period', 'acquisition_history',
       'alternative_years', 'ark_number', 'atf_source', 'atf_up', 'author',
       'author_remarks', 'cdli_collation', 'cdli_comments', 'citation',
       'collection', 'composite_id', 'condition_description', 'date_entered',
       'date_of_origin', 'date_remarks', 'date_updated', 'dates_referenced',
       'db_source', 'designation', 'dumb', 'dumb2', 'electronic_publication',
       'elevation', 'excavation_no', 'external_id', 'findspot_remarks',
       'findspot_square', 'genre', 'google_earth_collection',
       'google_earth_provenience', 'height', 'id', 'id_text2', 'id_text',
       'join_information', 'language', 'lineart_up', 'material', 'museum_no',
       'object_preservation', 'object_type', 'period', 'period_remarks',
       'photo_up', 'primary_publication', 'provenience', 'provenience_remarks',
       'publication_date', 'publication_history', 'published_collation',
       'seal_id', 'seal_information', 's

https://cdli.ucla.edu/search/archival_view.php?ObjectID=P256681

In [10]:
cat[cat["id_text"]==256681]

Unnamed: 0,accession_no,accounting_period,acquisition_history,alternative_years,ark_number,atf_source,atf_up,author,author_remarks,cdli_collation,cdli_comments,citation,collection,composite_id,condition_description,date_entered,date_of_origin,date_remarks,date_updated,dates_referenced,db_source,designation,dumb,dumb2,electronic_publication,elevation,excavation_no,external_id,findspot_remarks,findspot_square,genre,google_earth_collection,google_earth_provenience,height,id,id_text2,id_text,join_information,language,lineart_up,material,museum_no,object_preservation,object_type,period,period_remarks,photo_up,primary_publication,provenience,provenience_remarks,publication_date,publication_history,published_collation,seal_id,seal_information,stratigraphic_level,subgenre,subgenre_remarks,surface_preservation,text_remarks,thickness,translation_source,width,object_remarks
106189,,,,,21198/zz001s1m1w,"de Ridder, Alba",,"Stol, Marten",,,,,University of Pennsylvania Museum of Archaeolo...,,,2/24/2005,,,2021-06-12,,20050224 fitzgerald_upenn,"AbB 11, 029",,,,,,,,,Letter,,,?,106191,0,256681,,Akkadian,,clay,UM 29-16-076,,tablet,Old Babylonian (ca. 1900-1600 BC),,600ppi 20160630,"AbB 11, 029",Nippur (mod. Nuffar),,1986,,,,,,,Letter; 7x8x1 line,,,?,no translation,?,


## Get the Human Transliterations

In [11]:
cdli_corpus = corpi.CDLI()

Downloading https://github.com/cdli-gh/data/raw/master/cdliatf_unblocked.atf
Parsing atf


In [12]:
print("Merging transliterations with catalog")
cdli_pubs = cdli.merge_atf_with_catalog(cdli_corpus.cdli_pubs, cat, tqdm)

Merging transliterations with catalog


  0%|          | 0/135256 [00:00<?, ?it/s]

In [14]:
oracc_dir="/Volumes/FrankDisk/oracc_zips"
oracc_corpus = corpi.ORACC(oracc_dir=oracc_dir, tqdm=tqdm)

  0%|          | 0/140 [00:00<?, ?it/s]

Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fab8cc5e980>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fa9bf2214c0>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fa9bf241dc0>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fa9c9f7c7c0>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fa9ce915a40>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <traceback object at 0x7fa9c9f5cd80>)
Error: (<class 'json.decoder.JSONDecodeError'>, JSONDecodeError('Expecting value: line 1 column 1 (char 0)'), <t

  0%|          | 0/22076 [00:00<?, ?it/s]

  0%|          | 0/22076 [00:00<?, ?it/s]

In [15]:
cdli_pubs[0]

Publication('P000001', 'qpc', [TextArea('tablet', [], []), TextArea('obverse', [], []), TextArea('column 1', [TextLine("1'.", '1(N01) , [...]', {}), TextLine("2'.", '1(N01) , TIM ABGAL#', {}), TextLine("3'.", '1(N01) , KINGAL#', {})], []), TextArea('column 2', [TextLine("1'.", '1(N01) , [...]', {}), TextLine("2'.", '1(N01) , GAL~a# UMUN2#', {}), TextLine("3'.", '1(N01) , GAL~a UMUN2 KU3~a', {})], []), TextArea('column 3', [TextLine("1'.", '1(N01) , DUB~a SANGA~a#', {}), TextLine("2'.", '1(N01) , SUG5# SAG#', {}), TextLine("3'.", '1(N01) , UB SAG#', {})], []), TextArea('reverse', [TextLine('1.', '[N] , [...]', {})], [])])

In [16]:
all_pubs = dict()
for p in oracc_corpus.oracc_transliterated_pubs.values():
    if p.language in supported_langs:
        p.corpus = "oracc"
        all_pubs[p.id] = p
for p in cdli_pubs:
    if p.language in supported_langs:
        p.corpus = "cdli"
        all_pubs[p.id] = p
print(f"Found {len(all_pubs)} unique publications")

Found 136614 unique publications


In [17]:
# Group publications by the first 4 characters of pub.id to create a directory structure
all_pubs_by_dir = defaultdict(list)
for p in all_pubs.values():
    all_pubs_by_dir[p.id[:4].lower()].append(p)
print(f"Found {len(all_pubs_by_dir)} directories")


Found 398 directories


## Get the ML Translations

In [18]:
translations_json_path = "../data/ml_translations.json"
translations = json.loads(str(open(translations_json_path, "rb").read(), "utf8"))
for k in translations.keys():
    if "_to_" in k:
        print(k, len(translations[k]))

akk_to_en 380004
sux_to_en 50000


In [19]:
num_transliterations = len(all_pubs)
num_translations = len([x for x in all_pubs.values() if x.has_translations()])
print("num_transliterations:", num_transliterations)
print("    num_translations:", num_translations)

num_transliterations: 136614
    num_translations: 17058


## Find Publications we have translations for

In [20]:
tgt_lang = "en"

translated_pubs = []

for pub in tqdm(list(all_pubs.values())):
    has_new_translations = False
    has_ml_translations = True
    has_lines = False
    st_key = f"{pub.language}_to_{tgt_lang}"
    if st_key not in translations:
        continue
    st_translations = translations[st_key]
    for a in pub.text_areas:
        if len(a.lines) > 0 and len(a.paragraphs) == 0:
            a.lines_to_paragraphs(pub.language, tgt_lang)
        paras = a.paragraphs_to_lines()
        for i, plines in enumerate(paras):
            p = a.paragraphs[i]
            p.languages["ml_"+tgt_lang] = ""
            head = ""
            for si,ei,s in plines:
                if len(s) > 1:
                    has_lines = True
                    has_new_translations = has_new_translations or (tgt_lang not in p.languages)
                    if s in st_translations:
                        p.languages["ml_"+tgt_lang] += head + st_translations[s]
                        head = " "
                    else:
                        has_ml_translations = False
    pub.has_new_translations = has_new_translations
    pub.has_ml_translations = has_ml_translations
    if has_ml_translations and has_lines:
        translated_pubs.append(pub)
    
newly_translated_pubs = [x for x in translated_pubs if x.has_new_translations]
print(len(translated_pubs), "translated_pubs")
print(len(newly_translated_pubs), "newly_translated_pubs")

  0%|          | 0/136614 [00:00<?, ?it/s]

848 translated_pubs
746 newly_translated_pubs


In [21]:
len([x for x in translated_pubs if x.language == "sux"]), "sux"

(164, 'sux')

In [22]:
newly_translated_pubs[0]

Publication('P466946', 'akk', [TextArea('object brick', [], []), TextArea('surface a', [TextLine('1.', '{disz}{d}sin-pap-mesz-su _man szu2 man kur_ asz-szur _du3_-isz3 s,a-lam an-szar2 u _dingir-mesz gal-mesz_', {}), TextLine('2.', 'ana-ku _e2_ ti-ka-a-ti sza _kisal_ sa-ad-rum man-za-az {d}i2-gi3-gi3', {}), TextLine('3.', 'ina a-gur2-ri _udun ku3_-ti esz-szisz u-sze-pisz-ma u-zaq-qir6 hur-sza2-nisz', {})], [TextParagraph(0, 1, {'ml_en': 'Sin-papmesh, king of the totality, king of Assyria, protector of the great gods'}), TextParagraph(1, 2, {'ml_en': 'I entered the house of Tikatu, the main courtyard of the temple of the god Igigi'}), TextParagraph(2, 3, {'ml_en': 'in the threshing floor of the holy oven he made a fire and he placed a snare on the hill'})])])

## Data Dimensions

In [23]:
browser_dimensions = [
#     ("new", lambda p: ["new" if p.has_new_translations else "old"]),
#     ("language", lambda p: [p.language]),
    ("object_type", lambda p: [cdli.get_object_type(p.object_type)]),
    ("genre", lambda p: cdli.get_genres(p.genre)),
    ("period", lambda p: [cdli.period_slug_from_period[x] for x in cdli.get_periods(p.period)]),
]

## HTML Components

## HTML Pages

In [24]:
def get_file_path(site_path):
    return f"{wwwroot}{site_path}"

def get_page_file_path(site_path):
    return f"{get_file_path(site_path)}.html"
    

In [92]:
def header(title, f):
    f.write(f"<DOCTYPE html>\n")
    f.write(f"<html>\n<head>\n")
    f.write(f"<meta charset='utf-8'>\n")
    f.write(f"<title>{escape(title)}</title>\n")
    f.write(f"<meta name='viewport' content='width=device-width, initial-scale=1'>\n")
    f.write(f"<link rel='stylesheet' href='/main.css'>\n")
    f.write(f"</head>\n")
    f.write(f"<body>\n<div class='content'><h1 id='page-title'>{escape(title)}</h1>\n")
    
def footer(f, script=None):
    f.write(f"</div>\n")
    f.write(f"<footer>\n")
    f.write(f"<p class='otitle'>ML Translations by <a href=\"https://huggingface.co/praeclarum/cuneiform\">praeclarum/cuneiform</a></p>\n")
    f.write(f"</footer>\n")
    f.write(f"<script src='/main.js'></script>\n")
    if script is not None:
        f.write(f"<script>{script}</script>\n")
    f.write(f"</body>\n</html>")

In [93]:
def start_page(path, title):
    file_path = get_page_file_path(path)
    file_dir = os.path.dirname(file_path)
    os.makedirs(file_dir, exist_ok=True)
    f = open(file_path, "wt")
#     print(f"Writing {path} at {file_path}")
    header(title, f)
    return f

def end_page(f):
    footer(f)

### Publication Index Pages

In [75]:
data_links = [
    ("Oracc - Open Richly Annotated Cuneiform Corpus", "http://oracc.museum.upenn.edu"),
    ("CDLI - Cuneiform Digital Library Initiative", "https://cdli.ucla.edu"),
    ("ETCSL - Electronic Text Corpus of Sumerian Literature", "https://etcsl.orinst.ox.ac.uk"),
]

In [76]:
language_sort = {
    "akk": 0,
#     "akkts": 1,
#     "elx": 2,
#     "elxts": 3,
    "sux": 4,
#     "suxts": 5,
    "ml_en": 100,
    "en": 1000,
#     "fr": 1001,
}

bad_translators = {"uncertain", "NaN", "no translation", "", "check"}

In [77]:
def paragraph_lines_to_html(a, paragraphs, text_lines):
    html = []
    for pi, plines in enumerate(paragraphs):
        p = a.paragraphs[pi]
        tag = p.tag
        html.append(f"<{tag}>")
        for line_start_index, line_end_index, text in plines:
            for i in range(line_start_index, line_end_index):
                if i >= len(text_lines):
                    continue
                html.append(f"<span class='line line-{i}'>{escape(text_lines[i])}</span>\n")
        html.append(f"</{tag}>\n")
    return "".join(html)

def paragraphs_to_html(a, paragraphs, lang):
    html = []
    for pi, plines in enumerate(paragraphs):
        p = a.paragraphs[pi]
        tag = p.tag
        html.append(f"<{tag}>")
        line_index = plines[0][0] if len(plines) > 0 else 0
        html.append(f"<span class='line line-{line_index}'>{escape(p.languages[lang])}</span>\n")
        html.append(f"</{tag}>\n")
    return "".join(html)

def title_case(str):
    if len(str) == 0:
        return str
    if len(str) == 1:
        return str.upper()
    return str[0].upper() + str[1:]

def output_pub(p, f):
    f.write(f"<h1 class='otitle'>{p.id}: {' and '.join(cdli.get_genres(p.genre))} {cdli.get_object_type(p.object_type)}</h1>\n")
    src_a = ""
    if p.corpus == "cdli":
        src_a = f"<a href='https://cdli.ucla.edu/search/archival_view.php?ObjectID={p.id}'>CDLI</a>"
    elif p.corpus == "oracc":
        src_a = f"<a href='{p.src_url}'>Oracc</a>"
    f.write(f"<p class='otitle'>{p.period} {src_a}</p>\n")
    areas_with_paras = [x for x in p.text_areas if len(x.lines) > 0 and len(x.paragraphs) > 0 and len(x.lines[0].text) > 0]
    for a in areas_with_paras:
        f.write(f"<section class='textarea'>\n")
        if len(areas_with_paras) > 1:
            f.write(f"<h1>{escape(title_case(a.name))}</h1>\n")
        f.write(f"<div class='translations-container'>\n")
        paragraphs = a.paragraphs_to_lines()
        texts = {p.language: paragraph_lines_to_html(a, paragraphs, [l.text for l in a.lines])}
        langs = set()
        for para in a.paragraphs:
            for lang in para.languages:
                if lang in language_sort:
                    langs.add(lang)
        for lang in langs:
            texts[lang] = paragraphs_to_html(a, paragraphs, lang)
        langs.add(p.language)
#             f.write(f"<p><pre>{escape(repr(paragraphs))}</pre></p> ")
#             if "akkts" in langs and "akk" in langs:
#                 langs.remove("akk")
#             if "suxts" in langs and "sux" in langs:
#                 langs.remove("sux")
        langs = sorted(list(langs), key=lambda x:language_sort[x])
        for lang in langs:
            f.write(f"<div class='lang-{lang} text'>\n")
            translator = "ML Translation" if lang.startswith("ml_") else (languages.all_languages[lang])
            if lang == tgt_lang:
                if p.translation_source is not None and p.translation_source not in bad_translators:
                    translator = escape(p.translation_source)
                else:
                    translator = "Unknown"
            f.write(f"<div class='langid'>{translator}</div>\n")
            f.write(texts[lang])
            f.write(f"</div>\n")
        f.write(f"</div></section>\n")

In [78]:
for pdir in tqdm(sorted(list(all_pubs_by_dir.keys()))):
    pubs = sorted(all_pubs_by_dir[pdir], key=lambda p:p.id)
    with start_page(f"/p/{pdir}", pdir) as f:
        for p in pubs:
            f.write(f"<section id='{p.id}' class='pub'>\n")
            output_pub(p, f)
            f.write(f"</section>\n")
        end_page(f)

  0%|          | 0/398 [00:00<?, ?it/s]

In [33]:
def output_browser(path, pubs, ignore_dims, f):
    next_pages = []
    if len(pubs) == 0:
        return next_pages
    f.write(f"<section>\n")
    for dname, dselect in browser_dimensions:
        if dname in ignore_dims:
            continue
        vgroups = defaultdict(lambda: [])
        for p in pubs:
            for v in dselect(p):
                vgroups[v].append(p)
        if len(vgroups) < 2:
            continue
        f.write(f"<h1>{escape(dname)}</h1>\n")    
        for gv in vgroups.keys():
            gpubs = vgroups[gv]
            if len(gpubs) > 0:
                next_pages.append((dname, gv, gpubs))
                f.write(f"<a href='{path}/{gv}/'>{len(gpubs)} {escape(gv)}</a>\n")
    f.write(f"</section>\n")
    f.write(f"<section>\n")
    max_on_page = 200
    if len(pubs) <= max_on_page or len(next_pages) == 0:
        for p in pubs:
            output_pub(p, f)        
    else:
        f.write(f"<p>{len(pubs)} publications. Narrow the list to less than {max_on_page} by choosing links above.</p>\n")
    f.write(f"</section>\n")
    return next_pages

def output_browser_page(path, dim_value, pubs, ignore_dims):
    with start_page(path + "/index", f"{len(pubs)} {escape(dim_value)}") as f:
        next_pages = output_browser(path, pubs, ignore_dims, f)
        end_page(f)
    for gk, gv, gpubs in next_pages:
        ignores = set(ignore_dims)
        ignores.add(gk)
        output_browser_page(f"{path}/{gv}", gv, gpubs, ignores)


In [71]:
os.makedirs(f"{wwwroot}/fonts", exist_ok=True)
for font in glob.glob("../fonts/*.woff"):
    shutil.copy2(font, f"{wwwroot}/fonts/{os.path.basename(font)}")

In [37]:
languages.old_languages

{'akk': 'Akkadian',
 'sux': 'Sumerian',
 'qpn': 'Proper Nouns',
 'arc': 'Aramaic',
 'elx': 'Elamite',
 'grc': 'Greek',
 'peo': 'Old Persian',
 'ug': 'Ugaritic',
 'xur': 'Urartian'}

In [38]:
languages.cuneiform_text_to_unicode("ki na-lu5-ta ur-nigar{gar} szu ba-ti iti masz-da3-gu7 mu us2-sa ki-masz{ki} ba-hul", "sux")

'𒆠 𒈾lu5𒋫 𒌨nigar{𒃻} szu 𒁀𒋾 iti maszda3gu7 𒈬 us2𒊓 𒆠masz{𒆠} 𒁀hul'

In [39]:
languages.cuneiform_text_to_unicode("iti", "sux")

'iti'

In [42]:
#!rm -r /Users/fak/Dropbox/Projects/CuneiformTranslators/dist

In [47]:
with start_page("/404", "Not Found") as f:
    f.write(f"<p>The page you are looking for is not here.</p>\n")
    footer(f)


In [90]:
shutil.copy("../web/translator.html", f"{wwwroot}/translator.html")
shutil.copy("../web/main.css", f"{wwwroot}/main.css")
shutil.copy("../web/main.js", f"{wwwroot}/main.js")

'/Users/fak/Dropbox/Projects/CuneiformTranslators/dist/main.js'

In [98]:
with start_page("/browse", "Browse") as f:
    f.write(f"<div id='browser'></div>\n")
    script = """
    // get the q query parameter
    const q = new URLSearchParams(window.location.search).get('q');
    if (q) {
    document.title = 'Browse ' + q;
    document.getElementById('page-title').innerText = 'Browse ' + q;
    (async function() { await createPublicationBrowserAsync(document.getElementById('browser'), '/i/'+q+'.json'); })();
    }"""
    footer(f, script=script)



In [89]:
   
print("Writing /")
with start_page("/index", "AI Cuneiform Translation Corpus") as f:
    f.write(f"<p>The Largest Online Corpus of Translated Cuneiform Texts</p>\n")
    # f.write(f"<a href='/translator'>Online Translator!</a>")
#     pubs = [p for p in output_pubs if p.id>393000 and p.id<394000]
    by_lang = defaultdict(lambda: [])
    for p in all_pubs.values():
        by_lang[p.language].append(p)
    next_pages = []
    f.write(f"<nav>\n")
    f.write(f"<ul>\n")
    f.write(f"<li>{len(all_pubs):,} publications</li>\n")
    for lang in sorted(list(by_lang.keys())):
        gpubs = by_lang[lang]
        f.write(f"<li><a href='browse.html?q={lang}'>{len(gpubs):,} {escape(lang)}</a></li>\n")
    f.write(f"</nav>\n")
    # next_pages.extend(output_browser("", pubs, "", f))
    footer(f)
    
for gk, gv, gpubs in tqdm(next_pages):
    # output_browser_page(f"/{gv}", gv, gpubs, set([gk]))
    pass
    
for f in glob.glob("../dist/*"):
    print(f)

Writing /


0it [00:00, ?it/s]

../dist/index.html
../dist/browse.html
../dist/404.html
../dist/main.css
../dist/main.js
../dist/translator.html
../dist/fonts
../dist/p
