# Make Web Site

In [1]:
import sys, os, io, datetime
import json
import random
import requests
import zipfile
import glob
import pandas as pd
from slugify import slugify
from tqdm.notebook import tqdm
from html import escape

In [2]:
import cdli

In [3]:
pd.set_option("display.max_columns", None)

In [4]:
supported_langs = set(["akk"])

In [5]:
wwwroot = os.path.abspath("../dist")
os.makedirs(wwwroot, exist_ok=True)
wwwroot

'/Users/fak/Dropbox/Projects/CuneiformTranslators/dist'

## Download the Catalog

In [None]:
cat = cdli.get_catalog()

Downloading https://github.com/cdli-gh/data/raw/master/cdli_cat.csv


In [None]:
cat

In [None]:
len(cat)

In [None]:
cat.columns

https://cdli.ucla.edu/search/archival_view.php?ObjectID=P256681

In [None]:
cat[cat["id_text"]==256681]

## Get the Human Transliterations

In [None]:
all_publications = cdli.get_atf()

In [None]:
print("Merging transliterations with catalog")
merged_pubs = cdli.merge_atf_with_catalog(all_publications, cat, tqdm)

In [None]:
output_pubs = [p for p in merged_pubs if p.language in supported_langs]
len(output_pubs)

## Get the ML Translations

In [None]:
translations_json_path = "../data/ml_translations.json"
translations = json.loads(str(open(translations_json_path, "rb").read(), "utf8"))
translations = translations["akk_to_en"]
len(translations)

In [None]:
num_publications = len(cat)
num_transliterations = len(output_pubs)
num_translations = len([x for x in output_pubs if x.has_translations()])
print("    num_publications:", num_publications)
print("num_transliterations:", num_transliterations)
print("    num_translations:", num_translations)

## Find Publications we have translations for

In [None]:
src_lang = "akk"
tgt_lang = "en"

translated_pubs = []
newly_translated_pubs = []

for pub in tqdm(output_pubs):
    if pub.language != src_lang:
        continue
    has_new_translations = False
    has_ml_translations = True
    for a in pub.text_areas:
        for l in a.lines:
            s = l.text
            if len(s) > 0:
                has_new_translations = has_new_translations or (tgt_lang not in l.languages)
                if s in translations:
                    l.languages["ml_"+tgt_lang] = translations[s]
                else:
                    has_ml_translations = False
    if has_ml_translations:
        translated_pubs.append(pub)
        if has_new_translations:
            newly_translated_pubs.append(pub)
    
            
print(len(translated_pubs), "translated_pubs")
print(len(newly_translated_pubs), "newly_translated_pubs")

In [None]:
newly_translated_pubs[0]

## Output the HTML

In [None]:
def get_file_path(site_path):
    return f"{wwwroot}{site_path}"

def get_page_file_path(site_path):
    return f"{get_file_path(site_path)}.html"
    

In [None]:
style = """
body { font-family: sans-serif; }
div.content { margin: 0 auto; max-width: 512px; }
body h1 { text-align: center; }
p.src { font-size: 80%; font-style: italic;}
.otitle { text-align: center; }
h1.otitle { margin-top: 1em; padding-top: 1em; border-top:solid 2px rgba(128,128,128,0.5); }
"""

In [None]:
def header(title, f):
    f.write(f"<html>\n<head>\n<title>{escape(title)}</title>\n<meta name='viewport' content='width=device-width, initial-scale=1'>\n<style>{style}</style></head>\n<body>\n<div class='content'><h1>{escape(title)}</h1>\n")
    
def footer(f):
    f.write(f"</div></body>\n</html>")

In [None]:


def output_pub(p, f):
    f.write(f"<h1 class='otitle'>P{p.id:06}: {' and '.join(cdli.get_genres(p.genre))} {cdli.get_object_type(p.object_type)}</h1>\n")
    f.write(f"<p class='otitle'>{p.period}</p>\n")
    for a in p.text_areas:
        if len(a.lines) == 0:
            continue
        f.write(f"<section>\n")
        f.write(f"<h1>{escape(a.name)}</h1>\n")
        show_lines = False
        if show_lines:
            f.write("<table>")
            for l in a.lines:
                f.write("<tr>")
                f.write(f"<td>{escape(l.text)}</td>\n")
                if "ml_en" in l.languages:
                    f.write(f"<td>{escape(l.languages['ml_en'])}</td>\n")
                else:
                    f.write(f"<td></td>\n")
                f.write("</tr>")
            f.write("</table>")
        else:
            s = " ".join(l.text for l in a.lines)
            t = " ".join((l.languages['ml_en'] if 'ml_en' in l.languages else ' ') for l in a.lines)
            f.write(f"<p class='src'>{escape(s)}</p>\n")
            f.write(f"<p class='tgt'>{escape(t)}</p>\n")
        f.write(f"</section>\n")

print("Writing /new")
with open(get_page_file_path("/new"), "wt") as f:
    header("ML Translations", f)
    f.write(f"<p class='otitle'>Translated by <a href=\"https://huggingface.co/praeclarum/cuneiform\">praeclarum/cuneiform</a></p>\n")
    newly_translated_pubs.sort(key=lambda a: a.id)
    for p in tqdm(newly_translated_pubs):
        if "administrative" in cdli.get_genres(p.genre):
            pass
        else:
            f.write(f"<section>\n")
            output_pub(p, f)        
            f.write(f"</section>\n")
    footer(f)
    
print("Writing /404")
with open(get_page_file_path("/404"), "wt") as f:
    header("Not Found", f)
    f.write(f"<p>The page you are looking for is not here.</p>\n")
    footer(f)
    
print("Writing /")
with open(get_page_file_path("/index"), "wt") as f:
    header("ML Cuneiform Translations", f)
    f.write(f"<a href=\"/new\">New Translations</a>\n")
    footer(f)

In [None]:
translated_pubs.sort(key=lambda a: a.id)
for pub in tqdm(translated_pubs):
    page_site_path = f"/p{pub.id:06}"
    page_file_path = get_page_file_path(page_site_path)
#     print(page_file_path)



In [None]:
for f in glob.glob("../dist/*"):
    print(f)