# Export ATF File Contents from CDLI

Downloads: https://cdli.ucla.edu/downloads
Languages: http://oracc.museum.upenn.edu/doc/help/languages/index.html

Exports the human `translations.jsonl` that is used by TrainTranslator to train the neural network.

In [1]:
import sys, os, io
import requests
import zipfile
from tqdm.notebook import tqdm
import json

In [2]:
output_json_path = "../data/translations.jsonl"

## Language Support

Make a list of old and modern languages that we're interested in.

In [3]:
old_languages = {
    "akk": "Akkadian",
    "sux": "Sumerian",
    "qpn": "Proper Nouns",
    "arc": "Aramaic",
    "elx": "Elamite",
    "grc": "Greek",
    "peo": "Old Persian",
    "ug": "Ugaritic",
}
modern_languages = {
    "de": "German",
    "en": "English",
    "es": "Spanish",
    "fr": "French",
    "it": "Italian"
}
language_codes = set(list(modern_languages.keys()) + [x + suffix for x in old_languages.keys() for suffix in ["","ts"]])
language_codes

{'akk',
 'akkts',
 'arc',
 'arcts',
 'de',
 'elx',
 'elxts',
 'en',
 'es',
 'fr',
 'grc',
 'grcts',
 'it',
 'peo',
 'peots',
 'qpn',
 'qpnts',
 'sux',
 'suxts',
 'ug',
 'ugts'}

## Download CDLI

In [4]:
import cdli

In [5]:
publications = cdli.get_atf()

Downloading https://github.com/cdli-gh/data/raw/master/cdliatf_unblocked.atf
Parsing atf


In [6]:
len(publications), "publications"

(134560, 'publications')

## Find Publications with Human Translations

In [7]:
# for p in publications:
#     for a in p.text_areas:
#         n = len(a.lines)
#         skip = 0
#         while skip < n and len(a.lines[skip].text) == 0:
#             skip += 1
#         if skip > 0:
# #             print("SKIP", skip, a)
#             a.lines = a.lines[skip:]    

In [8]:
translated_publications = [x for x in publications if x.language is not None and x.has_translations()]
len(translated_publications), "translated publications"

(5413, 'translated publications')

In [9]:
translated_publications[0]

Publication('P001282', 'qpc', [TextArea('tablet', []), TextArea('obverse', [TextLine('1.', '3(N01) 3(N08) , U8 UR2# UMBIN~a', {'en': '3 + 3 (lamb) ewes, ...;'}), TextLine('2.', '2(N01) , MASZ GURUSZDA MUSZ3~a', {'en': '2 goats, ...;'})]), TextArea('reverse', [])])

## Merge Lines to form full sentances

In [11]:
cdli.get_genres(" fake; poTTery, other; royal/votive")

{'fake', 'other-genre', 'pottery', 'royal/votive'}

In [12]:
ignore_texts = set(["xxx", "", "(subscript)"])

In [13]:
def get_publication_raw_lines(p):
    translations = []
    src_lang = p.language
    for a in p.text_areas:        
        if not a.has_translations():
            continue
        for iline, line in enumerate(a.lines):
            langs = list(line.languages.keys())
            src_text = line.text
            if src_text in ignore_texts:
                continue
            tr = {"p": p.id, "a": a.name, "l": iline }
            if src_lang in language_codes:
                tr[src_lang] = src_text
            for tgt_lang in langs:
                if tgt_lang in language_codes:
                    tgt_text = line.languages[tgt_lang] if tgt_lang in line.languages else ""
                    if not (tgt_text in ignore_texts):
                        tr[tgt_lang] = tgt_text
            if len(tr) > 4:
                for lang in language_codes:
                    if lang not in tr:
                        tr[lang] = None
                translations.append(tr)
    return translations

## Export them to JSON

In [14]:
translations = []

for p in tqdm(translated_publications):
    plines = get_publication_raw_lines(p)
    translations.extend(plines)

print(len(translations), "translations")
translations[:10]

  0%|          | 0/5413 [00:00<?, ?it/s]

91566 translations


[{'p': 'P010481',
  'a': 'column 1',
  'l': 0,
  'sux': '2(u@c) 2(asz@c) uruda ma-na',
  'en': '22 mana copper:',
  'arcts': None,
  'de': None,
  'suxts': None,
  'arc': None,
  'peo': None,
  'akkts': None,
  'ugts': None,
  'qpn': None,
  'peots': None,
  'ug': None,
  'grcts': None,
  'qpnts': None,
  'elxts': None,
  'it': None,
  'akk': None,
  'elx': None,
  'es': None,
  'fr': None,
  'grc': None},
 {'p': 'P010481',
  'a': 'column 1',
  'l': 1,
  'sux': 'sa10 GAN2',
  'en': '(this is) the price of the field;',
  'arcts': None,
  'de': None,
  'suxts': None,
  'arc': None,
  'peo': None,
  'akkts': None,
  'ugts': None,
  'qpn': None,
  'peots': None,
  'ug': None,
  'grcts': None,
  'qpnts': None,
  'elxts': None,
  'it': None,
  'akk': None,
  'elx': None,
  'es': None,
  'fr': None,
  'grc': None},
 {'p': 'P010481',
  'a': 'column 1',
  'l': 2,
  'sux': '1(esze3@c) 2(iku@c) GAN2-bi',
  'en': 'its surface (is) 8 iku;',
  'arcts': None,
  'de': None,
  'suxts': None,
  'arc': N

In [27]:
with open(output_json_path, "wb") as f:
    for t in translations:
        f.write(bytes(json.dumps(t), "utf8"))
        f.write(b"\n")