# Export ATF File Contents from CDLI

Downloads: https://cdli.ucla.edu/downloads
Languages: http://oracc.museum.upenn.edu/doc/help/languages/index.html

Exports:
    
* 

In [1]:
import sys, os, io
import requests
import zipfile
import tqdm
import json

In [2]:
output_json_path = "../data/translations.jsonl"

## Language Support

Make a list of old and modern languages that we're interested in.

In [3]:
old_languages = {
    "akk": "Akkadian",
    "sux": "Sumerian",
    "qpn": "Proper Nouns",
    "arc": "Aramaic",
    "elx": "Elamite",
    "grc": "Greek",
    "peo": "Old Persian",
    "ug": "Ugaritic",
}
modern_languages = {
    "de": "German",
    "en": "English",
    "es": "Spanish",
    "fr": "French",
    "it": "Italian"
}
language_codes = set(list(modern_languages.keys()) + [x + suffix for x in old_languages.keys() for suffix in ["","ts"]])
language_codes

{'akk',
 'akkts',
 'arc',
 'arcts',
 'de',
 'elx',
 'elxts',
 'en',
 'es',
 'fr',
 'grc',
 'grcts',
 'it',
 'peo',
 'peots',
 'qpn',
 'qpnts',
 'sux',
 'suxts',
 'ug',
 'ugts'}

## Download CDLI

In [4]:
atf_url = "https://cdli.ucla.edu/tools/cdlifiles/cdliatf_unblocked.zip"
atf_zip = zipfile.ZipFile(io.BytesIO(requests.get(atf_url).content))

In [5]:
atf_lines = str(atf_zip.open("cdliatf_unblocked.atf", "r").read(), "utf8").split("\n")
for l in atf_lines[:50]:
    print(l)

&P000001 = CDLI Lexical 000002, ex. 065
#atf: lang qpc 
@tablet 
@obverse 
@column 1 
$ beginning broken 
1'. 1(N01) , [...] 
>>Q000002 014 
2'. 1(N01) , ABGAL# 
>>Q000002 015 
3'. 1(N01) , KINGAL# 
>>Q000002 016 
@column 2 
$ beginning broken 
1'. 1(N01) , [...] 
>>Q000002 030 
2'. 1(N01) , GAL~a# UMUN2# 
>>Q000002 031 
3'. 1(N01) , GAL~a UMUN2 KU3~a 
>>Q000002 032 
@column 3 
$ beginning broken 
1'. 1(N01) , DUB~a SANGA~a# 
>>Q000002 048 
2'. 1(N01) , SUG5# SAG# 
>>Q000002 049 
3'. 1(N01) , UB SAG# 
>>Q000002 050 
@reverse 
1. [N] , [...] 
>>Q000002 colophon 
 

&P000002 = CDLI Lexical 000002, ex. 066
#atf: lang qpc 
@tablet 
@obverse 
@column 1 
$ beginning broken 
1'. [1(N01)] , [...] 
>>Q000002 013 
2'. [1(N01)] , GAL~a# SZITA~a1 
>>Q000002 014 
3'. [1(N01)] , ABGAL# 
>>Q000002 015 
$ rest broken 
@column 2 
$ beginning broken 
1'. [1(N01)] , [...] NIM~a#? 
>>Q000002 023 


In [6]:
class Publication():
    def __init__(self, id):
        self.id = id
        self.text_areas = list()
        self.language = None
    def __repr__(self):
        return f"Publication({repr(self.id)}, {repr(self.language)}, {repr(self.text_areas)})"
    def has_translations(self):
        return any(x.has_translations() for x in self.text_areas)
    
class TextArea():
    def __init__(self, name):
        self.name = name
        self.lines = list()
    def __repr__(self):
        return f"TextArea({repr(self.name)}, {repr(self.lines)})"
    def has_translations(self):
        return len(self.lines) > 0 and self.lines[0].has_translations()
    
class TextLine():
    def __init__(self, number, text):
        self.number = number
        self.text = text
        self.languages = dict()
    def __repr__(self):
        return f"TextLine({repr(self.number)}, {repr(self.text)}, {repr(self.languages)})"
    def has_translations(self):
        for k in self.languages:
            v = self.languages[k]
            if len(v) > 0:
                return True
        return False
    

In [19]:
publications = []
pub = None
text = None
tline = None
all_langs = set()

for line in tqdm.tqdm(atf_lines):
    line = line.replace("\t", " ").strip()
    if len(line) < 1:
        continue
    
    if line[0] == "&":
        pub = Publication(line[1:].split(" ", 1)[0])
        publications.append(pub)
    elif line[0] == "@":
        text = TextArea(line[1:])
        pub.text_areas.append(text)
    elif line[0].isdigit():
#         print(line)
        parts = line.split(" ", 1)
        number, t = parts if len(parts) == 2 else (line, "")
        t = " ".join(t.strip().split(" "))
        tline = TextLine(number, t)
        text.lines.append(tline)
    elif len(line) > 4 and line.startswith("#atf: lang "):
        lang = line.split(" ")[-1]
        all_langs.add(lang)
        pub.language = lang
    elif len(line) > 4 and line.startswith("#tr."):
        parts = line.split(":", 1)
        lang, t = parts if len(parts) == 2 else (line, "")
        lang = lang[4:]
        if lang == "ts" and pub.language is not None:
            lang = pub.language + "ts"
        all_langs.add(lang)
        t = " ".join(t.strip().split(" "))
        tline.languages[lang] = t
    else:
#         print("Unknown start:", line[0])
        pass

publications[:2]

100%|██████████| 3012008/3012008 [00:15<00:00, 190540.50it/s]


[Publication('P000001', 'qpc', [TextArea('tablet', []), TextArea('obverse', []), TextArea('column 1', [TextLine("1'.", '1(N01) , [...]', {}), TextLine("2'.", '1(N01) , ABGAL#', {}), TextLine("3'.", '1(N01) , KINGAL#', {})]), TextArea('column 2', [TextLine("1'.", '1(N01) , [...]', {}), TextLine("2'.", '1(N01) , GAL~a# UMUN2#', {}), TextLine("3'.", '1(N01) , GAL~a UMUN2 KU3~a', {})]), TextArea('column 3', [TextLine("1'.", '1(N01) , DUB~a SANGA~a#', {}), TextLine("2'.", '1(N01) , SUG5# SAG#', {}), TextLine("3'.", '1(N01) , UB SAG#', {})]), TextArea('reverse', [TextLine('1.', '[N] , [...]', {})])]),
 Publication('P000002', 'qpc', [TextArea('tablet', []), TextArea('obverse', []), TextArea('column 1', [TextLine("1'.", '[1(N01)] , [...]', {}), TextLine("2'.", '[1(N01)] , GAL~a# SZITA~a1', {}), TextLine("3'.", '[1(N01)] , ABGAL#', {})]), TextArea('column 2', [TextLine("1'.", '[1(N01)] , [...] NIM~a#?', {}), TextLine("2'.", '1(N01) , GAL~a SILA4~b', {}), TextLine("3'.", '1(N01) , GAL~a SZAB~a',

In [20]:
all_langs

{' en',
 ' inspections, sealed documents',
 ' they shall release them;',
 '1',
 '_akk_',
 '_logo',
 '_sux',
 'akk',
 'akkts',
 'arc',
 'asux',
 'ca',
 'de',
 'dk',
 'egy',
 'elx',
 'en',
 'en (he) said',
 'en (this weight at) five minas',
 'en (this weight at) one-half mina',
 'en For Nanna,',
 'en I personaly will [...] our gate [...]',
 'en Kurub-Isztar, his brother Aszszur-imitti, son of Ikkupija,',
 'en Puzur-szadue and Aszszur-makik, son of Luzina, were my witnesses.',
 'en Puzur-szadue and Aszszur-malik, son of Luzina,',
 'en When the iron and the textiles were sold in Aszszur-malik,',
 'en When the money was paid Ammurumbani, Szu-Kubum,',
 'en [...]',
 'en [...] in accordance with',
 'en and I will take [...]',
 'en and king of the four world quarters,',
 'en curtailment and deduction',
 'en dedicated (this).',
 'en had drawn away, his wife',
 'en he (Enki) had given to him regarding it,',
 'en his master,',
 'en king of Ur',
 'en of my partners [...] the iron [...]',
 'en of th

## Find Publications with Translations

In [21]:
for p in publications:
    for a in p.text_areas:
        n = len(a.lines)
        skip = 0
        while skip < n and len(a.lines[skip].text) == 0:
            skip += 1
        if skip > 0:
#             print("SKIP", skip, a)
            a.lines = a.lines[skip:]    

In [22]:
translated_publications = [x for x in publications if x.language is not None and x.has_translations()]
len(translated_publications), "translated publications"

(4035, 'translated publications')

In [23]:
translated_publications[0]

Publication('P001282', 'qpc', [TextArea('tablet', []), TextArea('obverse', [TextLine('1.', '3(N01) 3(N08) , U8 UR2# UMBIN~a', {'en': '3 + 3 (lamb) ewes, ...;'}), TextLine('2.', '2(N01) , MASZ GURUSZDA MUSZ3~a', {'en': '2 goats, ...;'})]), TextArea('reverse', [])])

## Export them to JSON

In [31]:
ignore_texts = set(["xxx", ""])

In [35]:
translations = []

for p in tqdm.tqdm(translated_publications):
    src_lang = p.language
    for a in p.text_areas:        
        if not a.has_translations():
            continue
        for iline, line in enumerate(a.lines):
            langs = list(line.languages.keys())
            src_text = line.text
            if src_text in ignore_texts:
                continue
            tr = {"p": p.id, "a": a.name, "l": iline }
            if src_lang in language_codes:
                tr[src_lang] = src_text
            for tgt_lang in langs:
                if tgt_lang in language_codes:
                    tgt_text = line.languages[tgt_lang] if tgt_lang in line.languages else ""
                    if not (tgt_text in ignore_texts):
                        tr[tgt_lang] = tgt_text
            if len(tr) > 3:
                translations.append(tr)

print(len(translations), "translations")
translations[0:10]

100%|██████████| 4035/4035 [00:00<00:00, 37397.09it/s]

74584 translations





[{'p': 'P001282', 'a': 'obverse', 'l': 0, 'en': '3 + 3 (lamb) ewes, ...;'},
 {'p': 'P001282', 'a': 'obverse', 'l': 1, 'en': '2 goats, ...;'},
 {'p': 'P001392',
  'a': 'column 1',
  'l': 0,
  'en': '3 (adult), 2 (child) slaves, female and male: ZATU693.KID;'},
 {'p': 'P001392',
  'a': 'column 1',
  'l': 1,
  'en': '1 male slave, ZATU693.3(N57);'},
 {'p': 'P001392', 'a': 'column 1', 'l': 2, 'en': '...'},
 {'p': 'P003118',
  'a': 'column 1',
  'l': 0,
  'en': '1200 (ninda, ca. 6m), the (first) length,'},
 {'p': 'P003118', 'a': 'column 1', 'l': 1, 'en': '1200, the (second) length;'},
 {'p': 'P003118', 'a': 'column 2', 'l': 0, 'en': '930, the (first) width,'},
 {'p': 'P003118', 'a': 'column 2', 'l': 1, 'en': '870, the (second) width.'},
 {'p': 'P003118', 'a': 'column 1', 'l': 0, 'en': '990, the (first) length,'}]

In [36]:
with open(output_json_path, "wb") as f:
    for t in translations:
        f.write(bytes(json.dumps(t), "utf8"))
        f.write(b"\n")