# Export ATF File Contents from CDLI

Downloads: https://cdli.ucla.edu/downloads
Languages: http://oracc.museum.upenn.edu/doc/help/languages/index.html

Exports:
    
* 

In [10]:
import sys, os, io
import requests
import zipfile
from tqdm.notebook import tqdm
import json

In [2]:
output_json_path = "../data/translations.jsonl"

## Language Support

Make a list of old and modern languages that we're interested in.

In [3]:
old_languages = {
    "akk": "Akkadian",
    "sux": "Sumerian",
    "qpn": "Proper Nouns",
    "arc": "Aramaic",
    "elx": "Elamite",
    "grc": "Greek",
    "peo": "Old Persian",
    "ug": "Ugaritic",
}
modern_languages = {
    "de": "German",
    "en": "English",
    "es": "Spanish",
    "fr": "French",
    "it": "Italian"
}
language_codes = set(list(modern_languages.keys()) + [x + suffix for x in old_languages.keys() for suffix in ["","ts"]])
language_codes

{'akk',
 'akkts',
 'arc',
 'arcts',
 'de',
 'elx',
 'elxts',
 'en',
 'es',
 'fr',
 'grc',
 'grcts',
 'it',
 'peo',
 'peots',
 'qpn',
 'qpnts',
 'sux',
 'suxts',
 'ug',
 'ugts'}

## Download CDLI

In [28]:
import cdli

In [29]:
publications = cdli.get_publications()

100%|██████████████████████████████████████████████████████████████████████████████████| 3541243/3541243 [00:14<00:00, 250180.73it/s]


## Find Publications with Translations

In [30]:
for p in publications:
    for a in p.text_areas:
        n = len(a.lines)
        skip = 0
        while skip < n and len(a.lines[skip].text) == 0:
            skip += 1
        if skip > 0:
#             print("SKIP", skip, a)
            a.lines = a.lines[skip:]    

In [31]:
translated_publications = [x for x in publications if x.language is not None and x.has_translations()]
len(translated_publications), "translated publications"

(5412, 'translated publications')

In [32]:
translated_publications[0]

Publication('P001282', 'qpc', [TextArea('tablet', []), TextArea('obverse', [TextLine('1.', '3(N01) 3(N08) , U8 UR2# UMBIN~a', {'en': '3 + 3 (lamb) ewes, ...;'}), TextLine('2.', '2(N01) , MASZ GURUSZDA MUSZ3~a', {'en': '2 goats, ...;'})]), TextArea('reverse', [])])

## Export them to JSON

In [37]:
ignore_texts = set(["xxx", "", "(subscript)"])

In [38]:
translations = []

for p in tqdm(translated_publications):
    src_lang = p.language
    for a in p.text_areas:        
        if not a.has_translations():
            continue
        for iline, line in enumerate(a.lines):
            langs = list(line.languages.keys())
            src_text = line.text
            if src_text in ignore_texts:
                continue
            tr = {"p": p.id, "a": a.name, "l": iline }
            if src_lang in language_codes:
                tr[src_lang] = src_text
            for tgt_lang in langs:
                if tgt_lang in language_codes:
                    tgt_text = line.languages[tgt_lang] if tgt_lang in line.languages else ""
                    if not (tgt_text in ignore_texts):
                        tr[tgt_lang] = tgt_text
            if len(tr) > 3:
                translations.append(tr)

print(len(translations), "translations")
translations[0:10]

  0%|          | 0/5412 [00:00<?, ?it/s]

97825 translations


[{'p': 'P001282', 'a': 'obverse', 'l': 0, 'en': '3 + 3 (lamb) ewes, ...;'},
 {'p': 'P001282', 'a': 'obverse', 'l': 1, 'en': '2 goats, ...;'},
 {'p': 'P001392',
  'a': 'column 1',
  'l': 0,
  'en': '3 (adult), 2 (child) slaves, female and male: ZATU693.KID;'},
 {'p': 'P001392',
  'a': 'column 1',
  'l': 1,
  'en': '1 male slave, ZATU693.3(N57);'},
 {'p': 'P001392', 'a': 'column 1', 'l': 2, 'en': '...'},
 {'p': 'P001684',
  'a': 'column 1',
  'l': 0,
  'en': '... 23 female slaves, inspected, ...;'},
 {'p': 'P001684', 'a': 'column 2', 'l': 0, 'en': '22 ... slaves, ...;'},
 {'p': 'P001684',
  'a': 'reverse',
  'l': 0,
  'en': '213(?) female and male slaves, ... .'},
 {'p': 'P003118',
  'a': 'column 1',
  'l': 0,
  'en': '1200 (ninda, ca. 6m), the (first) length,'},
 {'p': 'P003118', 'a': 'column 1', 'l': 1, 'en': '1200, the (second) length;'}]

In [39]:
with open(output_json_path, "wb") as f:
    for t in translations:
        f.write(bytes(json.dumps(t), "utf8"))
        f.write(b"\n")