# Make Dataset

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [1]:
import sys, os, datetime
import json
import torch
import random
import glob
from tqdm.notebook import tqdm
# from transformers import AutoTokenizer
# from transformers import DataCollatorForSeq2Seq
# from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
# from transformers import TranslationPipeline
from datasets import load_dataset, Dataset
import cdli
import languages

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

source_langs = set(["akk", "sux"])

# target_langs = set(["en", "it", "es", "fr", "de"])
target_langs = set(["en"])


model_max_length = 512

paragraphs = True

test_publication_ids = set(["P393923"])

output_path = "../data/dataset_index.json"


## Load All Publications

In [3]:
publications = cdli.get_atf()

Downloading https://github.com/cdli-gh/data/raw/master/cdliatf_unblocked.atf
Parsing atf


In [4]:
len(publications), "publications"

(134712, 'publications')

## Find those with translations

In [5]:
def target_ok(target_text):
    if len(target_text) == 0:
        return False
    if len(set(target_text.replace(" ", ""))) < 2:
        return False
    return True
    

def test_target_ok(text):
    ok = target_ok(text)
    print(ok, repr(text))
    
test_target_ok("")
test_target_ok(" ")
test_target_ok("xx xxx x")
test_target_ok(".. . .. ")
test_target_ok("Hi")

False ''
False ' '
False 'xx xxx x'
False '.. . .. '
True 'Hi'


In [6]:
def pub_has_text(pub):
    for a in pub.text_areas:
        for l in a.lines:
            if len(l.text) > 0 and "en" in l.languages and target_ok(l.languages["en"]):
                return True
    return False

In [7]:
akk_pubs = [x for x in publications if x.language == "akk" and pub_has_text(x)]
print(len(akk_pubs), "akk pubs")

978 akk pubs


In [8]:
akk_pubs[0]

Publication('P464358', 'akk', [TextArea('object composite text', []), TextArea('surface a', []), TextArea('prologue', [TextLine('1.', 'i3-nu an s,i-ru-um', {'akkts': 'īnu anum ṣīrum', 'en': 'When the august Anu,'}), TextLine('2.', '_lugal_ {d}a-nun-na-ki', {'akkts': 'šar anunnakī', 'en': 'king of the Anunnaku,'}), TextLine('3.', '{d}en-lil2', {'akkts': 'enlil', 'en': 'and Enlil,'}), TextLine('4.', 'be-el sza-me-e', {'akkts': 'bēl šamê', 'en': 'lord of heaven'}), TextLine('5.', 'u3 er-s,e-tim', {'akkts': 'u erṣetim', 'en': 'and earth,'}), TextLine('6.', 'sza-i-im', {'akkts': 'šā’im', 'en': 'who determines'}), TextLine('7.', 'szi-ma-at _kalam_', {'akkts': 'šīmāt mātim', 'en': 'the destinies of the land,'}), TextLine('8.', 'a-na {d}marduk', {'akkts': 'ana marduk', 'en': 'to Marduk,'}), TextLine('9.', '_dumu_ re-esz-ti-im', {'akkts': 'mārim rēštîm', 'en': 'the firstborn son'}), TextLine('10.', 'sza {d}en-ki', {'akkts': 'ša ea', 'en': 'of Ea,'}), TextLine('11.', '{d}en-lil2-ut', {'akkts': '

In [9]:
sux_pubs = [x for x in publications if x.language == "sux" and pub_has_text(x)]
print(len(sux_pubs), "sux pubs")

4149 sux pubs


In [10]:
sux_pubs[0]

Publication('P010481', 'sux', [TextArea('tablet', []), TextArea('obverse', []), TextArea('column 1', [TextLine('1.', '2(u@c) 2(asz@c) uruda ma-na', {'en': '22 mana copper:'}), TextLine('2.', 'sa10 GAN2', {'en': '(this is) the price of the field;'}), TextLine('3.', '1(esze3@c) 2(iku@c) GAN2-bi', {'en': 'its surface (is) 8 iku;'}), TextLine('4.', '1(u@c) 6(asz@c) uruda ma-na', {'en': '16 mana copper:'}), TextLine('5.', 'nig2-diri', {'en': '(this is) the additional payment;'}), TextLine('6.', '2(asz@c) sze lid2-ga', {'en': '2 lidga-measures of barley:'}), TextLine('7.', 'nig2-ba', {'en': '(this is) the gift;'}), TextLine('8.', '1(asz@c) tug2 szu', {'en': '1 šu-garment,'})]), TextArea('column 2', [TextLine('1.', '1(asz@c) me-gal2 tug2', {'en': '1 megal-garment:'}), TextLine('2.', 'tug2', {'en': '(this is) the garment (payment);'}), TextLine('3.', '4(u@c) ninda', {'en': '40 breads,'}), TextLine('4.', '1(gesz2@c) gug2', {'en': '60 cakes,'}), TextLine('5.', '6(asz@c) kur2 tu7(|HIxASZ@c|)', {'

## Train and test split

In [19]:
def train_test_split(pubs):
    r = list(pubs)
    random.shuffle(r)
    n = len(r)
    train = []
    test = []
    for p in r:
        if random.uniform(0, 100) < 10 or p.id in test_publication_ids:
            test.append(p.id)
        else:
            train.append(p.id)
    ntrain = len(train)
    ntest = len(test)
    print(ntrain, "train")
    print(ntest, "test")
    return {"train": sorted(train), "test": sorted(test)}

print("akk")
akk_split = train_test_split(akk_pubs)
print("sux")
sux_split = train_test_split(sux_pubs)

akk
870 train
108 test
sux
3753 train
396 test


In [20]:
[x for x in akk_split["test"] if x in test_publication_ids]

['P393923']

In [21]:
output_obj = { "akk": akk_split, "sux": sux_split }
output_obj.keys()

dict_keys(['akk', 'sux'])

In [22]:
output_json = json.dumps(output_obj)
output_json[:100]

'{"akk": {"train": ["", "P212432", "P212952", "P215544", "P216437", "P216609", "P216611", "P216615", '

In [23]:
with open(output_path, "wt") as f:
    f.write(output_json)