<a href="https://colab.research.google.com/github/skolachi/rongorongo/blob/master/rongorongo_scriptanalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget http://kohaumotu.org/rongorongo_org/concord/concord1.zip
!wget http://kohaumotu.org/rongorongo_org/concord/concord2.zip

In [None]:
!mkdir rongorongo
!unzip concord1.zip -d rongorongo/ 
!unzip concord2.zip -d rongorongo/
!rm concord1.zip concord2.zip
!cat rongorongo/*.CCD >> rongorongo/fullconcordance.CCD
!wc -l rongorongo/fullconcordance.CCD

In [None]:
import re
def read_corpus(corpusfile):
  return re.findall(r'[A-Z]{1}[a-z]{1}[0-9]{2}\.[0-9]{3}\:([^\n]*)',open(corpusfile).read())

In [None]:
corpus = read_corpus('rongorongo/fullconcordance.CCD')
len(corpus)
corpus[:10]

In [None]:
#punctuation = ['-','.',':','\'','*']
samp = ''
def line2sign(line):
  signs = []
  line = re.sub('\([0-9]{1}\-[0-9]{1}\)','-000-',line)
  for c in re.split(r'[\-\.\:\'\*]',line):
    if c != '':
      sign = re.sub('[^0-9]','',c)
      signs.append('0'*(3-len(sign))+sign)
  
  return signs

In [None]:
with open('sign_sequences.txt','w') as f:
  for text in corpus:
    f.write('{}\n'.format(' '.join(line2sign(text))))

In [None]:
!pip install transformers

In [None]:
import os
from pathlib import Path
from tokenizers import BertWordPieceTokenizer
from tokenizers import Tokenizer
from transformers import BertTokenizerFast
from tokenizers.models import Unigram, WordLevel, WordPiece
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import decoders

In [None]:
def train_tokenizer():
  dictfiles = ['sign_sequences.txt']

  tokenizer = BertWordPieceTokenizer()
  tokenizer.pre_tokenizer = Whitespace()
  tokenizer.decoder = decoders.WordPiece()

  tokenizer.train(files=dictfiles,min_frequency=2,special_tokens=["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"])
  if not os.path.isdir("rongorongoLM"):
    os.mkdir("rongorongoLM")
  tokenizer.save_model("rongorongoLM")

In [None]:
train_tokenizer()
tokenizer = BertTokenizerFast.from_pretrained("rongorongoLM")

In [None]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
  tokenizer=tokenizer,
  file_path="./sign_sequences.txt",
  block_size=128,
)


In [None]:
from transformers import BertConfig

config = BertConfig(
    max_position_embeddings=514,
    num_attention_heads=6,
    num_hidden_layers=3,
    type_vocab_size=1,
)

In [None]:
from transformers import BertForMaskedLM

model = BertForMaskedLM(config=config)

print("Number of model parameters: ",model.num_parameters())

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.5
)

In [None]:
from transformers import Trainer, TrainingArguments

def train_lm():
    training_args = TrainingArguments(
        output_dir="./rongorongoLM",
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_gpu_train_batch_size=128,
        save_steps=10_000,
        save_total_limit=2,
        prediction_loss_only=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    trainer.save_model("./rongorongoLM")

In [None]:
train_lm()

In [None]:
from transformers import pipeline

fill_mask = pipeline(
  "fill-mask",
  model="./rongorongoLM",
  tokenizer="./rongorongoLM"
)

In [None]:
corpus[10], ' '.join(line2sign(corpus[10]))

In [None]:
fill_mask("004 000 000 004 [MASK] 000 000 001 000 [MASK] 008 600 001 385 001 000 059 022 022 008")