The aim of this notebook is to transform an ancient Greek Dictionary Morpheus (https://github.com/gcelano/LemmatizedAncientGreekXML/tree/master/Morpheus) into a form of lookup table suitable for `spacy train`, e.g. the one present in Spacy (modern) Greek language model.

# Requirements

In [2]:
import os
import json
import pandas as pd
import numpy as np
import json
import unicodedata
import requests
import re
from importlib import reload
import pyconll # universal dependencies parser
import zipfile
import io
from bs4 import BeautifulSoup


import sddk

#from anda import gr
# OR uncomment the following:
#script_url = "https://raw.githubusercontent.com/sdam-au/anda_py/master/anda/gr.py"
#exec(requests.get(script_url).content)

In [14]:
# first try to read the morpheus dict from our local data harmonia:
try:
    with open("../data/morpheus_dict.json") as json_file:
        morpheus_dict = json.load(json_file)
# if not available:
except:
    # read it from a public harmonia on sciencedata
    publicfolder = "8fe7d59de1eafe5f8eaebc0044534606"
    morpheus_dict = json.loads(requests.get("https://sciencedata.dk/public/" + publicfolder + "/morpheus_dict.json").content)
    # save it locally for future usage
    with open("../data/morpheus_dict.json", "w") as outfile:  
        json.dump(morpheus_dict, outfile)



In [3]:
# you can parse your own version following these steps, but parsing the xml takes some time:
response = input("Enter 'y' and 'Enter' if you want to parse the data from scratch: ") 
if response == "y":
    url = "https://github.com/gcelano/LemmatizedAncientGreekXML/blob/master/Morpheus/MorpheusUnicode.xml.zip?raw=true"
    resp = requests.get(url)
    zipped = zipfile.ZipFile(io.BytesIO(resp.content))
    zipped.extractall("../data/") 

    with open("../data/MorpheusUnicode.xml", "r") as file:
        # Read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # Combine the lines in the list into a string
        content = "".join(content)
        bs_content = BeautifulSoup(content, "lxml")
    ### to be further elaborated (from the bs_content object on...)

Enter 'y' and 'Enter' if you want to parse the data from scratch: n


In [4]:
# number of wordforms
morpheus_len = len(morpheus_dict)
morpheus_len

706506

In [None]:
print("ok")

In [4]:
# morpheus has the following structure
[it for it in morpheus_dict.items()][10:12]

[('ἀᾱ́ατος',
  [{'i': '6',
    'f': 'ἀᾱ́ατος',
    'b': 'αᾱατος',
    'l': 'ἀάατος',
    'e': 'ααατος',
    'p': 'a-s---fn-',
    'd': '1',
    's': 'not to be injured, inviolable',
    'a': None},
   {'i': '13',
    'f': 'ἀᾱ́ατος',
    'b': 'αᾱατος',
    'l': 'ἀάατος',
    'e': 'ααατος',
    'p': 'a-s---mn-',
    'd': '1',
    's': 'not to be injured, inviolable',
    'a': None}]),
 ('αᾱατος',
  [{'i': '6',
    'f': 'ἀᾱ́ατος',
    'b': 'αᾱατος',
    'l': 'ἀάατος',
    'e': 'ααατος',
    'p': 'a-s---fn-',
    'd': '1',
    's': 'not to be injured, inviolable',
    'a': None},
   {'i': '13',
    'f': 'ἀᾱ́ατος',
    'b': 'αᾱατος',
    'l': 'ἀάατος',
    'e': 'ααατος',
    'p': 'a-s---mn-',
    'd': '1',
    's': 'not to be injured, inviolable',
    'a': None}])]

POS tag used in morpheus (extracted from https://github.com/gcelano/LemmatizedAncientGreekXML/blob/master/README.md)
  * ```n```: noun
  * ```v```: verb
  * ```a```: adjective
  * ```d```: adverb
  * ```l```: article
  * ```g```: particle
  * ```c```: conjunction
  * ```r```: preposition
  * ```p```: pronoun
  * ```m```: numeral
  * ```i```: interjection
  * ```u```: punctuation

We have to transform this data into the following format: https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/el_lemma_exc.json


# "p" (="xpos") into universal dependency "upos"
As a first step: to translate the "p" tag here into the standardized tags used by universal dependency treebanks and spacy:

In [5]:
corpus_perseus = pyconll.load.iter_from_url("https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-train.conllu")
corpus_proiel = pyconll.load.iter_from_url("https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master/grc_proiel-ud-train.conllu")

In [6]:
upos_xpos = []
for sentence in corpus_perseus:
    for token in sentence:
        upos_xpos.append((token.upos, token.xpos))
for sentence in corpus_proiel:
    for token in sentence:
        upos_xpos.append((token.upos, token.xpos))        

In [7]:
upos_xpos[:10]

[('VERB', 'v3spia---'),
 ('ADV', 'd--------'),
 ('ADJ', 'a-s---mn-'),
 ('NOUN', 'n-s---mn-'),
 ('VERB', 'v--ana---'),
 ('NOUN', 'n-s---fa-'),
 ('PUNCT', 'u--------'),
 ('NOUN', 'n-s---mn-'),
 ('CCONJ', 'c--------'),
 ('NOUN', 'n-s---fa-')]

In [8]:
upos_unique = list(set([tup[0] for tup in upos_xpos]))
upos_unique

['ADJ',
 'PRON',
 'ADP',
 'PUNCT',
 'DET',
 'CCONJ',
 'INTJ',
 'PART',
 'PROPN',
 'AUX',
 'SCONJ',
 'ADV',
 'NOUN',
 'X',
 'NUM',
 'VERB']

In [9]:
upos_dict = {}
for upos in upos_unique:
    upos_dict[upos] = list(set([tup[1][:2] for tup in upos_xpos if tup[0] == upos]))
upos_dict

{'ADJ': ['Px', 'S-', 'A-', 'Pd', 'Ps', 'a-', 'Mo'],
 'PRON': ['p3', 'Pc', 'Pr', 'Pi', 'Pp', 'Pk', 'p2', 'p1', 'p-'],
 'ADP': ['r-', 'R-'],
 'PUNCT': ['u-'],
 'DET': ['S-', 'l-', 'Pd', 'Px'],
 'CCONJ': ['c-', 'C-'],
 'INTJ': ['I-', 'i-'],
 'PART': ['g-'],
 'PROPN': ['Ne'],
 'AUX': ['V-'],
 'SCONJ': ['c-', 'G-'],
 'ADV': ['Dq', 'Df', 'Du', 'd-'],
 'NOUN': ['Nb', 'n-'],
 'X': ['-3', 'F-', '--', 'x-'],
 'NUM': ['Ma', 'm-'],
 'VERB': ['v2', 'V-', 'v3', 'v-', 'v1']}

In [10]:
upos_dict_reverse = {}
for key in upos_dict.keys():
    for el in upos_dict[key]:
        try:
            old_value = upos_dict_reverse[el] 
            # new_value = old_value.append(key)
            upos_dict_reverse[el] = old_value + [key]
        except:
            upos_dict_reverse[el] = [key]
upos_dict_reverse

{'Px': ['ADJ', 'DET'],
 'S-': ['ADJ', 'DET'],
 'A-': ['ADJ'],
 'Pd': ['ADJ', 'DET'],
 'Ps': ['ADJ'],
 'a-': ['ADJ'],
 'Mo': ['ADJ'],
 'p3': ['PRON'],
 'Pc': ['PRON'],
 'Pr': ['PRON'],
 'Pi': ['PRON'],
 'Pp': ['PRON'],
 'Pk': ['PRON'],
 'p2': ['PRON'],
 'p1': ['PRON'],
 'p-': ['PRON'],
 'r-': ['ADP'],
 'R-': ['ADP'],
 'u-': ['PUNCT'],
 'l-': ['DET'],
 'c-': ['CCONJ', 'SCONJ'],
 'C-': ['CCONJ'],
 'I-': ['INTJ'],
 'i-': ['INTJ'],
 'g-': ['PART'],
 'Ne': ['PROPN'],
 'V-': ['AUX', 'VERB'],
 'G-': ['SCONJ'],
 'Dq': ['ADV'],
 'Df': ['ADV'],
 'Du': ['ADV'],
 'd-': ['ADV'],
 'Nb': ['NOUN'],
 'n-': ['NOUN'],
 '-3': ['X'],
 'F-': ['X'],
 '--': ['X'],
 'x-': ['X'],
 'Ma': ['NUM'],
 'm-': ['NUM'],
 'v2': ['VERB'],
 'v3': ['VERB'],
 'v-': ['VERB'],
 'v1': ['VERB']}

In [11]:
["NOUN", "ADV", "ADB", "VERB", "NUM"]

['NOUN', 'ADV', 'ADB', 'VERB', 'NUM']

In [12]:
morph_list_dict = list(morpheus_dict.values())
morph_list_dict[0][:3]

[{'i': '1',
  'f': 'ἀάατον',
  'b': 'ααατον',
  'l': 'ἀάατος',
  'e': 'ααατος',
  'p': 'a-s---fa-',
  'd': '1',
  's': 'not to be injured, inviolable',
  'a': None},
 {'i': '8',
  'f': 'ἀάατον',
  'b': 'ααατον',
  'l': 'ἀάατος',
  'e': 'ααατος',
  'p': 'a-s---ma-',
  'd': '1',
  's': 'not to be injured, inviolable',
  'a': None},
 {'i': '15',
  'f': 'ἀάατον',
  'b': 'ααατον',
  'l': 'ἀάατος',
  'e': 'ααατος',
  'p': 'a-s---na-',
  'd': '1',
  's': 'not to be injured, inviolable',
  'a': None}]

In [13]:
# create empty dictionary to fill in by values
ag_lemma_lookup = {}
for upos in upos_unique: 
    ag_lemma_lookup[upos] = {}
ag_lemma_lookup

{'ADJ': {},
 'PRON': {},
 'ADP': {},
 'PUNCT': {},
 'DET': {},
 'CCONJ': {},
 'INTJ': {},
 'PART': {},
 'PROPN': {},
 'AUX': {},
 'SCONJ': {},
 'ADV': {},
 'NOUN': {},
 'X': {},
 'NUM': {},
 'VERB': {}}

In [14]:
v_ud = "’"
v_agt = "ʼ"
re.sub(v_ud, v_agt, "Ἀλλ’")

'Ἀλλʼ'

In [15]:
for item in morph_list_dict:
    for el in item:
        for key in upos_dict_reverse.keys(): # e.g. "V-" or "a-"...            
            if el["p"][:2] == key:
                for tag in upos_dict_reverse[key]: # list of values of the individual key
                    ag_lemma_lookup[tag][re.sub(v_ud, v_agt, el["f"])] = el["l"]
                    
for item in morph_list_dict:
    for el in item:
        if el["a"] == None: # ignore dialect specific words at this moment
            for key in upos_dict_reverse.keys(): # e.g. "V-" or "a-"...            
                if el["p"][:2] == key:
                    for tag in upos_dict_reverse[key]: # list of values of the individual key
                        ag_lemma_lookup[tag][re.sub(v_ud, v_agt, el["f"])] = el["l"]

In [16]:
sum([len(ag_lemma_lookup[key]) for key in ag_lemma_lookup.keys()]) # 259647

331498

In [17]:
dict([it for it in ag_lemma_lookup["ADV"].items()][:20])

{'ἄτως': 'ἄατος',
 'ἅτʼ': 'ἅτε',
 'κἄτʼ': 'ἔτι',
 'κᾆτʼ': 'εἴτε',
 'κἆτʼ': 'ἔτι',
 'κἆθʼ': 'ἔτι',
 'κᾆθʼ': 'εἴτε',
 'κάθʼ': 'καθό',
 'καθʼ': 'καθό',
 'κᾆτα': 'εἶτα',
 'τώ': 'τῷ',
 'τῴ': 'τῷ',
 'τῶ': 'τῷ',
 'τῷ': 'τῷ',
 'τῳ': 'τῷ',
 'ἁτός': 'ἐτός',
 'ἅτε': 'ἅτε',
 'ἇτε': 'ἅτε',
 'ᾗ': 'ᾗ',
 'ἤ': 'ἦ'}

# Update/extend Morpheus by word forms and lemmata from universal dependencies

In [18]:
len(ag_lemma_lookup["NOUN"])

81298

In [19]:
corpus_perseus = pyconll.load.iter_from_url("https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-train.conllu")
corpus_proiel = pyconll.load.iter_from_url("https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-PROIEL/master/grc_proiel-ud-train.conllu")

In [21]:
for sentence in corpus_perseus:
    for token in sentence:
        ag_lemma_lookup[token.upos][re.sub(v_ud, v_agt, token.form)] = re.sub(v_ud, v_agt, token.lemma)
for sentence in corpus_proiel:
    for token in sentence:
        ag_lemma_lookup[token.upos][re.sub(v_ud, v_agt, token.form)] = re.sub(v_ud, v_agt, token.lemma)

In [22]:
ag_lemma_lookup["NOUN"].update(ag_lemma_lookup["PROPN"])

In [23]:
ag_lemma_lookup["NOUN"]["Ἰησοῦ"]

'Ἰησοῦς'

In [24]:
with open ("../data/ag_lemma_lookup.json", "w") as f:
    json.dump(ag_lemma_lookup, f)

In [97]:
ag_lemma_lookup.keys()

dict_keys(['NOUN', 'ADV', 'ADJ', 'VERB', 'PRON', 'SCONJ', 'X', 'PUNCT', 'CCONJ', 'NUM', 'ADP', 'PROPN', 'AUX', 'DET', 'INTJ', 'PART'])

In [25]:
ag_lemma_lookup_merged = {}
for key in ag_lemma_lookup.keys():
    ag_lemma_lookup_merged.update(ag_lemma_lookup[key])

In [26]:
# overwrite overthing by keys in this order
keys = ["ADV", "VERB", "ADJ", "NOUN", "PROPN"]
for key in keys:
    ag_lemma_lookup_merged.update(ag_lemma_lookup[key])

In [27]:
ag_lemma_lookup_merged["ἐνεργειῶν"]

'ἐνέργεια'

In [28]:
with open ("../data/ag_lemma_lookup_merged.json", "w") as f:
    json.dump(ag_lemma_lookup_merged, f)

# Verbs for Rule-based lemmatizer

In [3]:
# if you don't have your data already accessible:
ag_lemma_lookup = json.load(open("../data/ag_lemma_lookup.json"))

In [5]:
verbs_wordform_lemma_dict = ag_lemma_lookup["VERB"]

In [6]:
conf = sddk.configure("SDAM_root", "648597@au.dk") # shared harmonia, harmonia owner
sddk.write_file("SDAM_data/AGT/verbs_wordform_lemma_dict.json", verbs_wordform_lemma_dict, conf)

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/
Your <class 'dict'> object has been succefully written as "https://sciencedata.dk/files/SDAM_root/SDAM_data/AGT/verbs_wordform_lemma_dict.json"
