In [5]:
#==================================================#
# Author:      Ernst Boogert                       #
# Institution: Protestant Theological University   #
# Date:        August 27, 2018                     #
# Version:     0.0.2                               #
#==================================================#

#==============Documentation============================#
# It should be noted what this lemmatizer does...       
# It takes the MorpheusUnicode data as starting point.  
# Next it builds a dictionary with all concrete words   
# in this database and gives all possible lemmata as     
# values in a set.
# Next, the lemmatizer lemmatizes tokens according to
# this dictionary, which means that not the best lemma
# in context is given, but a set with all possibilities
# according to the MorpheusUnicode database.
# This lemmatizer is best used in tasks like textcomparison.
#==============End Documentation========================#

from lxml import etree
import unicodedata as ud
import pickle
import zipfile

#%run normalization_accents.ipynb
%run tokenizer.ipynb

lemmatizer_dictionary = {}

def buildLemmatizerDictionary(): 
    # Unzip datafile
    with zipfile.ZipFile("data/MorpheusUnicode.xml.zip", 'r') as lemmatizer_zipfile:
        lemmatizer_zipfile.extractall("data")
        utf8_parser = etree.XMLParser(encoding='utf-8') # Make sure that the XML is parsed in utf-8 encoding
        tree = etree.parse("data/MorpheusUnicode.xml", parser=utf8_parser)
    
    # Process loop to find all the word-lemma units, extract the text and store it in a dictionary
    # while avoiding any duplicate key and value data
        for unit in tree.iter('t'):
            form1 = strip_accents(unit.findtext('f'))
            lemma = strip_accents(unit.findtext('l'))
            #word = ud.normalize("NFD", unit.findtext('f'))
            #lemma = ud.normalize("NFD", unit.findtext('l'))
            
#             if lemma not in lemmatizer_dictionary:
#                 lemmatizer_dictionary.update({lemma:{word}})
#             else:
#                 lemmatizer_dictionary[lemma].add(word)               # Makes set values of lemmata
            if word not in lemmatizer_dictionary:                  # Word = key, lemma = value
                lemmatizer_dictionary.update({word:{lemma}})
            else:
                lemmatizer_dictionary[word].add(lemma)               # Makes set values of lemmata
        return lemmatizer_dictionary
    

def buildDictionaryPickle(data):
    pickle.dump(data, open("data/lemmatizer_dict.pickle", "wb"))
    
# # Execute functions
#buildLemmatizerDictionary()
#buildDictionaryPickle(lemmatizer_dictionary)


In [6]:
import pickle
import unicodedata as ud
import operator
from functools import reduce

#token_list = ("κεκρότηται", "κρηπὶς", "ἀληθείας", "ὦ", "παῖδες", "ὑμεῖς", "ἡμῖν", "αὐτοῖς", "ἁγίου", "νεὼ", "μεγάλου", "θεοῦ", "θεμέλιος", "γνώσεως", "ἀρραγής", "προτροπὴ")
greek_token_list = ('λέγει',
 'αὐτοῖς',
 'Τί',
 'ζητεῖτε',
 'οἱ',
 'δὲ',
 'εἶπαν',
 'αὐτῷ',
 'Ῥαββεί',
 'ὃ',
 'λέγεται',
 'μεθερμηνευόμενον',
 'Διδάσκαλε',
 'ποῦ',
 'μένεις')

def lemmatizer_dict(lemma_dict, token_list):
    lemmatized_dict = {}
    for token in token_list:
        #token = ud.normalize("NFC", token)
        token_stripped = strip_accents(token)
        if token_stripped in lemma_dict:
            lemmatized_dict.update({token:lemma_dict.get(token_stripped)})
        else:
            lemmatized_dict.update({token:{token_stripped}})
    return lemmatized_dict

def lemmatizer(lemma_dict, token_list):
    lemmatized_list = []
    for token in token_list:
        #token = ud.normalize("NFC", token)
        token_stripped = strip_accents(token)
        if token_stripped in lemma_dict:
            lemmatized_list.append(tuple(lemma_dict.get(token_stripped)))
        else:
            lemmatized_list.append(token_stripped)
    return lemmatized_list

#     for word, lemmata in lemmatized_dict.items(): # Check whether a lemma exists in the value sets
#         if 'ιημι' in lemmata:
#             print(word)
#         else:
#             pass
#    lemmatized_dict.keys() # returns a list with all dictionary keys!
#    lemmatized_dict.values() # returns a list with all the value sets!
    

lemmatizer( pickle.load( open ("data/lemmatizer_dict.pickle", "rb") ), greek_token_list)




[('λεγω',),
 ('αυτεω', 'εαυτου', 'αυτος'),
 ('τιω', 'ετι', 'τις', 'τιο'),
 ('ζητεω',),
 ('οιις', 'ο', 'ε', 'ιημι', 'οιος', 'ος', 'οις'),
 ('δει', 'δεω', 'δε'),
 ('ειπον',),
 ('αυτεω', 'αυτου', 'εαυτου', 'αυτος'),
 'ραββει',
 ('ος', 'ο'),
 ('λεγω',),
 'μεθερμηνευομενον',
 ('διδασκαλος',),
 ('που',),
 ('μενω',)]