In [1]:
import operator
from functools import reduce
from unicodedata import category, normalize, decomposition, lookup

In [2]:
# Definition of unicodedata categories
letter = {'L'}
dia = {'M'}
punc = {'P'}
letter_dia = {'L', 'M'}

# unicodedata normalize norm
udnorm = 'NFD'

In [3]:
def splitWord(word, norm=udnorm): 
    w = normalize(udnorm, word)
    tokens = ()
    pP = 0
    for i in range(len(w)):
        if category(w[i])[0] not in letter:
            pP += 1
        else:
            break
    pW = pP
    for i in range(pP, len(w)):
        if category(w[i])[0] in letter_dia:
            pW += 1
        else:
            break
    realWord = w[pP:pW]
    pA = pW
    for i in range(pW, len(w)):
        if category(w[i])[0] not in letter:
            pA += 1
        else:
            break
    return (realWord,) + (splitWord(w[pA:]) if pA < len(w) else ())


def tokenize(sentence, norm=udnorm):
    return reduce(
        operator.add,
        (splitWord(word, norm=udnorm) for word in sentence.strip().split()),
        (),
    )

# def strip_accents(word):
#     w_strip = normalize(NFD, word)
#     stripped_word = 0
#     for i in range(len(w_strip)):
#         if category(w_strip[i]) in letter:
#             stripped_word += i
#         else:
#             pass
#     return stripped_word

def strip_accents(word):
    return ''.join(c for c in normalize('NFD', word.lower())
                  if category(c)[0] in letter)


In [4]:
example = '''
λέγει αὐτοῖς Τί ζητεῖτε; οἱ δὲ εἶπαν αὐτῷ Ῥαββεί,
?̔ὃ λέγεται μεθερμηνευόμενον Διδάσκαλε?̓ ποῦ μένεις; 
'''
example2 = 'αὐτοῖς'

In [5]:
tokenize(example)

('λέγει',
 'αὐτοῖς',
 'Τί',
 'ζητεῖτε',
 'οἱ',
 'δὲ',
 'εἶπαν',
 'αὐτῷ',
 'Ῥαββεί',
 'ὃ',
 'λέγεται',
 'μεθερμηνευόμενον',
 'Διδάσκαλε',
 'ποῦ',
 'μένεις')

In [6]:
strip_accents(example2)

'αυτοις'

In [7]:
contrivedExample = '''
λέγει.αὐτοῖς:Τί ζητεῖτε;οἱ δὲ123εἶπαν αὐτῷ Ῥαββεί,
?̔ὃ λέγεται μεθερμηνευόμενον Διδάσκαλε?̓ ποῦ μένεις... 
'''

In [8]:
tokenize(contrivedExample)

('λέγει',
 'αὐτοῖς',
 'Τί',
 'ζητεῖτε',
 'οἱ',
 'δὲ',
 'εἶπαν',
 'αὐτῷ',
 'Ῥαββεί',
 'ὃ',
 'λέγεται',
 'μεθερμηνευόμενον',
 'Διδάσκαλε',
 'ποῦ',
 'μένεις')