In [None]:
from unicodedata import category, normalize

letter = {'L'}
space = {'Z'}
letter_space = {'L', 'Z'}
dia = {'M'}
punc = {'P'}
letter_dia = {'L', 'M'}
udnorm = 'NFC'

test1a = '-:Κεκρ;?ότη-ται᾿,᾿'
test1b = 'Κεκρ;?ότηται᾿,᾿'
test1c = '-:Κε.:κρ;?ότη,.τα...ι᾿,᾿'
test2 = '''   Κεκ.,ρότη-ται;?    ?κρη/πὶς ..ἀληθείας, ;ὦ παῖδες ὑμεῖς, ἡμῖν αὐτοῖς, 
ἁγίου νεὼ μεγάλου θεοῦ θεμέλιος γνώσεως ἀρραγής, προτροπὴ καλή, 
δι᾿ ὑπακοῆς εὐλόγου ζωῆς ἀιδίουὄρεξις, νοερῷ καταβληθεῖσα χωρίῳ.'''

In [None]:
def rsplitPunc(word, norm=udnorm, clean=False):
    '''This function splits off punctuation 
    from words on the RIGHT side of the word.
    
    returns (word, punc)
    '''
    w = normalize(norm, word)
    afterWord = len(w)
    for i in range(len(w) - 1, -1, -1):
        if category(w[i])[0] not in letter_dia:
            afterWord = i
        else:
            break
    if clean:
        return (''.join(c for c in w[0:afterWord] \
                          if category(c)[0] in letter_dia), w[afterWord:])
    else:
        return (w[0:afterWord], w[afterWord:])

In [None]:
print(rsplitPunc(test1a, clean=True))

In [None]:
def lsplitPunc(word, norm=udnorm, clean=False):
    '''This function splits off punctuation 
    from words on the LEFT side of the word.
    
    returns (punc, word)
    '''
    w = normalize(norm, word)
    beforeWord = -1
    for i in range(len(w)):
        if category(w[i])[0] not in letter_dia:
            beforeWord = i
        else:
            beforeWord +=1
            break
    if clean:
        return (w[0:beforeWord], ''.join(c for c in w[beforeWord:] \
                                           if category(c)[0] in letter_dia))
    else:
        return (w[0:beforeWord], w[beforeWord:])


In [None]:
print(lsplitPunc(test1a, clean=True))


In [None]:
def splitPunc(words, norm=udnorm, clean=False,
              splitters=None, non_splitters=None):
    '''This function splits off punctuation 
    from words on both sides of the word. 
    It returns a tuple with tuples, containing
    the punctuation before, the word itself, 
    and punctuation after. It can be used for
    multiple words
    
    clean=False:
        if punctuation is within the word, the word
        will be split into two, except for characters
        defined in the non-splitters list.
    clean=True:
        punctuation within a word will be deleted, 
        except for characters defined in the splitters 
        list. In that case, the string will be split.
    
    
    splitters=['character', 'character', ...]
    non_splitters=['character', 'character', ...]
        
    
    returns ((pre, word, after), (pre, word, after), ...)
    '''
    if splitters is None: splitters = ()
    if non_splitters is None: non_splitters = ()
    w = normalize(norm, words)
    pP = 0
    for i in range(len(w)):
        if category(w[i])[0] not in letter_dia:
            pP += 1
        else:
            break
    preWord = w[0:pP].strip() if pP else ''
    pW = pP
    for i in range(pP, len(w)):
        if w[i] in non_splitters:
            break
        elif category(w[i])[0] in letter_dia:
            pW += 1
        else:
            break
    word = w[pP:pW]
    pA = pW
    nsplit = False
    for i in range(pW, len(w)):
        if clean:
            if category(w[i])[0] in space:
                pA += 1
                break
            elif w[i] in splitters:
                pA += 1
                break
            elif category(w[i])[0] in letter_dia:
                pW = i + 1
                pA = pW
                word += w[i]
            elif category(w[i])[0] not in letter_dia:
                pA += 1
        else:
            if category(w[i])[0] in space:
                pA += 1
                break
            elif w[i] in non_splitters:
                nsplit = True
                continue
            elif category(w[i])[0] not in letter_dia:
                nsplit = False
                pA += 1
            elif category(w[i])[0] in letter_dia and nsplit == True:
                pW = i + 1
                pA = pW
                word += w[i]
            else:
                break
    afterWord = w[pW:pA].strip()
    rest = splitPunc(w[pA:], clean=clean, splitters=splitters, 
                     non_splitters=non_splitters) if pA < len(w) else ()
    return ((preWord, word, afterWord),) + rest

In [None]:
print(test2)
splitPunc(test2, clean=False, non_splitters=('-', '/',))

In [None]:
def cleanWords(words, norm=udnorm, clean=False,
               splitters=None, non_splitters=None): 
    """cleanWords splits off any punctuation and 
    non-word characters from words in a string. 
    It can be used for cleaning single words,
    or to tokenize full sentences.
    
    clean=False:
        letter characters that have punctuation
        inbetween but no space, are split on punctuation
        exceptions can be defined in non_splitters
    
    clean=True:
        words with punctuation within (without whitespace) 
        are glued together without punctuation
        exceptions can be defined in splitters
    
    returns: ('string', 'string', ...)
    """
    if splitters is None: splitters = ()
    if non_splitters is None: non_splitters = ()
    w = normalize(norm, words)
    pP = 0
    for i in range(len(w)):
        if category(w[i])[0] not in letter_dia:
            pP += 1
        else:
            break
    pW = pP
    for i in range(pP, len(w)):
        if category(w[i])[0] in letter_dia:
            pW += 1
        else:
            break
    realWord = w[pP:pW]
    pA = pW
    nsplit = False
    for i in range(pW, len(w)):
        if clean:
            if category(w[i])[0] in space:
                break
            elif w[i] in splitters:
                break
            elif category(w[i])[0] not in letter_dia:
                pA += 1
            elif category(w[i])[0] in letter_dia:
                realWord += w[i]
                pA += 1
        else:
            if w[i] in non_splitters:
                nsplit = True
                continue
            elif category(w[i])[0] in letter_dia and nsplit == True:
                pW = i + 1
                pA = pW
                realWord += w[i]
            elif category(w[i])[0] not in letter_dia:
                nsplit = False
                pA += 1
            else:
                break
    res = (realWord,) + \
          (cleanWords(w[pA:], clean=clean, 
                      splitters=splitters, non_splitters=non_splitters) 
           if pA < len(w) else ())
    return res if not res == ('',) else ()


In [None]:
print(test2)
cleanWords(test2, clean=True,
              splitters=('-'), non_splitters=('-', '/'))

In [None]:
def tokenizer(sentence, norm=udnorm, punc=False, clean=False,
              splitter=None, non_splitter=None, func=None):
    """tokenize feeds a sentence string
    to splitWord, while concatenating the
    resulting strings into one tuple.
    
    clean=False:
        split on punctuation without whitespace
    clean=True:
        delete punctuation inside words
    clean=None
        
    returns: ('string', 'string', ...)
    """
    if func:
        func(sentence)
    else:
        if punc:
            if clean:
                return tuple(f'{pre}{word}{post}' \
                    for pre, word, post in splitPunc(sentence, norm=udnorm, clean=True,
                                                     splitter=splitter, non_splitter=non_splitter))
            else:
                return tuple(f'{pre}{word}{post}' \
                    for pre, word, post in splitPunc(sentence, norm=udnorm, clean=False,
                                                     splitter=splitter, non_splitter=non_splitter))
        else:
            if clean:
                return cleanWords(sentence, clean=True)
            else:
                return cleanWords(sentence, clean=False)


        

In [None]:
error = {'punc': True, 'nop': False,}

def func(word, **kwargs):
    if kwargs['punc']:
        print(word.lower())

func('DEZE!', **error)

In [None]:
dictio = {'publicationStmt': {'concat': True, 'delimit': ', ', 'end': '.'}}
for i in dictio:
    if 'end' in dictio[i]:
        print(i)
        print(dictio[i]['end'])

In [None]:
import operator

attribs = {'a': 4, 'b': 1, 'c': 25, 'd': 1}
sorted_attribs = sorted(attribs.items(), key=operator.itemgetter(1))
print(sorted_attribs)

In [None]:
from helpertools.unicodetricks import plainLow
from helpertools.data.greek_elisions import ELISIONS

ELISIONS_norm = {k.strip('᾽'): v for k, v in ELISIONS.items()}

def greekReplacements(word):
    if word in ELISION_norm:
        return ELISION_norm[word]
    plain_word = plainLow(word)
    # Deletion of movable-nu
    if plain_word.endswith(('εν', 'σιν', 'στιν')) and len(midWord_pl) >= 3:
        return word[:-1]
    # Handling final-sigma
    if plain_word.endswith('σ'):
        return word[:-1] + 'ς'
    # Handling various forms of ου
    if plain_word in ('ουχ', 'ουκ'):
        return word[:-1]
    # Handling ἐξ
    if plain_word == 'εξ':
        return word[:-1] + 'κ'
    
print(len(ELISIONS))
print(len(ELISIONS_norm))

In [None]:
from lxml import etree

import xmlschema
import urllib.request
from os import path
from pprint import pprint

# url = 'http://www.stoa.org/epidoc/schema/latest/tei-epidoc.rng'
# scheme = urllib.request.urlretrieve(url, path.expanduser('~/github/pthu/tfbuilder/tfbuilder/temp/scheme.xsd'))
# xml_scheme = xmlschema.XMLSchema(url)
file = path.expanduser('~/github/pthu/sources/pt/tlg0555/tlg002/tlg0555.tlg002.opp-grc1.xml')

# pprint()
relaxng_doc = etree.parse(path.expanduser('~/github/pthu/tfbuilder/tfbuilder/temp/scheme.xsd'))
print(relaxng_doc)
relaxng = etree.RelaxNG(relaxng_doc)
print(relaxng)

doc = etree.parse(file)
print(doc)
relaxng.validate(doc)



In [None]:
s = {'a|b'}
# print(s)
if 'a' in s:
    print('yes!')

In [None]:
elem = ['dit is een testinstru-   ', 'ment om te kijken of het werkt.    ', 'en anders is het ge- ', 'woon een proef!']


data = ''.join([(line.strip() + ' ' if not line.strip().endswith('-') else line.strip()) for line in elem ])\
          .replace('<', '#!#<')\
          .replace('>', '>#!#')\
          .split('#!#')
print(data)

In [None]:
s = 'LUDOVICUS OF DINDORFIUS.    '
if s.isupper():
    print(s.title())
    print(' '.join((c.title() if not c.lower() in {'of', 'the',} else c.lower()) for c in s.strip(' ').split()))


In [None]:
st = 'name, sdfsdf '
# name = st[:st.find(',')]
name = st[st.find(',')+2:]
print(name)

In [1]:
# import ray
# ray.init()

# @ray.remote
# def f(x):
#     return x * x

# futures = [f.remote(i) for i in range(4)]
# print(ray.get(futures))

2019-10-16 11:38:08,931	INFO resource_spec.py:205 -- Starting Ray with 4.79 GiB memory available for workers and up to 2.4 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).






[0, 1, 4, 9]


In [4]:
from multiprocessing import Pool

lis = [i for i in range(1000)]

def f(x):
    return print(x * x)

pool = Pool()
pool.map(f, lis)
pool.close()
pool.join()

15876
35721
3969
0
16129
36100
4096
1
36481
16384
4225
4
16641
36864
9
4356
16900
37249
16
37636
4489
25
17161
4624
38025
17424
36
38416
4761
17689
49
38809
4900
17956
64
39204
5041
18225
81
39601
5184
18496
100
40000
5329
18769
121
40401
5476
19044
144
5625
40804
19321
169
19600
5776
41209
196
5929
41616
19881
225
42025
20164
6084
256
42436
20449
6241
289
42849
20736
21025
324
43264
6400
361
21316
43681
6561
21609
400
6724
44100
6889
441
44521
21904
484
7056
44944
22201
529
7225
45369
22500
7396
576
45796
22801
7569
625
23104
46225
7744
676
46656
23409
7921
729
47089
23716
8100
784
24025
47524
841
8281
24336
47961
900
8464
48400
24649
24964
8649
48841
961
25281
8836
49284
1024
49729
25600
1089
9025
50176
25921
9216
1156
50625
9409
26244
1225
26569
9604
1296
51076
9801
26896
1369
51529
10000
51984
1444
27225
27556
52441
1521
10201
1600
52900
27889
10404
28224
53361
10609
1681
53824
28561
10816
54289
1764
28900
11025
54756
1849
29241
11236
1936
55225
29584
11449
2025
55696
11664
29929
2