In [100]:
import text_utilities as tu
import regex as rex
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:

pc = tu.PosCorpus('../data/experiment/verbs')
pc.counted_corpus.keys()

dict_keys([3, 1, 2, 4, 5, 6, 7, 8])

In [97]:
class WordMapper:
    """Generates a Wordmap for a target token, comparing it to its POS-members in a dict sorted by syllables"""
    def __init__(self, target: str, tokenset: dict, clean=True, pattern='([^^1][0*1*]+[^$1])'):
        self.target = target
        self.tokenset = tokenset
        self.syllables = tu.count_syllables(target)
        self.maps = self.stack_maps(target, tokenset, self.syllables)
        self.pattern = pattern
        self.clean_maps = None
        if clean:
            self.filter_map_noise(self.maps, self.pattern)
            self.wordmap = self.sum_map_stack(self.clean_maps)
        else:
            self.wordmap = self.sum_map_stack(self.maps)


    def stack_maps(self, target, tokenset, syllables):
        l = len(target)
        #cc1, cc2, cc3 = 0,0,0
        maps = []
        for k in tokenset:
            for v in tokenset[k]:
                pair = (v, self.target)
                case = tu.match_ends(v, target)
                shorter = min(pair, key=len)
                longer = max(pair, key=len)
                diff = len(longer) - len(shorter)

                if case.get("any"):
                    if diff:
                        if case.get("first") and syllables!=1:
                            #cc1+=1
                            wm = tu.wordmap(longer=longer, shorter=shorter)
                            while len(wm) < l:
                                wm.append(0) # padding
                            maps.append(wm)

                        if case.get("last"):
                            wm = []
                            #cc2+=1
                            wm = tu.wordmap(longer=longer, shorter=shorter, start=diff)
                            while len(wm) < l:
                                wm.insert(0, 0) # padding
                            maps.append(wm)

                    else:
                        #cc3+=1
                        wm = tu.wordmap(longer=pair[0], shorter=pair[1])
                        maps.append(wm)
        #print("Cases:", cc1, cc2, cc3)
        return maps


    def filter_map_noise(self, maps, pattern):
        """Convert maps to strings and delete any consecutive '1' not at the start or end of the map"""
        str_maps = ["".join([str(c) for c in m]) for m in maps]  # cast to str
        recount_map = [rex.sub(pattern=pattern, repl=lambda m: len(m.group(1))*"0",string=sm) for sm in str_maps]  # regex sub
        regexed_listed = [list(i) for i in recount_map]  # into list form
        regexed_inted = [[int(c) for c in m] for m in regexed_listed]  # cast back to int
        self.clean_maps = regexed_inted

    def sum_map_stack(self, maps):
        return [sum(x) for x in zip(*maps)]


In [99]:
wordmapper = WordMapper("verdutzt", pc.counted_corpus, clean=True)
wordmapper.wordmap

[6407, 5622, 5578, 256, 57, 319, 623, 24793]

In [125]:
class MapToken:
    """Holds information about a single token. metrics must be text_utilities.PosCorpus metrics dict"""
    def __init__(self, token: str, wordmap: list, metrics):
        self.wordmap = wordmap
        self.freqmap = {c:metrics[c] for c in metrics if c in token}
        self.str = token

In [126]:
mt = MapToken(wordmapper.target, wordmapper.wordmap, pc.metrics)

In [127]:
mt.freqmap

{'e': (129905, 0.16219310599704342),
 't': (77211, 0.09640192376842863),
 'r': (68048, 0.08496144472412002),
 'u': (29988, 0.03744156778137361),
 'd': (14059, 0.01755338806983899),
 'z': (12020, 0.015007591194214711),
 'v': (8892, 0.011102121539014742)}

In [128]:
def map_subword(target: str, map:str) -> str:
    """Returns a subword from a target string and a map. Yet to implement maps with 1 on both ends."""
    if map.startswith("1"):
        return target[:map.count("1")]+"##"
    elif map.endswith("1"):
        return "##"+target[-map.count("1"):]
print(map_subword("verstehen", "000000011"))

##en
