Lets see what we can do with a wordmap (plot, noise reduction and possibly partitioning into morphemes?)

In [193]:
import wordmapper
import text_utilities as tu
import numpy as np # arrays
import scipy.stats as stats # arithmetics
import matplotlib.pyplot as pp # plotting

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [194]:
pc = tu.PosCorpus('../data/experiment/verbs/joined/')

targets = "beschissen,stehen,anschauen,viertelt,verunglimpfst"
wm_mt = []
for t in targets.split(","):
    wm = wordmapper.WordMapper(t, pc.counted_corpus)
    mt = wordmapper.MapToken(t, wm.wordmap, pc.metrics)
    wm_mt.append((mt.token, mt.wordmap))

TypeError: __init__() takes 3 positional arguments but 4 were given

In [109]:
def plot_targets(maptokens, absolute=False, zscore=False, med=False, std=False, mean=False, mmmix=False, zeroline=False, derive=False):
    for (target, wm) in maptokens:

        arr = np.array(wm)
        wm_med = np.array([np.median(wm) for i in wm])
        wm_mea = np.array([np.mean(wm) for i in wm])
        wm_std = np.array([np.std(wm) for i in wm])
        wm_mix = [(x+y)/2 for x, y, in zip(wm_mea, wm_med)]
        wm_zero = [0 for i in range(len(arr))]
        wm_derive = wm_derive = np.array(stats.zscore(tu.derive_wordmap(wm)))
        wm_zscore = np.array(stats.zscore(arr))

        if absolute:
            pp.plot(range(len(arr)), arr)
        if med:
            pp.plot(range(len(arr)), wm_med)
        if mean:
            pp.plot(range(len(arr)), wm_mea)
        if std:
            pp.plot(range(len(arr)), wm_std)
        if mmmix:
            pp.plot(range(len(arr)), wm_mix)
        if zscore:
            pp.plot(range(len(arr)), wm_zscore)
        if zeroline:
            pp.plot(range(len(arr)), wm_zero)
        if derive:
            pp.plot(range(len(arr)), wm_derive)


    pp.xlabel('Character positions')
    pp.ylabel('Frequency')
    pp.xticks(np.arange(stop=len(arr),step=1), labels=list(target))
    pp.savefig("/home/gnom/Pictures/wordmaps/zscore" + target + ".png")
    pp.clf()

In [113]:
for (x,y) in wm_mt:
    plot_targets(
        maptokens=[(x,y)],
        absolute=False,
        med=  False,
        mmmix=False,
        mean= False,
        zscore=  True,
        zeroline=True,
        derive=  True
    )

<Figure size 640x480 with 0 Axes>

0.6054367328205298


In [211]:
from tqdm import tqdm as tq

lex_morphemes = []
fun_morphemes = []


for k in pc.counted_corpus:
    for v in tq(pc.counted_corpus[k]):
        wm = wordmapper.WordMapper(v, pc.counted_corpus)
        mt = wordmapper.MapToken(v, wm.wordmap)
        lex_morphemes.append(mt.stem)
        fun_morphemes.extend(mt.affix)

#with open("../new_tokenizer/lex_vocab.txt", encoding="utf8", mode="w") as lv:

100%|██████████| 28675/28675 [4:20:50<00:00,  1.83it/s]  
100%|██████████| 24964/24964 [2:55:14<00:00,  2.37it/s]  
100%|██████████| 14169/14169 [2:13:02<00:00,  1.78it/s] 
100%|██████████| 4257/4257 [40:48<00:00,  1.74it/s]
100%|██████████| 5008/5008 [36:24<00:00,  2.29it/s]
100%|██████████| 974/974 [09:43<00:00,  1.67it/s]
100%|██████████| 182/182 [01:52<00:00,  1.62it/s]
100%|██████████| 32/32 [00:20<00:00,  1.59it/s]


In [276]:
with open("../new_tokenizer/lex_vocab_raw.txt", encoding="utf8", mode="w") as lv:
    lv.write("".join(s + "\n" for s in lex_morphemes))
    lv.close()

with open("../new_tokenizer/fun_vocab_raw.txt", encoding="utf8", mode="w") as fv:
    fv.write("".join([s + "\n" for s in fun_morphemes]))
    fv.close()

CONVERT FUNCTIONAL MORPHEME MAP TO HASHABLE WITH RELATIVE FREQUENCIES

In [232]:
import collections as cl

In [None]:
fm_clean = [i for i in fun_morphemes if i != ""]  # select non-empty morphemes
fm_ncount = cl.Counter(fm_clean).most_common()
n_o_fm = len(fm_clean)
fm_rel = {k:v/n_o_fm if len(k)>1 else 0 for k, v in fm_ncount } # unary morphemes get no weight
fm_rel

LEXEMIC MORPHEMES

In [None]:
lm_clean = [i for i in lex_morphemes if len(i)>1]  # select only morphemes longer than 1 character
lm_ncount = cl.Counter(lm_clean).most_common()
n_o_lm = len(lm_clean)
lm_rel = {k:v/n_o_lm if len(k)>1 else 0 for k, v in lm_ncount } # unary morphemes get no weight
lm_rel

In [273]:
[i for i in lm_clean if i.startswith("sch")]

['schwindl',
 'schnöker',
 'schleuder',
 'schleuder',
 'schlagzeil',
 'scheuch',
 'schlagnahm',
 'schmutz',
 'schlechter',
 'schlosser',
 'schmiss',
 'schütte',
 'schloss',
 'schling',
 'schücht',
 'schrei',
 'schluder',
 'schlotter',
 'schüttgeh',
 'schmäler',
 'schild',
 'schreck',
 'scherbel',
 'schrumpel',
 'schmurgel',
 'schnüff',
 'schnauf',
 'schmölz',
 'schände',
 'schott',
 'schnüff',
 'schoss',
 'schob',
 'schloss',
 'schädi',
 'schwefel',
 'schütt',
 'schmach',
 'schlacker',
 'schlimmer',
 'schnüffel',
 'schädig',
 'schaff',
 'schor',
 'schrieb',
 'schränk',
 'schaud',
 'schlu',
 'schücht',
 'schuster',
 'schwäbel',
 'schnupper',
 'schreib',
 'schüchter',
 'schlagnahm',
 'schaff',
 'schlank',
 'schrie',
 'schattier',
 'schwamm',
 'schmalz',
 'schwatz',
 'schnörkel',
 'schachte',
 'schweig',
 'schmetter',
 'schlüp',
 'schiss',
 'schlag',
 'scheffel',
 'schnüffe',
 'schätztho',
 'schuld',
 'schrumpl',
 'schmett',
 'schnitt',
 'schwäger',
 'schlimmer',
 'schien',
 'schied',
 's

True