Lets see what we can do with a wordmap (plot, noise reduction and possibly partitioning into morphemes?)

In [193]:
import wordmapper
import text_utilities as tu
import numpy as np # arrays
import scipy.stats as stats # arithmetics
import matplotlib.pyplot as pp # plotting

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [194]:
pc = tu.PosCorpus('../data/experiment/verbs/joined/')

targets = "beschissen,stehen,anschauen,viertelt,verunglimpfst"
wm_mt = []
for t in targets.split(","):
    wm = wordmapper.WordMapper(t, pc.counted_corpus)
    mt = wordmapper.MapToken(t, wm.wordmap, pc.metrics)
    wm_mt.append((mt.token, mt.wordmap))

TypeError: __init__() takes 3 positional arguments but 4 were given

In [109]:
def plot_targets(maptokens, absolute=False, zscore=False, med=False, std=False, mean=False, mmmix=False, zeroline=False, derive=False):
    for (target, wm) in maptokens:

        arr = np.array(wm)
        wm_med = np.array([np.median(wm) for i in wm])
        wm_mea = np.array([np.mean(wm) for i in wm])
        wm_std = np.array([np.std(wm) for i in wm])
        wm_mix = [(x+y)/2 for x, y, in zip(wm_mea, wm_med)]
        wm_zero = [0 for i in range(len(arr))]
        wm_derive = wm_derive = np.array(stats.zscore(tu.derive_wordmap(wm)))
        wm_zscore = np.array(stats.zscore(arr))

        if absolute:
            pp.plot(range(len(arr)), arr)
        if med:
            pp.plot(range(len(arr)), wm_med)
        if mean:
            pp.plot(range(len(arr)), wm_mea)
        if std:
            pp.plot(range(len(arr)), wm_std)
        if mmmix:
            pp.plot(range(len(arr)), wm_mix)
        if zscore:
            pp.plot(range(len(arr)), wm_zscore)
        if zeroline:
            pp.plot(range(len(arr)), wm_zero)
        if derive:
            pp.plot(range(len(arr)), wm_derive)


    pp.xlabel('Character positions')
    pp.ylabel('Frequency')
    pp.xticks(np.arange(stop=len(arr),step=1), labels=list(target))
    pp.savefig("/home/gnom/Pictures/wordmaps/zscore" + target + ".png")
    pp.clf()

In [113]:
for (x,y) in wm_mt:
    plot_targets(
        maptokens=[(x,y)],
        absolute=False,
        med=  False,
        mmmix=False,
        mean= False,
        zscore=  True,
        zeroline=True,
        derive=  True
    )

<Figure size 640x480 with 0 Axes>

In [72]:
def signaltonoise(a, axis=0, ddof=0):
    """
    The signal-to-noise ratio of the input data.
    Returns the signal-to-noise ratio of `a`, here defined as the mean
    divided by the standard deviation.
    Parameters
    ----------
    a : array_like
        An array_like object containing the sample data.
    axis : int or None, optional
        Axis along which to operate. Default is 0. If None, compute over
        the whole array `a`.
    ddof : int, optional
        Degrees of freedom correction for standard deviation. Default is 0.
    Returns
    -------
    s2n : ndarray
        The mean to standard deviation ratio(s) along `axis`, or 0 where the
        standard deviation is 0.
    """
    a = np.asanyarray(a)
    m = a.mean(axis)
    sd = a.std(axis=axis, ddof=ddof)
    return np.where(sd == 0, 0, m/sd)

In [52]:
print(signaltonoise(wordmap1))

0.6054367328205298


In [27]:
S = np.fft.fft(mt.wordmap)

In [None]:
t = np.arange(8)
#s = np.sin(0.15*2*np.pi*t)

S_mag = np.abs(S)
S_phase = np.angle(S)
pp.plot(t,S_mag,'.-')
pp.plot(t,S_phase,'.-')

In [None]:
Fs = 1 # Hz
N = 8 # number of points to simulate, and our FFT size

t = np.arange(N) # because our sample rate is 1 Hz

S = np.fft.fftshift(np.fft.fft(mt.wordmap))
S_mag = np.abs(S)
S_phase = np.angle(S)
f = np.arange(Fs/-2, Fs/2, Fs/N)
pp.figure(0)
pp.plot(f, S_mag,'.-')
pp.figure(1)
pp.plot(f, S_phase,'.-')
pp.show()

In [211]:
from tqdm import tqdm as tq

lex_morphemes = []
fun_morphemes = []


for k in pc.counted_corpus:
    for v in tq(pc.counted_corpus[k]):
        wm = wordmapper.WordMapper(v, pc.counted_corpus)
        mt = wordmapper.MapToken(v, wm.wordmap)
        lex_morphemes.append(mt.stem)
        fun_morphemes.extend(mt.affix)

#with open("../new_tokenizer/lex_vocab.txt", encoding="utf8", mode="w") as lv:

100%|██████████| 28675/28675 [4:20:50<00:00,  1.83it/s]  
100%|██████████| 24964/24964 [2:55:14<00:00,  2.37it/s]  
100%|██████████| 14169/14169 [2:13:02<00:00,  1.78it/s] 
100%|██████████| 4257/4257 [40:48<00:00,  1.74it/s]
100%|██████████| 5008/5008 [36:24<00:00,  2.29it/s]
100%|██████████| 974/974 [09:43<00:00,  1.67it/s]
100%|██████████| 182/182 [01:52<00:00,  1.62it/s]
100%|██████████| 32/32 [00:20<00:00,  1.59it/s]


In [219]:
with open("../new_tokenizer/lex_vocab.txt", encoding="utf8", mode="w") as lv:
    lv.write("".join(s + "\n" for s in set(lex_morphemes)))
    lv.close()

with open("../new_tokenizer/fun_vocab.txt", encoding="utf8", mode="w") as fv:
    fv.write("".join([s + "\n" for s in set(fun_morphemes)]))
    fv.close()

In [163]:
def zip_wordmap(t: str, wm: list):
    "t = target string, wm = boolean wordmap."
    stem = "".join([c for b, c in zip(wm, t) if b])
    return stem, t.partition(stem)

In [213]:
print(lex_morphemes[:10], fun_morphemes[:10])

['inhef', 'erkram', 'nachlief', 'rwahr', 'rückstreb', 'umrechne', 'korrigier', 'ngefüll', 'betä', 'gefall'] ['e', 'te', 'v', 'e', '', 'e', 'e', 'te', '', 'te']


True

In [209]:
with open("../new_tokenizer/fun_vocab.txt", encoding="utf8", mode="w") as fv:
    fv.write("".join([s + "\n" for s in set(morfs)]))

['über', 'en']