In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import itertools as itls

In [3]:
import cytoolz as tlz
from cytoolz import curried as tlzc

c_starmap = tlz.curry(itls.starmap)

In [4]:
from text2math import raw2text as r2t
from text2math import text2tokens as t2t

In [5]:
from h_topic_model import textproc_utils as tpu

---

In [6]:
SEGMENTED_SENTS_FILE = "/Users/steven_c/projects/h_topic_model/data/labled_data/ex_segments/ex_sentences_w_segmented_words.txt"

In [7]:
ANNOTATIONS_FILE = "/Users/steven_c/projects/h_topic_model/data/labled_data/traindata.txt"

---

In [8]:
def load_file_lines(filename):
    with open(filename) as f:
        return [r2t.adv_decode(txt) for txt in f.readlines()]

In [9]:
LINES = load_file_lines(SEGMENTED_SENTS_FILE)
len(LINES)

89365

---

## Parsing out the tokens

In [10]:
SPLITMARKER = u"f60968a6d89e"

In [11]:
OGMARKER = u"~"

In [26]:
def clean_txt(txt):
    """
    Removes most of the punctuation.
    
    Applies text2math.text2tokens.drop_punct
    then applies h_topic_model.textproc_utils.remove_h_punct.
    """
    return tlz.pipe(txt,
                    t2t.drop_punct,
                    tpu.remove_h_punct(replace=u""))

In [28]:
def get_seg_tokens(lines, ogmarker=OGMARKER, segmarker=SPLITMARKER):
    return tlz.pipe(lines,
                    tlzc.map(lambda token: token.replace(ogmarker, segmarker)),
                    tlzc.map(tpu.basic_split),
                    tlz.concat,
                    tlzc.filter(lambda t: segmarker in t),  # Filter out tokens that are not segmented
                    tlzc.map(clean_txt),
                    tlzc.filter(tlz.identity),
                    list,
                    )

In [15]:
%time TOKENS = get_seg_tokens(LINES)
len(TOKENS)

CPU times: user 35.3 s, sys: 637 ms, total: 35.9 s
Wall time: 37.5 s


1178441

---

## Finding unique tokens

In [34]:
def unique(seq):
    return tlz.pipe(seq,
                    set,
                    list)

In [43]:
%time UTOKENS = unique(TOKENS)
len(UTOKENS)

CPU times: user 301 ms, sys: 2.45 ms, total: 304 ms
Wall time: 310 ms


104582

---

## Putting together compounds and annotations

In [430]:
def replace_seg_marker(txt, replacement, segmarker=SPLITMARKER):
    return txt.replace(segmarker, replacement)

In [431]:
def merg_segs(txt, segmarker=SPLITMARKER):
    return replace_seg_marker(txt, u"", segmarker=segmarker)

In [432]:
def seg_marker_to_space(txt, segmarker=SPLITMARKER, replacement=u" "):
    return replace_seg_marker(txt, replacement, segmarker=segmarker)

In [433]:
def make_comps_and_annots(seq):
    return tlz.pipe(seq,
                    tlzc.groupby(merg_segs),
                    lambda d: d.iteritems(),
                    list)

In [435]:
%time GROUPED_ON_COMPS = make_comps_and_annots(UTOKENS)

CPU times: user 294 ms, sys: 29.3 ms, total: 323 ms
Wall time: 386 ms


In [436]:
def filter_non_annots(seq):
    return tlz.pipe(seq,
                    c_starmap(lambda k, vs: (k, filter(lambda t: t != k, vs))),
                    tlzc.filter(tlz.second), # filter out those that now have zero annotations.
                    list)    

In [440]:
%time COMPOUNDS_AND_ANNOTATIONS = filter_non_annots(GROUPED_ON_COMPS)
len(COMPOUNDS_AND_ANNOTATIONS)

CPU times: user 220 ms, sys: 10.1 ms, total: 230 ms
Wall time: 229 ms


104580

In [441]:
%time MULTI = list(filter(lambda t: len(tlz.second(t)) > 1, COMPOUNDS_AND_ANNOTATIONS))
len(MULTI)

CPU times: user 55.3 ms, sys: 17.4 ms, total: 72.7 ms
Wall time: 61.7 ms


2

---

## Make Annotation Strings

### [Annotation file](http://morfessor.readthedocs.io/en/latest/filetypes.html#annotation-file)

An annotation file contains one compound and one or more annotations per
compound on each line. The separators between the annotations (default ', ')
and between the constructions (default ' ') are configurable.

**Specification**

```
<compound> <analysis1construction1>[ <analysis1constructionN>][, <analysis2construction1> [<analysis2constructionN>]*]*
```

**Example**
```
kahvikakku kahvi kakku, kahvi kak ku
kahvikilon kahvi kilon
kahvikoneemme kahvi konee mme, kah vi ko nee mme
```

In [442]:
def format_comp_and_annot(seq):
    return u" ".join([seq[0], ", ".join(seq[1])])

In [443]:
def comp_and_annot_string(seq):
    return tlz.pipe(seq,
                    format_comp_and_annot,
                    seg_marker_to_space,
                    )

In [444]:
def comp_and_annot_strings(seq):
    return tlz.map(comp_and_annot_string, seq)

In [448]:
%time ANC_STRINGS = list(comp_and_annot_strings(COMPOUNDS_AND_ANNOTATIONS))

CPU times: user 212 ms, sys: 5.32 ms, total: 217 ms
Wall time: 241 ms


---

In [416]:
with open(ANNOTATIONS_FILE, "w+") as f:
    for line in ANC_STRINGS:
        f.write("{}\n".format(line.encode("utf-8")))