In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import cytoolz as tlz

In [3]:
from text2math import raw2text as r2t
from text2math import text2tokens as t2t
from text2math import tokens2numbers as t2n
import text2math as txt2m

# The Text

Posts from [RaspberryPi StackExchange](http://raspberrypi.stackexchange.com/)

Stored as a single large xml file.

24,925 posts

**`get_text_from_xml_file`** extracts the Title and Body for each post.

In [4]:
TXT_STREAM = r2t.get_text_from_xml_file("../data/stackoverflow/Posts.xml")

In [5]:
TXT = TXT_STREAM.next()
print(TXT)

How do I build a GCC 4.7 toolchain for cross-compiling?


<p>I already asked this <a href="http://stackoverflow.com/questions/10973020/cross-compilation-for-raspberry-pi-in-gcc-where-to-start">question</a> on Stack Overflow, but  I would like to know if anyone managed to build a GCC 4.7 toolchain for ARM cross-compilation (for a x86/x86-64 Linux host). There are many instructins for building GCC from source and many available cross-compilers for pre-4.7 GCC versions, just not the latest one.</p>

<p>Compiling on Rasp Pi itself works fine but is just a bit too slow for practical purposes.</p>

<p>I am eager to get compiling and I would like to use the latest and the best tools.</p>



---

# Initial Cleaning of Text

## Removing HTML Tags

**`remove_html_bits`** uses BeautifulSoup and lxml to remove the HTML tags.

In [6]:
NO_HTML = r2t.remove_html_bits(TXT)
print(r2t.remove_html_bits(NO_HTML))

How do I build a GCC 4.7 toolchain for cross-compiling?


I already asked this question on Stack Overflow, but  I would like to know if anyone managed to build a GCC 4.7 toolchain for ARM cross-compilation (for a x86/x86-64 Linux host). There are many instructins for building GCC from source and many available cross-compilers for pre-4.7 GCC versions, just not the latest one.
Compiling on Rasp Pi itself works fine but is just a bit too slow for practical purposes.
I am eager to get compiling and I would like to use the latest and the best tools.



## Fixing text encoding

In [7]:
UNICODE = r2t.decode_and_fix(NO_HTML)
print(UNICODE)

How do I build a GCC 4.7 toolchain for cross-compiling?


I already asked this question on Stack Overflow, but  I would like to know if anyone managed to build a GCC 4.7 toolchain for ARM cross-compilation (for a x86/x86-64 Linux host). There are many instructins for building GCC from source and many available cross-compilers for pre-4.7 GCC versions, just not the latest one.
Compiling on Rasp Pi itself works fine but is just a bit too slow for practical purposes.
I am eager to get compiling and I would like to use the latest and the best tools.



**Here's a better example of fixing text encoding**

In [8]:
MESSED_UP_TEXT = b'uÌˆnicode'
print(MESSED_UP_TEXT)

uÌˆnicode


In [9]:
r2t.decode_and_fix(MESSED_UP_TEXT)

u'unicode'

Broken down by component:

In [10]:
UNI_MESSED_UP_TEXT = r2t.adv_decode(MESSED_UP_TEXT)
print(UNI_MESSED_UP_TEXT)

uÌˆnicode


In [11]:
CLEAN_UNI_MESSED_UP_TEXT = r2t.clean_unicode(UNI_MESSED_UP_TEXT)
print(CLEAN_UNI_MESSED_UP_TEXT)

ünicode


In [12]:
print(r2t.normize_text(CLEAN_UNI_MESSED_UP_TEXT))

unicode


## Putting All Those Steps together

In [13]:
CLEAN_TXT = tlz.pipe(TXT,
                     r2t.remove_html_bits,
                     r2t.adv_decode,
                     r2t.clean_unicode,
                     r2t.normize_text)
print(CLEAN_TXT)

How do I build a GCC 4.7 toolchain for cross-compiling?


I already asked this question on Stack Overflow, but  I would like to know if anyone managed to build a GCC 4.7 toolchain for ARM cross-compilation (for a x86/x86-64 Linux host). There are many instructins for building GCC from source and many available cross-compilers for pre-4.7 GCC versions, just not the latest one.
Compiling on Rasp Pi itself works fine but is just a bit too slow for practical purposes.
I am eager to get compiling and I would like to use the latest and the best tools.



# Tokenizing

**The Goal:**

```python
@tlz.curry
def ngram_tuples(n, string, minlen=3, maxlen=25):
    return tlz.pipe(string,
                    lower,
                    simple_split,
                    filter_longer_than(maxlen),
                    compose(concat, map, splitter_of_words),
                    filter_shorter_than(minlen),
                    filter_stopwords,
                    sliding_window_c(n),
                    map_c(join_strings("_")))
```

**`lower`**

In [14]:
LOW_TXT = t2t.lower(CLEAN_TXT)
print(LOW_TXT)

how do i build a gcc 4.7 toolchain for cross-compiling?


i already asked this question on stack overflow, but  i would like to know if anyone managed to build a gcc 4.7 toolchain for arm cross-compilation (for a x86/x86-64 linux host). there are many instructins for building gcc from source and many available cross-compilers for pre-4.7 gcc versions, just not the latest one.
compiling on rasp pi itself works fine but is just a bit too slow for practical purposes.
i am eager to get compiling and i would like to use the latest and the best tools.



**`simple_split`**

In [15]:
SMPL_SPLIT_TXT = t2t.simple_split(LOW_TXT)
print(SMPL_SPLIT_TXT)

[u'how', u'do', u'i', u'build', u'a', u'gcc', u'4.7', u'toolchain', u'for', u'cross-compiling?', u'i', u'already', u'asked', u'this', u'question', u'on', u'stack', u'overflow,', u'but', u'i', u'would', u'like', u'to', u'know', u'if', u'anyone', u'managed', u'to', u'build', u'a', u'gcc', u'4.7', u'toolchain', u'for', u'arm', u'cross-compilation', u'(for', u'a', u'x86/x86-64', u'linux', u'host).', u'there', u'are', u'many', u'instructins', u'for', u'building', u'gcc', u'from', u'source', u'and', u'many', u'available', u'cross-compilers', u'for', u'pre-4.7', u'gcc', u'versions,', u'just', u'not', u'the', u'latest', u'one.', u'compiling', u'on', u'rasp', u'pi', u'itself', u'works', u'fine', u'but', u'is', u'just', u'a', u'bit', u'too', u'slow', u'for', u'practical', u'purposes.', u'i', u'am', u'eager', u'to', u'get', u'compiling', u'and', u'i', u'would', u'like', u'to', u'use', u'the', u'latest', u'and', u'the', u'best', u'tools.']


In [16]:
LONG_FILTERED = list(t2t.filter_longer_than(25, SMPL_SPLIT_TXT))
print(LONG_FILTERED)

[u'how', u'do', u'i', u'build', u'a', u'gcc', u'4.7', u'toolchain', u'for', u'cross-compiling?', u'i', u'already', u'asked', u'this', u'question', u'on', u'stack', u'overflow,', u'but', u'i', u'would', u'like', u'to', u'know', u'if', u'anyone', u'managed', u'to', u'build', u'a', u'gcc', u'4.7', u'toolchain', u'for', u'arm', u'cross-compilation', u'(for', u'a', u'x86/x86-64', u'linux', u'host).', u'there', u'are', u'many', u'instructins', u'for', u'building', u'gcc', u'from', u'source', u'and', u'many', u'available', u'cross-compilers', u'for', u'pre-4.7', u'gcc', u'versions,', u'just', u'not', u'the', u'latest', u'one.', u'compiling', u'on', u'rasp', u'pi', u'itself', u'works', u'fine', u'but', u'is', u'just', u'a', u'bit', u'too', u'slow', u'for', u'practical', u'purposes.', u'i', u'am', u'eager', u'to', u'get', u'compiling', u'and', u'i', u'would', u'like', u'to', u'use', u'the', u'latest', u'and', u'the', u'best', u'tools.']


In [17]:
NON_ALPHANUM_SPLIT = list(tlz.concat(tlz.map(t2t.splitter_of_words, LONG_FILTERED)))
print(NON_ALPHANUM_SPLIT)

[u'how', u'do', u'i', u'build', u'a', u'gcc', u'4', u'7', u'toolchain', u'for', u'cross', u'compiling', u'', u'i', u'already', u'asked', u'this', u'question', u'on', u'stack', u'overflow', u'', u'but', u'i', u'would', u'like', u'to', u'know', u'if', u'anyone', u'managed', u'to', u'build', u'a', u'gcc', u'4', u'7', u'toolchain', u'for', u'arm', u'cross', u'compilation', u'', u'for', u'a', u'x86', u'x86', u'64', u'linux', u'host', u'', u'there', u'are', u'many', u'instructins', u'for', u'building', u'gcc', u'from', u'source', u'and', u'many', u'available', u'cross', u'compilers', u'for', u'pre', u'4', u'7', u'gcc', u'versions', u'', u'just', u'not', u'the', u'latest', u'one', u'', u'compiling', u'on', u'rasp', u'pi', u'itself', u'works', u'fine', u'but', u'is', u'just', u'a', u'bit', u'too', u'slow', u'for', u'practical', u'purposes', u'', u'i', u'am', u'eager', u'to', u'get', u'compiling', u'and', u'i', u'would', u'like', u'to', u'use', u'the', u'latest', u'and', u'the', u'best', u'tool

In [18]:
SHORT_FILTERED = list(t2t.filter_shorter_than(3, NON_ALPHANUM_SPLIT))
print(SHORT_FILTERED)

[u'how', u'build', u'gcc', u'toolchain', u'for', u'cross', u'compiling', u'already', u'asked', u'this', u'question', u'stack', u'overflow', u'but', u'would', u'like', u'know', u'anyone', u'managed', u'build', u'gcc', u'toolchain', u'for', u'arm', u'cross', u'compilation', u'for', u'x86', u'x86', u'linux', u'host', u'there', u'are', u'many', u'instructins', u'for', u'building', u'gcc', u'from', u'source', u'and', u'many', u'available', u'cross', u'compilers', u'for', u'pre', u'gcc', u'versions', u'just', u'not', u'the', u'latest', u'one', u'compiling', u'rasp', u'itself', u'works', u'fine', u'but', u'just', u'bit', u'too', u'slow', u'for', u'practical', u'purposes', u'eager', u'get', u'compiling', u'and', u'would', u'like', u'use', u'the', u'latest', u'and', u'the', u'best', u'tools']


In [19]:
NON_STOP = list(t2t.filter_stopwords(SHORT_FILTERED))
print(NON_STOP)

[u'build', u'gcc', u'toolchain', u'cross', u'compiling', u'asked', u'question', u'stack', u'overflow', u'like', u'know', u'managed', u'build', u'gcc', u'toolchain', u'arm', u'cross', u'compilation', u'x86', u'x86', u'linux', u'host', u'instructins', u'building', u'gcc', u'source', u'available', u'cross', u'compilers', u'pre', u'gcc', u'versions', u'latest', u'compiling', u'rasp', u'works', u'fine', u'bit', u'slow', u'practical', u'purposes', u'eager', u'compiling', u'like', u'use', u'latest', u'best', u'tools']


In [20]:
UNI_T = list(t2t.sliding_window_c(1, NON_STOP))
print(UNI_T)

[(u'build',), (u'gcc',), (u'toolchain',), (u'cross',), (u'compiling',), (u'asked',), (u'question',), (u'stack',), (u'overflow',), (u'like',), (u'know',), (u'managed',), (u'build',), (u'gcc',), (u'toolchain',), (u'arm',), (u'cross',), (u'compilation',), (u'x86',), (u'x86',), (u'linux',), (u'host',), (u'instructins',), (u'building',), (u'gcc',), (u'source',), (u'available',), (u'cross',), (u'compilers',), (u'pre',), (u'gcc',), (u'versions',), (u'latest',), (u'compiling',), (u'rasp',), (u'works',), (u'fine',), (u'bit',), (u'slow',), (u'practical',), (u'purposes',), (u'eager',), (u'compiling',), (u'like',), (u'use',), (u'latest',), (u'best',), (u'tools',)]


In [21]:
BI_T = list(t2t.sliding_window_c(2, NON_STOP))
print(BI_T)

[(u'build', u'gcc'), (u'gcc', u'toolchain'), (u'toolchain', u'cross'), (u'cross', u'compiling'), (u'compiling', u'asked'), (u'asked', u'question'), (u'question', u'stack'), (u'stack', u'overflow'), (u'overflow', u'like'), (u'like', u'know'), (u'know', u'managed'), (u'managed', u'build'), (u'build', u'gcc'), (u'gcc', u'toolchain'), (u'toolchain', u'arm'), (u'arm', u'cross'), (u'cross', u'compilation'), (u'compilation', u'x86'), (u'x86', u'x86'), (u'x86', u'linux'), (u'linux', u'host'), (u'host', u'instructins'), (u'instructins', u'building'), (u'building', u'gcc'), (u'gcc', u'source'), (u'source', u'available'), (u'available', u'cross'), (u'cross', u'compilers'), (u'compilers', u'pre'), (u'pre', u'gcc'), (u'gcc', u'versions'), (u'versions', u'latest'), (u'latest', u'compiling'), (u'compiling', u'rasp'), (u'rasp', u'works'), (u'works', u'fine'), (u'fine', u'bit'), (u'bit', u'slow'), (u'slow', u'practical'), (u'practical', u'purposes'), (u'purposes', u'eager'), (u'eager', u'compiling'), (

In [22]:
_UNIGRAMS = map(t2t.join_strings("_"), UNI_T)
print(_UNIGRAMS)

[u'build', u'gcc', u'toolchain', u'cross', u'compiling', u'asked', u'question', u'stack', u'overflow', u'like', u'know', u'managed', u'build', u'gcc', u'toolchain', u'arm', u'cross', u'compilation', u'x86', u'x86', u'linux', u'host', u'instructins', u'building', u'gcc', u'source', u'available', u'cross', u'compilers', u'pre', u'gcc', u'versions', u'latest', u'compiling', u'rasp', u'works', u'fine', u'bit', u'slow', u'practical', u'purposes', u'eager', u'compiling', u'like', u'use', u'latest', u'best', u'tools']


In [23]:
_BIGRAMS = map(t2t.join_strings("_"), BI_T)
print(_BIGRAMS)

[u'build_gcc', u'gcc_toolchain', u'toolchain_cross', u'cross_compiling', u'compiling_asked', u'asked_question', u'question_stack', u'stack_overflow', u'overflow_like', u'like_know', u'know_managed', u'managed_build', u'build_gcc', u'gcc_toolchain', u'toolchain_arm', u'arm_cross', u'cross_compilation', u'compilation_x86', u'x86_x86', u'x86_linux', u'linux_host', u'host_instructins', u'instructins_building', u'building_gcc', u'gcc_source', u'source_available', u'available_cross', u'cross_compilers', u'compilers_pre', u'pre_gcc', u'gcc_versions', u'versions_latest', u'latest_compiling', u'compiling_rasp', u'rasp_works', u'works_fine', u'fine_bit', u'bit_slow', u'slow_practical', u'practical_purposes', u'purposes_eager', u'eager_compiling', u'compiling_like', u'like_use', u'use_latest', u'latest_best', u'best_tools']


### Putting it together

In [24]:
TRIGRAMS = tuple(tlz.pipe(CLEAN_TXT,
                 t2t.lower,
                 t2t.simple_split,
                 t2t.filter_longer_than(25),  # curried 
                 tlz.compose(tlz.concat, t2t.map_c(t2t.splitter_of_words)),
                 t2t.filter_shorter_than(3),  # curried
                 t2t.filter_stopwords,
                 t2t.sliding_window_c(3),  # curried
                 t2t.map_c(t2t.join_strings("_"))))  # A lot of curry
print(TRIGRAMS)

(u'build_gcc_toolchain', u'gcc_toolchain_cross', u'toolchain_cross_compiling', u'cross_compiling_asked', u'compiling_asked_question', u'asked_question_stack', u'question_stack_overflow', u'stack_overflow_like', u'overflow_like_know', u'like_know_managed', u'know_managed_build', u'managed_build_gcc', u'build_gcc_toolchain', u'gcc_toolchain_arm', u'toolchain_arm_cross', u'arm_cross_compilation', u'cross_compilation_x86', u'compilation_x86_x86', u'x86_x86_linux', u'x86_linux_host', u'linux_host_instructins', u'host_instructins_building', u'instructins_building_gcc', u'building_gcc_source', u'gcc_source_available', u'source_available_cross', u'available_cross_compilers', u'cross_compilers_pre', u'compilers_pre_gcc', u'pre_gcc_versions', u'gcc_versions_latest', u'versions_latest_compiling', u'latest_compiling_rasp', u'compiling_rasp_works', u'rasp_works_fine', u'works_fine_bit', u'fine_bit_slow', u'bit_slow_practical', u'slow_practical_purposes', u'practical_purposes_eager', u'purposes_eage

# Counts

In [25]:
print(t2n.freq(list(t2t.unigram(CLEAN_TXT))))

[(u'pre', 1), (u'managed', 1), (u'overflow', 1), (u'purposes', 1), (u'linux', 1), (u'tools', 1), (u'arm', 1), (u'best', 1), (u'gcc', 4), (u'slow', 1), (u'source', 1), (u'fine', 1), (u'question', 1), (u'cross', 3), (u'eager', 1), (u'compiling', 3), (u'build', 2), (u'rasp', 1), (u'compilers', 1), (u'available', 1), (u'x86', 2), (u'compilation', 1), (u'use', 1), (u'host', 1), (u'know', 1), (u'bit', 1), (u'stack', 1), (u'building', 1), (u'toolchain', 2), (u'like', 2), (u'versions', 1), (u'practical', 1), (u'instructins', 1), (u'works', 1), (u'asked', 1), (u'latest', 2)]


In [5]:
tkn_maker = tlz.compose(tuple, t2t.uni_and_bigram_tuples, r2t.decode_and_fix, r2t.remove_html_bits)

In [6]:
POST_STREAM = list(r2t.get_text_from_xml_file("../data/stackoverflow/Posts.xml"))

**Serial Version**

In [29]:
%time OUT0 = txt2m.total_counts(tlz.map(tkn_maker, POST_STREAM))

CPU times: user 1min 9s, sys: 583 ms, total: 1min 10s
Wall time: 1min 10s


In [7]:
%time OUT0 = tlz.pipe(tlz.map(tkn_maker, POST_STREAM), txt2m.total_counts)

CPU times: user 1min 16s, sys: 1.16 s, total: 1min 17s
Wall time: 1min 20s


**Parallel Version**

In [9]:
from multiprocessing import Pool
p = Pool(8)
pmap = p.map

In [10]:
%time OUT1 = txt2m.total_counts(pmap(tkn_maker, POST_STREAM))

CPU times: user 2.86 s, sys: 680 ms, total: 3.54 s
Wall time: 34.9 s


In [10]:
%time OUT1 = tlz.pipe(pmap(tkn_maker, POST_STREAM), txt2m.total_counts)

CPU times: user 3.51 s, sys: 907 ms, total: 4.42 s
Wall time: 36.8 s


In [11]:
len(OUT1)

789541

In [17]:
sorted(OUT1, key=lambda t: t[1], reverse=True)[250:300]

[(u'problems', 1066),
 (u'wrong', 1066),
 (u'enable', 1065),
 (u'lot', 1065),
 (u'powered', 1060),
 (u'won', 1056),
 (u'long', 1055),
 (u'download', 1035),
 (u'best', 1033),
 (u'info', 1021),
 (u'service', 1016),
 (u'similar', 1009),
 (u'idea', 1003),
 (u'interfaces', 1003),
 (u'sound', 997),
 (u'commands', 996),
 (u'return', 985),
 (u'gpio_pins', 984),
 (u'gui', 983),
 (u'function', 981),
 (u'media', 968),
 (u'include', 965),
 (u'reading', 964),
 (u'worked', 961),
 (u'ports', 961),
 (u'connecting', 951),
 (u'signal', 948),
 (u'configure', 948),
 (u'light', 947),
 (u'raspbmc', 946),
 (u'motion', 946),
 (u'debug', 938),
 (u'daemon', 936),
 (u'free', 935),
 (u'message', 935),
 (u'remove', 934),
 (u'specific', 932),
 (u'copy', 930),
 (u'little', 930),
 (u'post', 922),
 (u'isn', 919),
 (u'machine', 919),
 (u'format', 912),
 (u'order', 912),
 (u'standard', 911),
 (u'correct', 909),
 (u'load', 907),
 (u'maybe', 894),
 (u'fix', 889),
 (u'cards', 883)]

In [None]:
tkn_maker_uni = tlz.compose(tuple, t2t., r2t.decode_and_fix, r2t.remove_html_bits)
%time OUT2 = txt2m.total_counts(pmap(tkn_maker_uni, POST_STREAM))