In [142]:
#from argparse import Namespace
import re
from typing import List
import dask.bag as db
from dask import delayed
from dask.distributed import Client
import torch

In [143]:
client = Client(asynchronous=True)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 49914 instead


In [144]:
filename = 'data/p1ch4/jane-austen/1342-0.txt'

In [145]:
raw_text0 = db.read_text(filename, encoding='utf8')
raw_text1 = await client.gather(raw_text0)

In [146]:
await client.submit(lambda x: x.take(10), raw_text1)

('\ufeffThe Project Gutenberg EBook of Pride and Prejudice, by Jane Austen\n',
 '\n',
 'This eBook is for the use of anyone anywhere at no cost and with\n',
 'almost no restrictions whatsoever.  You may copy it, give it away or\n',
 're-use it under the terms of the Project Gutenberg License included\n',
 'with this eBook or online at www.gutenberg.org\n',
 '\n',
 '\n',
 'Title: Pride and Prejudice\n',
 '\n')

In [147]:
#we're now going to take the text of each element and convert it to a dictionary, with the key as
#"text" and the value as the text
#in the book we split the file, but we don't do that here because it's already split as a dask bag
raw_text_future0 = client.scatter(raw_text1)
#raw_text2 = client.submit(lambda x: x.map(lambda y: y.split('\n')), await raw_text_future0)
raw_text2 = client.submit(lambda x: x.map(lambda y: {'text': y}), await raw_text_future0)
raw_text3 = await client.gather(raw_text2)

In [148]:
display_future0 = client.scatter(raw_text3)
await client.submit(lambda x: x.take(25), await display_future0)

({'text': '\ufeffThe Project Gutenberg EBook of Pride and Prejudice, by Jane Austen\n'},
 {'text': '\n'},
 {'text': 'This eBook is for the use of anyone anywhere at no cost and with\n'},
 {'text': 'almost no restrictions whatsoever.  You may copy it, give it away or\n'},
 {'text': 're-use it under the terms of the Project Gutenberg License included\n'},
 {'text': 'with this eBook or online at www.gutenberg.org\n'},
 {'text': '\n'},
 {'text': '\n'},
 {'text': 'Title: Pride and Prejudice\n'},
 {'text': '\n'},
 {'text': 'Author: Jane Austen\n'},
 {'text': '\n'},
 {'text': 'Posting Date: August 26, 2008 [EBook #1342]\n'},
 {'text': 'Release Date: June, 1998\n'},
 {'text': 'Last Updated: March 10, 2018\n'},
 {'text': '\n'},
 {'text': 'Language: English\n'},
 {'text': '\n'},
 {'text': 'Character set encoding: UTF-8\n'},
 {'text': '\n'},
 {'text': '*** START OF THIS PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE ***\n'},
 {'text': '\n'},
 {'text': '\n'},
 {'text': '\n'},
 {'text': '\n'})

In [210]:
#await client.submit(lambda x: x.take(25), raw_text5)
def clean_words(input_str: str) -> List:
    '''Takes an input string, removes a list of characters that will
    not work with deep learning, then returns a list'''
    punctuation = '.,;:""\'_-'
    word_list = input_str.lower().replace('\n', '').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

def add_clean_words_to_element(element):
    '''Takes an element and creates a field called cleaned text.'''
    element['cleaned text'] = clean_words(element['text'])
    return element

In [211]:
raw_text_future2 = client.scatter(raw_text3)
raw_text4 = client.submit(lambda x: x.map(add_clean_words_to_element),
                         await raw_text_future2)
raw_text5 = await client.gather(raw_text4)

In [212]:
display_future1 = client.scatter(raw_text5)
await client.submit(lambda x: x.take(15), await display_future1)

({'text': '\ufeffThe Project Gutenberg EBook of Pride and Prejudice, by Jane Austen\n',
  'cleaned text': ['\ufeffthe',
   'project',
   'gutenberg',
   'ebook',
   'of',
   'pride',
   'and',
   'prejudice',
   'by',
   'jane',
   'austen']},
 {'text': '\n', 'cleaned text': []},
 {'text': 'This eBook is for the use of anyone anywhere at no cost and with\n',
  'cleaned text': ['this',
   'ebook',
   'is',
   'for',
   'the',
   'use',
   'of',
   'anyone',
   'anywhere',
   'at',
   'no',
   'cost',
   'and',
   'with']},
 {'text': 'almost no restrictions whatsoever.  You may copy it, give it away or\n',
  'cleaned text': ['almost',
   'no',
   'restrictions',
   'whatsoever',
   'you',
   'may',
   'copy',
   'it',
   'give',
   'it',
   'away',
   'or']},
 {'text': 're-use it under the terms of the Project Gutenberg License included\n',
  'cleaned text': ['re-use',
   'it',
   'under',
   'the',
   'terms',
   'of',
   'the',
   'project',
   'gutenberg',
   'license',
   'included']

In [213]:
def make_word_list(element):
    element['word list'] = sorted(set(element['cleaned text']))
    return element

In [214]:
raw_text_future3 = client.scatter(raw_text5)
raw_text6 = client.submit(lambda x: x.map(make_word_list),
                         await raw_text_future3)
raw_text7 = await client.gather(raw_text6)

In [215]:
await client.submit(lambda x: x.take(3), raw_text7)

({'text': '\ufeffThe Project Gutenberg EBook of Pride and Prejudice, by Jane Austen\n',
  'cleaned text': ['\ufeffthe',
   'project',
   'gutenberg',
   'ebook',
   'of',
   'pride',
   'and',
   'prejudice',
   'by',
   'jane',
   'austen'],
  'word list': ['and',
   'austen',
   'by',
   'ebook',
   'gutenberg',
   'jane',
   'of',
   'prejudice',
   'pride',
   'project',
   '\ufeffthe']},
 {'text': '\n', 'cleaned text': [], 'word list': []},
 {'text': 'This eBook is for the use of anyone anywhere at no cost and with\n',
  'cleaned text': ['this',
   'ebook',
   'is',
   'for',
   'the',
   'use',
   'of',
   'anyone',
   'anywhere',
   'at',
   'no',
   'cost',
   'and',
   'with'],
  'word list': ['and',
   'anyone',
   'anywhere',
   'at',
   'cost',
   'ebook',
   'for',
   'is',
   'no',
   'of',
   'the',
   'this',
   'use',
   'with']})

In [216]:
#let's try pluck
words_in_line_future0 = client.scatter(raw_text7)
words_in_line0 = client.submit(lambda x: x.pluck('cleaned text'), await words_in_line_future0)
words_in_line1 = await client.gather(raw_text8)

In [217]:
await client.submit(lambda x: x.take(5), raw_text9)

(['\ufeffthe',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'pride',
  'and',
  'prejudice',
  'by',
  'jane',
  'austen'],
 [],
 ['this',
  'ebook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with'],
 ['almost',
  'no',
  'restrictions',
  'whatsoever',
  'you',
  'may',
  'copy',
  'it',
  'give',
  'it',
  'away',
  'or'],
 ['re-use',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'project',
  'gutenberg',
  'license',
  'included'])

In [226]:
words_in_line2 = await client.compute(words_in_line1)
word_list0 = await client.compute(words_in_line1.flatten())

In [227]:
word_list_future0 = client.scatter(word_list0)
word_list1 = client.submit(lambda x: sorted(list(set(x))), await word_list_future0)
word_list2 = await client.gather(word_list1)

In [228]:
#raw_text12
#word_list2
#word_list_future0 = client.scatter(word_list2)
enumerated_word_list0 = client.submit(lambda x: enumerate(x), word_list2)
enumerated_word_list1 = await client.gather(enumerated_word_list0)

In [229]:
#
#word_list_future1 = client.scatter(enumerated_word_list1)
word2indexdict0 = client.submit(lambda x: {word: i for i, word in x},
                              enumerated_word_list1)
word2indexdict1 = await client.gather(word2indexdict0)

In [230]:
len(word2indexdict1), word2indexdict1['impossible']

(8831, 3980)

In [232]:
word_t = torch.zeros(len(words_in_line2), len(word2indexdict1))

In [261]:
words_in_line3 = [word for word in words_in_line2 if word]
        #word_t[i][word_index] = 1
        #print(index, word)
for i, k in enumerate(words_in_line3[:10]):
    print(i, k)

0 ['\ufeffthe', 'project', 'gutenberg', 'ebook', 'of', 'pride', 'and', 'prejudice', 'by', 'jane', 'austen']
1 ['this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with']
2 ['almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or']
3 ['re-use', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included']
4 ['with', 'this', 'ebook', 'or', 'online', 'at', 'www.gutenberg.org']
5 ['title', 'pride', 'and', 'prejudice']
6 ['author', 'jane', 'austen']
7 ['posting', 'date', 'august', '26', '2008', '[ebook', '#1342]']
8 ['release', 'date', 'june', '1998']
9 ['last', 'updated', 'march', '10', '2018']


In [301]:
#word_list = [(i, words, torch.zeros(len(words), len(word2indexdict1))) for i, words in enumerate(words_in_line3)]
del word_list

In [300]:
words_in_line = ['a',
  'large',
  'party',
  'for',
  'the',
  'ball',
  'and',
  'a',
  'report',
  'soon',
  'followed',
  'that',
  'mr',
  'bingley']

In [303]:
word_t = torch.zeros(len(words_in_line), len(word2indexdict1))

In [307]:
for i, word in enumerate(words_in_line):
    word_index = word2indexdict1[word]
    word_t[i][word_index] = 1
    print("{:2} {:4} {}".format(i, word_index, word))
print(word_t.shape)

 0  153 a
 1 4485 large
 2 5633 party
 3 3160 for
 4 7630 the
 5  807 ball
 6  498 and
 7  153 a
 8 6483 report
 9 7172 soon
10 3143 followed
11 7626 that
12 5117 mr
13  959 bingley
torch.Size([14, 8831])


  cluster.close(timeout=10)
  cluster.close(timeout=10)
