In [51]:
from functools import partial
import dask.bag as db
from dask import delayed
from dask.distributed import Client
import torch

In [52]:
client = Client(asynchronous=True)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 55893 instead


In [53]:
#new_bag = db.read_text('data/p1ch4/jane-austen/1342-0.txt')
def read_text(filename: str, encoding: str) -> str:
    '''Takes a filename and returns a string of text. We created
    this function to be able to call it in a delayed manner.'''
    with open(filename, encoding=encoding) as f:
        return f.read()
    
read_text_utf8 = partial(read_text, encoding='utf8')

In [54]:
text = delayed(read_text_utf8)('data/p1ch4/jane-austen/1342-0.txt')

In [58]:
lines = delayed(lambda x: x.split('\n'))(text)

In [70]:
#lines2 = await client.compute(lines)
line0 = delayed(lambda x: x[200])(lines)
line = await client.compute(line0)

In [71]:
letter_t1 = client.submit(lambda x: torch.zeros(len(x), 128), line)
letter_t2 = await client.gather(letter_t1)

In [72]:
letter_t2, letter_t2.shape

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 torch.Size([70, 128]))

In [78]:
for i, letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter) < 128 else 0
    letter_t2[i][letter_index] = 1

In [79]:
letter_t2

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [94]:
def clean_words(input_str):
    '''Takes a list of words and cleans them, replacing escaped characters'''
    punctuation = '.,;:"“_-'
    word_list = input_str.lower().replace('\n', ' ').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

In [99]:
words_in_line0 = delayed(clean_words(line))

In [96]:
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

In [100]:
words_in_line = await client.compute(words_in_line)

In [108]:
word_list = sorted(set(clean_words(await client.compute(text))))
word2index_dict = {word: i for (i, word) in enumerate(word_list)}

In [109]:
len(word2index_dict), word2index_dict['impossible']

(8615, 4003)

In [110]:
word_t = torch.zeros(len(words_in_line), len(word2index_dict))
for i, word in enumerate(words_in_line):
    word_index = word2index_dict[word]
    word_t[i][word_index] = 1
    print('{:2} {:4} {}'.format(i, word_index, word))

 0 4003 impossible
 1 5144 mr
 2  928 bennet
 3 4003 impossible
 4 8385 when
 5 3912 i
 6  465 am
 7 5308 not
 8  254 acquainted
 9 8463 with
10 3781 him
