In [10]:
import os
import ray
import numpy as np
from transformers import RobertaTokenizer
from corpus_loader import SentenceSegmenter, CorpusLoader

In [2]:
ray.init()

2020-02-17 23:58:44,234	INFO resource_spec.py:212 -- Starting Ray with 32.42 GiB memory available for workers and up to 16.22 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


{'node_ip_address': '192.168.50.100',
 'redis_address': '192.168.50.100:25724',
 'object_store_address': '/tmp/ray/session_2020-02-17_23-58-44_233721_31054/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-02-17_23-58-44_233721_31054/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2020-02-17_23-58-44_233721_31054'}

In [3]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [4]:
corpus_loader = CorpusLoader(tokenizer, max_seq_length=128, corpus_path="/data/SECTOR/")

8


In [5]:
class CorpusIterator:
    def __init__(self, processed_docs):
        self.processed_docs = processed_docs
        self.total_num_docs = len(processed_docs)

    def __iter__ (self):
        # shuffle the indices
        indices = np.arange(self.total_num_docs)
        np.random.shuffle(indices)

        for doc_index in indices:
            # randomly sample a document
            doc = self.processed_docs[doc_index]

            for i, segment in enumerate(doc):
                # output if the segment is the start of the document
                yield segment, i==0

In [14]:
class CorpusBatchIterator:
    def __init__(self, tokenizer, corpus_path:str, batch_size:int, max_seq_length:int, rank:int = 0):
        """
        Args:
            corpus_path: directory path to store the corpus sectors
            rank: for distributed learning.
        """ 
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.current_sector_id = rank
        self.corpus_loader = CorpusLoader(tokenizer, 
                                          max_seq_length=max_seq_length, 
                                          corpus_path=corpus_path)
        self.total_num_sectors = len(os.listdir(corpus_path))
        
        # process the data and save it into cache
    
    def __iter__(self):
        iterators = self.create_corpus_iterators(self.current_sector_id)
        
        while True:
            
            try:
                # TODO: extend it with Ray
                batch = [next(iterators[i]) for i in range(self.batch_size)]
                yield batch
            except StopIteration:
                # after the iterator finishes, load the next sector
                # update self.current_sector_id
                self.current_sector_id = (rank + 1) % self.total_num_sectors
                iterators = self.create_corpus_iterators(self.current_sector_id)
                
    def create_corpus_iterators(self, corpus_sector_id):
        processed_docs = self.corpus_loader.load_sector(self.current_sector_id)
        iterators = [iter(CorpusIterator(processed_docs)) for i in range(self.batch_size)]
        return iterators

In [7]:
# processed_docs = corpus_loader.load_sector(0)

Loadding Cache


In [8]:
# iterator = CorpusIterator(processed_docs)

In [16]:
corpus_iter = CorpusBatchIterator(tokenizer, corpus_path="/data/SECTOR/", batch_size=2, max_seq_length=128)

8


In [17]:
next(iter(corpus_iter))

Loadding Cache


[(['New',
   'ĠYork',
   'ĠGiants',
   'Ġquarterback',
   'ĠEli',
   'ĠManning',
   'Ġand',
   'Ġhis',
   'Ġwife',
   ',',
   'ĠAbby',
   ',',
   'Ġhave',
   'Ġpledged',
   'Ġ$',
   '1',
   'Ġmillion',
   'Ġto',
   'Ġthe',
   'ĠChildren',
   "'s",
   'Ġof',
   'ĠMississippi',
   'Ġ$',
   '100',
   'Ġmillion',
   'Ġcapital',
   'Ġcampaign',
   '.',
   'The',
   'ĠMann',
   'ings',
   'Ġwill',
   'Ġalso',
   'Ġlend',
   'Ġtheir',
   'Ġnames',
   'Ġand',
   'Ġfaces',
   'Ġfor',
   'Ġpublicity',
   'Ġfor',
   'Ġthe',
   'Ġcampaign',
   'Ġand',
   'Ġwill',
   'Ġserve',
   'Ġas',
   'Ġhonorary',
   'Ġchairs',
   'Ġon',
   'Ġthe',
   'Ġcampaign',
   'Ġfundraising',
   'Ġcommittee',
   '.',
   'The',
   'Ġgoal',
   'Ġof',
   'Ġthe',
   'Ġproject',
   'Ġis',
   'Ġto',
   'Ġexpand',
   'Ġand',
   'Ġupdate',
   'Ġthe',
   'Ġneon',
   'atal',
   'Ġintensive',
   'Ġcare',
   'Ġunit',
   ',',
   'Ġadd',
   'Ġmore',
   'Ġpediatric',
   'ĠIC',
   'U',
   'Ġrooms',
   'Ġand',
   'Ġsurgical',
   'Ġsuite

In [None]:
os.cpu_count()

In [None]:
list(range(0, 11, 10//2))