In [1]:
# 22GB File that contains all document text
msdocs_path = r'/home/ubuntu/efs/data/msmarco_docs/msmarco-docs.tsv'

In [None]:
with open(msdocs_path, 'rt') as dfile:
    docs = []
    for _ in range(5):
        doc = dfile.readline().split('\t')
        docs.append({'docid': doc[0], 'url': doc[1], 'title': doc[2], 'text': doc[3]})

In [None]:
docs[3]

In [2]:
import nltk
import nltk.tokenize as tk
import numpy as np
import transformers

# Must run this once before using sentence tokenizer.
# nltk.download('punkt')

In [3]:
t5_tokenizer = transformers.T5Tokenizer.from_pretrained(
    'castorini/doc2query-t5-base-msmarco')

In [6]:
t5_tokenizer.convert_tokens_to_ids(t5_tokenizer.pad_token)

0

In [7]:
t5_tokenizer.convert_ids_to_tokens(0)

'<pad>'

In [116]:
def split_doc(doc, max_len=512, tgt_len=256, min_len=128):
    """Splits document at sentence boundaries and tokenizes for T5.
    
    Args
        doc: String, the document to be split
        max_len: int, the maximum allowed sequence length. Sentences
            will be split if needed to stay below this limit.
        tgt_len: int, target sequence length. `split_doc` will not add
            additional sentences once this length is achieved.
        min_len: int, minimum allowed sequence length. Sentences will
            be split if needed to stay above this limit.
            
    Returns
        A list of dictionaries, with one dictionary per passage. The keys are:
            pos: passage position within document, starting at 0.
            input_ids: token ID values generated by T5 tokenizer
            attention_mask: List of ones that are the same length as
                the input_ids.
    """
    # Create list of (sentence position, sentence length, list of T5 input IDs, passage_id)
    tokenized_sentences = [t5_tokenizer(sentence)['input_ids']
                           for sentence in tk.sent_tokenize(doc)]
    sentence_data = [(pos, len(sentence), sentence)
                 for pos, sentence in enumerate(tokenized_sentences)]
    
    print('Total Tokens:', sum([sentence[1] for sentence in sentence_data]))
    passage = []    
    passages = []
    for sentence in sentence_data:
        curr_len = len(passage) + sentence[1]
        if curr_len <= tgt_len:        # Not long enough - add another sentence
            passage.extend(sentence[2]) 
        elif curr_len <= max_len:      # Just right, start another passage
            passage.extend(sentence[2])
            passages.append(passage)   
            passage = []
        else:                         # Oh-oh, above max length
            if len(passage) > min_len:  # Reached min length, start next pass.
                passages.append(passage)
                passage = sentence[2]
            else:            # Did not reach min len, break up sentence.
                sentence_break = max_len - len(passage)
                passage.extend(sentence[2][:sentence_break])
                passages.append(passage)
                passage = sentence[2][sentence_break:]
            while len(passage) > max_len:  # Sentence longer than max_len
                passages.append(passage[:max_len])
                passage = passage[max_len:]
    if len(passage) > 0:  # Don't forget to append final passage.
        passages.append(passage)
    return [{'position': pos,
             'input_ids': passage,
             'attention_mask': [1] * len(passage)}
            for pos, passage in enumerate(passages)]

In [121]:
passages = split_doc(docs[3]['text'])
[len(psg['input_ids']) for psg in passages]
sum([len(psg['input_ids']) for psg in passages])

Total Tokens: 1753


1753

In [122]:
t5_tokenizer.batch_decode([psg['input_ids'] for psg in passages])

['Global Business Dubai Opens a Tower to Beat All By LANDON THOMAS Jr. JAN. 4, 2010A visitor gets a view of Dubai from the 124th floor of Burj Khalifa, the world’s tallest building, on Monday.</s> Ali Haider/European Pressphoto Agency Burdened by debt and a devastating real estate crash, Dubai is doing what it does best: doubling down.</s> Just one month after a close brush with bankruptcy, Dubai celebrated the opening of the world’s tallest building on Monday — a rocket-shaped edifice that soars 2,717 feet and has views that reach 60 miles.</s> The glittering celebration may have been an attempt by Dubai’s ruler, Sheik Mohammed bin Rashid al-Maktoum, to shift the focus from Dubai’s current economic troubles to a future filled with more promise.</s> All the same, the tower’s success by no means signals a recovery in Dubai’s beaten-down real estate market, where prices have collapsed by as much as 50 percent and many developers are having trouble finding occupants for their buildings.</

In [111]:
docs[1]['text']

'"School-Age Kids Growth & Development Developmental Milestones and Your 8-Year-Old Child8-Year-Olds Are Expanding Their Worlds By Katherine Lee | Reviewed by Joel Forman, MDUpdated February 10, 2018Share Pin Email Print Eight-year-olds are becoming more confident about themselves and who they are. At age 8, your child will likely have developed some interests and hobbies and will know what he or she likes or doesn\'t like. At the same time, children this age are learning more about the world at large and are also better able to navigate social relationships with others more independently, with less guidance from parents. At home, 8-year-olds are able to tackle more complicated household chores and take on more responsibility for taking care of themselves, even helping out with younger siblings. In general, according to the CDC, these are some changes you may see in your child: Shows more independence from parents and family. Starts to think about the future. Understands more about his

In [123]:
passages

[{'position': 0,
  'input_ids': [3699,
   1769,
   9959,
   2384,
   7,
   3,
   9,
   10677,
   12,
   14117,
   432,
   938,
   3,
   21147,
   4170,
   332,
   6299,
   23010,
   8206,
   5,
   446,
   5033,
   5,
   6464,
   2735,
   188,
   7019,
   2347,
   3,
   9,
   903,
   13,
   9959,
   45,
   8,
   3,
   22504,
   189,
   1501,
   13,
   4152,
   354,
   12877,
   40,
   99,
   9,
   6,
   8,
   296,
   22,
   7,
   5065,
   222,
   740,
   6,
   30,
   2089,
   5,
   1,
   5429,
   13758,
   588,
   87,
   11351,
   152,
   3373,
   17720,
   7038,
   4152,
   26,
   4632,
   57,
   2814,
   11,
   3,
   9,
   18827,
   490,
   2052,
   8420,
   6,
   9959,
   19,
   692,
   125,
   34,
   405,
   200,
   10,
   3,
   30421,
   323,
   5,
   1,
   1142,
   80,
   847,
   227,
   3,
   9,
   885,
   6453,
   28,
   14160,
   6,
   9959,
   9443,
   8,
   2101,
   13,
   8,
   296,
   22,
   7,
   5065,
   222,
   740,
   30,
   2089,
   3,
   318,
   3,
   9,
   15721,
   