In [67]:
import os
from transformers import BertTokenizer, RobertaTokenizer

In [39]:
def sents_to_segments(doc_sentences):

    token_segments = []
    current_seq = []

    for count, sent in enumerate(doc_sentences):
        if count > 0:
            sent = " " + sent

        token_sent = tokenizer.tokenize(sent)

        if len(token_sent) > max_seq_length:
            # append last sequence
            token_segments.append(current_seq)

            for i in range(0, len(token_sent) - max_seq_length, max_seq_length):
                token_segments.append(token_sent[i:i+max_seq_length])

            # assign the current seq the tail of token_sent
            current_seq = token_sent[i+max_seq_length:i+max_seq_length*2]
            continue

        if (len(current_seq) + len(token_sent)) > max_seq_length:
            token_segments.append(current_seq)
            current_seq = token_sent
        else:
            current_seq = current_seq + token_sent
    
    if len(current_seq) > 0:
        token_segments.append(current_seq)
    
    # remove empty segment
    token_segments = [seg for seg in token_segments if seg]
    
    return token_segments

In [53]:
def print_segment(seg):
    print(tokenizer.convert_tokens_to_string(seg))
    

In [69]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [81]:
tokenizer.tokenize("This is not good. I am a dog adsaasdasda")

['This',
 'Ġis',
 'Ġnot',
 'Ġgood',
 '.',
 'ĠI',
 'Ġam',
 'Ġa',
 'Ġdog',
 'Ġads',
 'a',
 'as',
 'd',
 'as',
 'da']

In [71]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [76]:
tokenizer.pad_token

'<pad>'

In [59]:
max_seq_length = 10

In [63]:
doc = ["i'm good.", "it not only a time.", "asdasd asko koa asdkaok asdkoas asdkoaskd asdkasodka asdko", "not my way"]

In [64]:
for seg in sents_to_segments(doc):
    print_segment(seg)

i'm good.it not only a time.
asdasd asko koa asd
kaok asdkoas asdkoask
d asdkasodka asdko
not my way


In [65]:
sents_to_segments(doc)

[['i', "'m", 'Ġgood', '.', 'it', 'Ġnot', 'Ġonly', 'Ġa', 'Ġtime', '.'],
 ['as', 'd', 'as', 'd', 'Ġask', 'o', 'Ġk', 'oa', 'Ġas', 'd'],
 ['ka', 'ok', 'Ġas', 'd', 'ko', 'as', 'Ġas', 'd', 'ko', 'ask'],
 ['d', 'Ġas', 'dk', 'as', 'odka', 'Ġas', 'd', 'ko'],
 ['not', 'Ġmy', 'Ġway']]