In [1]:
import glob
import os
import sys
import numpy as np
import pandas as pd
import torch
import pytorch_lightning as pl
from transformers import AutoConfig, AutoTokenizer, AutoModelForQuestionAnswering
from typing import NamedTuple, Dict, List, Callable
from tqdm import tqdm

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device:{device}')

#Additional Info when using cuda
if device.type == 'cuda':
    for i in range(torch.cuda.device_count()):
        print(f"{i}: {torch.cuda.get_device_name(i)}")
        print('Memory Allocated:\t', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Memory Cached:\t\t', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')

Using device:cuda
0: NVIDIA GeForce GTX 1060 6GB
Memory Allocated:	 0.0 GB
Memory Cached:		 0.0 GB


In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
pl.seed_everything(31)

Global seed set to 31


31

In [4]:
sp = AutoTokenizer.from_pretrained("pretrained/deepset/xlm-roberta-base-squad2")
input_keys = ["labels"] + sp.model_input_names
print(f"{repr(sp)}\ninput_keys={input_keys}")

PreTrainedTokenizerFast(name_or_path='pretrained/deepset/xlm-roberta-base-squad2', vocab_size=250002, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'})
input_keys=['labels', 'input_ids', 'attention_mask']


In [5]:
wp = AutoTokenizer.from_pretrained("pretrained/google/electra-small-discriminator")
input_keys = ["labels"] + wp.model_input_names
print(f"{repr(wp)}\ninput_keys={input_keys}")

PreTrainedTokenizerFast(name_or_path='pretrained/google/electra-small-discriminator', vocab_size=30522, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
input_keys=['labels', 'input_ids', 'token_type_ids', 'attention_mask']


In [6]:
bpe = AutoTokenizer.from_pretrained("pretrained/roberta-base")
input_keys = ["labels"] + bpe.model_input_names
print(f"{repr(bpe)}\ninput_keys={input_keys}")

PreTrainedTokenizerFast(name_or_path='pretrained/roberta-base', vocab_size=50265, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})
input_keys=['labels', 'input_ids', 'attention_mask']


In [7]:
tokenizers = {
    "wordpiece": wp,
    "sentencepiece": sp,
    "byte_pair_encoding": bpe,
}

In [8]:
s1 = ["question question"]
s2 = ["one two three"]
for name, tokenizer in tokenizers.items():
    print(f"========\n{name}")
    x = tokenizer(
        s1, 
        s2, 
        truncation="only_second",
        max_length=10,
        padding="max_length",
        stride=0,
        return_overflowing_tokens=True,
        return_token_type_ids=True,
        return_special_tokens_mask=True,
    )
    print(x.keys())
    for input_ids in x["input_ids"]:
        print(tokenizer.convert_ids_to_tokens(input_ids))
    special_tokens_mask = x.pop("special_tokens_mask")
    print(f"special_tokens_mask={repr(special_tokens_mask)}")
    token_type_ids = x.pop("token_type_ids")
    print(f"token_type_ids={repr(token_type_ids)}")
    overflow_to_sample_mapping = x.pop("overflow_to_sample_mapping")
    print(f"{len(overflow_to_sample_mapping)} overflow_to_sample_mapping={repr(overflow_to_sample_mapping)}")

wordpiece
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'overflow_to_sample_mapping'])
['[CLS]', 'question', 'question', '[SEP]', 'one', 'two', 'three', '[SEP]', '[PAD]', '[PAD]']
special_tokens_mask=[[1, 0, 0, 1, 0, 0, 0, 1, 1, 1]]
token_type_ids=[[0, 0, 0, 0, 1, 1, 1, 1, 0, 0]]
1 overflow_to_sample_mapping=[0]
sentencepiece
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'overflow_to_sample_mapping'])
['<s>', '▁question', '▁question', '</s>', '</s>', '▁one', '▁two', '▁three', '</s>', '<pad>']
special_tokens_mask=[[1, 0, 0, 1, 1, 0, 0, 0, 1, 1]]
token_type_ids=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
1 overflow_to_sample_mapping=[0]
byte_pair_encoding
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'overflow_to_sample_mapping'])
['<s>', 'question', 'Ġquestion', '</s>', '</s>', 'one', 'Ġtwo', 'Ġthree', '</s>', '<pad>']
special_tokens_mask=[[1, 0, 0, 1, 1, 0, 0, 0, 1, 1]]
token_type_ids=[[

# Non-overlapping windows

In [9]:
s1 = ["question question"]
s2 = ["one two three four five six"]
for name, tokenizer in tokenizers.items():
    print(f"========\n{name}")
    x = tokenizer(
        s1, 
        s2, 
        truncation="only_second",
        max_length=10,
        padding="max_length",
        stride=0,
        return_overflowing_tokens=True,
        return_token_type_ids=True,
        return_special_tokens_mask=True,
    )
    print(x.keys())
    for input_ids in x["input_ids"]:
        print(tokenizer.convert_ids_to_tokens(input_ids))
    special_tokens_mask = x.pop("special_tokens_mask")
    print(f"special_tokens_mask={repr(special_tokens_mask)}")
    token_type_ids = x.pop("token_type_ids")
    print(f"token_type_ids={repr(token_type_ids)}")
    overflow_to_sample_mapping = x.pop("overflow_to_sample_mapping")
    print(f"{len(overflow_to_sample_mapping)} overflow_to_sample_mapping={repr(overflow_to_sample_mapping)}")

wordpiece
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'overflow_to_sample_mapping'])
['[CLS]', 'question', 'question', '[SEP]', 'one', 'two', 'three', 'four', 'five', '[SEP]']
['[CLS]', 'question', 'question', '[SEP]', 'six', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
special_tokens_mask=[[1, 0, 0, 1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 1, 1, 1, 1, 1]]
token_type_ids=[[0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 0, 0, 0, 0]]
2 overflow_to_sample_mapping=[0, 0]
sentencepiece
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'overflow_to_sample_mapping'])
['<s>', '▁question', '▁question', '</s>', '</s>', '▁one', '▁two', '▁three', '▁four', '</s>']
['<s>', '▁question', '▁question', '</s>', '</s>', '▁five', '▁six', '</s>', '<pad>', '<pad>']
special_tokens_mask=[[1, 0, 0, 1, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 1, 0, 0, 1, 1, 1]]
token_type_ids=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
2 overflow

# Overlapping windows

In [10]:
s1 = ["question question"]
s2 = ["one two three four five six seven eight nine"]
for name, tokenizer in tokenizers.items():
    print(f"========\n{name}")
    x = tokenizer(
        s1, 
        s2, 
        truncation="only_second",
        max_length=10,
        padding="max_length",
        stride=2,
        return_overflowing_tokens=True,
        return_token_type_ids=True,
        return_special_tokens_mask=True,
    )
    print(x.keys())
    for input_ids in x["input_ids"]:
        print(tokenizer.convert_ids_to_tokens(input_ids))
    special_tokens_mask = x.pop("special_tokens_mask")
    print(f"special_tokens_mask={repr(special_tokens_mask)}")
    token_type_ids = x.pop("token_type_ids")
    print(f"token_type_ids={repr(token_type_ids)}")
    overflow_to_sample_mapping = x.pop("overflow_to_sample_mapping")
    print(f"{len(overflow_to_sample_mapping)} overflow_to_sample_mapping={repr(overflow_to_sample_mapping)}")

wordpiece
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'overflow_to_sample_mapping'])
['[CLS]', 'question', 'question', '[SEP]', 'one', 'two', 'three', 'four', 'five', '[SEP]']
['[CLS]', 'question', 'question', '[SEP]', 'four', 'five', 'six', 'seven', 'eight', '[SEP]']
['[CLS]', 'question', 'question', '[SEP]', 'seven', 'eight', 'nine', '[SEP]', '[PAD]', '[PAD]']
special_tokens_mask=[[1, 0, 0, 1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 0, 1, 1, 1]]
token_type_ids=[[0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1, 0, 0]]
3 overflow_to_sample_mapping=[0, 0, 0]
sentencepiece
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'overflow_to_sample_mapping'])
['<s>', '▁question', '▁question', '</s>', '</s>', '▁one', '▁two', '▁three', '▁four', '</s>']
['<s>', '▁question', '▁question', '</s>', '</s>', '▁three', '▁four', '▁five', '▁six', '</s>']
['<s>', '▁que