In [1]:
import os
import requests
import json

In [2]:
def read_squad(path):
    with open(path, "rb") as f:
        squad_dict = json.load(f)

    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict["data"]:
        for passage in group["paragraphs"]:
            context = passage["context"]
            for qa in passage["qas"]:
                question = qa["question"]
                if "plausible_answers" in qa.keys():
                    access = "plausible_answers"
                else:
                    access = "answers"
                for answer in qa["answers"]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

In [4]:
(train_contexts, train_questions, train_answers) = \
    read_squad('data/f21mp_test.json')
    
(val_contexts, val_questions, val_answers) = \
    read_squad('data/f21mp_train.json')

In [5]:
from tqdm.auto import tqdm

text_data = []
file_count = 0

for sample in tqdm(train_contexts):
    sample = sample.replace('\n', '')
    text_data.append(sample)
    if len(text_data) == 10_000:
        # once we git the 10K mark, save to file
        with open(f'data/tokenizer/hosp_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
# after saving in 10K chunks, we will have ~2082 leftover samples, we save those now too
with open(f'data/tokenizer/hosp_{file_count}.txt', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(text_data))



100%|██████████| 18/18 [00:00<?, ?it/s]


In [6]:
from pathlib import Path

paths = [str(x) for x in Path('data/tokenizer/').glob('**/*.txt')]

paths[-5:]

['data\\tokenizer\\hosp_0.txt']

In [9]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer()

In [10]:
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=True)

# Customize training
tokenizer.train(files=paths, vocab_size=8192, min_frequency=2,
                show_progress=True,
                special_tokens=[
                                "<s>",
                                "<pad>",
                                "</s>",
                                "<unk>",
                                "<mask>",
])

In [11]:
tokenizer

Tokenizer(vocabulary_size=579, model=ByteLevelBPE, add_prefix_space=False, lowercase=True, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [None]:
tokenizer.save_model(tokenizer_folder)

In [12]:
import os

#os.mkdir('Robertahospitality')

tokenizer.save_model('Robertahospitality')

['Robertahospitality\\vocab.json', 'Robertahospitality\\merges.txt']

In [15]:
# Create the tokenizer using vocab.json and mrege.txt files
tokenizer = ByteLevelBPETokenizer(
    os.path.abspath(os.path.join('Robertahospitality','vocab.json')),
    os.path.abspath(os.path.join('Robertahospitality','merges.txt'))
)

In [16]:
# Prepare the tokenizer
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [17]:
# Show the tokens created
tokenizer.encode("mundo").tokens

['<s>', 'm', 'undo', '</s>']

In [18]:
TRAIN_BATCH_SIZE = 16    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 8    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 15        # number of epochs to train (default: 10)
LEARNING_RATE = 1e-4    # learning rate (default: 0.001)
WEIGHT_DECAY = 0.01
SEED = 42               # random seed (default: 42)
MAX_LEN = 128
SUMMARY_LEN = 7

In [19]:
# Check that we have a GPU
!nvidia-smi

Thu Nov 25 09:51:39 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 462.31       Driver Version: 462.31       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla M60          WDDM  | 00000001:00:00.0 Off |                  Off |
| N/A   39C    P8    17W / 150W |   7437MiB /  8192MiB |     13%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [20]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

In [21]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=8192,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [22]:
from transformers import RobertaForQuestionAnswering

model = RobertaForQuestionAnswering(config=config)
print('Num parameters: ',model.num_parameters())

Num parameters:  49217282


In [23]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained('Robertahospitality', max_len=MAX_LEN)

file Robertahospitality\config.json not found
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.
file Robertahospitality\config.json not found
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


In [13]:

# test our tokenizer on a simple sentence
tokens = tokenizer('Mundo')

In [14]:
print(tokens)

{'input_ids': [0, 408, 83, 2], 'attention_mask': [1, 1, 1, 1]}


In [56]:
tokenizer.convert_ids_to_tokens(tokens['input_ids'])

['<s>', 'M', 'undo', '</s>']

In [None]:
TRAIN_BATCH_SIZE = 16    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 8    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 15        # number of epochs to train (default: 10)
LEARNING_RATE = 1e-4    # learning rate (default: 0.001)
WEIGHT_DECAY = 0.01
SEED = 42               # random seed (default: 42)
MAX_LEN = 128
SUMMARY_LEN = 7