# 1. Create Synthetic data (V5)

In [1]:
import torch
import numpy as np
import pandas as pd

import torch.nn.functional as F
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical

In [2]:
def is_in(z, a=1, b=3):
    return (z >= a ) * (z <= b)

xdim = 100  # number of billing codes
N = 20_0000  # number of patients
T = 5  # total time steps

# model parameters
Psi = torch.arange(0, xdim * 0.01, 0.01)

# random variables
Z_true = torch.zeros((N, T)) # latent 
X = torch.zeros((N, T)) # observed
X_onehot = torch.zeros((N, T, xdim))
Y = torch.zeros((N,1))

for t in range(0,T):
    # Zit | Zi,t-1, Yi
    meanz = (0.9 * Z_true[:, t - 1]) if t != 0 else torch.zeros((N,))
    Zt = Normal(meanz, 1)
    Z_true[:, t] = Zt.sample()
    
    # Xit | Zit
    Psi_z = Z_true[:,t].view(N, 1) * Psi.view(1, xdim)
    PX = F.softmax(Psi_z, dim = 1)
    Xt = Categorical(PX)
    Xit = Xt.sample()
    X[:, t] = Xit
    X_onehot[:, t] = F.one_hot(Xit, num_classes = xdim)

for t in range(T-2):
    Y[:,0] += is_in(Z_true[:,t]) * is_in(Z_true[:,t+1]) * is_in(Z_true[:,t+2])

# Truncate the Y values which are greater than 1.
Y = torch.cat((Y, torch.ones((N,1))), 1).min(dim = 1).values


# Save data to txt file, separate with whitespace ' '.
X_ = X.cpu().data.numpy().astype(int)
Y_ = Y.cpu().data.numpy().astype(int)

np.savetxt('synthetic_X.txt', X_, delimiter=' ', fmt='%s')

# 2 Create customized Tokenizer.

This task is separated into three substeps:

*NOTE: Maybe I didn't do it completely correct in this version. Now it just can work.*

1. Create a word-level vocabulary based on `Tokenizer` library. See [Issue 232](https://github.com/huggingface/tokenizers/issues/232), [Issue 243](https://github.com/huggingface/tokenizers/issues/243#issuecomment-617860020) for codes.

2. Create our own `WordLevelTokenizer` based on `Tokenizer` library. 
See [whitespace/word level](https://github.com/huggingface/tokenizers/issues/244) for codes.

3. Create our own `WordLevelBertTokenizer` that can be used to train Transformers(Bert/Roberta) by wrapping the class from last step based on `Transformer` library. See [Why doesn't this library share the same tokenizer api as the transformers library?](https://github.com/huggingface/tokenizers/issues/259) for tutorial to do this.

## 2.1 Create a word-level vocabulary

In [3]:
# %%time 
import time
import os
from pathlib import Path

from tokenizers import Tokenizer, trainers
from tokenizers.models import BPE, WordLevel
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.processors import BertProcessing

# We build our custom tokenizer:
tokenizer = Tokenizer(WordLevel()) 
tokenizer.normalizer = Lowercase()
tokenizer.pre_tokenizer = WhitespaceSplit()

# We can train this tokenizer by giving it a list of path to text files:
trainer = trainers.BpeTrainer(special_tokens=
        ["[SEP]",
        "[PAD]",
        "[CLS]",
        "[UNK]",
        "[MASK]",
    ])


files = [str(x) for x in Path(".").glob("**/synthetic_X.txt")]
print(files)

tokenizer.train(trainer, files)

# Add post_processor.

tokenizer.post_processor = BertProcessing(
    ("[SEP]", tokenizer.token_to_id("[SEP]")), # SEP 
    ("[CLS]", tokenizer.token_to_id("[CLS]")), # CLS
)

    
if not os.path.exists('Synthetic'):
    os.makedirs('Synthetic')
    
# Set truncation.
tokenizer.enable_truncation(max_length=128)

# And now it is ready, we can save the vocabulary with
tokenizer.model.save("./Synthetic")

# And simply use it
tokenizer.encode('30, 63, 48, 31, 31').tokens

['synthetic_X.txt']


['[CLS]', '6', '64', '9', '0', '48', '[SEP]']

In [24]:
X_[0,]

tokenizer.encode('30, 63, 48, 31, 31').tokens

['[CLS]', '30,', '63,', '48,', '31,', '31', '[SEP]']

## 2.2 Create WordLevelTokenizer

In [4]:
from typing import Optional, Union

import tokenizers
from tokenizers.models import WordLevel, TokenizedSequence, TokenizedSequenceWithOffsets
from tokenizers import Tokenizer, Encoding, AddedToken
from tokenizers.normalizers import Lowercase, Sequence, unicode_normalizer_from_str

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import CharDelimiterSplit, WhitespaceSplit
from tokenizers.implementations import BaseTokenizer 


class WordLevelTokenizer(BaseTokenizer):
    """ WordLevelBertTokenizer
    Represents a simple word level tokenization for BERT.
    """

    def __init__(
        self,
        vocab_file: Optional[str] = None,
        unk_token: Union[str, AddedToken] = "[UNK]",
        sep_token: Union[str, AddedToken] = "[SEP]",
        cls_token: Union[str, AddedToken] = "[CLS]",
        pad_token: Union[str, AddedToken] = "[PAD]",
        mask_token: Union[str, AddedToken] = "[MASK]",
        
        
        lowercase: bool = False,
        unicode_normalizer: Optional[str] = None,
    ):
        if vocab_file is not None:
            tokenizer = Tokenizer(WordLevel(vocab_file, unk_token='[UNK]'))
        else:
            tokenizer = Tokenizer(WordLevel())

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(sep_token)) is not None:
            tokenizer.add_special_tokens([str(sep_token)])
        if tokenizer.token_to_id(str(cls_token)) is not None:
            tokenizer.add_special_tokens([str(cls_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if lowercase:
            normalizers += [Lowercase()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.WhitespaceSplit()

        if vocab_file is not None:
            cls_token_id = tokenizer.token_to_id(str(cls_token))
            if cls_token_id is None:
                raise TypeError("cls_token not found in the vocabulary")
                
            sep_token_id = tokenizer.token_to_id(str(sep_token))
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")


            tokenizer.post_processor = tokenizers.processors.BertProcessing(
                (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)
            )

        parameters = {
            "model": "WordLevel",
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "pad_token": pad_token,
            "mask_token": mask_token,
            "lowercase": lowercase,
            "unicode_normalizer": unicode_normalizer,
        }

        super().__init__(tokenizer, parameters)

## 2.3 Create WordLevelBertTokenizer

In [5]:
from transformers import PreTrainedTokenizerFast
from typing import List, Optional, Union

class WordLevelBertTokenizer(PreTrainedTokenizerFast):

    def __init__(
        self,
        tokenizer,
        bos_token="[CLS]",
        eos_token="[SEP]",
        sep_token="[SEP]",
        cls_token="[CLS]",
        unk_token="[UNK]",
        pad_token="[PAD]",
        mask_token="[MASK]",
        **kwargs
    ):
        super().__init__(
            tokenizer,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )
        
    # Copied from [BertTokenizer](https://huggingface.co/transformers/model_doc/bert.html?highlight=get_special_tokens_mask#transformers.BertTokenizer.get_special_tokens_mask)
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.

        Args:
            token_ids_0 (:obj:`List[int]`):
                List of ids.
            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Set to True if the token list is already formatted with special tokens for the model

        Returns:
            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formated with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]
    
    # Copied from [BertTokenizer](https://huggingface.co/transformers/model_doc/bert.html?highlight=get_special_tokens_mask#transformers.BertTokenizer.get_special_tokens_mask)
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        A BERT sequence has the following format:

        - single sequence: ``[CLS] X [SEP]``
        - pair of sequences: ``[CLS] A [SEP] B [SEP]``

        Args:
            token_ids_0 (:obj:`List[int]`):
                List of IDs to which the special tokens will be added
            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
                Optional second list of IDs for sequence pairs.

        Returns:
            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep


## 2.4 Instantize Tokenizes we have. 

In [6]:
tokenizer = WordLevelTokenizer('./Synthetic/vocab.json')
BertTokenizer = WordLevelBertTokenizer(tokenizer = tokenizer, unk_token='[UNK]')

# 3 Instantize Bert, Dataset, Data_collator, Trainer.

In [34]:
# !pip install -U numpy
import torch
from transformers import BertForMaskedLM
import os

# Specify visible CUDA for the script, try to avoid encounder out of memory issue.
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="7"

TRAIN_NEW_MODEL = True

if TRAIN_NEW_MODEL:
    
    from transformers import BertConfig
    
    config = BertConfig(
        vocab_size=len(BertTokenizer),
        max_position_embeddings=128,
        num_attention_heads=1,
        num_hidden_layers=1,
        type_vocab_size=1,
    )

else:
    
    # load a pre-trained model.
    from transformers import AutoConfig
    
    config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
    
model = BertForMaskedLM(config=config)

print(f'The Bert model contains {model.num_parameters()} parameters.')

The Bert model contains 8451945 parameters.


In [35]:
%%time
from transformers import LineByLineTextDataset

# Define a dataset.
# Each output is the encoded row.
dataset = LineByLineTextDataset(
    tokenizer=BertTokenizer,
    file_path="./synthetic_X.txt",
    block_size=128,
)

# Define a data-collator. 
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=BertTokenizer, mlm=True, mlm_probability=0.15,
)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.


CPU times: user 5.51 s, sys: 848 ms, total: 6.36 s
Wall time: 2.79 s


In [36]:
# for idx, tmp_data in enumerate(dataset):
#     print(idx)
#     print(tmp_data)
#     break

In [37]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./Synthetic/",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

In [38]:
%%time
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=6250.0, style=ProgressStyle(description_w…



CPU times: user 1min 44s, sys: 1.85 s, total: 1min 45s
Wall time: 1min 45s


TrainOutput(global_step=6250, training_loss=4.335336205024719)

In [39]:
trainer.save_model("./Synthetic")

In [30]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./Synthetic",
    tokenizer=BertTokenizer
)



In [31]:
# We can see that the Bert can fill the mask at least.
fill_mask("6 64 0 [MASK] 99")

[{'sequence': '[CLS] 6 64 0 99 99 [SEP]',
  'score': 0.012895897962152958,
  'token': 15,
  'token_str': '99'},
 {'sequence': '[CLS] 6 64 0 0 99 [SEP]',
  'score': 0.012775693088769913,
  'token': 5,
  'token_str': '0'},
 {'sequence': '[CLS] 6 64 0 4 99 [SEP]',
  'score': 0.011902562342584133,
  'token': 9,
  'token_str': '4'},
 {'sequence': '[CLS] 6 64 0 1 99 [SEP]',
  'score': 0.011871114373207092,
  'token': 6,
  'token_str': '1'},
 {'sequence': '[CLS] 6 64 0 2 99 [SEP]',
  'score': 0.011860175058245659,
  'token': 7,
  'token_str': '2'}]