Installs

In [None]:
pip install tokenizers

Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/e9/ee/fedc3509145ad60fe5b418783f4a4c1b5462a4f0e8c7bbdbda52bdcda486/tokenizers-0.8.1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 9.0MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.8.1


In [None]:
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 16.6MB/s eta 0:00:01[K     |▉                               | 20kB 6.1MB/s eta 0:00:01[K     |█▎                              | 30kB 7.1MB/s eta 0:00:01[K     |█▊                              | 40kB 8.0MB/s eta 0:00:01[K     |██▏                             | 51kB 6.4MB/s eta 0:00:01[K     |██▋                             | 61kB 6.9MB/s eta 0:00:01[K     |███                             | 71kB 7.3MB/s eta 0:00:01[K     |███▍                            | 81kB 8.0MB/s eta 0:00:01[K     |███▉                            | 92kB 8.3MB/s eta 0:00:01[K     |████▎                           | 102kB 8.7MB/s eta 0:00:01[K     |████▊                           | 112kB 8.7MB/s eta 0:00:01[K     |█████▏                          | 122kB 8.7M

Imports

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import BertConfig, BertForMaskedLM, BertModel
from transformers import Trainer, TrainingArguments
from tokenizers import BertWordPieceTokenizer

In [None]:
from _dataset import BERT16SDataset
from _collator import DataCollatorForBertWordPieceTokenizer

Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
drive_path = './drive/My Drive/Colab Notebooks/NLP/model'

Check Resources

In [None]:
# Check that we have a GPU
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
# Check that PyTorch sees it
torch.cuda.is_available()

True

Prepare Model Config

In [None]:
vocab_size = 15621  # parallel to k=6 in classic k-mers (for this corpus)

In [None]:
config = BertConfig(
    vocab_size=vocab_size,
    hidden_size=256,
    intermediate_size=1024,
    num_hidden_layers=4,
    num_attention_heads=4,
    max_position_embeddings=512
)

Create BERT model

In [None]:
model = BertForMaskedLM(config=config)

print(f"BERT model has {model.num_parameters()/10**6}M parameters")

BERT model has 7.437829M parameters


Create Datatset

In [None]:
vocab_path = os.path.join(drive_path, 'vocab.txt')
data_path = os.path.join(drive_path, 'SILVA_parsed_V2.tsv')

In [None]:
dataset = BERT16SDataset(
    vocab_path=vocab_path,
    data_path=data_path,
    block_size=512
)

  if self.run_code(code, result):


Crete Data Collator

In [None]:
tokenizer = BertWordPieceTokenizer(
    vocab_path,
    handle_chinese_chars=False,
    lowercase=False,
    unk_token="[UNK]",
    sep_token="[SEP]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    mask_token="[MASK]")

tokenizer.enable_truncation(512)
tokenizer.enable_padding(length=512)

In [None]:
len(tokenizer.get_vocab())

15621

In [None]:
data_collator = DataCollatorForBertWordPieceTokenizer(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
model(data_collator.collate_batch([dataset[0]])['input_ids'])[0].shape

torch.Size([1, 512, 15621])

Config Training

In [None]:
#model = BertForMaskedLM.from_pretrained(drive_path)

In [None]:
training_args = TrainingArguments(
    output_dir=os.path.join(drive_path, 'checkpoints'),
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=40,
    save_steps=10_000,
    logging_steps=500,
    save_total_limit=2,
    learning_rate=5e-4,
)

In [None]:
from typing import Callable, Dict, List, Optional, Tuple
import json
from transformers.optimization import get_constant_schedule_with_warmup

class CustomTrainer(Trainer):

    def _log(self, logs: Dict[str, float], iterator: Optional = None) -> None:
      if self.epoch is not None:
          logs["epoch"] = self.epoch

      output = json.dumps({**logs, **{"step": self.global_step}})
      print(output)
  
    def get_optimizers(self, num_training_steps: int) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]:
        optimizer, _ = super(CustomTrainer, self).get_optimizers(num_training_steps)
        scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=self.args.warmup_steps)
        return optimizer, scheduler

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)



Train!

In [None]:
%%time
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=10801.0, style=ProgressStyle(description_…

{"loss": 2.1806337041854857, "learning_rate": 0.0005, "epoch": 0.04629200999907416, "step": 500}
{"loss": 2.178268550634384, "learning_rate": 0.0005, "epoch": 0.09258401999814832, "step": 1000}
{"loss": 2.161776591539383, "learning_rate": 0.0005, "epoch": 0.13887602999722248, "step": 1500}
{"loss": 2.166535542011261, "learning_rate": 0.0005, "epoch": 0.18516803999629664, "step": 2000}
{"loss": 2.1483408017158507, "learning_rate": 0.0005, "epoch": 0.2314600499953708, "step": 2500}
{"loss": 2.1512983677387236, "learning_rate": 0.0005, "epoch": 0.27775205999444497, "step": 3000}
{"loss": 2.14949022769928, "learning_rate": 0.0005, "epoch": 0.32404406999351915, "step": 3500}
{"loss": 2.149521510362625, "learning_rate": 0.0005, "epoch": 0.37033607999259327, "step": 4000}
{"loss": 2.142380564212799, "learning_rate": 0.0005, "epoch": 0.41662808999166745, "step": 4500}
{"loss": 2.139502103805542, "learning_rate": 0.0005, "epoch": 0.4629200999907416, "step": 5000}
{"loss": 2.1218170828819276, "l

FileNotFoundError: ignored

In [None]:
trainer.save_model(drive_path)

In [None]:
tokenizer.save_model(drive_path)

['./drive/My Drive/Colab Notebooks/NLP/model/vocab.txt']

Extract Embeddings

In [None]:
loaded_model = BertModel.from_pretrained(drive_path)

In [None]:
batch_size = 32
dataloader = torch.utils.data.DataLoader(
                dataset=dataset,
                batch_size=batch_size,
                sampler=torch.utils.data.SequentialSampler(dataset),
                num_workers=0,
                pin_memory=True
)

In [None]:
from tqdm import tqdm

In [None]:
%%time 

averaged_embeddings = torch.tensor([], dtype=torch.float)
first_batch = True

for batch in tqdm(dataloader, position=0, leave=True):
    model_outputs = loaded_model.embeddings(batch).mean(dim=1)
    if first_batch:
      assert batch.shape == torch.Size([batch_size, 512])
      assert model_outputs.shape == torch.Size([batch_size, 256])
      first_batch = False

    averaged_embeddings = torch.cat((averaged_embeddings, model_outputs.detach().cpu()), 0)

100%|██████████| 13502/13502 [40:43<00:00,  5.53it/s]

CPU times: user 39min 38s, sys: 1min 6s, total: 40min 44s
Wall time: 40min 43s





In [None]:
torch.save(averaged_embeddings, os.path.join(drive_path, 'averaged_embeddings'))

In [None]:
assert averaged_embeddings.shape[0] == len(dataset)

Extract Weighted Embedding Without Token Padding

In [None]:
token_weights_df = pd.read_csv(os.path.join(drive_path, 'bpe_token_weights.tsv'), sep='\t')
token_weights = token_weights_df.set_index('token')['weight2'].to_dict()

In [None]:
unpadded_tokenizer = BertWordPieceTokenizer(
    vocab_path,
    handle_chinese_chars=False,
    lowercase=False,
    unk_token="[UNK]",
    sep_token="[SEP]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    mask_token="[MASK]")

In [None]:
averaged_embeddings = torch.tensor([], dtype=torch.float)

for sample in tqdm(dataset.samples[:50000], position=0, leave=True):

  sample_splitted = dataset._split_sequence_by_max_word_length(sample)
  tokens = dataset.tokenizer.encode(sample_splitted)
  embedding = loaded_model.embeddings(torch.tensor(tokens.ids, dtype=torch.long).expand(1, -1))

  tokens_unpadded = unpadded_tokenizer.encode(sample_splitted, add_special_tokens=False)
  original_len = len(tokens_unpadded.ids)

  weights = np.array([token_weights[k] for k in tokens_unpadded.tokens])
  weights_normalized = weights# / np.sum(weights)
  weights_tensor = torch.tensor([w * np.ones([256]) for w in weights_normalized]).view(1, -1, 256)

  averaged_embedding = torch.mul(embedding[:, 1:original_len + 1, :], weights_tensor).mean(dim=1)
  averaged_embeddings = torch.cat((averaged_embeddings, averaged_embedding.detach().cpu()), 0)

100%|██████████| 50000/50000 [21:04<00:00, 39.54it/s]


In [None]:
torch.save(averaged_embeddings, os.path.join(drive_path, 'averaged_embeddings_no_padding__weight2'))

In [None]:
assert averaged_embeddings.shape[0] == len(dataset)

AssertionError: ignored