Installs

In [None]:
pip install tokenizers

Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/e9/ee/fedc3509145ad60fe5b418783f4a4c1b5462a4f0e8c7bbdbda52bdcda486/tokenizers-0.8.1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |                                | 10kB 19.3MB/s eta 0:00:01[K     |▏                               | 20kB 6.4MB/s eta 0:00:01[K     |▎                               | 30kB 7.4MB/s eta 0:00:01[K     |▍                               | 40kB 8.6MB/s eta 0:00:01[K     |▌                               | 51kB 7.1MB/s eta 0:00:01[K     |▋                               | 61kB 7.5MB/s eta 0:00:01[K     |▊                               | 71kB 8.0MB/s eta 0:00:01[K     |▉                               | 81kB 8.9MB/s eta 0:00:01[K     |█                               | 92kB 9.2MB/s eta 0:00:01[K     |█                               | 102kB 9.6MB/s eta 0:00:01[K     |█▏                              | 112kB 9.6MB/s eta 0:00:01[K     |█▎                              

In [None]:
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 22.4MB/s eta 0:00:01[K     |▉                               | 20kB 6.1MB/s eta 0:00:01[K     |█▎                              | 30kB 7.2MB/s eta 0:00:01[K     |█▊                              | 40kB 8.1MB/s eta 0:00:01[K     |██▏                             | 51kB 6.5MB/s eta 0:00:01[K     |██▋                             | 61kB 6.9MB/s eta 0:00:01[K     |███                             | 71kB 7.9MB/s eta 0:00:01[K     |███▍                            | 81kB 7.7MB/s eta 0:00:01[K     |███▉                            | 92kB 7.7MB/s eta 0:00:01[K     |████▎                           | 102kB 8.0MB/s eta 0:00:01[K     |████▊                           | 112kB 8.0MB/s eta 0:00:01[K     |█████▏                          | 122kB 8.0M

Imports

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import BertConfig, BertForMaskedLM, BertModel
from transformers import Trainer, TrainingArguments
from tokenizers import BertWordPieceTokenizer

In [None]:
from _kmers import BERT16SKmerDataset, KmerTokenizer
from _collator import DataCollatorForBertWordPieceTokenizer

Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
drive_path = './drive/My Drive/Colab Notebooks/NLP/kmer_model'

Check Resources

In [None]:
# Check that we have a GPU
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
# Check that PyTorch sees it
torch.cuda.is_available()

True

Prepare Model Config

In [None]:
vocab_size = 14989
k = 6

In [None]:
config = BertConfig(
    vocab_size=vocab_size,
    hidden_size=256,
    intermediate_size=1024,
    num_hidden_layers=4,
    num_attention_heads=4,
    max_position_embeddings=512
)

Create BERT model

In [None]:
model = BertForMaskedLM(config=config)

print(f"BERT model has {model.num_parameters()/10**6}M parameters")

BERT model has 7.275405M parameters


Create Datatset

In [None]:
vocab_path = os.path.join(drive_path, 'kmer_vocab.txt')
data_path = os.path.join(drive_path, 'SILVA_parsed_V2.tsv')

In [None]:
dataset = BERT16SKmerDataset(
    vocab_path=vocab_path,
    data_path=data_path,
    block_size=512,
    k=k
)

  if self.run_code(code, result):


Crete Data Collator

In [None]:
tokenizer = KmerTokenizer(
    vocab_path
)

tokenizer.enable_padding(length=512)

In [None]:
len(tokenizer.get_vocab())

14989

In [None]:
data_collator = DataCollatorForBertWordPieceTokenizer(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
model(data_collator.collate_batch([dataset[0]])['input_ids'])[0].shape

torch.Size([1, 512, 14989])

Config Training

In [None]:
training_args = TrainingArguments(
    output_dir=os.path.join(drive_path, 'checkpoints'),
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=40,
    save_steps=10_000,
    logging_steps=500,
    save_total_limit=2,
    learning_rate=5e-4,
)

In [None]:
from typing import Callable, Dict, List, Optional, Tuple
import json
from transformers.optimization import get_constant_schedule_with_warmup

class CustomTrainer(Trainer):

    def _log(self, logs: Dict[str, float], iterator: Optional = None) -> None:
      if self.epoch is not None:
          logs["epoch"] = self.epoch

      output = json.dumps({**logs, **{"step": self.global_step}})
      print(output)
  
    def get_optimizers(self, num_training_steps: int) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]:
        optimizer, _ = super(CustomTrainer, self).get_optimizers(num_training_steps)
        scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=self.args.warmup_steps)
        return optimizer, scheduler

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

Tesla T4 with CUDA capability sm_75 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the Tesla T4 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



Train!

In [None]:
%%time
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=10801.0, style=ProgressStyle(description_…

{"loss": 6.5577001962661745, "learning_rate": 0.0005, "epoch": 0.04629200999907416, "step": 500}
{"loss": 4.894051997184754, "learning_rate": 0.0005, "epoch": 0.09258401999814832, "step": 1000}
{"loss": 4.049568943500518, "learning_rate": 0.0005, "epoch": 0.13887602999722248, "step": 1500}
{"loss": 3.6060651655197145, "learning_rate": 0.0005, "epoch": 0.18516803999629664, "step": 2000}
{"loss": 3.2885485310554503, "learning_rate": 0.0005, "epoch": 0.2314600499953708, "step": 2500}
{"loss": 3.047599423408508, "learning_rate": 0.0005, "epoch": 0.27775205999444497, "step": 3000}
{"loss": 2.8713842153549196, "learning_rate": 0.0005, "epoch": 0.32404406999351915, "step": 3500}
{"loss": 2.7416285972595213, "learning_rate": 0.0005, "epoch": 0.37033607999259327, "step": 4000}
{"loss": 2.6016853723526, "learning_rate": 0.0005, "epoch": 0.41662808999166745, "step": 4500}
{"loss": 2.529984164237976, "learning_rate": 0.0005, "epoch": 0.4629200999907416, "step": 5000}
{"loss": 2.4390352659225463, "



{"loss": 2.0360802092552186, "learning_rate": 0.0005, "epoch": 0.9721322099805574, "step": 10500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=10801.0, style=ProgressStyle(description_…

{"loss": 2.0271493604183197, "learning_rate": 0.0005, "epoch": 1.0184242199796316, "step": 11000}
{"loss": 2.0041087489128113, "learning_rate": 0.0005, "epoch": 1.0647162299787056, "step": 11500}
{"loss": 1.9763193168640136, "learning_rate": 0.0005, "epoch": 1.1110082399777799, "step": 12000}
{"loss": 1.9649719727039336, "learning_rate": 0.0005, "epoch": 1.157300249976854, "step": 12500}
{"loss": 1.9429760653972625, "learning_rate": 0.0005, "epoch": 1.2035922599759281, "step": 13000}
{"loss": 1.9328555047512055, "learning_rate": 0.0005, "epoch": 1.2498842699750023, "step": 13500}
{"loss": 1.901951913833618, "learning_rate": 0.0005, "epoch": 1.2961762799740764, "step": 14000}
{"loss": 1.8865125699043275, "learning_rate": 0.0005, "epoch": 1.3424682899731506, "step": 14500}
{"loss": 1.8715765659809112, "learning_rate": 0.0005, "epoch": 1.3887602999722248, "step": 15000}
{"loss": 1.8618672909736633, "learning_rate": 0.0005, "epoch": 1.435052309971299, "step": 15500}
{"loss": 1.835839368104

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=10801.0, style=ProgressStyle(description_…

{"loss": 1.6892852051258087, "learning_rate": 0.0005, "epoch": 2.0368484399592632, "step": 22000}
{"loss": 1.6666832859516143, "learning_rate": 0.0005, "epoch": 2.083140449958337, "step": 22500}
{"loss": 1.6708894917964936, "learning_rate": 0.0005, "epoch": 2.1294324599574113, "step": 23000}
{"loss": 1.6473406405448914, "learning_rate": 0.0005, "epoch": 2.1757244699564855, "step": 23500}
{"loss": 1.6531950652599334, "learning_rate": 0.0005, "epoch": 2.2220164799555597, "step": 24000}
{"loss": 1.636396633386612, "learning_rate": 0.0005, "epoch": 2.268308489954634, "step": 24500}
{"loss": 1.62535635638237, "learning_rate": 0.0005, "epoch": 2.314600499953708, "step": 25000}
{"loss": 1.5994646651744842, "learning_rate": 0.0005, "epoch": 2.360892509952782, "step": 25500}
{"loss": 1.6134627084732056, "learning_rate": 0.0005, "epoch": 2.4071845199518562, "step": 26000}
{"loss": 1.5922247302532195, "learning_rate": 0.0005, "epoch": 2.4534765299509305, "step": 26500}
{"loss": 1.5921565408706666

TrainOutput(global_step=32403, training_loss=2.0962373362983175)

In [None]:
trainer.save_model(drive_path)

Extract Embeddings

In [None]:
loaded_model = BertModel.from_pretrained(drive_path)

In [None]:
batch_size = 32
dataloader = torch.utils.data.DataLoader(
                dataset=dataset,
                batch_size=batch_size,
                sampler=torch.utils.data.SequentialSampler(dataset),
                num_workers=0,
                pin_memory=True
)

In [None]:
from tqdm import tqdm

In [None]:
%%time 

averaged_embeddings = torch.tensor([], dtype=torch.float)
first_batch = True

for batch in tqdm(dataloader, position=0, leave=True):
    model_outputs = loaded_model.embeddings(batch).mean(dim=1)
    if first_batch:
      assert batch.shape == torch.Size([batch_size, 512])
      assert model_outputs.shape == torch.Size([batch_size, 256])
      first_batch = False

    averaged_embeddings = torch.cat((averaged_embeddings, model_outputs.detach().cpu()), 0)

100%|██████████| 13502/13502 [24:58<00:00,  9.01it/s]

CPU times: user 23min 57s, sys: 1min 5s, total: 25min 3s
Wall time: 24min 58s





In [None]:
torch.save(averaged_embeddings, os.path.join(drive_path, 'averaged_embeddings'))

In [None]:
assert averaged_embeddings.shape[0] == len(dataset)

Extract Weighted Embedding Without Token Padding

In [None]:
token_weights_df = pd.read_csv(os.path.join(drive_path, 'kmer_token_weights.tsv'), sep='\t')
token_weights = token_weights_df.set_index('token')['weight2'].to_dict()

In [None]:
loaded_model = BertModel.from_pretrained(drive_path)

In [None]:
unpadded_tokenizer = KmerTokenizer(vocab_path)

In [None]:
averaged_embeddings = torch.tensor([], dtype=torch.float)

for sample in tqdm(dataset.samples[:50000], position=0, leave=True):

  sample_splitted = dataset.split_seq_into_kmers(sample)
  tokens = dataset.tokenizer.encode(sample_splitted)
  embedding = loaded_model.embeddings(torch.tensor(tokens, dtype=torch.long).expand(1, -1))

  tokens_unpadded = sample_splitted
  tokens_ids_unpadded = unpadded_tokenizer.encode(sample_splitted, add_special_tokens=False)
  original_len = len(tokens_ids_unpadded)

  weights = np.array([token_weights[k] for k in tokens_unpadded])
  weights_normalized = weights# / np.sum(weights)
  weights_tensor = torch.tensor([w * np.ones([256]) for w in weights_normalized]).view(1, -1, 256)

  averaged_embedding = torch.mul(embedding[:, 1:original_len + 1, :], weights_tensor).mean(dim=1)
  averaged_embeddings = torch.cat((averaged_embeddings, averaged_embedding.detach().cpu()), 0)

100%|██████████| 50000/50000 [15:58<00:00, 52.14it/s]


In [None]:
torch.save(averaged_embeddings, os.path.join(drive_path, 'averaged_embeddings_no_padding__weight2'))

In [None]:
assert averaged_embeddings.shape[0] == len(dataset)