In [1]:
import time
import numpy as np
import pandas as pd
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn
from torch.utils.data import dataloader

from _classifier import BertClassifier, BERT16SKmerDatasetForPhylaClassification, GeneratePhylumLabels, TrainTestSplit

### Add Phylum Lables to Dataset 

In [2]:
label_generator = GeneratePhylumLabels(data_path='SILVA_parsed_V2.tsv')
label_generator.save('SILVA_parsed_V2__labeled.tsv')
num_classes = label_generator.num_classes

  if (await self.run_code(code, result,  async_=asy)):


In [3]:
label_generator.other_label

array([41])

In [3]:
num_classes

42

### Train-Test Split 

In [5]:
train_df, test_df = TrainTestSplit('SILVA_parsed_V2__labeled.tsv').train_test_split()

train_df.to_csv('SILVA_parsed_V2__labeled__train.tsv', sep='\t')
test_df.to_csv('SILVA_parsed_V2__labeled__test.tsv', sep='\t')

  if (await self.run_code(code, result,  async_=asy)):


### Create Dataset 

In [4]:
trainset = BERT16SKmerDatasetForPhylaClassification(
    vocab_path='kmer_model/kmer_vocab.txt', 
    data_path='SILVA_parsed_V2__labeled__train.tsv')

testset = BERT16SKmerDatasetForPhylaClassification(
    vocab_path='kmer_model/kmer_vocab.txt', 
    data_path='SILVA_parsed_V2__labeled__test.tsv')

I0814 11:11:27.015621 4598826432 _kmers.py:66] Loading K-mer tokenizer using vocab file kmer_model/kmer_vocab.txt
I0814 11:11:27.023921 4598826432 _kmers.py:70] Loading 16S dataset file at SILVA_parsed_V2__labeled__train.tsv...
  exec(code_obj, self.user_global_ns, self.user_ns)
I0814 11:11:31.415036 4598826432 _kmers.py:72] 16S corpus is of shape (345626, 16)
I0814 11:11:31.427878 4598826432 _kmers.py:66] Loading K-mer tokenizer using vocab file kmer_model/kmer_vocab.txt
I0814 11:11:31.436480 4598826432 _kmers.py:70] Loading 16S dataset file at SILVA_parsed_V2__labeled__test.tsv...
I0814 11:11:32.555785 4598826432 _kmers.py:72] 16S corpus is of shape (86407, 16)


In [5]:
batch_size = 32
num_workers = 4

In [6]:
train_loader = dataloader.DataLoader(
    dataset=trainset,
    batch_size=batch_size,
    num_workers=num_workers
)

test_loader = dataloader.DataLoader(
    dataset=testset,
    batch_size=batch_size,
    num_workers=num_workers
)

### Define Model 

In [7]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [9]:
def initialize_model(epochs):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(path='kmer_model/', num_classes=num_classes, freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(
        bert_classifier.parameters(),
        lr=5e-5,    # Default learning rate
        eps=1e-8    # Default epsilon value
    )

    # Total number of training steps
    total_steps = len(trainset) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0, # Default value
        num_training_steps=total_steps)
    
    return bert_classifier, optimizer, scheduler

In [10]:
# Specify loss function
loss_fn = nn.CrossEntropyLoss()

### Define Train Loop 

In [11]:
def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """
    Train loop.
    """
    for epoch_i in range(epochs):
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^15} | {'LR':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*90)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        total_loss, batch_loss, batch_counts = 0, 0, 0

        model.train()
        num_steps = len(train_dataloader)
        for step, batch in enumerate(train_dataloader):
              
            batch_counts += 1
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)
            model.zero_grad()
            logits = model(b_input_ids)

            loss = loss_fn(logits, b_labels.view(-1,))
            batch_loss += loss.item()
            total_loss += loss.item()

            # back-propagation
            loss.backward()
            # clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if (step % 50 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch
                print(f"{epoch_i + 1:^7} | {step:^7}/{num_steps:^7} | {np.round(scheduler.get_lr()[-1], 7):^7}| {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()           

        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)

        if evaluation == True:
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^15} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*90)
        print("\n")


In [12]:
def evaluate(model, val_dataloader):
    """
    Evaluate model performance.
    """
    model.eval()

    val_accuracy = []
    val_loss = []

    for batch in val_dataloader:
        b_input_ids, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(b_input_ids)

        loss = loss_fn(logits, b_labels.view(-1,))
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()

        accuracy = (preds == b_labels.view(-1,)).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

### Train! 

In [None]:
%%time
bert_classifier, optimizer, scheduler = initialize_model(epochs=5)
train(bert_classifier, train_loader, test_loader, epochs=5, evaluation=True)

I0814 11:11:57.569875 4598826432 configuration_utils.py:263] loading configuration file kmer_model/config.json
I0814 11:11:57.570791 4598826432 configuration_utils.py:301] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 14989
}

I0814 11:11:57.572025 4598826432 modeling_utils.py:648] loading weights file kmer_model/pytorch_model.bin


 Epoch  |      Batch      |   LR    |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------------------------------------




   1    |   50   / 10801  |  5e-05 |   3.550413   |     -      |     -     |   9.74   
   1    |   100  / 10801  |  5e-05 |   2.915901   |     -      |     -     |   9.33   
   1    |   150  / 10801  |  5e-05 |   2.241002   |     -      |     -     |   9.34   
   1    |   200  / 10801  |  5e-05 |   2.107441   |     -      |     -     |   9.51   
   1    |   250  / 10801  |  5e-05 |   2.167109   |     -      |     -     |   9.86   
   1    |   300  / 10801  |  5e-05 |   2.142150   |     -      |     -     |   9.76   
   1    |   350  / 10801  |  5e-05 |   2.106351   |     -      |     -     |   9.89   
   1    |   400  / 10801  |  5e-05 |   2.044989   |     -      |     -     |   10.08  
   1    |   450  / 10801  |  5e-05 |   2.040657   |     -      |     -     |   10.07  
   1    |   500  / 10801  |  5e-05 |   2.053548   |     -      |     -     |   9.65   
   1    |   550  / 10801  |  5e-05 |   2.071611   |     -      |     -     |   9.32   
   1    |   600  / 10801  |  5e-05 |   1.99