In [10]:
import time
import pandas as pd
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn

from _classifier import BertClassifier, BERT16SDatasetForPhylaClassification, GeneratePhylumLabels, TrainTestSplit

### Add Phylum Lables to Dataset 

In [2]:
label_generator = GeneratePhylumLabels(data_path='SILVA_parsed_V2.tsv')
label_generator.save('SILVA_parsed_V2__labeled.tsv')

  if (await self.run_code(code, result,  async_=asy)):


### Train-Test Split 

In [6]:
train_df, test_df = TrainTestSplit('SILVA_parsed_V2__labeled.tsv').train_test_split()

train_df.to_csv('SILVA_parsed_V2__labeled__train.tsv', sep='\t')
test_df.to_csv('SILVA_parsed_V2__labeled__test.tsv', sep='\t')

  if (await self.run_code(code, result,  async_=asy)):


### Create Dataset 

In [7]:
trainset = BERT16SDatasetForPhylaClassification(
    vocab_path='vocab.txt', 
    data_path='SILVA_parsed_V2__labeled__train.tsv')

testset = BERT16SDatasetForPhylaClassification(
    vocab_path='vocab.txt', 
    data_path='SILVA_parsed_V2__labeled__test.tsv')

I0811 08:34:53.569885 4732683712 _dataset.py:27] Loading BERT tokenizer using vocab file vocab.txt
I0811 08:34:53.591789 4732683712 _dataset.py:35] Loading 16S dataset file at SILVA_parsed_V2__labeled__train.tsv...
  exec(code_obj, self.user_global_ns, self.user_ns)
I0811 08:34:58.341029 4732683712 _dataset.py:37] 16S corpus is of shape (345626, 16)
I0811 08:34:58.358277 4732683712 _dataset.py:27] Loading BERT tokenizer using vocab file vocab.txt
I0811 08:34:58.376846 4732683712 _dataset.py:35] Loading 16S dataset file at SILVA_parsed_V2__labeled__test.tsv...
I0811 08:34:59.927132 4732683712 _dataset.py:37] 16S corpus is of shape (86407, 16)


### Define Model 

In [6]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [8]:
def initialize_model(epochs):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(path='', freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(
        bert_classifier.parameters(),
        lr=5e-5,    # Default learning rate
        eps=1e-8    # Default epsilon value
    )

    # Total number of training steps
    total_steps = len(trainset) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0, # Default value
        num_training_steps=total_steps)
    
    return bert_classifier, optimizer, scheduler

In [9]:
# Specify loss function
loss_fn = nn.CrossEntropyLoss()

### Define Train Loop 

In [11]:
def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """
    Train loop.
    """
    for epoch_i in range(epochs):
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
              
            batch_counts += 1
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)
            model.zero_grad()
            logits = model(b_input_ids)

            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # back-propagation
            loss.backward()
            # clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)

        if evaluation == True:
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")


In [12]:
def evaluate(model, val_dataloader):
    """
    Evaluate model performance.
    """
    model.eval()

    val_accuracy = []
    val_loss = []

    for batch in val_dataloader:
        b_input_ids, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(b_input_ids)

        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()

        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

### Train! 

In [None]:
bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
train(bert_classifier, train_dataloader, val_dataloader, epochs=2, evaluation=True)