<a href="https://colab.research.google.com/github/soutrik71/MInMaxBERT/blob/main/DistilBertClassification_GENAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install torcheval



In [None]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
import logging
import os
import random
from typing import List, Mapping, Dict
from transformers import AutoConfig, AutoModel
import torch.nn as nn
from torcheval.metrics import MulticlassAccuracy,BinaryAccuracy
logging.basicConfig(level=logging.DEBUG)

In [None]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

In [40]:
# Set manual seed since nn.Parameter are randomly initialzied
set_seed(42)
# Set device cuda for GPU if it's available otherwise run on the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
MAX_LEN = 512
BATCH_SIZE = 25
EPOCHS = 10
LEARNING_RATE = 1e-05

Random seed set as 42
cuda


In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [None]:
from transformers import AutoTokenizer
tokenizer_cp = AutoTokenizer.from_pretrained("distilbert-base-cased")

In [None]:
tokenizer

DistilBertTokenizer(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
tokenizer_cp

DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Basic Data Preprocessing

In [None]:
full_df = pd.read_csv("https://raw.githubusercontent.com/soutrik71/MInMaxBERT/main/data/classifier_data.csv")

In [None]:
full_df.head()

Unnamed: 0,question,label
0,What is the percentage change in volume for sa...,simple
1,what should beThe target average time in route...,simple
2,Which social media platform has the highest nu...,simple
3,What Are Tools And Equipment Used In Truck Dep...,simple
4,"What is the net revenue per HL in Aug 2023, Ju...",simple


In [None]:
full_df.shape

(123, 2)

In [None]:
full_df['label'].value_counts()

simple     108
complex     15
Name: label, dtype: int64

In [None]:
target_dict = {'simple':0, 'complex':1}

In [None]:
full_df['target'] = full_df['label'].map(target_dict)

In [None]:
full_df.head()

Unnamed: 0,question,label,target
0,What is the percentage change in volume for sa...,simple,0
1,what should beThe target average time in route...,simple,0
2,Which social media platform has the highest nu...,simple,0
3,What Are Tools And Equipment Used In Truck Dep...,simple,0
4,"What is the net revenue per HL in Aug 2023, Ju...",simple,0


In [None]:
# train validation split
train_df, val_df = train_test_split(full_df, test_size=0.2, random_state=42, stratify=full_df['target'])

In [None]:
train_df.shape, val_df.shape

((98, 3), (25, 3))

In [None]:
NUM_CLASSES = len(target_dict)
print(NUM_CLASSES)

2


## Custom Torch Dataset Class

In [None]:
idx = np.random.randint(0, len(train_df))
sample_text = train_df.iloc[idx]['question']
sample_label = train_df.iloc[idx]['target']
print(sample_text)
print(sample_label)

Outline of all Steps of the Huddle
0


In [None]:
outputs = tokenizer.encode_plus(
    text = sample_text,
    add_special_tokens=True,
    padding="max_length",
    max_length=MAX_LEN,
    return_tensors="pt",
    truncation=True,
    return_attention_mask=True,
    return_token_type_ids=True
)

In [None]:
outputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
ids = outputs['input_ids']

In [None]:
ids.squeeze(0).shape

torch.Size([512])

In [None]:
masks = outputs['attention_mask']
print(masks.shape)

torch.Size([1, 512])


**Both input and masks will be of [1,n] shape and we are not squeezing out the 1 extra dim as it will be internally handled and if not then we have to do it manually**

In [41]:
class BertClassificationDataset(Dataset):
    """
    Wrapper around Torch Dataset to perform text classification
    """

    def __init__(
        self,
        texts: List[str],
        labels: List[str] = None,
        label_dict: Mapping[str, int] = None,
        max_seq_length: int = 512,
        model_name: str = "distilbert-base-uncased",
    ):
        """
        Args:
            texts (List[str]): a list with texts to classify or to train the
                classifier on
            labels List[str]: a list with classification labels (optional)
            label_dict (dict): a dictionary mapping class names to class ids,
                to be passed to the validation data (optional)
            max_seq_length (int): maximal sequence length in tokens,
                texts will be stripped to this length
            model_name (str): transformer model name, needed to perform
                appropriate tokenization

        """

        self.texts = texts
        self.labels = labels
        self.label_dict = label_dict
        self.max_seq_length = max_seq_length

        if self.label_dict is None and labels is not None:
            self.label_dict = dict(zip(sorted(set(labels)), range(len(set(labels)))))

        self.tokenizer =  DistilBertTokenizer.from_pretrained(model_name)
        # suppresses tokenizer warnings
        logging.getLogger("transformers.tokenization_utils").setLevel(logging.FATAL)

    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.texts)

    def __getitem__(self, index) -> Mapping[str, torch.Tensor]:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """

        # encoding the text
        input_text = self.texts[index]

        # a dictionary with `input_ids` and `attention_mask` as keys
        output_dict  = self.tokenizer.encode_plus(
              text = input_text,
              add_special_tokens=True,
              padding="max_length",
              max_length=self.max_seq_length,
              return_tensors="pt",
              truncation=True,
              return_attention_mask=True,
              return_token_type_ids=True
          )

        # dealing with attention masks - there's a 1 for each input token and
        # if the sequence is shorter that `max_seq_length` then the rest is
        # padded with zeroes. Attention mask will be passed to the model in
        # order to compute attention scores only with input data

        ids = output_dict['input_ids'].squeeze(0)
        mask = output_dict['attention_mask']

        # encoding target
        if self.labels is not None:
            y = self.labels[index]
            y_encoded = torch.Tensor([self.label_dict.get(y,)]).float().squeeze(0)


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': y_encoded
        }


In [42]:
train_dataset = BertClassificationDataset(
        texts=train_df["question"].values.tolist(),
        labels=train_df["label"].values,
        max_seq_length=MAX_LEN,
        model_name="distilbert-base-cased",
        label_dict=target_dict
)

In [43]:
valid_dataset = BertClassificationDataset(
        texts=val_df["question"].values.tolist(),
        labels=val_df["label"].values,
        max_seq_length=MAX_LEN,
        model_name="distilbert-base-cased",
        label_dict=target_dict
)

In [44]:
# data loader stuffs
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    pin_memory=True,
)

valid_loader = DataLoader(
    valid_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
)

In [45]:
for batch in train_loader:
    ids = batch['ids']
    mask = batch['mask']
    targets = batch['targets']
    print(ids.shape)
    print(mask.shape)
    print(targets.shape)
    break

torch.Size([25, 512])
torch.Size([25, 1, 512])
torch.Size([25])


## Model Building

### Baseline Models using Distil Bert Classifier

In [None]:
class BertForSequenceClassification_A(nn.Module):
    """
    Simplified version of the same class by HuggingFace.
    See transformers/modeling_distilbert.py in the transformers repository.
    """

    def __init__(self, pretrained_model_name: str, num_classes: int = None, dropout: float = 0.1):
        super(BertForSequenceClassification_A, self).__init__()

        config = AutoConfig.from_pretrained(pretrained_model_name, num_labels=num_classes)
        print(config.hidden_size)

        self.model = AutoModel.from_pretrained(pretrained_model_name, config=config) # alternate DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.classifier = nn.Linear(config.hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, features, attention_mask=None, head_mask=None):

        assert attention_mask is not None, "attention mask is none"

        bert_output = self.model(input_ids=features, attention_mask=attention_mask)
        # we only need the hidden state here and don't need transformer output, so index 0
        seq_output = bert_output[0]  # (bs, seq_len, dim)
        # mean pooling, i.e. getting average representation of all tokens
        pooled_output = seq_output.mean(axis=1)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        scores = self.classifier(pooled_output)  # (bs, num_classes)
        return scores

In [None]:
baseline_model1 = BertForSequenceClassification_A("distilbert-base-cased",1).to(device)
baseline_model1

768


BertForSequenceClassification_A(
  (model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
# Creating the loss function and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  baseline_model1.parameters(), lr=LEARNING_RATE)
metric = BinaryAccuracy(threshold= 0.5 , device = device)

In [None]:
def train_module(model:torch.nn.Module,
                 device:torch.device,
                 train_dataloader:torch.utils.data.DataLoader ,
                 optimizer:torch.optim.Optimizer,
                 criterion:torch.nn.Module,
                 metric,
                 train_losses:list,
                 train_metrics:list):

  # setting model to train mode
  model.train()
  pbar = tqdm(train_dataloader)

  # batch metrics
  train_loss = 0
  train_metric = 0
  processed_batch = 0

  for _,data in enumerate(pbar):

    ids = data['ids'].to(device)
    mask = data['mask'].to(device)
    targets = data['targets'].to(device)

    outputs = model(ids, mask).squeeze()
    # print(targets)
    # print(outputs)

    # calc loss
    loss = criterion(outputs, targets)
    train_loss += loss.item()
    # print(f"training loss for batch {idx} is {loss}")

    # backpropagation
    optimizer.zero_grad() # flush out  existing grads
    loss.backward() # back prop of weights wrt loss
    optimizer.step() # optimizer step -> minima

    # print(f"preds:: {preds}")
    preds = outputs.clone()
    # metric calc
    metric.update(preds,targets)
    train_metric += metric.compute().detach().item()

    #updating batch count
    processed_batch += 1

    pbar.set_description(f"Avg Train Loss: {train_loss/processed_batch} Avg Train Metric: {train_metric/processed_batch}")

  # It's typically called after the epoch completes
  metric.reset()
  # updating epoch metrics
  train_losses.append(train_loss/processed_batch)
  train_metrics.append(train_metric/processed_batch)

  return train_losses, train_metrics


In [None]:
def test_module(model:torch.nn.Module,
                device:torch.device,
                test_dataloader:torch.utils.data.DataLoader,
                criterion:torch.nn.Module,
                metric,
                test_losses,
                test_metrics):
  # setting model to eval mode
  model.eval()
  pbar = tqdm(test_dataloader)

  # batch metrics
  test_loss = 0
  test_metric = 0
  processed_batch = 0

  with torch.inference_mode():
    for _, data in enumerate(pbar, 0):
      ids = data['ids'].to(device)
      mask = data['mask'].to(device)
      targets = data['targets'].to(device)
      outputs = model(ids, mask).squeeze()
      # print(preds.shape)
      # print(label.shape)

     # calc loss
      loss = criterion(outputs, targets)
      test_loss += loss.item()

      # metric calc
      preds = outputs.clone()
      metric.update(outputs, targets)
      test_metric += metric.compute().detach().item()

      #updating batch count
      processed_batch += 1

      pbar.set_description(f"Avg Test Loss: {test_loss/processed_batch} Avg Test Metric: {test_metric/processed_batch}")

    # It's typically called after the epoch completes
    metric.reset()
    # updating epoch metrics
    test_losses.append(test_loss/processed_batch)
    test_metrics.append(test_metric/processed_batch)

  return test_losses, test_metrics

In [None]:
# Place holders----
train_losses = []
train_metrics = []
test_losses = []
test_metrics = []

for epoch in range(0,EPOCHS):
  print(f'Epoch {epoch}')
  train_losses, train_metrics = train_module(baseline_model1, device, train_loader, optimizer, criterion, metric, train_losses, train_metrics)
  test_losses , test_metrics = test_module(baseline_model1, device, valid_loader, criterion, metric, test_losses, test_metrics)

Epoch 0


Avg Train Loss: 0.48201113045215604 Avg Train Metric: 0.8902987480163574: 100%|██████████| 10/10 [00:06<00:00,  1.46it/s]
Avg Test Loss: 0.40513624747594196 Avg Test Metric: 0.8933333158493042: 100%|██████████| 3/3 [00:00<00:00,  4.59it/s]


Epoch 1


Avg Train Loss: 0.3562739208340645 Avg Train Metric: 0.8150685966014862: 100%|██████████| 10/10 [00:04<00:00,  2.24it/s]
Avg Test Loss: 0.3704611659049988 Avg Test Metric: 0.8933333158493042: 100%|██████████| 3/3 [00:00<00:00,  5.86it/s]


Epoch 2


Avg Train Loss: 0.3459030032157898 Avg Train Metric: 0.8905923902988434: 100%|██████████| 10/10 [00:04<00:00,  2.23it/s]
Avg Test Loss: 0.3354926109313965 Avg Test Metric: 0.8933333158493042: 100%|██████████| 3/3 [00:00<00:00,  5.65it/s]


Epoch 3


Avg Train Loss: 0.30963268876075745 Avg Train Metric: 0.8465924084186554: 100%|██████████| 10/10 [00:04<00:00,  2.22it/s]
Avg Test Loss: 0.27915628751118976 Avg Test Metric: 0.8933333158493042: 100%|██████████| 3/3 [00:00<00:00,  5.58it/s]


Epoch 4


Avg Train Loss: 0.2546938944607973 Avg Train Metric: 0.8810923933982849: 100%|██████████| 10/10 [00:04<00:00,  2.21it/s]
Avg Test Loss: 0.2021191567182541 Avg Test Metric: 0.8933333158493042: 100%|██████████| 3/3 [00:00<00:00,  5.66it/s]


Epoch 5


Avg Train Loss: 0.17250319831073285 Avg Train Metric: 0.8705765306949615: 100%|██████████| 10/10 [00:04<00:00,  2.21it/s]
Avg Test Loss: 0.12360640366872151 Avg Test Metric: 0.9066666563351949: 100%|██████████| 3/3 [00:00<00:00,  5.57it/s]


Epoch 6


Avg Train Loss: 0.10996524505317211 Avg Train Metric: 0.8638645112514496: 100%|██████████| 10/10 [00:04<00:00,  2.22it/s]
Avg Test Loss: 0.06665678198138873 Avg Test Metric: 0.9699999888737997: 100%|██████████| 3/3 [00:00<00:00,  4.86it/s]


Epoch 7


Avg Train Loss: 0.053915971983224155 Avg Train Metric: 0.9379982888698578: 100%|██████████| 10/10 [00:04<00:00,  2.15it/s]
Avg Test Loss: 0.035670303429166474 Avg Test Metric: 1.0: 100%|██████████| 3/3 [00:00<00:00,  5.64it/s]


Epoch 8


Avg Train Loss: 0.029951273696497083 Avg Train Metric: 0.9915232419967651: 100%|██████████| 10/10 [00:04<00:00,  2.21it/s]
Avg Test Loss: 0.032655637400845684 Avg Test Metric: 1.0: 100%|██████████| 3/3 [00:00<00:00,  5.68it/s]


Epoch 9


Avg Train Loss: 0.02756611634977162 Avg Train Metric: 0.9867131590843201: 100%|██████████| 10/10 [00:04<00:00,  2.18it/s]
Avg Test Loss: 0.03209601497898499 Avg Test Metric: 1.0: 100%|██████████| 3/3 [00:00<00:00,  5.03it/s]


## Extended Model

In [46]:
class BertForSequenceClassification_B(nn.Module):
    """
    Simplified version of the same class by HuggingFace.
    See transformers/modeling_distilbert.py in the transformers repository.
    """

    def __init__(self, pretrained_model_name: str, num_classes: int = None, dropout: float = 0.1):
        super(BertForSequenceClassification_B, self).__init__()

        config = AutoConfig.from_pretrained(pretrained_model_name, num_labels=num_classes)
        print(config.hidden_size)

        self.model = AutoModel.from_pretrained(pretrained_model_name, config=config) # alternate DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.fc1 = nn.Linear(config.hidden_size, config.hidden_size)
        self.classifier = nn.Linear(config.hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, features, attention_mask=None, head_mask=None):

        assert attention_mask is not None, "attention mask is none"

        bert_output = self.model(input_ids=features, attention_mask=attention_mask)
        # we only need the hidden state here and don't need transformer output, so index 0
        seq_output = bert_output[0]  # (bs, seq_len, dim)
        pooler = seq_output[:, 0] # take out the first hideen state
        pooler = self.dropout(pooler)
        pooler = self.fc1(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        scores = self.classifier(pooler)

        return scores

In [47]:
baseline_model2 = BertForSequenceClassification_B("distilbert-base-cased",1).to(device)
baseline_model2

768


BertForSequenceClassification_B(
  (model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
          

In [48]:
# Creating the loss function and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  baseline_model2.parameters(), lr=LEARNING_RATE)
metric = BinaryAccuracy(threshold= 0.5 , device = device)

In [49]:
# Place holders----
train_losses = []
train_metrics = []
test_losses = []
test_metrics = []

for epoch in range(0,EPOCHS):
  print(f'Epoch {epoch}')
  train_losses, train_metrics = train_module(baseline_model2, device, train_loader, optimizer, criterion, metric, train_losses, train_metrics)
  test_losses , test_metrics = test_module(baseline_model2, device, valid_loader, criterion, metric, test_losses, test_metrics)

Epoch 0


Avg Train Loss: 0.6331150531768799 Avg Train Metric: 0.8777210712432861: 100%|██████████| 4/4 [00:04<00:00,  1.06s/it]
Avg Test Loss: 0.594943642616272 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.77it/s]


Epoch 1


Avg Train Loss: 0.5729416608810425 Avg Train Metric: 0.8477210849523544: 100%|██████████| 4/4 [00:04<00:00,  1.04s/it]
Avg Test Loss: 0.5431259274482727 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.78it/s]


Epoch 2


Avg Train Loss: 0.5283570140600204 Avg Train Metric: 0.8543877601623535: 100%|██████████| 4/4 [00:04<00:00,  1.05s/it]
Avg Test Loss: 0.4982055127620697 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.74it/s]


Epoch 3


Avg Train Loss: 0.48714953660964966 Avg Train Metric: 0.8993877619504929: 100%|██████████| 4/4 [00:04<00:00,  1.07s/it]
Avg Test Loss: 0.4589575529098511 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.43it/s]


Epoch 4


Avg Train Loss: 0.4522959813475609 Avg Train Metric: 0.8943877518177032: 100%|██████████| 4/4 [00:04<00:00,  1.09s/it]
Avg Test Loss: 0.4243566691875458 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.71it/s]


Epoch 5


Avg Train Loss: 0.41842344403266907 Avg Train Metric: 0.9260544180870056: 100%|██████████| 4/4 [00:04<00:00,  1.06s/it]
Avg Test Loss: 0.39477255940437317 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.72it/s]


Epoch 6


Avg Train Loss: 0.38188544660806656 Avg Train Metric: 0.8477210998535156: 100%|██████████| 4/4 [00:04<00:00,  1.06s/it]
Avg Test Loss: 0.3728202283382416 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.76it/s]


Epoch 7


Avg Train Loss: 0.3640465512871742 Avg Train Metric: 0.8893877565860748: 100%|██████████| 4/4 [00:04<00:00,  1.07s/it]
Avg Test Loss: 0.3553853929042816 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.73it/s]


Epoch 8


Avg Train Loss: 0.34949876368045807 Avg Train Metric: 0.8693877458572388: 100%|██████████| 4/4 [00:04<00:00,  1.08s/it]
Avg Test Loss: 0.33892932534217834 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.45it/s]


Epoch 9


Avg Train Loss: 0.3185894936323166 Avg Train Metric: 0.8943877518177032: 100%|██████████| 4/4 [00:04<00:00,  1.09s/it]
Avg Test Loss: 0.3165799677371979 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.75it/s]
