<a href="https://colab.research.google.com/github/soutrik71/MInMaxBERT/blob/main/notebook/DistilBertClassification_GENAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In BERT uncased, the text has been lowercased before WordPiece tokenization step while in BERT cased, the text is same as the input text (no changes).

In [1]:
!pip install transformers
!pip install torcheval



In [2]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset, WeightedRandomSampler
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
import logging
import os
import random
from typing import List, Mapping, Dict
from transformers import AutoConfig, AutoModel
import torch.nn as nn
from torcheval.metrics import MulticlassAccuracy,BinaryAccuracy
logging.basicConfig(level=logging.DEBUG)

In [3]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

In [4]:
# Set manual seed since nn.Parameter are randomly initialzied
set_seed(71)
# Set device cuda for GPU if it's available otherwise run on the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
MAX_LEN = 512
BATCH_SIZE = 25
EPOCHS = 10
LEARNING_RATE = 1e-05

Random seed set as 71
cuda


In [5]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [6]:
from transformers import AutoTokenizer
tokenizer_cp = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
tokenizer

DistilBertTokenizer(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [8]:
tokenizer_cp

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Basic Data Preprocessing

In [9]:
full_df = pd.read_csv("https://raw.githubusercontent.com/soutrik71/MInMaxBERT/main/data/classifier_data.csv")

In [10]:
full_df.head()

Unnamed: 0,question,label
0,What is the percentage change in volume for sa...,simple
1,what should beThe target average time in route...,simple
2,Which social media platform has the highest nu...,simple
3,What Are Tools And Equipment Used In Truck Dep...,simple
4,"What is the net revenue per HL in Aug 2023, Ju...",simple


In [11]:
full_df.shape

(123, 2)

In [12]:
full_df['label'].value_counts()

simple     108
complex     15
Name: label, dtype: int64

In [13]:
target_dict = {'simple':0, 'complex':1}

In [14]:
full_df['target'] = full_df['label'].map(target_dict)

In [15]:
full_df.head()

Unnamed: 0,question,label,target
0,What is the percentage change in volume for sa...,simple,0
1,what should beThe target average time in route...,simple,0
2,Which social media platform has the highest nu...,simple,0
3,What Are Tools And Equipment Used In Truck Dep...,simple,0
4,"What is the net revenue per HL in Aug 2023, Ju...",simple,0


In [16]:
# train validation split
train_df, val_df = train_test_split(full_df, test_size=0.2, random_state=42, stratify=full_df['target'])

In [17]:
train_df.shape, val_df.shape

((98, 3), (25, 3))

In [18]:
NUM_CLASSES = len(target_dict)
print(NUM_CLASSES)

2


## Custom Torch Dataset Class

In [19]:
idx = np.random.randint(0, len(train_df))
sample_text = train_df.iloc[idx]['question']
sample_label = train_df.iloc[idx]['target']
print(sample_text)
print(sample_label)

Hello
0


In [20]:
outputs = tokenizer.encode_plus(
    text = sample_text,
    add_special_tokens=True,
    padding="max_length",
    max_length=MAX_LEN,
    return_tensors="pt",
    truncation=True,
    return_attention_mask=True,
    return_token_type_ids=True
)

In [21]:
outputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [22]:
ids = outputs['input_ids']

In [23]:
ids.squeeze(0).shape

torch.Size([512])

In [24]:
masks = outputs['attention_mask']
print(masks.shape)

torch.Size([1, 512])


**Both input and masks will be of [1,n] shape and we are not squeezing out the 1 extra dim as it will be internally handled and if not then we have to do it manually**

In [25]:
class BertClassificationDataset(Dataset):
    """
    Wrapper around Torch Dataset to perform text classification
    """

    def __init__(
        self,
        texts: List[str],
        labels: List[str] = None,
        label_dict: Mapping[str, int] = None,
        max_seq_length: int = 512,
        model_name: str = "distilbert-base-uncased",
    ):
        """
        Args:
            texts (List[str]): a list with texts to classify or to train the
                classifier on
            labels List[str]: a list with classification labels (optional)
            label_dict (dict): a dictionary mapping class names to class ids,
                to be passed to the validation data (optional)
            max_seq_length (int): maximal sequence length in tokens,
                texts will be stripped to this length
            model_name (str): transformer model name, needed to perform
                appropriate tokenization

        """

        self.texts = texts
        self.labels = labels
        self.label_dict = label_dict
        self.max_seq_length = max_seq_length

        if self.label_dict is None and labels is not None:
            self.label_dict = dict(zip(sorted(set(labels)), range(len(set(labels)))))

        self.tokenizer =  DistilBertTokenizer.from_pretrained(model_name)
        # suppresses tokenizer warnings
        logging.getLogger("transformers.tokenization_utils").setLevel(logging.FATAL)

    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.texts)

    def __getitem__(self, index) -> Mapping[str, torch.Tensor]:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """

        # encoding the text
        input_text = self.texts[index].lower()

        # a dictionary with `input_ids` and `attention_mask` as keys
        output_dict  = self.tokenizer.encode_plus(
              text = input_text,
              add_special_tokens=True,
              padding="max_length",
              max_length=self.max_seq_length,
              return_tensors="pt",
              truncation=True,
              return_attention_mask=True,
              return_token_type_ids=True
          )

        # dealing with attention masks - there's a 1 for each input token and
        # if the sequence is shorter that `max_seq_length` then the rest is
        # padded with zeroes. Attention mask will be passed to the model in
        # order to compute attention scores only with input data

        ids = output_dict['input_ids'].squeeze(0)
        mask = output_dict['attention_mask'].squeeze(0)

        # encoding target
        if self.labels is not None:
            y = self.labels[index]
            y_encoded = torch.Tensor([self.label_dict.get(y,)]).float().squeeze(0)


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': y_encoded
        }


In [38]:
train_dataset = BertClassificationDataset(
        texts=train_df["question"].values.tolist(),
        labels=train_df["label"].values,
        max_seq_length=MAX_LEN,
        model_name="distilbert-base-cased",
        label_dict=target_dict
)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

In [39]:
valid_dataset = BertClassificationDataset(
        texts=val_df["question"].values.tolist(),
        labels=val_df["label"].values,
        max_seq_length=MAX_LEN,
        model_name="distilbert-base-cased",
        label_dict=target_dict
)

In [40]:
# data loader stuffs
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    pin_memory=True,
)

valid_loader = DataLoader(
    valid_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
)

In [41]:
for batch in train_loader:
    ids = batch['ids']
    mask = batch['mask']
    targets = batch['targets']
    print(ids.shape)
    print(mask.shape)
    print(targets.shape)
    break

torch.Size([25, 512])
torch.Size([25, 512])
torch.Size([25])


## Model Building

### Baseline Models using Distil Bert Classifier

In [42]:
class BertForSequenceClassification_A(nn.Module):
    """
    Simplified version of the same class by HuggingFace.
    See transformers/modeling_distilbert.py in the transformers repository.
    """

    def __init__(self, pretrained_model_name: str, num_classes: int = None, dropout: float = 0.1):
        super(BertForSequenceClassification_A, self).__init__()

        config = AutoConfig.from_pretrained(pretrained_model_name, num_labels=num_classes)
        print(config.hidden_size)

        self.model = AutoModel.from_pretrained(pretrained_model_name, config=config) # alternate DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.classifier = nn.Linear(config.hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, features, attention_mask=None, head_mask=None):

        assert attention_mask is not None, "attention mask is none"

        bert_output = self.model(input_ids=features, attention_mask=attention_mask)
        # we only need the hidden state here and don't need transformer output, so index 0
        seq_output = bert_output[0]  # (bs, seq_len, dim)
        # mean pooling, i.e. getting average representation of all tokens
        pooled_output = seq_output.mean(axis=1)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        scores = self.classifier(pooled_output)  # (bs, num_classes)
        return scores

In [51]:
baseline_model1 = BertForSequenceClassification_A("distilbert-base-cased",1).to(device)
baseline_model1

768


BertForSequenceClassification_A(
  (model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
          

In [56]:
# Creating the loss function and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  baseline_model1.parameters(), lr=LEARNING_RATE)
metric = BinaryAccuracy(threshold= 0.5 , device = device)

In [58]:
def train_module(model:torch.nn.Module,
                 device:torch.device,
                 train_dataloader:torch.utils.data.DataLoader ,
                 optimizer:torch.optim.Optimizer,
                 criterion:torch.nn.Module,
                 metric,
                 train_losses:list,
                 train_metrics:list):

  # setting model to train mode
  model.train()
  pbar = tqdm(train_dataloader)

  # batch metrics
  train_loss = 0
  train_metric = 0
  processed_batch = 0

  for _,data in enumerate(pbar):

    ids = data['ids'].to(device)
    mask = data['mask'].to(device)
    targets = data['targets'].to(device)

    outputs = model(ids, mask).squeeze()
    # print(targets)
    # print(outputs)

    # calc loss
    loss = criterion(outputs, targets)
    train_loss += loss.item()
    # print(f"training loss for batch {idx} is {loss}")

    # backpropagation
    optimizer.zero_grad() # flush out  existing grads
    loss.backward() # back prop of weights wrt loss
    optimizer.step() # optimizer step -> minima

    # metric calc
    metric.update(outputs,targets)
    train_metric += metric.compute().detach().item()

    #updating batch count
    processed_batch += 1

    pbar.set_description(f"Avg Train Loss: {train_loss/processed_batch} Avg Train Metric: {train_metric/processed_batch}")

  # It's typically called after the epoch completes
  metric.reset()
  # updating epoch metrics
  train_losses.append(train_loss/processed_batch)
  train_metrics.append(train_metric/processed_batch)

  return train_losses, train_metrics


In [59]:
def test_module(model:torch.nn.Module,
                device:torch.device,
                test_dataloader:torch.utils.data.DataLoader,
                criterion:torch.nn.Module,
                metric,
                test_losses,
                test_metrics):
  # setting model to eval mode
  model.eval()
  pbar = tqdm(test_dataloader)

  # batch metrics
  test_loss = 0
  test_metric = 0
  processed_batch = 0

  with torch.inference_mode():
    for _, data in enumerate(pbar, 0):
      ids = data['ids'].to(device)
      mask = data['mask'].to(device)
      targets = data['targets'].to(device)
      outputs = model(ids, mask).squeeze()
      # print(preds.shape)
      # print(label.shape)

     # calc loss
      loss = criterion(outputs, targets)
      test_loss += loss.item()

      # metric calc
      metric.update(outputs, targets)
      test_metric += metric.compute().detach().item()

      #updating batch count
      processed_batch += 1

      pbar.set_description(f"Avg Test Loss: {test_loss/processed_batch} Avg Test Metric: {test_metric/processed_batch}")

    # It's typically called after the epoch completes
    metric.reset()
    # updating epoch metrics
    test_losses.append(test_loss/processed_batch)
    test_metrics.append(test_metric/processed_batch)

  return test_losses, test_metrics

In [60]:
# Place holders----
train_losses = []
train_metrics = []
test_losses = []
test_metrics = []

for epoch in range(0,EPOCHS):
  print(f'Epoch {epoch}')
  train_losses, train_metrics = train_module(baseline_model1, device, train_loader, optimizer, criterion, metric, train_losses, train_metrics)
  test_losses , test_metrics = test_module(baseline_model1, device, valid_loader, criterion, metric, test_losses, test_metrics)

Epoch 0


Avg Train Loss: 0.2962701991200447 Avg Train Metric: 0.8743877559900284: 100%|██████████| 4/4 [00:04<00:00,  1.14s/it]
Avg Test Loss: 0.2732808291912079 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.71it/s]


Epoch 1


Avg Train Loss: 0.2570966072380543 Avg Train Metric: 0.8877210766077042: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]
Avg Test Loss: 0.23101018369197845 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.39it/s]


Epoch 2


Avg Train Loss: 0.21894674748182297 Avg Train Metric: 0.9193877428770065: 100%|██████████| 4/4 [00:04<00:00,  1.18s/it]
Avg Test Loss: 0.19483321905136108 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.67it/s]


Epoch 3


Avg Train Loss: 0.18963734433054924 Avg Train Metric: 0.8943877518177032: 100%|██████████| 4/4 [00:04<00:00,  1.15s/it]
Avg Test Loss: 0.1601826399564743 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.58it/s]


Epoch 4


Avg Train Loss: 0.15293970704078674 Avg Train Metric: 0.8910544216632843: 100%|██████████| 4/4 [00:04<00:00,  1.19s/it]
Avg Test Loss: 0.12787745893001556 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.67it/s]


Epoch 5


Avg Train Loss: 0.12527434900403023 Avg Train Metric: 0.8769387602806091: 100%|██████████| 4/4 [00:04<00:00,  1.17s/it]
Avg Test Loss: 0.09719528257846832 Avg Test Metric: 0.9599999785423279: 100%|██████████| 1/1 [00:00<00:00,  1.69it/s]


Epoch 6


Avg Train Loss: 0.10195595864206553 Avg Train Metric: 0.8729251474142075: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]
Avg Test Loss: 0.07304985076189041 Avg Test Metric: 0.9599999785423279: 100%|██████████| 1/1 [00:00<00:00,  1.68it/s]


Epoch 7


Avg Train Loss: 0.07264623790979385 Avg Train Metric: 0.9647959172725677: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]
Avg Test Loss: 0.052518151700496674 Avg Test Metric: 1.0: 100%|██████████| 1/1 [00:00<00:00,  1.65it/s]


Epoch 8


Avg Train Loss: 0.05843451712280512 Avg Train Metric: 0.9915646314620972: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]
Avg Test Loss: 0.0376325361430645 Avg Test Metric: 1.0: 100%|██████████| 1/1 [00:00<00:00,  1.46it/s]


Epoch 9


Avg Train Loss: 0.04805159009993076 Avg Train Metric: 0.9915646314620972: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]
Avg Test Loss: 0.027342937886714935 Avg Test Metric: 1.0: 100%|██████████| 1/1 [00:00<00:00,  1.68it/s]


## Extended Model

In [61]:
class BertForSequenceClassification_B(nn.Module):
    """
    Simplified version of the same class by HuggingFace.
    See transformers/modeling_distilbert.py in the transformers repository.
    """

    def __init__(self, pretrained_model_name: str, num_classes: int = None, dropout: float = 0.1):
        super(BertForSequenceClassification_B, self).__init__()

        config = AutoConfig.from_pretrained(pretrained_model_name, num_labels=num_classes)
        print(config.hidden_size)

        self.model = AutoModel.from_pretrained(pretrained_model_name, config=config) # alternate DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.fc1 = nn.Linear(config.hidden_size, config.hidden_size)
        self.classifier = nn.Linear(config.hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, features, attention_mask=None, head_mask=None):

        assert attention_mask is not None, "attention mask is none"

        bert_output = self.model(input_ids=features, attention_mask=attention_mask)
        # we only need the hidden state here and don't need transformer output, so index 0
        seq_output = bert_output[0]  # (bs, seq_len, dim)
        pooler = seq_output[:, 0] # take out the first hideen state
        pooler = self.dropout(pooler)
        pooler = self.fc1(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        scores = self.classifier(pooler)

        return scores

In [62]:
baseline_model2 = BertForSequenceClassification_B("distilbert-base-cased",1).to(device)
baseline_model2

768


BertForSequenceClassification_B(
  (model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
          

In [63]:
# Creating the loss function and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  baseline_model2.parameters(), lr=LEARNING_RATE)
metric = BinaryAccuracy(threshold= 0.5 , device = device)

In [64]:
# Place holders----
train_losses = []
train_metrics = []
test_losses = []
test_metrics = []

for epoch in range(0,EPOCHS):
  print(f'Epoch {epoch}')
  train_losses, train_metrics = train_module(baseline_model2, device, train_loader, optimizer, criterion, metric, train_losses, train_metrics)
  test_losses , test_metrics = test_module(baseline_model2, device, valid_loader, criterion, metric, test_losses, test_metrics)

Epoch 0


Avg Train Loss: 0.6644908934831619 Avg Train Metric: 0.8760544210672379: 100%|██████████| 4/4 [00:04<00:00,  1.11s/it]
Avg Test Loss: 0.6207653880119324 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.31it/s]


Epoch 1


Avg Train Loss: 0.6048317700624466 Avg Train Metric: 0.8610544204711914: 100%|██████████| 4/4 [00:04<00:00,  1.12s/it]
Avg Test Loss: 0.5616757869720459 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.64it/s]


Epoch 2


Avg Train Loss: 0.5455614253878593 Avg Train Metric: 0.8560544103384018: 100%|██████████| 4/4 [00:04<00:00,  1.09s/it]
Avg Test Loss: 0.5053665041923523 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.65it/s]


Epoch 3


Avg Train Loss: 0.49134328961372375 Avg Train Metric: 0.846054419875145: 100%|██████████| 4/4 [00:04<00:00,  1.11s/it]
Avg Test Loss: 0.4522315263748169 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.65it/s]


Epoch 4


Avg Train Loss: 0.4394281581044197 Avg Train Metric: 0.9027210921049118: 100%|██████████| 4/4 [00:04<00:00,  1.09s/it]
Avg Test Loss: 0.40575897693634033 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.69it/s]


Epoch 5


Avg Train Loss: 0.4075167626142502 Avg Train Metric: 0.8827210813760757: 100%|██████████| 4/4 [00:04<00:00,  1.10s/it]
Avg Test Loss: 0.3737267553806305 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.33it/s]


Epoch 6


Avg Train Loss: 0.371970035135746 Avg Train Metric: 0.8560544103384018: 100%|██████████| 4/4 [00:04<00:00,  1.11s/it]
Avg Test Loss: 0.35572826862335205 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.61it/s]


Epoch 7


Avg Train Loss: 0.3479520305991173 Avg Train Metric: 0.8643877506256104: 100%|██████████| 4/4 [00:04<00:00,  1.10s/it]
Avg Test Loss: 0.3379162549972534 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]


Epoch 8


Avg Train Loss: 0.3410060927271843 Avg Train Metric: 0.8943877518177032: 100%|██████████| 4/4 [00:04<00:00,  1.09s/it]
Avg Test Loss: 0.3106881380081177 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.58it/s]


Epoch 9


Avg Train Loss: 0.31717853248119354 Avg Train Metric: 0.8760544210672379: 100%|██████████| 4/4 [00:04<00:00,  1.10s/it]
Avg Test Loss: 0.2841821610927582 Avg Test Metric: 0.8799999952316284: 100%|██████████| 1/1 [00:00<00:00,  1.63it/s]


### Evaluation Framework

In [89]:
# Create sentence and label lists
raw_sentences = ["What was the sales for brand Hurricane in India?",
                 "How did the sales for brand X varied over last one year?",
                 "Explain how has the impact on water shortage on the production of beer in India?",
                 "How is the performance of Classic lager segment in LP TY compared to LP YA?",
                 "Detail the reasons for drop in sales for brnad Corona in the west?"]

# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in raw_sentences:
  encoded_dict = tokenizer.encode_plus(
              text = sent.lower(),
              add_special_tokens=True,
              padding="max_length",
              max_length=MAX_LEN,
              return_tensors="pt",
              truncation=True,
              return_attention_mask=True,
              return_token_type_ids=True
          )
  # Add the encoded sentence to the list.
  input_ids.append(encoded_dict['input_ids'])

  # And its attention mask (simply differentiates padding from non-padding).
  attention_masks.append(encoded_dict['attention_mask'])


In [90]:
input_ids[0].shape,attention_masks[0].shape

(torch.Size([1, 512]), torch.Size([1, 512]))

In [91]:
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
# labels = torch.tensor(labels)

In [92]:
# Create the DataLoader.
# prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_data = TensorDataset(input_ids, attention_masks)
prediction_dataloader = DataLoader(prediction_data, batch_size=BATCH_SIZE)

In [93]:
for batch in prediction_dataloader:
  print(batch)
  break

[tensor([[ 101, 1184, 1108,  ...,    0,    0,    0],
        [ 101, 1293, 1225,  ...,    0,    0,    0],
        [ 101, 4137, 1293,  ...,    0,    0,    0],
        [ 101, 1293, 1110,  ...,    0,    0,    0],
        [ 101, 6505, 1103,  ...,    0,    0,    0]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])]


In [94]:
def eval_module(model:torch.nn.Module,
                device:torch.device,
                prediction_dataloader:torch.utils.data.DataLoader,
              ):
  """
  Function to evaluate the model on a given dataset.
  """
  # setting model to eval mode

  predictions = []
  model.eval()
  pbar = tqdm(prediction_dataloader)

  with torch.inference_mode():
    for _, data in enumerate(pbar, 0):
      ids = data[0].to(device)
      mask = data[1].to(device)
      outputs = model(ids, mask).squeeze()

      outputs = torch.sigmoid(outputs)
      predictions.append(outputs.cpu().detach().numpy())
  return predictions




In [95]:
predictions = eval_module(baseline_model1, device, prediction_dataloader)

100%|██████████| 1/1 [00:00<00:00,  9.29it/s]


In [96]:
predictions

[array([0.01225466, 0.414234  , 0.48458377, 0.9456737 , 0.17545688],
       dtype=float32)]