### Specific package requirement

In [None]:
pip install datasets transformers[sentencepiece]

### Imports

In [None]:
from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import pandas as pd
from datasets import load_metric
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from transformers import AdamW,get_scheduler
import numpy as np
from statistics import NormalDist

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2024-04-11 14:58:45.949989: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-11 14:58:46.008075: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-11 14:58:46.271691: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-04-11 14:58:46.271719: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

### Load datasets & tokenize, pre-trained model checkpoint

In [None]:
data=load_dataset("json",data_files="/home/ssrini27/random/Sarcasm_Headlines_Dataset_v2.json")
data=data.rename_column("is_sarcastic","label")

data=data.remove_columns(['article_link'])

data.set_format('pandas')
data=data['train'][:]

data.drop_duplicates(subset=['headline'],inplace=True)
data=data.reset_index()[['headline','label']]
data=Dataset.from_pandas(data)

# 80% train, 20% test + validation
train_testvalid = data.train_test_split(test_size=0.2,seed=15)

# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed=15)

# gather everyone if you want to have a single DatasetDict
data = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

data

DatasetDict({
    train: Dataset({
        features: ['headline', 'label'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label'],
        num_rows: 2850
    })
})

In [None]:
checkpoint = "cardiffnlp/twitter-roberta-base-emotion"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.model_max_len=512

In [None]:
def tokenize(batch):
  return tokenizer(batch["headline"], truncation=True,max_length=512)

tokenized_dataset = data.map(tokenize, batched=True)
tokenized_dataset

Map:   0%|          | 0/22802 [00:00<?, ? examples/s]

Map:   0%|          | 0/2851 [00:00<?, ? examples/s]

Map:   0%|          | 0/2850 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2850
    })
})

In [None]:
tokenized_dataset.set_format("torch",columns=["input_ids", "attention_mask", "label"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Define our WHOLE model

In [None]:
class CustomModel(nn.Module):
  def __init__(self,checkpoint,num_labels):
    super(CustomModel,self).__init__()
    self.num_labels = num_labels

    #Load Model with given checkpoint and extract its body
    self.model = model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
    self.dropout = nn.Dropout(0.2)
    self.classifier = nn.Linear(768,num_labels) # load and initialize weights

  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #Extract outputs from the body
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    #Add custom layers
    sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state

    logits = self.classifier(sequence_output[:,0,:].view(-1,768))

    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

Model with couple extra layers tested (Not considered)

In [None]:
class CustomModel(nn.Module):
  def __init__(self,checkpoint,num_labels):
    super(CustomModel,self).__init__()
    self.num_labels = num_labels

    #Load Model with given checkpoint and extract its body
    self.model = model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
    self.dropout = nn.Dropout(0.2)
    self.linear1 = nn.Linear(768, 768)
    self.dropout2 = nn.Dropout(0.2)
    self.classifier = nn.Linear(768,num_labels) # load and initialize weights

  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #Extract outputs from the body
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    #Add custom layers
    sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state

    sequence_output = self.linear1(sequence_output)  # Apply linear layer
    sequence_output = self.dropout2(sequence_output)  # Apply dropout

    logits = self.classifier(sequence_output[:,0,:].view(-1,768))

    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

In [None]:
model=CustomModel(checkpoint=checkpoint,num_labels=2).to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Create Dataloaders

In [None]:
train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=32, collate_fn=data_collator
)

eval_dataloader = DataLoader(
    tokenized_dataset["valid"], batch_size=32, collate_fn=data_collator
)

Set training params

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

3565




### Training the model

In [None]:
metric = load_metric("f1")

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)))


for epoch in range(num_epochs):
  model.train()
  for batch in train_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      loss = outputs.loss
      loss.backward()

      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress_bar_train.update(1)

  model.eval()

  for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    progress_bar_eval.update(1)

  print(metric.compute())

In [None]:
model.eval()

test_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size=32, collate_fn=data_collator
)

for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'f1': 0.9276241000378932}

### Saving and loading the model

In [None]:
torch.save(model.state_dict(), "/home/ssrini27/random/model_5.pt")

In [None]:
model = CustomModel(checkpoint=checkpoint,num_labels=2)
model.load_state_dict(torch.load("/home/ssrini27/random/model_5.pt"))
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CustomModel(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

### Defining our custom SPLIT model

In [None]:
class CustomModel_body(nn.Module):
  def __init__(self,checkpoint):
    super(CustomModel_body,self).__init__()

    #Load Model with given checkpoint and extract its body
    self.model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))

  def forward(self, input_ids=None, attention_mask=None):
    #Extract outputs from the body
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    return outputs[0], outputs.hidden_states, outputs.attentions

class CustomModel_head(nn.Module):
  def __init__(self,num_labels):
    super(CustomModel_head,self).__init__()
    self.num_labels = num_labels
    self.dropout = nn.Dropout(0.2)
    self.classifier = nn.Linear(768,num_labels) # load and initialize weights

  def forward(self, sequence_output, transformer_hidden_states, transformer_attentions, labels=None):
    sequence_output = self.dropout(sequence_output)
    logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses

    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=transformer_hidden_states, attentions=transformer_attentions)

Load saved model into our split model classes

In [None]:
model_body = CustomModel_body(checkpoint)
model_head = CustomModel_head(num_labels=2)

model_body.load_state_dict(model.state_dict(), strict=False)
model_head.load_state_dict(model.state_dict(), strict = False)

model_body.to(device)
model_head.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CustomModel_head(
  (dropout): Dropout(p=0.2, inplace=False)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
)

Perform normal evaluation on the combied model to test

In [None]:
model_body.eval()
model_head.train()
metric = load_metric("f1")

test_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size=32, collate_fn=data_collator
)

for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        sequence_output, hidden_states, transformer_attentions = model_body(batch['input_ids'], batch['attention_mask'])
        outputs = model_head(sequence_output, hidden_states, transformer_attentions, labels = batch['labels'])

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

  metric = load_metric("f1")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'f1': 0.9284361984096933}

Perform MC evaluation on the SPLIT model

In [None]:
model_body.eval()
model_head.train()
metric = load_metric("f1")

no_MC_iterations = 100
confidence_percent = 0.95
z_value = NormalDist().inv_cdf((1 + confidence_percent) / 2.)
threshold_value = 0.5

# batch size 1 to perform UQ on single sample
test_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size=1, collate_fn=data_collator
)

considered_samples = []
ignored_samples = []

for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    # Only get body outputs for 1 itr
    with torch.no_grad():
        sequence_output, hidden_states, transformer_attentions = model_body(batch['input_ids'], batch['attention_mask'])

    # Perform rest of the iterations ONLY on the head of the model
    MC_outputs = np.zeros((no_MC_iterations, 2))

    for i in range(no_MC_iterations):
        with torch.no_grad():
            outputs = model_head(sequence_output, hidden_states, transformer_attentions, labels = batch['labels'])
        logits = outputs.logits

        MC_outputs[i] = torch.sigmoid(logits)[0].numpy(force = True)

        # predictions = torch.argmax(logits, dim=-1).to('cpu')
        # metric.add_batch(predictions=predictions, references=batch["labels"])

    mean_outputs = np.mean(MC_outputs, axis = 0)
    std_outputs = np.std(MC_outputs, axis = 0)

    ignored_sample_flag = False

    for i in range(len(mean_outputs)):
        interval_min = mean_outputs[i] - z_value*std_outputs[i]
        interval_max = mean_outputs[i] + z_value*std_outputs[i]
        if interval_max < threshold_value:
            final_class_pred = 0
        elif interval_min > threshold_value:
            final_class_pred = 1
        else:
            ignored_sample_flag = True

    if ignored_sample_flag:
        ignored_samples.append([np.argmax(mean_outputs), batch['labels'].numpy(force = True)[0]])
    else:
        considered_samples.append([final_class_pred, batch['labels'].numpy(force = True)[0]])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
considered_samples = np.array(considered_samples)
ignored_samples = np.array(ignored_samples)

Slightly higher F1 score among chosen samples

In [None]:
metric = load_metric("f1")
metric.add_batch(predictions=considered_samples[:, 0], references=considered_samples[:, 1])
metric.compute()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'f1': 0.9392649903288202}

Low F1 score among other samples

In [None]:
metric = load_metric("f1")
metric.add_batch(predictions=ignored_samples[:, 0], references=ignored_samples[:, 1])
metric.compute()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'f1': 0.45614035087719296}

In [None]:
ignored_samples.shape

(47, 2)

In [None]:
metric = load_metric('accuracy')
metric.add_batch(predictions=ignored_samples[:, 0], references=ignored_samples[:, 1])
metric.compute()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.3404255319148936}

Evaluation on WHOLE model on test dataset (not necessary)

In [None]:
model.eval()
model.dropout.train()

test_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size=1, collate_fn=data_collator
)

for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    print("one" + str(logits))
    with torch.no_grad():

        outputs = model(**batch)

    logits = outputs.logits
    print("two" + str(logits))


    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

### Timing the models with and without splitting

In [None]:
test_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size=1, collate_fn=data_collator
)
extracted_batch = 0
for batch in test_dataloader:
    extracted_batch = {k: v.to(device) for k, v in batch.items()}
    break

model.eval()
model.dropout.train()
model_body.eval()
model_head.train()

CustomModel_head(
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
)

In [None]:
%%timeit
with torch.no_grad():
    outputs = model(**extracted_batch)

1.91 ms ± 1.68 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
%%timeit
with torch.no_grad():
    sequence_output, hidden_states, transformer_attentions = model_body(extracted_batch['input_ids'], extracted_batch['attention_mask'])
    outputs = model_head(sequence_output, hidden_states, transformer_attentions, labels = extracted_batch['labels'])

1.94 ms ± 1.47 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


Wee see on avg splitting the model costs more over 1 iteration





Trying over 50 iterations

In [None]:
%%timeit
for i in range(50):
    with torch.no_grad():
        outputs = model(**extracted_batch)

95.8 ms ± 175 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
sequence_output, hidden_states, transformer_attentions = model_body(extracted_batch['input_ids'], extracted_batch['attention_mask'])
for i in range(50):
    with torch.no_grad():
        outputs = model_head(sequence_output, hidden_states, transformer_attentions, labels = extracted_batch['labels'])

3.78 ms ± 8.83 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


But running over 50 iterations, there is a speedup of 25

Trying 100 iterations

In [None]:
%%timeit
for i in range(100):
    with torch.no_grad():
        outputs = model(**extracted_batch)

190 ms ± 93.3 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
sequence_output, hidden_states, transformer_attentions = model_body(extracted_batch['input_ids'], extracted_batch['attention_mask'])
for i in range(100):
    with torch.no_grad():
        outputs = model_head(sequence_output, hidden_states, transformer_attentions, labels = extracted_batch['labels'])

5.61 ms ± 10.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Running over 100 iterations we have a 33 times speedup!