In [92]:
!pip install torchmetrics



In [93]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import math

from tabulate import tabulate
from tqdm import trange
import random
from torchmetrics.classification import Recall, Accuracy, AUROC, Precision

In [94]:
FILL_IN = "FILL_IN"

In [95]:
!wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
!unzip -o smsspamcollection.zip

--2024-12-04 04:07:30--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘smsspamcollection.zip.8’

smsspamcollection.z     [   <=>              ] 198.65K   309KB/s    in 0.6s    

2024-12-04 04:07:31 (309 KB/s) - ‘smsspamcollection.zip.8’ saved [203415]

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [96]:
!unzip -o smsspamcollection.zip

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [97]:
!head -10 SMSSpamCollection

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham	Even my brother is not like to speak with me. They treat me like aids patent.
ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
spam	H

In [98]:
!wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
!unzip -o smsspamcollection.zip

--2024-12-04 04:07:32--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘smsspamcollection.zip.9’

smsspamcollection.z     [   <=>              ] 198.65K   311KB/s    in 0.6s    

2024-12-04 04:07:33 (311 KB/s) - ‘smsspamcollection.zip.9’ saved [203415]

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [99]:
file_path = 'SMSSpamCollection'
df = pd.DataFrame({'label':int(), 'text':str()}, index = [])
with open(file_path) as f:
    for line in f.readlines():
        split = line.split('\t')
        df = pd.concat([
                df,
                pd.DataFrame.from_dict({
                    'label': [1 if split[0] == 'spam' else 0],
                    'text': [split[1]]
                })
            ],
            ignore_index=True
        )
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...\n
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [100]:
text = df.text.values
labels = df.label.values

In [101]:
# Get the bert-base-uncased tokenizer and set lower case to True
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [102]:
def print_rand_sentence():
    '''Displays the tokens and respective IDs of a random text sample'''
    index = random.randint(0, len(text)-1)
    table = np.array([tokenizer.tokenize(text[index]),
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
    print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

╒══════════╤═════════════╕
│ Tokens   │   Token IDs │
╞══════════╪═════════════╡
│ yes      │        2748 │
├──────────┼─────────────┤
│ baby     │        3336 │
├──────────┼─────────────┤
│ !        │         999 │
├──────────┼─────────────┤
│ i        │        1045 │
├──────────┼─────────────┤
│ need     │        2342 │
├──────────┼─────────────┤
│ to       │        2000 │
├──────────┼─────────────┤
│ stretch  │        7683 │
├──────────┼─────────────┤
│ open     │        2330 │
├──────────┼─────────────┤
│ your     │        2115 │
├──────────┼─────────────┤
│ pussy    │       22418 │
├──────────┼─────────────┤
│ !        │         999 │
╘══════════╧═════════════╛


In [103]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(input_text, add_special_tokens=True,
                               max_length=32, pad_to_max_length = True,
                               return_attention_mask=True, return_tensors='pt')

for sample in text:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids'])
    attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [104]:
def print_rand_sentence_encoding():
    '''Displays tokens, token IDs and attention mask of a random text sample'''
    index = random.randint(0, len(text) - 1)
    tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
    token_ids = [i.numpy() for i in token_id[index]]
    attention = [i.numpy() for i in attention_masks[index]]
    table = np.array([tokens, token_ids, attention]).T
    print(
        tabulate(
            table,
            headers = ['Tokens', 'Token IDs', 'Attention Mask'],
            tablefmt = 'fancy_grid')
    )

print_rand_sentence_encoding()

╒══════════╤═════════════╤══════════════════╕
│ Tokens   │   Token IDs │   Attention Mask │
╞══════════╪═════════════╪══════════════════╡
│ [CLS]    │         101 │                1 │
├──────────┼─────────────┼──────────────────┤
│ tired    │        5458 │                1 │
├──────────┼─────────────┼──────────────────┤
│ .        │        1012 │                1 │
├──────────┼─────────────┼──────────────────┤
│ i        │        1045 │                1 │
├──────────┼─────────────┼──────────────────┤
│ haven    │        4033 │                1 │
├──────────┼─────────────┼──────────────────┤
│ '        │        1005 │                1 │
├──────────┼─────────────┼──────────────────┤
│ t        │        1056 │                1 │
├──────────┼─────────────┼──────────────────┤
│ slept    │        7771 │                1 │
├──────────┼─────────────┼──────────────────┤
│ well     │        2092 │                1 │
├──────────┼─────────────┼──────────────────┤
│ the      │        1996 │        

In [105]:
val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(np.arange(len(labels)),
                                      test_size=val_ratio,
                                      stratify=labels,
                                      shuffle=True,
                                      random_state=42)

# Train and validation sets
train_set = train_set = TensorDataset(token_id[train_idx], attention_masks[train_idx], labels[train_idx])

val_set = TensorDataset(token_id[val_idx], attention_masks[val_idx], labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(train_set, sampler=RandomSampler(train_set), batch_size=batch_size)
validation_dataloader = DataLoader(val_set, sampler=SequentialSampler(val_set), batch_size=batch_size)

Define the LoRA specific layers.

In [106]:
# Define a LoRA Layer which has A, B and alpha parameters
class LoRALayer(torch.nn.Module):
  def __init__(self, in_dim, out_dim, rank, alpha):
    super().__init__()
    # Initialize A to be a parameter matrix of dimension in_dim by rank
    self.A = torch.nn.Parameter(torch.empty(in_dim, rank))
    # Initialize all the elements of A via kaiming_uniform with a equal to sqrt(5)
    torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5))
    # Initialize B to be a zero parameter matrix of the appropriate dimensions
    self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
    self.alpha = alpha

  def forward(self, x):
    # Pass x through the LoRA layer and return the new x
    x = self.alpha * torch.matmul(torch.matmul(x, self.A), self.B)
    return x

# Define a class LoRALinear which has a linear layer and a LoRA layer on top
class LoRALinear(torch.nn.Module):
  def __init__(self, linear, rank, alpha):
    super().__init__()
    self.linear = linear
    self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)

  def forward(self, x):
    # Pass x through the linear layer and also the lora layer
    return self.linear(x) + self.lora(x)

def lora_linear_replace(model, rank, alpha):
  # Use model.named_children to go through all layers and if the layer is Linear replace that layer with LoRALinear
  for name, module in model.named_children():
    # If the module is linear, replace the module in the model with a LoRA layer
    if isinstance(module, torch.nn.Linear):
      # Replace the Linear layer with LoRALinear
      setattr(model, name, LoRALinear(module, rank, alpha))
    else:
      # Alterntively, recursively apply the same function to child modules so that other Linear layers get replaced
      lora_linear_replace(module, rank, alpha)

### Load specific versions of the model

In [107]:
# Load the BertForSequenceClassification model
# Do not ouput the attentions and all hidden states
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    output_attentions=False,
    output_hidden_states=False
)

# Turn off all gradients of the model to start
for param in model.parameters():
    param.requires_grad = False

# Set to True if LORA is used; if False, fine_tune flag will be used to decide if you fine tune the entire model or just parts
use_lora  = True
# If this is False, turn off gradients
fine_tune = False
# Set total_parameters to 0; this will count the number of parameters in each case
total_parameters = 0

if use_lora:
  # Use the lora_linear to attach a LoRA layer to each linear later of the original BERT model
  lora_linear_replace(model, rank=8, alpha=16)
  # Get the total number of parameters with gradients
  for param in model.parameters():
        if param.requires_grad:
            total_parameters += param.numel()
else:
  # If fine_tune is off, turn off gradients for all layers other than classifier
  if not fine_tune:
    # Turn off all gradients; count just the 'classifier' layer which is the only one that has gradients
    for name, param in model.named_parameters():
        if "classifier" in name:
            param.requires_grad = True
            total_parameters += param.numel()
        else:
            param.requires_grad = False
  else:
    # Turn off all gradients except for classifier
    for param in model.parameters():
        param.requires_grad = True
        total_parameters += param.numel()

print(total_parameters)

if use_lora:
  assert(total_parameters == 1345552)
else:
  if fine_tune:
    assert(total_parameters == 109483778)
  else:
    assert(total_parameters == 1538)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1345552


### Set the model to the right device

In [108]:
import platform

# Pick the system you have and select GPU if you can
if platform.system() == 'Darwin':
    device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
elif platform.system() == 'Linux':
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
else:
    device = torch.device('cpu')
print(device)

cuda


In [109]:
_ = model.to(device)

# Recommended number of epochs: See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2

In [110]:
# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5
# See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr = 5e-5,
    eps = 1e-08
)

### Train the model

In [111]:
# Use torchmetrics to set up accuracy, recall, precision, and auroc
accuracy = Accuracy(task='binary', num_classes=2).to(device)
recall = Recall(task='binary', num_classes=2).to(device)
precision = Precision(task='binary', num_classes=2).to(device)
auroc = AUROC(task="binary").to('cpu')

In [112]:
model.device

device(type='cuda', index=0)

In [None]:
# Main training / validation loop
import tqdm
for _ in tqdm.tqdm(trange(epochs, desc = 'Epoch')):

    # ========== Training ==========

    # Set model to training mode
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in tqdm.tqdm(enumerate(train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        # Unpack the batch
        b_input_ids, b_input_mask, b_labels = batch

        # Set the gradients to zero
        optimizer.zero_grad()

        # Forward pass
        train_output = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)

        # Backward pass
        loss = train_output.loss
        loss.backward()
        optimizer.step()

        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_auroc = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
            eval_output = model(b_input_ids, attention_mask=b_input_mask)

        # Calculate validation metrics
        labels = b_labels.to('cpu')
        predicted_labels = torch.argmax(eval_output.logits.to('cpu'), dim=-1)

        val_accuracy.append(accuracy(predicted_labels, labels).item())
        val_recall.append(recall(predicted_labels, labels).item())
        val_precision.append(precision(predicted_labels, labels).item())
        val_auroc.append(auroc(predicted_labels.to('cpu'), labels.to('cpu')).item())

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)))
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)))
    print('\t - Validation AUROC: {:.4f}\n'.format(sum(val_auroc)/len(val_auroc)))

### Test on a specific sentence, see the outcome

In [None]:
new_sentence = 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'

# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

# Forward pass, calculate logit predictions
with torch.no_grad():
    output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Ham'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

### Questions

Question 1: Run the above by fine tuning bert and the classfier head and by not doing this (using BERT as a feature encoder). What is the gap between this?

**Metrics**: Fine-tuning BERT and Classifier Head (`fine_tune = True`):

- Epoch 1:
  - Validation Accuracy: 0.9821
  - Validation Precision: 0.8714
  - Validation Recall: 0.7745
  - Validation AUROC: 0.8444
- Epoch 2:
  - Validation Accuracy: 0.9884
  - Validation Precision: 0.8828
  - Validation Recall: 0.8674
  - Validation AUROC: 0.8880
- Predicted Class: Spam

**Metrics**: Using BERT as a Feature Encoder (`fine_tune = False`):

- Epoch 1:
  - Validation Accuracy: 0.8666
  - Validation Precision: 0.0000
  - Validation Recall: 0.0000
  - Validation AUROC: 0.4571
- Epoch 2:
  - Validation Accuracy: 0.8657
  - Validation Precision: 0.0000
  - Validation Recall: 0.0000
  - Validation AUROC: 0.4566
- Predicted Class: Ham

**Metrics**: Fine-tuning LoRA weights (`use_lora = True`):

- Epoch 1:
  - Validation Accuracy: 0.9830
  - Validation Precision: 0.8488
  - Validation Recall: 0.8126
  - Validation AUROC: 0.8607
- Epoch 2:
  - Validation Accuracy: 0.9875
  - Validation Precision: 0.8667
  - Validation Recall: 0.8464
  - Validation AUROC: 0.8783
- Predicted Class: Spam

**Gap Analysis (Case 1 vs Case 2)**:
- Accuracy Gap: 0.1227
- Precision Gap: 0.8828
- Recall Gap: 0.8674
- AUROC Gap: 0.4314
- Predicted Class: Spam (Case 1) vs Ham (Case 2)

**Gap Analysis (Case 1 vs Case 3)**:
- Accuracy Gap: 0.0009
- Precision Gap: 0.0161
- Recall Gap: 0.0210
- AUROC Gap: 0.0097
- Predicted Class: Spam (Case 1) vs Spam (Case 3)

**Observations**:
1. Performance Gap:
  - Case 1 (`fine_tune = True`) vs Case 2 (`fine_tune = False`): Fine-tuning the entire model (Case 1) significantly outperforms using BERT as a frozen feature encoder (Case 2) across all metrics. Case 2 struggles to learn meaningful task-specific representations because it relies entirely on the frozen BERT embeddings, which are not adapted to the spam detection task. This results in almost no learning, as evidenced by `0.0` precision and recall across epochs.

   - Case 1 (`fine_tune = True`) vs Case 3 (`use_lora = True`): Fine-tuning LoRA weights (Case 3) achieves metrics close to those of case 1, with a marginal performance gap. LoRA shows only a slight reduction in validation metrics (e.g., AUROC gap of 0.0097) while maintaining computational efficiency.

2. Both case 1 and case 3 correctly predict the input as Spam, aligning well with typical spam characteristics (e.g., urgency, monetary rewards, and excessive capitalization). However, case 2 incorrectly predicts the input as Ham, highlighting the limitations of using a frozen BERT for tasks requiring deeper contextual understanding and adaptation.

3. Training Efficiency: Fine-tuning the entire model (`fine_tune = True`, Case 1) took approximately 40 seconds per epoch. Fine-tuning LoRA weights (`use_lora = True`, Case 3) took approximately 28 seconds per epoch, showing improved efficiency while achieving similar performance to Case 1. Using BERT as a feature encoder (`fine_tune = False`, Case 2) was the fastest, taking only 11 seconds per epoch, but this speed came at the cost of severely degraded performance, making it unsuitable for meaningful tasks.

4. LoRA Efficiency and Effectiveness (Case 3): LoRA achieves a strong balance between efficiency and performance. Compared to case 1, it reduces training time by about 30% while maintaining nearly the same metrics across validation accuracy, precision, recall, and AUROC. This makes LoRA a compelling alternative to full fine-tuning, particularly for tasks where computational resources are limited.

**Conclusion**: Fine-tuning entire model offers the best performance by fully adapting the BERT model to the spam detection task but requires the most training time, while using BERT as a feature encoder is computationally efficient but fails to adapt to the task, making it unsuitable for meaningful downstream tasks like spam detection. For tasks requiring both high performance and computational efficiency, LoRA emerges as the best choice, striking a balance between accuracy and resource usage.