### Download Dataset

In [1]:
import urllib

In [2]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
urllib.request.urlretrieve(url, filename="./data/sms_spam_collection.zip")

('./data/sms_spam_collection.zip', <http.client.HTTPMessage at 0x7f7ff08536a0>)

In [7]:
!unzip ./data/sms_spam_collection.zip -d ./data

Archive:  ./data/sms_spam_collection.zip
  inflating: ./data/SMSSpamCollection  
  inflating: ./data/readme           


In [1]:
import pandas as pd

df = pd.read_csv("./data/SMSSpamCollection", sep="\t", header=None, names=["Label", "Text"])
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
df['Label'].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [3]:
def create_balanced_dataset(df):
    
    # Count the instances of "spam"
    num_spam = df[df["Label"] == "spam"].shape[0]
    
    # Randomly sample "ham" instances to match the number of "spam" instances
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)
    
    # Combine ham "subset" with "spam"
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])

    return balanced_df


balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


In [4]:
balanced_df["class_label_index"] = balanced_df['Label'].map({
    "ham": 0, "spam": 1
})

In [5]:
def random_split(df, train_frac, validation_frac):
    # Shuffle the entire DataFrame
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    # Calculate split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
# Test size is implied to be 0.2 as the remainder

In [6]:
train_df.shape, validation_df.shape, test_df.shape

((1045, 3), (149, 3), (300, 3))

In [7]:
train_df.head()

Unnamed: 0,Label,Text,class_label_index
0,ham,Dude how do you like the buff wind.,0
1,ham,Tessy..pls do me a favor. Pls convey my birthd...,0
2,spam,Reminder: You have not downloaded the content ...,1
3,spam,Got what it takes 2 take part in the WRC Rally...,1
4,spam,"Shop till u Drop, IS IT YOU, either 10K, 5K, £...",1


### Creating Data Loaders

* Note that the text messages have different lengths; if we want to combine multiple training examples in a batch, we have to either

    1. truncate all messages to the length of the shortest message in the dataset or batch
    2. pad all messages to the length of the longest message in the dataset or batch

* We choose option 2 and pad all messages to the longest message in the dataset

* For that, we use <|endoftext|> as a padding token

In [8]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [9]:
tokenizer.eos_token, tokenizer.eos_token_id

('<|endoftext|>', 50256)

##### Spam Dataset

In [10]:
import torch
from torch.utils.data import Dataset


class SpamDataset(Dataset):
    def __init__(self, data_df, tokenizer, max_length=None, pad_token_id=50256):
        self.data = data_df.reset_index(drop=True)

        # Pre-tokenize texts
        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data["Text"]
        ]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            # Truncate sequences if they are longer than max_length
            self.encoded_texts = [
                encoded_text[:self.max_length]
                for encoded_text in self.encoded_texts
            ]

        # Pad sequences to the longest sequence
        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["class_label_index"]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length
        # Note: A more pythonic version to implement this method
        # is the following, which is also used in the next chapter:
        # return max(len(encoded_text) for encoded_text in self.encoded_texts)

In [11]:
train_dataset = SpamDataset(train_df, tokenizer)
val_dataset = SpamDataset(validation_df, max_length=None, tokenizer=tokenizer)
test_dataset = SpamDataset(test_df, max_length=None, tokenizer=tokenizer)

print(train_dataset.max_length, val_dataset.max_length, test_dataset.max_length)

120 71 92


In [12]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

In [13]:
## Sanity Test
for X, y in train_loader:
    print(X.shape, y.shape)
    break

torch.Size([8, 120]) torch.Size([8])


In [None]:
print(f"{len(train_loader)} training batches")
print(f"{len(val_loader)} validation batches")
print(f"{len(test_loader)} test batches")

130 training batches
19 validation batches
38 test batches


Please Note:
* Handling Masks with nn.MultiheadAttention
  
  Masks are often used in attention mechanisms to prevent attending to certain positions, such as padding tokens in NLP tasks. The nn.MultiheadAttention module supports both key padding masks and attention masks.

### Pretrained GPT2 Model Weights

In [15]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [16]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [17]:
text = "Every effort moves you"
encoded_input = tokenizer(text, return_tensors='pt')
print("Encoded:", encoded_input)
print("Encoded Tensor Shape:", encoded_input["input_ids"].shape)

Encoded: {'input_ids': tensor([[6109, 3626, 6100,  345]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
Encoded Tensor Shape: torch.Size([1, 4])


In [18]:
## Applying Greedy Decoding

# Select next token always with max prob using argmax
def generate_text_sample(model, idx, max_new_tokens, context_size):
    model.eval()
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)["logits"]
        logits = logits[:, -1, :]  # last vector
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

In [19]:
output_token_ids = generate_text_sample(model, encoded_input["input_ids"], max_new_tokens=40, context_size=model.config.n_ctx)

output_token_ids = output_token_ids.squeeze(dim=0).tolist() # remove batch size
print("Output Text:", tokenizer.decode(output_token_ids))

Output Text: Every effort moves you forward.

The first step is to understand the importance of your work.

The second step is to understand the importance of your work.

The third step is to understand the importance


In [20]:
# Using builtin hugging face method to generate sample text
output_tokens = model.generate(**encoded_input, max_new_tokens=25, top_k=10, temperature=1.5, do_sample=True)
tokenizer.decode(output_tokens[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Every effort moves you. It makes you want to do more. It moves you to become more creative. And then you get tired from the work'

* Before we finetune the model as a classifier, let's see if the model can perhaps already classify spam messages via prompting

In [28]:
text_2 = (
    "Is the following text 'spam'? Answer with 'yes' or 'no':"
    " 'You are a winner you have been specially"
    " selected to receive $1000 cash or a $2000 award.'"
)

encoded_input = tokenizer(text_2, return_tensors='pt')
output_token_ids = generate_text_sample(model, encoded_input["input_ids"], max_new_tokens=40, 
                                        context_size=model.config.n_ctx)

output_token_ids = output_token_ids.squeeze(dim=0).tolist() # remove batch size
print("Output Text:", tokenizer.decode(output_token_ids))


Output Text: Is the following text'spam'? Answer with 'yes' or 'no': 'You are a winner you have been specially selected to receive $1000 cash or a $2000 award.'

The following text'spam'? Answer with 'yes' or 'no': 'You are a winner you have been specially selected to receive $1000 cash or a $2000 award.'



* As we can see, the model is not very good at following instructions
* This is expected, since it has only been pretrained and not instruction-finetuned (instruction finetuning will be covered in the next chapter)

In [29]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

* The goal is to replace and finetune the output layer
* To achieve this, we first freeze the model, meaning that we make all layers non-trainable

In [30]:
for param in model.parameters():
    param.requires_grad = False

In [31]:
torch.manual_seed(123)
num_classes = 2
model.lm_head = torch.nn.Linear(in_features=768, out_features=num_classes, bias=False)

* Technically, it's sufficient to only train the output layer
* However, as found in [Finetuning Large Language Models](https://magazine.sebastianraschka.com/p/finetuning-large-language-models), experiments show that finetuning additional layers can noticeably improve the performance
* So, we are also making the last transformer block and the final LayerNorm module connecting the last transformer block to the output layer trainable



In [32]:
# make last transformer block trainable
for param in model.transformer.h[-1].parameters():
    param.requires_grad = True

# make final layer norm trainable
for param in model.transformer.ln_f.parameters():
    param.requires_grad = True

In [33]:
inputs = tokenizer.encode("Do you have time")
inputs = torch.tensor(inputs).unsqueeze(0)
print("Inputs token ids:", inputs)
print("Inputs dimensions:", inputs.shape) # shape: (batch_size, num_tokens)

with torch.no_grad():
    outputs = model(inputs)

print("Outputs:\n", outputs["logits"])
print("Outputs dimensions:", outputs["logits"].shape) # shape: (batch_size, num_tokens, num_classes)


Inputs token ids: tensor([[5211,  345,  423,  640]])
Inputs dimensions: torch.Size([1, 4])
Outputs:
 tensor([[[-1.5995,  1.0086],
         [-3.7375,  7.4730],
         [-2.2801,  6.6231],
         [-3.6124,  4.0084]]])
Outputs dimensions: torch.Size([1, 4, 2])


* In chapter 3, we discussed the attention mechanism, which connects each input token to each other input token
* In chapter 3, we then also introduced the causal attention mask that is used in GPT-like models; this causal mask lets a current token only attend to the current and previous token positions
* Based on this causal attention mechanism, the last token contains the most information among all tokens because it's the only token that includes information about all other tokens
Hence, we are particularly interested in this last token, which we will finetune for the spam classification task

In [34]:
print("Last output token:", outputs["logits"][:, -1, :])

Last output token: tensor([[-3.6124,  4.0084]])


In [22]:
class SpamClassificationGPTModel(torch.nn.Module):
    def __init__(self, num_classes: int = 2):
        super().__init__()

        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.model = GPT2LMHeadModel.from_pretrained('gpt2')

        for param in self.model.parameters():
            param.requires_grad = False

        # make last transformer block trainable
        for param in self.model.transformer.h[-1].parameters():
            param.requires_grad = True

        # make final layer norm trainable
        for param in self.model.transformer.ln_f.parameters():
            param.requires_grad = True

        self.model.lm_head = torch.nn.Linear(in_features=768, out_features=num_classes, bias=False)

    
    def forward(self, x):
        # X [BS, SEQ]
        x = self.model(x) # BS, SEQ, 2
        return x["logits"][:, -1, :] #last token

In [23]:
classification_model = SpamClassificationGPTModel(num_classes=2)

In [24]:
# Sanity Test
print("Input Dimension:", X.shape)
classification_model.eval()
with torch.no_grad():
    output = classification_model(X)
print("Output Dimension:", output.shape)

Input Dimension: torch.Size([8, 120])
Output Dimension: torch.Size([8, 2])


In [25]:
@torch.no_grad
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0

    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)

            logits = model(input_batch) 
            
            predicted_labels = torch.argmax(logits, dim=-1)

            num_examples += predicted_labels.shape[0]
            correct_predictions += (predicted_labels == target_batch).sum().item()
        else:
            break
    return correct_predictions / num_examples

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(123) # For reproducibility due to the shuffling in the training data loader

train_accuracy = calc_accuracy_loader(train_loader, classification_model, device, num_batches=10)
val_accuracy = calc_accuracy_loader(val_loader, classification_model, device, num_batches=10)
test_accuracy = calc_accuracy_loader(test_loader, classification_model, device, num_batches=10)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")


Training accuracy: 53.75%
Validation accuracy: 55.00%
Test accuracy: 51.25%


In [26]:
from deepml.tasks import NeuralNetTask

task = NeuralNetTask(classification_model, "./model_wights/spam_classification")

In [27]:
from deepml.train import Learner

optimizer = torch.optim.AdamW(classification_model.parameters(), lr=4e-4, weight_decay=0.1)
loss = torch.nn.CrossEntropyLoss()

trainer = Learner(task, optimizer, loss)

In [28]:
trainer.fit(train_loader, val_loader, epochs=5)

Epoch 1/5:


Validation  : 100%|██████████| 19/19 [00:24<00:00,  1.31s/it, loss=0.6213]] 


Training Loss: 0.7670 Validation Loss: 0.6213 [Saving best validation model]
Epoch 2/5:


Training    : 100%|██████████| 130/130 [06:27<00:00,  2.98s/it, loss=0.767]
Validation  : 100%|██████████| 19/19 [06:30<00:00, 20.53s/it, loss=0.1105]


Training Loss: 0.2044 Validation Loss: 0.1105 [Saving best validation model]
Epoch 3/5:


Training    : 100%|██████████| 130/130 [17:01<00:00,  7.86s/it, loss=0.2044]
Validation  : 100%|██████████| 19/19 [01:23<00:00,  4.39s/it, loss=0.0864]8] 


Training Loss: 0.0918 Validation Loss: 0.0864 [Saving best validation model]
Epoch 4/5:


Training    : 100%|██████████| 130/130 [35:36<00:00, 16.43s/it, loss=0.0918]


KeyboardInterrupt: 

In [30]:
device = "cuda" if torch.cuda.is_available() else "cpu"
train_accuracy = calc_accuracy_loader(train_loader, classification_model, device, num_batches=10)
val_accuracy = calc_accuracy_loader(val_loader, classification_model, device, num_batches=10)
test_accuracy = calc_accuracy_loader(test_loader, classification_model, device, num_batches=10)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 98.75%
Validation accuracy: 97.50%
Test accuracy: 95.00%


In [38]:
def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256):
    model.eval()

    supported_context_length = 1024
    # Prepare inputs to the model
    input_ids = tokenizer.encode(text)
       
    # Truncate sequences if they too long
    input_ids = input_ids[:min(max_length, supported_context_length)]

    # Pad sequences to the longest sequence
    input_ids += [pad_token_id] * (max_length - len(input_ids))
    input_tensor = torch.tensor(input_ids, device=device).unsqueeze(0) # add batch dimension

    # Model inference
    with torch.no_grad():
        logits = model(input_tensor)
    predicted_label = torch.argmax(logits, dim=-1).item()

    # Return the classified result
    return "spam" if predicted_label == 1 else "not spam"


In [39]:
text_1 = (
    "You are a winner you have been specially"
    " selected to receive $1000 cash or a $2000 award."
)

print(classify_review(
    text_1, classification_model, tokenizer, device, max_length=train_dataset.max_length
))


spam


In [41]:
text_2 = (
    "Hey, just wanted to check if we're still on"
    " for dinner tonight? Let me know!"
)

print(classify_review(
    text_2, classification_model, tokenizer, device, max_length=train_dataset.max_length
))

not spam
