In [37]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import tiktoken

In [38]:
tokenizer = tiktoken.get_encoding('gpt2')

In [39]:
import urllib.request
import ssl
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Create an unverified SSL context
    ssl_context = ssl._create_unverified_context()

    # Downloading the file
    with urllib.request.urlopen(url, context=ssl_context) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [40]:
df = pd.read_csv(data_file_path, sep="\t", header=None, names=['Label', 'Text'])
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [41]:
num_spam = df['Label'].value_counts()
num_spam = num_spam['spam']

In [42]:
df_ham = df[df['Label'] == 'ham'].sample(num_spam)
df_spam = df[df["Label"] == "spam"]
df_balanced = pd.concat([df_ham, df_spam])
df_balanced['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
ham,747
spam,747


In [43]:
df_balanced['Label'] = df_balanced['Label'].map({'ham': 0, 'spam': 1})
df_balanced['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,747
1,747


In [44]:
df_balanced.head()

Unnamed: 0,Label,Text
2813,0,"Say this slowly.? GOD,I LOVE YOU &amp; I NEED ..."
4679,0,It is a good thing I'm now getting the connect...
4178,0,Ok lor then we go tog lor...
2327,0,The Xmas story is peace.. The Xmas msg is love...
3684,0,When did i use soc... I use it only at home......


In [45]:
df_balanced = df_balanced.sample(frac=1, random_state=123)

In [46]:
train_frac = 0.7
val_frac = 0.1

train_end = int(len(df_balanced) * train_frac)
train_data = df_balanced[:train_end]

val_end = int(train_end + len(df_balanced) * val_frac)
val_data = df_balanced[train_end:val_end]

In [47]:
print(len(train_data))
print(len(val_data))

1045
149


In [48]:
test_data = df_balanced[val_end:]

In [49]:
# encoded_text = [tokenizer.encode(text) for text in train_data['Text']]
# max_len = 0
# for text in encoded_text:
#   if len(text) > max_len:
#     max_len = len(text)

# for text in encoded_text:
#   tokens_to_pad = [50257] * (max_len - len(text))
#   text = text + tokens_to_pad

# for text in encoded_text:
#   print(len(text))

In [50]:
# encoded_text = [text + [50257] * (max_len - len(text)) for text in encoded_text]
# print(type(encoded_text))

In [51]:
class SpamDataset(Dataset):
  def __init__(self, data, tokenizer, pad_token_id=50256, max_len=None):
    self.data = data
    self.encoded_text = [tokenizer.encode(text) for text in data['Text']]

    if max_len is None:
      self.max_len = self.cal_max_len()
    else:
      self.max_len = max_len
      self.encoded_text = [text[0:self.max_len] for text in self.encoded_text]

    self.encoded_text = [text + [pad_token_id] * (self.max_len - len(text)) for text in self.encoded_text]

  def __getitem__(self, index):
    encoded = self.encoded_text[index]
    label = self.data.iloc[index]['Label']
    return (
        torch.tensor(encoded, dtype=torch.long),
        torch.tensor(label, dtype=torch.long)
    )

  def __len__(self):
    return len(self.encoded_text)

  def cal_max_len(self):
    max_len = 0
    for text in self.encoded_text:
      if len(text) > max_len:
        max_len = len(text)
    return max_len

In [52]:
train_dataset = SpamDataset(train_data, tokenizer)

In [53]:
val_dataset = SpamDataset(val_data, tokenizer, max_len=train_dataset.max_len)
test_dataset = SpamDataset(val_data, tokenizer, max_len=train_dataset.max_len)

In [54]:
print(len(test_data))

300


In [55]:
train_dataset[0]

(tensor([20459,   588,   326,  7541, 46166,   983,  1312,   892,    13,   921,
          8494, 24367,   287,   257,  1989,  1517, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 5

In [56]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          num_workers = num_workers,
                          shuffle=True,
                          drop_last=True)

val_loader = DataLoader(dataset=val_dataset,
                          batch_size=batch_size,
                          num_workers = num_workers,
                          shuffle=True,
                          drop_last=False)

test_loader = DataLoader(dataset=test_dataset,
                          batch_size=batch_size,
                          num_workers = num_workers,
                          shuffle=True,
                          drop_last=True)

In [57]:
print("Train loader:")
for input_batch, target_batch in train_loader:
    pass

print("Input batch dimensions:", input_batch.shape)
print("Label batch dimensions", target_batch.shape)

Train loader:
Input batch dimensions: torch.Size([8, 204])
Label batch dimensions torch.Size([8])


In [58]:
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"

BASE_CONFIG = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.0,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

# assert train_dataset.max_length <= BASE_CONFIG["context_length"], (
#     f"Dataset length {train_dataset.max_length} exceeds model's context "
#     f"length {BASE_CONFIG['context_length']}. Reinitialize data sets with "
#     f"`max_length={BASE_CONFIG['context_length']}`"
# )

In [59]:
import torch.nn as nn

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), ## Expansion
            GELU(), ## Activation
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), ## Contraction
        )

    def forward(self, x):
        return self.layers(x)

In [60]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

In [61]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        # 2*4*768
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x
        # 2*4*768

In [62]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

torch.manual_seed(123)
model = GPTModel(BASE_CONFIG)
model.eval();  # Disable dropout during inference

In [63]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [64]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel() # Returns the total number of elements (or tokens) in the input_batch.
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen

In [65]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [66]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context

    ###Input batch:
 ###tensor([[6109, 3626, 6100,  345],
        ##[6109, 1110, 6622,  257]])

    for _ in range(max_new_tokens):

        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]

        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond) ### batch, n_tokens, vocab_size

        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [67]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

In [68]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [69]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [70]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

In [71]:
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")

from gpt_download3 import download_and_load_gpt2

settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)


checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 155kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:01<00:00, 652kiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 194kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [02:54<00:00, 2.86MiB/s]
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 7.00MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:01<00:00, 393kiB/s]
vocab.bpe: 100%|██████████| 456k/456k [00:01<00:00, 381kiB/s]


In [72]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device);
model.eval();

In [73]:
text_1 = "Every effort moves you"

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(text_1, tokenizer).to(device),
    max_new_tokens=15,
    context_size=BASE_CONFIG["context_length"]
)

print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you forward.

The first step is to understand the importance of your work


In [74]:
for param in model.parameters():
  param.requires_grad = False

In [76]:
print(model)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [77]:
torch.manual_seed(123)

num_classes = 2
model.out_head = nn.Linear(in_features=BASE_CONFIG['emb_dim'], out_features=num_classes)

In [78]:
for param in model.final_norm.parameters():
  param.requires_grad = True

In [79]:
for param in model.trf_blocks[-1].parameters():
  param.requires_grad = True

In [93]:
def cal_accuracy_loader(data_loader, model, device, num_batches=None):
  model.eval()

  correct_preds = 0
  num_examples = 0

  if num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(num_batches, len(data_loader))

  for i, (input_batch, output_batch) in enumerate(data_loader):
    if i<num_batches:
      input_batch, output_batch = input_batch.to(device), output_batch.to(device)
      with torch.no_grad():
        logits = model(input_batch)[:, -1, :]
      preds = torch.argmax(logits, dim=-1)
      num_examples += input_batch.shape[0]
      correct_preds += (preds == output_batch).sum().item()
    else:
      break

  return correct_preds/num_examples

In [94]:
model.to(device)

train_acc = cal_accuracy_loader(train_loader, model, device, num_batches=10)
val_acc = cal_accuracy_loader(val_loader, model, device, num_batches=10)
test_acc = cal_accuracy_loader(test_loader, model, device, num_batches=10)

In [95]:
print(train_acc)
print(val_acc)
print(test_acc)

0.5625
0.5125
0.5125


In [139]:
def calc_loss_batch_finetuned(input_batch, output_batch, device, model):

  input_batch, output_batch = input_batch.to(device), output_batch.to(device)
  logits = model(input_batch)[:, -1, :]
  loss = torch.nn.functional.cross_entropy(logits, output_batch)

  return loss

In [140]:
def calc_loss_loader_finetuned(data_loader, model, device, num_batches=None):
  total_loss = 0.

  if len(data_loader) == 0:
    return float("nan")
  elif num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(num_batches, len(data_loader))

  for i, (input_batch, output_batch) in enumerate(data_loader):
    if i<num_batches:
      loss = calc_loss_batch_finetuned(input_batch, output_batch, device, model)
      total_loss += loss.item()
    else:
      break

  return total_loss/num_batches

In [141]:
with torch.no_grad():
  train_loss = calc_loss_loader_finetuned(train_loader, model, device, num_batches=5)
  val_loss = calc_loss_loader_finetuned(val_loader, model, device, num_batches=5)
  test_loss = calc_loss_loader_finetuned(test_loader, model, device, num_batches=5)

In [142]:
print(train_loss)
print(val_loss)
print(test_loss)

0.5499268770217896
0.5499030590057373
0.5541114926338195


In [143]:
# Overall the same as `train_model_simple` in chapter 5
def train_classifier_simple_finetuned(model, train_loader, val_loader, optimizer, device, num_epochs,
                            eval_freq, eval_iter):
    # Initialize lists to track losses and examples seen
    train_losses, val_losses, train_accs, val_accs = [], [], [], []
    examples_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch_finetuned(input_batch, target_batch, device, model)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            examples_seen += input_batch.shape[0] # New: track examples instead of tokens
            global_step += 1

            ## 130 batches: training, eval_Freq = 50 --> after 50 batches are processed in each epoch, we print train loss and val loss

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Calculate accuracy after each epoch
        train_accuracy = cal_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
        val_accuracy = cal_accuracy_loader(val_loader, model, device, num_batches=eval_iter)
        print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
        print(f"Validation accuracy: {val_accuracy*100:.2f}%")
        train_accs.append(train_accuracy)
        val_accs.append(val_accuracy)

    return train_losses, val_losses, train_accs, val_accs, examples_seen

In [144]:
# Same as chapter 5
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader_finetuned(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader_finetuned(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [171]:
import time

start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)

num_epochs = 5
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple_finetuned(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=50, eval_iter=5,
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 0.028, Val loss 0.008
Ep 1 (Step 000050): Train loss 0.010, Val loss 0.008
Ep 1 (Step 000100): Train loss 0.008, Val loss 0.004
Training accuracy: 97.50% | Validation accuracy: 100.00%
Ep 2 (Step 000150): Train loss 0.049, Val loss 0.007
Ep 2 (Step 000200): Train loss 0.093, Val loss 0.032
Ep 2 (Step 000250): Train loss 0.007, Val loss 0.041
Training accuracy: 95.00% | Validation accuracy: 100.00%
Ep 3 (Step 000300): Train loss 0.004, Val loss 0.022
Ep 3 (Step 000350): Train loss 0.010, Val loss 0.056
Training accuracy: 97.50% | Validation accuracy: 100.00%
Ep 4 (Step 000400): Train loss 0.008, Val loss 0.017
Ep 4 (Step 000450): Train loss 0.009, Val loss 0.031
Ep 4 (Step 000500): Train loss 0.003, Val loss 0.021
Training accuracy: 100.00% | Validation accuracy: 100.00%
Ep 5 (Step 000550): Train loss 0.119, Val loss 0.023
Ep 5 (Step 000600): Train loss 0.002, Val loss 0.016
Training accuracy: 100.00% | Validation accuracy: 97.50%
Training completed in 1.5

In [172]:
def classify_reviews(text, model, device, tokenizer, max_length, pad_token_id=50256):
  model.eval()

  encoded_text = tokenizer.encode(text)
  encoded_text = encoded_text[0:max_length]
  encoded_text = encoded_text + [pad_token_id] * (max_length-len(encoded_text))

  input_tensor = torch.tensor(encoded_text, device=device).unsqueeze(0)

  with torch.no_grad():
    logits = model(input_tensor)[:, -1, :]

  print(logits)
  pred_class = torch.argmax(logits, dim=-1).item()
  print(pred_class)
  return "spam" if pred_class == 1 else "not spam"

In [174]:
text_1 = (
    "You are a winner you have been specially"
    " selected to receive $1000 cash or a $2000 award."
)

print(classify_reviews(
    text_1, model, device, tokenizer, max_length=train_dataset.max_len
))

tensor([[-3.1150,  4.9588]], device='cuda:0')
1
spam


In [176]:
text_2 = (
    "Hey, just wanted to check if we're still on"
    " for dinner tonight? Let me know!"
)

print(classify_reviews(
    text_2, model, device, tokenizer, max_length=train_dataset.max_len
))

tensor([[ 2.7164, -1.3788]], device='cuda:0')
0
not spam


In [177]:
text_2 = (
    "DHL Zustellbenachrichtigung – Letzte Erinnerung"

"Wir konnten die Zustellung Ihres Pakets aufgrund unvollständiger oder ungenauer Adressangaben nicht abschließen."
"Geplantes Lieferdatum: 7. Juli 2025"
"Aktueller Status: Das Paket befindet sich zurzeit in unserem lokalen Depot."

"Um zu verhindern, dass Ihr Paket an den Absender zurückgeschickt wird, bitten wir Sie, Ihre Lieferadresse umgehend über den sicheren Link unten zu bestätigen oder zu aktualisieren:"

"[Adresse Bestätigen]"

"https://dhlde.vcvhcr.icu/Pakete"

"Dies ist Ihre letzte Erinnerung. Sollten wir innerhalb der nächsten 12 Stunden keine Bestätigung Ihrer Adresse erhalten, wird das Paket an den Absender zurückgeschickt."

"Um den Link zu öffnen, antworten Sie bitte mit „J“ auf diese Nachricht und öffnen Sie die SMS erneut, oder kopieren Sie die URL und fügen Sie diese in Ihren Browser ein (Google Chrome oder Safari werden empfohlen)."

"Vielen Dank, dass Sie sich für DHL entschieden haben."
)

print(classify_reviews(
    text_2, model, device, tokenizer, max_length=train_dataset.max_len
))

tensor([[-3.6247,  5.0561]], device='cuda:0')
1
spam


In [178]:
torch.save(model.state_dict(), "review_classifier.pth")

In [179]:
model_state_dict = torch.load("review_classifier.pth")
model.load_state_dict(model_state_dict)

<All keys matched successfully>

In [181]:
test_data['Text']

Unnamed: 0,Text
2079,85233 FREE>Ringtone!Reply REAL
3230,Ur cash-balance is currently 500 pounds - to m...
3720,"Thanks for your ringtone order, reference numb..."
3610,Joy's father is John. Then John is the ____ of...
3126,1st wk FREE! Gr8 tones str8 2 u each wk. Txt N...
...,...
2109,FREE2DAY sexy St George's Day pic of Jordan!Tx...
2730,Urgent! Please call 09066612661 from your land...
4410,For your chance to WIN a FREE Bluetooth Headse...
4903,* FREE* POLYPHONIC RINGTONE Text SUPER to 8713...


In [182]:
test_accuracy = cal_accuracy_loader(test_loader, model, device, num_batches=10)

In [183]:
test_accuracy

0.9875