In [1]:
import os 
from pathlib import Path

project_root = Path.cwd().parents[0]
os.chdir(project_root)
print("Set project_root:", project_root)

Set project_root: /home/sromo/Repos/lm-workbench


**Downloading the dataset**

In [2]:
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = project_root / "data" / "raw" / "sms_spam_collection.zip"
extracted_path = project_root / "data" / "raw" / "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download "
              "and extraction."
              )
        return
    try:
        with urllib.request.urlopen(url) as response: # Download
            with open(zip_path, "wb") as out_file:
                out_file.write(response.read())
        with zipfile.ZipFile(zip_path, "r") as zip_ref: # Unzip
            zip_ref.extractall(extracted_path)
        original_file_path = Path(extracted_path) / "SMSSpamCollection"
        os.rename(original_file_path, data_file_path)
        print(f"File downloaded and saved as {data_file_path}")
    except Exception as e:
        print("Error downloading and extracting data:", e)
    finally:    
        os.remove(zip_path)

# Usage
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

/home/sromo/Repos/lm-workbench/data/raw/sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [3]:
import pandas as pd
df = pd.read_csv(
    data_file_path, sep="\t", header=None, names=["Label", "Text"]
)
display(df.head(5))

print(df["Label"].value_counts())

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Label
ham     4825
spam     747
Name: count, dtype: int64


In [4]:
def create_balanced_dataset(df:pd.DataFrame):
    num_spam = df[df["Label"] == "spam"].shape[0]
    ham_subject = df[df["Label"]=="ham"].sample(num_spam, random_state=123)
    balanced_df = pd.concat([ham_subject, df[df["Label"]=="spam"]])
    return balanced_df

balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

# Map Labels to Integers:
balanced_df["Label"] = balanced_df["Label"].map({"ham":0, "spam":1})

Label
ham     747
spam    747
Name: count, dtype: int64


In [5]:
def random_split(df:pd.DataFrame, train_frac:float, validation_frac:float):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True) # Shuffle the entire df
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)

In [6]:
out_path = project_root / "data" / "processed" / "sms_spam_collection"
out_path.mkdir(parents=True, exist_ok=True)

train_df.to_csv(out_path / "train.csv", index=None)
validation_df.to_csv(out_path / "validation.csv", index=None)
test_df.to_csv(out_path/"test.csv", index=None)

**Creating dataloaders**

In [7]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

# We will pad short messages with the <|endoftext|> token to match the lenght of the longest message

[50256]


In [8]:
%%script echo SKIPPED
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
    def __init__(self, csv_file:Path, tokenizer:tiktoken.Encoding, max_length:int=None, pad_token_id:int=50256):
        self.data = pd.read_csv(csv_file)
        self.encoded_texts = [tokenizer.encode(text) for text in self.data["Text"]]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            self.encoded_texts = [encoded_text[:self.max_length] for encoded_text in self.encoded_texts]

        self.encoded_texts = [encoded_text + [pad_token_id]*(self.max_length-len(encoded_text)) 
                              for encoded_text in self.encoded_texts]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length

SKIPPED


In [9]:
from src.dataloaders.Classification import SpamDataset

train_dataset = SpamDataset(
csv_file=out_path / "train.csv",
max_length=None,
tokenizer=tokenizer
)

val_dataset = SpamDataset(
csv_file=out_path/"validation.csv",
max_length=train_dataset.max_length,
tokenizer=tokenizer
)

test_dataset = SpamDataset(
csv_file=out_path/"test.csv",
max_length=train_dataset.max_length,
tokenizer=tokenizer
)

print(train_dataset.max_length)

120


In [10]:
import torch
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)
val_loader = DataLoader(
dataset=val_dataset,
batch_size=batch_size,
num_workers=num_workers,
drop_last=False,
)
test_loader = DataLoader(
dataset=test_dataset,
batch_size=batch_size,
num_workers=num_workers,
drop_last=False,
)

# Check dimensions:
for input_batch, target_batch in train_loader:
    pass
print("input_batch dim:\n", input_batch.shape)
print("\ntarget_batch dim:\n", target_batch.shape)

print("\nNumber of batches:")
print(f"{len(train_loader)} training batches")
print(f"{len(val_loader)} validation batches")
print(f"{len(test_loader)} test batches")

input_batch dim:
 torch.Size([8, 120])

target_batch dim:
 torch.Size([8])

Number of batches:
130 training batches
19 validation batches
38 test batches


**Initializing a model with pretrained weights**

In [11]:
from src.configs.GPT2 import GPT_CONFIG_124M
import json

CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
additional_configs = {
    "drop_rate_attn":0.0,
    "drop_rate_shortcut":0.0,
    "drop_rate_emb":0.0,
    "qkv_bias":True,
}
BASE_CONFIG = {**GPT_CONFIG_124M, **model_configs[CHOOSE_MODEL], **additional_configs}

print(json.dumps(BASE_CONFIG, indent=2))

{
  "vocab_size": 50257,
  "context_length": 1024,
  "emb_dim": 768,
  "n_heads": 12,
  "n_layers": 12,
  "drop_rate_attn": 0.0,
  "drop_rate_shortcut": 0.0,
  "drop_rate_emb": 0.0,
  "qkv_bias": true
}


In [12]:
from src.parameter_loading.GPT2 import load_weights
from src.architectures.GPT2 import GPTModel
from scripts.download_parameters.gpt_download import download_and_load_gpt2


model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
models_dir = "openai_" + CHOOSE_MODEL.upper()[:4]

models_dir = project_root/"data/model_parameters"/ models_dir

settings, params = download_and_load_gpt2(
    model_size=model_size, models_dir=models_dir
    )

model = GPTModel(BASE_CONFIG)
load_weights(model, params)
model.eval()

2026-02-19 09:18:13.489261: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


File already exists and is up-to-date: /home/sromo/Repos/lm-workbench/data/model_parameters/openai_GPT2/124M/checkpoint
File already exists and is up-to-date: /home/sromo/Repos/lm-workbench/data/model_parameters/openai_GPT2/124M/encoder.json
File already exists and is up-to-date: /home/sromo/Repos/lm-workbench/data/model_parameters/openai_GPT2/124M/hparams.json
File already exists and is up-to-date: /home/sromo/Repos/lm-workbench/data/model_parameters/openai_GPT2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: /home/sromo/Repos/lm-workbench/data/model_parameters/openai_GPT2/124M/model.ckpt.index
File already exists and is up-to-date: /home/sromo/Repos/lm-workbench/data/model_parameters/openai_GPT2/124M/model.ckpt.meta
File already exists and is up-to-date: /home/sromo/Repos/lm-workbench/data/model_parameters/openai_GPT2/124M/vocab.bpe


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7