In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("SMSSpamCollection", sep="\t", header=None, names=["label", "message"])

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df["label"].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [5]:
def balance_dataframe(df):
    # Count the number of spam 
    num_spam = df[df["label"] == "spam"].shape[0]

    # Random sample "ham" instance to match with the number of spam 
    ham_df = df[df["label"] == "ham"].sample(num_spam, random_state=42)

    # Concatenate the spam and sampled ham dataframes
    balanced_df = pd.concat([ham_df, df[df["label"] == "spam"]])

    return balanced_df

In [6]:
balanced_df = balance_dataframe(df)
balanced_df["label"].value_counts()

label
ham     747
spam    747
Name: count, dtype: int64

In [8]:
balanced_df["label"] = balanced_df["label"].map({"ham": 0, "spam": 1})

In [9]:
def random_split(df, train_frac, validation_frac):
    # Shuffle the entire DataFrame
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    # Calculate split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
# Test size is implied to be 0.2 as the remainder

In [10]:
print(len(train_df))
print(len(validation_df))
print(len(test_df))

1045
149
300


In [11]:
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

Creating the dataLoaders


In [22]:
import torch
from torch.utils.data import Dataset, DataLoader

class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)

        # Pre-tokenize texts
        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data["message"]
        ]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            # Truncate sequences if they are longer than max_length
            self.encoded_texts = [
                encoded_text[:self.max_length]
                for encoded_text in self.encoded_texts
            ]

        # Pad sequences to the longest sequence
        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length

In [23]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [24]:
# Load the training dataset in the SpamDataset class
train_dataset = SpamDataset(
    csv_file= "train.csv",
    tokenizer=tokenizer,
    max_length=None,
)

In [25]:
print(train_dataset.max_length)

120


In [27]:
# Load the validation dataset in the SpamDataset class
validation_dataset = SpamDataset(
    csv_file= "validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer,
)

In [28]:
# Load the testing dataset in the SpamDataset class
testing_dataset = SpamDataset(
    csv_file= "test.csv",
    tokenizer=tokenizer,
    max_length=train_dataset.max_length,
)

In [29]:
from torch.utils.data import DataLoader

num_workers = 4
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True
)
val_loader = DataLoader(
    dataset=validation_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=True
)
test_loader = DataLoader(
    dataset=testing_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True
)

In [30]:
print(f"{len(train_loader)} training batches")
print(f"{len(val_loader)} validation batches")
print(f"{len(test_loader)} test batches")

130 training batches
18 validation batches
37 test batches
