In [1]:
import os 
from pathlib import Path

project_root = Path.cwd().parents[0]
os.chdir(project_root)
print("Set project_root:", project_root)

Set project_root: /home/sromo/Repos/lm-workbench


**Downloading the dataset**

In [2]:
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = project_root / "data" / "raw" / "sms_spam_collection.zip"
extracted_path = project_root / "data" / "raw" / "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download "
              "and extraction."
              )
        return
    try:
        with urllib.request.urlopen(url) as response: # Download
            with open(zip_path, "wb") as out_file:
                out_file.write(response.read())
        with zipfile.ZipFile(zip_path, "r") as zip_ref: # Unzip
            zip_ref.extractall(extracted_path)
        original_file_path = Path(extracted_path) / "SMSSpamCollection"
        os.rename(original_file_path, data_file_path)
        print(f"File downloaded and saved as {data_file_path}")
    except Exception as e:
        print("Error downloading and extracting data:", e)
    finally:    
        os.remove(zip_path)

# Usage
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

/home/sromo/Repos/lm-workbench/data/raw/sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [3]:
import pandas as pd
df = pd.read_csv(
    data_file_path, sep="\t", header=None, names=["Label", "Text"]
)
display(df.head(5))

print(df["Label"].value_counts())

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Label
ham     4825
spam     747
Name: count, dtype: int64


In [4]:
def create_balanced_dataset(df:pd.DataFrame):
    num_spam = df[df["Label"] == "spam"].shape[0]
    ham_subject = df[df["Label"]=="ham"].sample(num_spam, random_state=123)
    balanced_df = pd.concat([ham_subject, df[df["Label"]=="spam"]])
    return balanced_df

balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

# Map Labels to Integers:
balanced_df["Label"] = balanced_df["Label"].map({"ham":0, "spam":1})

Label
ham     747
spam    747
Name: count, dtype: int64


In [5]:
def random_split(df:pd.DataFrame, train_frac:float, validation_frac:float):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True) # Shuffle the entire df
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)

In [6]:
out_path = project_root / "data" / "processed" / "sms_spam_collection"
out_path.mkdir(parents=True, exist_ok=True)

train_df.to_csv(out_path / "train.csv", index=None)
validation_df.to_csv(out_path / "validation.csv", index=None)
test_df.to_csv(out_path/"test.csv", index=None)

**Creating dataloaders**

In [7]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

# We will pad short messages with the <|endoftext|> token to match the lenght of the longest message

[50256]


In [8]:
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
    def __init__(self, csv_file:Path, tokenizer:tiktoken.Encoding, max_length:int=None, pad_token_id:int=50256):
        self.data = pd.read_csv(csv_file)
        self.encoded_texts = [tokenizer.encode(text) for text in self.data["Text"]]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            self.encoded_texts = [encoded_text[:self.max_length] for encoded_text in self.encoded_texts]

        self.encoded_texts = [encoded_text + [pad_token_id]*(self.max_length-len(encoded_text)) 
                              for encoded_text in self.encoded_texts]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length

In [9]:
csv_path = out_path / "train.csv"

train_dataset = SpamDataset(
csv_file=csv_path,
max_length=None,
tokenizer=tokenizer
)

print(train_dataset.max_length)

120


In [None]:
val_dataset = SpamDataset(
csv_file=out_path/"validation.csv",
max_length=train_dataset.max_length,
tokenizer=tokenizer
)

test_dataset = SpamDataset(
csv_file=out_path/"test.csv",
max_length=train_dataset.max_length,
tokenizer=tokenizer
)