In [2]:
!pip install torchtext==0.6

Collecting torchtext==0.6
  Downloading torchtext-0.6.0-py3-none-any.whl.metadata (6.3 kB)
Collecting sentencepiece (from torchtext==0.6)
  Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl.metadata (8.3 kB)
Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/992.0 kB ? eta -:--:--
   ---------------------------------------- 992.0/992.0 kB 7.8 MB/s eta 0:00:00
Installing collected packages: sentencepiece, torchtext
Successfully installed sentencepiece-0.2.0 torchtext-0.6.0


In [76]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import torchtext
# from google.colab import drive, files
#Used to import files for Google Colab

# drive.mount('/content/gdrive/')

text = torchtext.data.Field(sequential=True,
                            tokenize=lambda x: x,
                            include_lengths=True,
                            batch_first=True,
                            use_vocab=True)
label = torchtext.data.Field(sequential=False,
                            use_vocab=False,      
                            is_target=True,
                            batch_first=True,
                            dtype = torch.float)

fields = [('text', text), ('label', label)] #Change Below Directory to source training dataset
dataset = torchtext.data.TabularDataset("/content/gdrive/MyDrive/Colab Notebooks/Spam Ham/spam_ham_datasets.csv","csv",fields, skip_header=True)
train, validate, test = dataset.split(split_ratio=[0.6,0.2,0.2])
train_iter = torchtext.data.BucketIterator(train,
                                           batch_size=32,
                                           sort_key=lambda x: len(x.text), # to minimize padding
                                           sort_within_batch=True,        # sort within each batch
                                           repeat=False)                  # repeat the iterator for many epochs

valid_iter = torchtext.data.BucketIterator(validate,
                                           batch_size=32,
                                           sort_key=lambda x: len(x.text), # to minimize padding
                                           sort_within_batch=True,        # sort within each batch
                                           repeat=False)     
text.build_vocab(train)

class RNN(nn.Module):
    def __init__(self, vocab_size, hidden_units, num_classes):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_units)
        self.rnn = nn.GRU(hidden_units, hidden_units, batch_first=True)
        self.fc = nn.Linear(hidden_units, num_classes)

    def forward(self,x):
        embedded = self.embedding(x)
        out, _= self.rnn(embedded)
        out = out[:, -1, :]
        out = self.fc(out)
        return out
def get_accuracy(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    for batch in dataloader:
        text_data = batch.text[0]
        labels = batch.label.unsqueeze(1).float()  # Ensure labels are [batch_size, 1]
        outputs = model(text_data) #make prediction
        #compare prediction and add to total number or predictions
        predicted = (outputs > 0.5).float()
        total += labels.shape[0]
        correct += (predicted == labels).sum().item()
    #return accuracy
    return correct / total

def get_loss(model, dataloader, loss_fn):
    count = 0
    total_loss = 0
    for batch in dataloader:
        text_data = batch.text[0]
        labels = batch.label.unsqueeze(1).float()
        outputs = model(text_data)
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()
        count += 1
    return total_loss / count

def train(model, train_iter, valid_iter, num_epochs, learning_rate, loss_fn):
    torch.manual_seed(26)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    epochs = []
    train_acc = []
    train_loss = []
    val_acc = []
    val_loss = []

    best_val_acc = 0

    print("Training started...")
    for epoch in range(num_epochs):
        epochs.append(epoch + 1)
        total_loss_t = 0
        count_t = 0
        try:
            for batch in train_iter:
                text_data = batch.text[0]
                labels = batch.label.unsqueeze(1).float()
                outputs = model(text_data)
                loss_train = loss_fn(outputs, labels)
                optimizer.zero_grad()
                loss_train.backward()
                optimizer.step()
    
    
                total_loss_t += loss_train.item()
                count_t += 1
        except ValueError as e:
            print(f"Error processing batch: {e}")

        train_acc.append(get_accuracy(model, train_iter))
        train_loss.append(total_loss_t / count_t)
        val_acc.append(get_accuracy(model, valid_iter))
        val_loss.append(get_loss(model, valid_iter, loss_fn))

        print("epoch: ", epochs[-1], ", train loss: ", train_loss[-1], ", train acc: ", train_acc[-1], ", val loss: ", val_loss[-1], ", val acc: ", val_acc[-1])

        # torch.save(model.state_dict(), 'checkpoints/checkpoint.pth')
        # files.download('checkpoint.pth')
    #Needed to compile and train on Google Colab

num_epochs = 3
learning_rate = 1e-3
hidden_units = 128
num_classes = 1
loss_fn = nn.BCEWithLogitsLoss()

# Instantiate the model
model = RNN(len(text.vocab.itos), hidden_units, num_classes)
train(model, train_iter, valid_iter, num_epochs=num_epochs, learning_rate=learning_rate, loss_fn=loss_fn)

# torch.save(model.state_dict(), 'model_weights.pth')


Training started...


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 16384 bytes.