In [31]:
import pandas as pd

df = pd.read_csv('../data/interim/training_data.csv')
df.head()

Unnamed: 0,reference,translation
0,shite i dont think theyre very happy,i dont think theyre very happy
1,if you want to understand animals i mean i mea...,if you want to understand game i mean really u...
2,sonia that smells awful,sonio that is a terrible thing
3,sam dolans father is out of his mind,sam dolans father checks out
4,theres a guy out there whos fucking serious,this is one very serious boy we have out there


In [32]:
toxic = df['reference'].tolist()
nontoxic = df['translation'].tolist()
# df['translation'].
df.isna().sum()

reference      0
translation    0
dtype: int64

In [33]:
from collections import Counter

# Let's assume `toxic` and `nontoxic` are lists of sentences (strings).
all_texts = toxic + nontoxic  # Combine both lists for creating the vocab.
# print(all_texts[0].split())
# # Tokenize the text
# tokenized_texts = []
# for i, sentence in enumerate(nontoxic):
#     try:
#         tokenized_texts.append(sentence.split())
#     except:
#         print(i, sentence)
tokenized_texts = [sentence.split() for sentence in all_texts]

# Flatten the list of token lists into a single list of tokens
all_tokens = [token for sublist in tokenized_texts for token in sublist]

# Count the frequency of tokens in the corpus
token_freqs = Counter(all_tokens)

# Create the vocabulary
vocab = {
    '<pad>': 0,
    '<sos>': 1,
    '<eos>': 2,
    '<unk>': 3,  # Tokens not found in the vocab will be replaced with <unk>
}

# Start the index count from 4, as 0-3 are reserved for special tokens
for index, token in enumerate(token_freqs, start=4):
    vocab[token] = index

# Now, vocab is a dictionary mapping each token to a unique index.

In [53]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


# Example of a custom dataset class
class TextDataset(Dataset):
    def __init__(self, toxic_texts, nontoxic_texts, vocab):
        self.toxic_texts = toxic_texts
        self.nontoxic_texts = nontoxic_texts
        self.vocab = vocab

    def __len__(self):
        return len(self.toxic_texts)

    def vectorize(self, text):
        return [self.vocab[token] for token in text.split()]

    def __getitem__(self, idx):
        toxic_vectorized = self.vectorize(self.toxic_texts[idx])
        nontoxic_vectorized = self.vectorize(self.nontoxic_texts[idx])
        return torch.tensor(toxic_vectorized), torch.tensor(nontoxic_vectorized)


# Assume 'vocab' is a dictionary mapping tokens to indices, and 'toxic' and 'nontoxic' are lists of sentences.
dataset = TextDataset(toxic, nontoxic, vocab)


def collate_batch(batch):
    toxic_list, nontoxic_list = [], []
    for toxic, nontoxic in batch:
        toxic_list.append(torch.tensor(toxic, requires_grad=True))
        nontoxic_list.append(torch.tensor(nontoxic, requires_grad=True))
    return pad_sequence(toxic_list, padding_value=vocab["<pad>"]), pad_sequence(
        nontoxic_list, padding_value=vocab["<pad>"]
    )


loader = DataLoader(dataset, batch_size=32, collate_fn=collate_batch)

In [55]:
import torch.nn as nn
from numpy import random

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers)
    
    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.output_dim = output_dim
    
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.embedding(input)
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        # Tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        # Last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        # First input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1) 
            input = trg[t] if teacher_force else top1
        
        return outputs


In [56]:
INPUT_DIM = len(vocab)
OUTPUT_DIM = len(vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
# device = torch.device('mps' if torch.cuda.is_available() else 'cpu')
device = "mps"

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS)

model = Seq2Seq(enc, dec, device).to(device)


In [57]:
N_EPOCHS = 10

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])

model.train()

for epoch in range(N_EPOCHS):
    for src, trg in loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        print(output.shape)
        print(trg.shape)
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        print(epoch, loss.value())
    print(epoch, loss.value())


  toxic_list.append(torch.tensor(toxic, requires_grad=True))


RuntimeError: Only Tensors of floating point and complex dtype can require gradients