In [27]:
import pandas as pd
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from collections import Counter
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence

In [3]:
# Dataset Class
class TextDataset(Dataset):
    def __init__(self, data, vocab=None, max_length=None):
        self.data = data
        self.vocab = vocab or self.build_vocab()
        self.max_length = max_length or self.get_max_length()
        self.data['reference_int'] = self.data['reference'].apply(self.text_to_ints)
        self.data['translation_int'] = self.data['translation'].apply(self.text_to_ints)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        reference_int = self.pad_features(self.data.iloc[idx]['reference_int'])
        translation_int = self.pad_features(self.data.iloc[idx]['translation_int'])
        return (
            torch.tensor(reference_int, dtype=torch.long),
            torch.tensor(translation_int, dtype=torch.long)
        )

    def build_vocab(self):
        word_counts = Counter()
        for _, row in self.data.iterrows():
            word_counts.update(row['reference'].split())
            word_counts.update(row['translation'].split())
        return {word: i+1 for i, (word, _) in enumerate(word_counts.most_common())}

    def get_max_length(self):
        return max(
            self.data['reference'].apply(lambda x: len(x.split())).max(),
            self.data['translation'].apply(lambda x: len(x.split())).max()
        )

    def text_to_ints(self, text):
        return [self.vocab.get(word, 0) for word in text.split()]

    def pad_features(self, text_ints):
        return text_ints + [0] * (self.max_length - len(text_ints))

In [4]:

# LSTM Model Class
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=0.5, batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.linear(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.lstm.num_layers, sequence_length, self.lstm.hidden_size),
                torch.zeros(self.lstm.num_layers, sequence_length, self.lstm.hidden_size))


In [22]:
def train(model, data_loader, criterion, optimizer, num_epochs, vocab_size):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(total=len(data_loader), desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)

        # Initial hidden state
        state_h, state_c = model.init_state(data_loader.batch_size)

        for batch, (x, y) in enumerate(data_loader):
            optimizer.zero_grad()

            # If the last batch is smaller, reinitialize the state with the correct batch size
            if x.size(0) != data_loader.batch_size:
                state_h, state_c = model.init_state(x.size(0))

            # Detach the states from the history of the last batch
            state_h = state_h.detach()
            state_c = state_c.detach()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))

            loss = criterion(y_pred.transpose(1, 2), y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.update()
            progress_bar.set_postfix(loss=loss.item())

        progress_bar.close()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(data_loader):.4f}')

    return model


In [17]:
def calculate_bleu(data_loader, model, vocab):
    bleu_scores = []
    model.eval()
    with torch.no_grad():
        for x, y in data_loader:
            predicted, _ = model(x, model.init_state(1))
            predicted = torch.argmax(predicted, dim=2)
            pred_list = predicted.numpy().tolist()
            real_list = y.numpy().tolist()

            # Convert integer sequences to words
            pred_words = [[vocab[i] for i in seq if i != 0] for seq in pred_list]
            real_words = [[[vocab[i] for i in seq if i != 0]] for seq in real_list]

            # Calculate BLEU score
            bleu_scores.append(corpus_bleu(real_words, pred_words, smoothing_function=SmoothingFunction().method1))

    return np.mean(bleu_scores)

In [31]:
def collate_fn(batch):
    x, y = zip(*batch)
    x_padded = pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in x],
                            batch_first=True, padding_value=0)
    y_padded = pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in y],
                            batch_first=True, padding_value=0)
    return x_padded, y_padded

In [None]:
# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

train_data = train_data.head(1000)
test_data = test_data.head(1000)

# Create dataset
train_dataset = TextDataset(train_data)
test_dataset = TextDataset(test_data, vocab=train_dataset.vocab, max_length=train_dataset.max_length)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)



In [24]:
# Initialize model
model = LSTMModel(
    vocab_size=len(train_dataset.vocab) + 1,
    embedding_dim=256,
    hidden_dim=512,
    n_layers=2
)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [25]:
# Train the model
model = train(model, train_loader, criterion, optimizer, num_epochs=10, vocab_size=len(train_dataset.vocab) + 1)




Epoch 1/10, Loss: 2.5227




Epoch 2/10, Loss: 1.5274




Epoch 3/10, Loss: 1.4512




Epoch 4/10, Loss: 1.4361




Epoch 5/10, Loss: 1.4008




Epoch 6/10, Loss: 1.4008




Epoch 7/10, Loss: 1.3725




Epoch 8/10, Loss: 1.3296




Epoch 9/10, Loss: 1.3017


                                                                       

Epoch 10/10, Loss: 1.2838




In [32]:
# Function to convert integers to text
def ints_to_text(ints, vocab):
    return ' '.join([vocab[i] for i in ints if i > 0])  # assuming 0 is the padding value

# Prediction function
def predict(model, sentence, vocab, max_length):
    model.eval()
    with torch.no_grad():
        # Convert sentence to integers
        text_ints = [vocab.get(word, 0) for word in sentence.split()]

        # Pad sequence
        text_ints_padded = text_ints + [0] * (max_length - len(text_ints))
        text_tensor = torch.tensor(text_ints_padded, dtype=torch.long).unsqueeze(0)

        # Predict
        logits, _ = model(text_tensor, model.init_state(1))
        prediction_ints = torch.argmax(logits, dim=2).squeeze(0).tolist()

        # Convert integers back to text
        predicted_sentence = ints_to_text(prediction_ints, {i: word for word, i in vocab.items()})

        return predicted_sentence


In [35]:
# Example text to de-toxify
example_text = "I like that shit."

# Predict a less toxic version
less_toxic_version = predict(model, example_text, train_dataset.vocab, train_dataset.max_length)
print(f"Original: {example_text}")
print(f"De-toxified: {less_toxic_version}")

Original: I like that shit.
De-toxified: I you you .


In [37]:
# Example text to de-toxify
example_text = "It told you this was a waste of my fucking time."

# Predict a less toxic version
less_toxic_version = predict(model, example_text, train_dataset.vocab, train_dataset.max_length)
print(f"Original: {example_text}")
print(f"De-toxified: {less_toxic_version}")

Original: It told you this was a waste of my fucking time.
De-toxified: you you to a . ,


In [38]:
# Example text to de-toxify
example_text = "Funny how Nazis are always the bad guys."

# Predict a less toxic version
less_toxic_version = predict(model, example_text, train_dataset.vocab, train_dataset.max_length)
print(f"Original: {example_text}")
print(f"De-toxified: {less_toxic_version}")

Original: Funny how Nazis are always the bad guys.
De-toxified: I 's to to the the .
