# Model Architecture

In [4]:
import warnings

warnings.filterwarnings('ignore')

## 1. Loading dataset

In [5]:
import pickle
from src.data.make_dataset import TextDataset

In [6]:
df = pickle.load(open('../data/interim/text_dataset.pkl', 'rb'))

In [7]:
df.data.head()

Unnamed: 0,toxic,normal,toxic_reduction
5,I'm not gonna have a child... ...with the same...,I'm not going to breed kids with a genetic dis...,0.915109
6,"They're all laughing at us, so we'll kick your...",they're laughing at us. We'll show you.,0.999361
7,Maine was very short on black people back then.,there wasn't much black in Maine then.,0.814971
11,"So now their spirits are cursed, walking back ...","their souls are cursed, they guard the paths, ...",0.698517
13,"Come on, Cal, leave that shit alone.","come on, Cal, put it down.",0.999357


In [8]:
from src.data.transforms import apply_transforms

df_transformed = TextDataset(df=df.data.copy())
apply_transforms(df_transformed)

1. Cleaning text...
2. Tokenizing text...
3. Removing stopwords...
4. Lemmatizing text...
Transfomed data sample:
                                                    toxic  \
508062  [claimed, forced, avoid, church, wanted, visit...   
137379                            [useless, even, manage]   
24365          [point, like, right, chick, really, crazy]   
147195  [wondering, whether, really, would, cut, throa...   
125724                                             [kick]   

                                                   normal  toxic_reduction  
508062  [said, felt, compelled, abstain, church, becam...         0.772285  
137379                                        [use, even]         0.998066  
24365             [case, like, right, cat, really, crazy]         0.925730  
147195                  [wonder, able, cut, said, magrat]         0.942295  
125724                                             [kick]         0.923199  
All done.


In [9]:
df_transformed.data.head()

Unnamed: 0,toxic,normal,toxic_reduction
5,"[gon, na, child, genetic, disorder, gon, na, d...","[going, breed, kid, genetic, disorder, make, die]",0.915109
6,"[laughing, u, kick, as]","[laughing, u, show]",0.999361
7,"[maine, short, black, people, back]","[much, black, maine]",0.814971
11,"[spirit, cursed, walking, back, road, waterway...","[soul, cursed, guard, path, say, encounter, un...",0.698517
13,"[come, cal, leave, shit, alone]","[come, cal, put]",0.999357


## 2. Loading Toxic Words set

In [5]:
toxic_words_set = pickle.load(open('../data/interim/toxic_words.pkl', 'rb'))

## 3. Loading Toxicity Classifier

In [25]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from src.models.config import CLF_PATH, DEVICE

model = RobertaForSequenceClassification.from_pretrained(CLF_PATH)
tokenizer = RobertaTokenizer.from_pretrained(CLF_PATH)

In [26]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [41]:
import torch


def get_toxicity(prompts):
    with torch.inference_mode():
        inputs = tokenizer(prompts, return_tensors='pt', padding=True).to(DEVICE)

        out = torch.softmax(model(**inputs).logits, -1)[:, 1].cpu().numpy()

    return out

## 4. Trying different architectures

### 4.1 Baseline Model: removing toxic words from sentences

In [42]:
def apply_remove_model(raw_sent, toxic_words):
    words = raw_sent.split()
    result = []

    for word in words:
        result.append(word) if word not in toxic_words else None

    return ' '.join(result)

#### Applying for random sentences

In [43]:
def test_remove_model(n_samples, toxic_words):
    i = 1
    for index, row in df.data.sample(n=n_samples).iterrows():
        print("-" * 30)
        print(f"SENTENCE {i}/{n_samples}")
        i += 1

        orig_sent = row['toxic']
        target_sent = row['normal']
        result_sent = apply_remove_model(orig_sent, toxic_words)

        toxic_scores = get_toxicity([orig_sent, result_sent, target_sent])

        print("-" * 10)
        print("Original sentence:", orig_sent)
        print("Original toxicity:", toxic_scores[0])
        print("-" * 10)
        print("Result sentence:", result_sent)
        print("Result toxicity:", toxic_scores[1])
        print("-" * 10)
        print("Target sentence:", target_sent)
        print("Target toxicity:", toxic_scores[2])
        print("-" * 10)

In [44]:
test_remove_model(10, toxic_words_set)

------------------------------
SENTENCE 1/10
----------
Original sentence: No matter what side you're on. You're always the looser.
Original toxicity: 0.97803134
----------
Result sentence: No matter what side you're on. You're always the looser.
Result toxicity: 0.97803134
----------
Target sentence: whatever you're on, you'll always lose.
Target toxicity: 0.00016438692
----------
------------------------------
SENTENCE 2/10
----------
Original sentence: Two: why are youfucking llona then?
Original toxicity: 0.9957242
----------
Result sentence: Two: why are then?
Result toxicity: 4.0355728e-05
----------
Target sentence: second: Why are you sleeping with Ilons?
Target toxicity: 9.117348e-05
----------
------------------------------
SENTENCE 3/10
----------
Original sentence: Maybe buy my fathead.
Original toxicity: 0.9798124
----------
Result sentence: Maybe buy my fathead.
Result toxicity: 0.9798124
----------
Target sentence: maybe buy my poster.
Target toxicity: 4.673778e-05
-----

So, my set of toxic words doesn't work and it is expected. Because my set is just simple difference of words between toxic and nontoxic sentences.

#### Trying well-prepared set of toxic and negatice words

In [35]:
with open('../data/external/toxic_words.txt', 'r') as f:
    toxic_words_prepared = list(map(lambda x: x.strip(), f.readlines()))

In [37]:
with open('../data/external/negative-words.txt', 'r') as f:
    negative_words_prepared = list(map(lambda x: x.strip(), f.readlines()))

In [39]:
all_toxic_words = set(toxic_words_prepared).union(set(negative_words_prepared))

In [45]:
test_remove_model(10, all_toxic_words)

------------------------------
SENTENCE 1/10
----------
Original sentence: No, I've been trying to find Olivia's killer, busting my ass--
Original toxicity: 0.99812406
----------
Result sentence: No, I've been trying to find Olivia's killer, busting my ass--
Result toxicity: 0.99812406
----------
Target sentence: no, I'm trying to find Olivia's killer.
Target toxicity: 0.002062821
----------
------------------------------
SENTENCE 2/10
----------
Original sentence: what the fuck are you doing?
Original toxicity: 0.9995204
----------
Result sentence: what the are you doing?
Result toxicity: 7.6862074e-05
----------
Target sentence: What are you doing?
Target toxicity: 4.8589915e-05
----------
------------------------------
SENTENCE 3/10
----------
Original sentence: Boy, I froze my ass last night!
Original toxicity: 0.9973277
----------
Result sentence: Boy, I my last night!
Result toxicity: 5.1524403e-05
----------
Target sentence: I almost froze last night!
Target toxicity: 7.1020346e

Now, the dict model is more accurate and successfully detoxifies a lot more sentences.

### 4.2 Making custom Seq2Seq model

In [138]:
import torch
from torch import nn
import random


class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        x, (h, c) = self.lstm(x)

        return x, (h, c)


class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, h, c):
        x = self.embedding(x)
        x, (h, c) = self.lstm(x, (h, c))
        x = self.fc(x)

        return x, (h, c)


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, x, y, teacher_forcing_ratio=0.5):
        batch_size = x.shape[1]
        target_len = y.shape[0]
        target_vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(self.device)

        encoder_output, (encoder_h, encoder_c) = self.encoder(x)

        decoder_input = y
        h, c = encoder_h, encoder_c

        for t in range(1, target_len):
            output, (h, c) = self.decoder(decoder_input, h, c)
            outputs[t] = output

            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(2)

            decoder_input = y[t] if teacher_force else top1

        return outputs

In [139]:
from torch.utils.data import DataLoader, Dataset


class ToxicDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=50):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]

        toxic_sent = row['toxic']
        normal_sent = row['normal']

        toxic_sent = self.tokenizer(toxic_sent, return_tensors='pt', padding='max_length', truncation=True,
                                    max_length=self.max_len)
        normal_sent = self.tokenizer(normal_sent, return_tensors='pt', padding='max_length', truncation=True,
                                     max_length=self.max_len)

        return {
            'toxic_sent': toxic_sent,
            'normal_sent': normal_sent
        }

In [140]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained(CLF_PATH)

dataset = ToxicDataset(df.data, tokenizer)

train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [141]:
model = Seq2Seq(
    encoder=Encoder(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2, dropout=0.5),
    decoder=Decoder(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2, dropout=0.5),
    device=DEVICE
).to(DEVICE)

In [142]:
epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [None]:
from tqdm import tqdm

for epoch in range(epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()

        x = batch['toxic_sent']['input_ids'].to(DEVICE).squeeze(1)
        y = batch['normal_sent']['input_ids'].to(DEVICE).squeeze(1)

        output = model(x, y)

        output = output[1:].reshape(-1, output.shape[2])
        y = y[1:].reshape(-1)

        loss = criterion(output, y)

        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch [{epoch}/{epochs}]")
        loop.set_postfix(loss=loss.item())

No results here :(