In [1]:
import pandas as pd
import torch
import torch.utils.data as data_utils
import numpy as np

In [2]:
# !unzip ../data/interim/toxicity_levels.zip

In [3]:
df = pd.read_csv('../data/interim/toxicity_levels.csv')
df.head()

Unnamed: 0,text,tox_level
0,"if Alkar floods her with her mental waste, it ...",0.981983
1,"If Alkar is flooding her with psychic waste, t...",0.014195
2,you're becoming disgusting.,0.999039
3,Now you're getting nasty.,0.065473
4,"well, we can spare your life.",0.985068


In [4]:
threeshold = 0.5

df['tox_level'] = df['tox_level'].apply(lambda x: 1 if x > threeshold else 0)
df.head()

Unnamed: 0,text,tox_level
0,"if Alkar floods her with her mental waste, it ...",1
1,"If Alkar is flooding her with psychic waste, t...",0
2,you're becoming disgusting.,1
3,Now you're getting nasty.,0
4,"well, we can spare your life.",1


## Dataset and Dataloader

Нужно добавить ещё норм препроцессинг с токенизацией стеммингом и прочим

In [5]:
from transformers import AutoTokenizer
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
def preprocessing_stage(sample):
    model_inputs = tokenizer(sample['text'], padding='max_length', max_length=256, truncation=True)
    return model_inputs['input_ids']

In [7]:
df['input_ids'] = df.apply(lambda x: preprocessing_stage(x), axis=1)

In [8]:
df.drop(columns=['text'], inplace=True)

In [9]:
from sklearn.model_selection import train_test_split

ratio = 0.2
train, val = train_test_split(
    df, stratify=df['tox_level'], test_size=0.2, random_state=42
)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    text_list, toxicity_list = [], []
    for _toxicity, _text in batch:
        text_list.append(_text)
        toxicity_list.append(_toxicity)
    return torch.LongTensor(text_list).to(device), torch.FloatTensor(toxicity_list).to(device)

In [27]:
batch_size = 512
train_dataloader = data_utils.DataLoader(
    train.to_numpy(), batch_size=batch_size, shuffle=True, collate_fn=collate_batch
)

val_dataloader = data_utils.DataLoader(
    val.to_numpy(), batch_size=batch_size, shuffle=False, collate_fn=collate_batch
)

In [28]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class TextClassificationModel(nn.Module):
    def __init__(self, input_dim):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(input_dim, 300)
        self.fc1 = nn.Linear(300, 100)
        self.fc2 = nn.Linear(100, 1)

    def forward(self, text):
        text = self.embedding(text)
        x = F.relu(self.fc1(text))
        return F.sigmoid(self.fc2(x))

In [45]:
vocab_size = 32128

model = TextClassificationModel(vocab_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.BCELoss()

In [48]:
from tqdm.autonotebook import tqdm

def train_one_epoch(
    model,
    loader,
    optimizer,
    loss_fn,
    epoch_num=10
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch_num}: train",
        leave=True,
    )
    model.train()
    train_loss = 0.0
    for i, batch in loop:
        texts, labels = batch
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward pass
        outputs = model(texts).squeeze(1)
        # loss calculation
        loss = loss_fn(outputs, labels)
        
        # backward pass
        loss.backward()

        # optimizer run
        optimizer.step()

        train_loss += loss.item()
        loop.set_postfix({"loss": train_loss / (i * len(labels))})

def val_one_epoch(
    model,
    loader,
    loss_fn,
    epoch_num=-1
):
    
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch_num}: val",
        leave=True,
    )
    val_loss = 0.0
    correct_3 = 0
    correct_4 = 0
    correct_5 = 0
    correct_6 = 0
    correct_7 = 0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            texts, labels = batch

            # forward pass
            outputs = model(texts).squeeze(1)
            # loss calculation
            loss = loss_fn(outputs, labels)
            
            # test different threesholds
            total += len(labels)
            predicted_3 = torch.where(outputs.data > 0.3, 1.0, 0.0)
            predicted_4 = torch.where(outputs.data > 0.4, 1.0, 0.0)
            predicted_5 = torch.where(outputs.data > 0.5, 1.0, 0.0)
            predicted_6 = torch.where(outputs.data > 0.6, 1.0, 0.0)
            predicted_7 = torch.where(outputs.data > 0.7, 1.0, 0.0)
            
            correct_3 += sum(predicted_3 == labels)
            correct_4 += sum(predicted_4 == labels)
            correct_5 += sum(predicted_5 == labels)
            correct_6 += sum(predicted_6 == labels)
            correct_7 += sum(predicted_7 == labels)

            val_loss += loss.item()
            loop.set_postfix({"loss": val_loss / total})
        print(correct_3 / total, correct_4 / total, correct_5 / total, correct_6 / total, correct_7 / total)
       
    torch.cuda.empty_cache()
    return val_loss / total

In [49]:
for epoch in range(1, 21):
    train_one_epoch(model, train_dataloader, optimizer, criterion, epoch_num=epoch)
    if epoch % 5 == 0:
        val_loss = val_one_epoch(model, val_dataloader, criterion, epoch)

Epoch 1: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 2: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 3: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 4: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 5: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 5: val:   0%|          | 0/427 [00:00<?, ?it/s]

tensor(0.7457, device='cuda:0') tensor(0.7680, device='cuda:0') tensor(0.7797, device='cuda:0') tensor(0.7798, device='cuda:0') tensor(0.7605, device='cuda:0')


Epoch 6: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 7: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 8: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 9: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 10: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 10: val:   0%|          | 0/427 [00:00<?, ?it/s]

tensor(0.7779, device='cuda:0') tensor(0.7816, device='cuda:0') tensor(0.7711, device='cuda:0') tensor(0.7494, device='cuda:0') tensor(0.7188, device='cuda:0')


Epoch 11: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 12: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 13: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 14: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 15: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 15: val:   0%|          | 0/427 [00:00<?, ?it/s]

tensor(0.7548, device='cuda:0') tensor(0.7730, device='cuda:0') tensor(0.7826, device='cuda:0') tensor(0.7796, device='cuda:0') tensor(0.7574, device='cuda:0')


Epoch 16: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 17: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 18: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 19: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 20: train:   0%|          | 0/1706 [00:00<?, ?it/s]

Epoch 20: val:   0%|          | 0/427 [00:00<?, ?it/s]

tensor(0.7744, device='cuda:0') tensor(0.7827, device='cuda:0') tensor(0.7776, device='cuda:0') tensor(0.7566, device='cuda:0') tensor(0.7244, device='cuda:0')


In [51]:
torch.save(model.state_dict(), '../models/toxicity_identifier.pt')