In [28]:
#Load the saved encoder model
import numpy as np
import pandas as pd

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Get the training data

In [40]:
from loader import Lang, ToxicityDataset, collate, normalizeString, unicodeToAscii

lang = Lang("eng")
data = pd.read_csv('data/train_2024.csv', quoting = 3)
df = pd.DataFrame(data)

for sentence in df['text']:
    lang.addSentence(normalizeString(unicodeToAscii(sentence))) 

lang.trim(5)

trainset = ToxicityDataset('data/train_2024.csv', 'id', 'text', 'label', lang)
train_loader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=collate)

# Get the validation data

In [32]:
val_data = pd.read_csv('data/dev_2024.csv', quoting = 3)
val_df = pd.DataFrame(val_data)

valset = ToxicityDataset('data/dev_2024.csv', 'id', 'text', 'label', lang)
val_loader = DataLoader(valset, batch_size=64, shuffle=True, collate_fn=collate)

# Get the test data

In [33]:
test_data = pd.read_csv('./test_data.csv', quoting = 3)
test_df = pd.DataFrame(test_data)

testset = ToxicityDataset('./test_data.csv', 'id', 'text', 'label', lang)
test_loader = DataLoader(testset, batch_size=1, shuffle=False, collate_fn=collate)

In [34]:
#Load the saved encoder model
from encoder import EncoderClassifier, Encoder, MLPClassifier

embed_size = 512
#This correponds to the first model I tried
bert_encoder = Encoder(src_vocab_size=trainset.lang.n_words, n_blocks = 4, n_features = embed_size, n_heads = 4, n_hidden = 512, dropout = 0.1, max_length = 5000)
classifier = MLPClassifier(n_features = embed_size, num_classes = 2, num_layers = 2, dropout = 0.2)
model = EncoderClassifier(bert_encoder, classifier)
model.load_state_dict(torch.load('encoder.pth', map_location = device))

<All keys matched successfully>

In [35]:
def compute_accuracy(model, val_loader):
    model.eval()
    accuracy = []
    for i, data in enumerate(val_loader):
        inputs, mask, labels = data
        inputs = inputs.to(device)
        mask = mask.to(device)
        labels = labels.to(torch.float32).reshape(labels.size(0), 1).to(device)

        outputs = model(inputs, mask)

        out = torch.round(torch.sigmoid(outputs))
        accuracy.append(torch.sum(out == labels).item() / labels.size(0))
        print(i)
        if i > 4:
            break

    accuracy = np.mean(accuracy)
    return accuracy

In [36]:
def compute_f1(model, val_loader):
    model.eval()
    f1 = []
    for i, data in enumerate(val_loader):
        inputs, mask, labels = data
        inputs = inputs.to(device)
        mask = mask.to(device)
        labels = labels.to(torch.float32).reshape(labels.size(0), 1).to(device)

        outputs = model(inputs, mask)

        out = torch.round(torch.sigmoid(outputs))
        tp = torch.sum(out * labels).item()
        fp = torch.sum(out * (1 - labels)).item()
        fn = torch.sum((1 - out) * labels).item()
        if tp == 0:
            f1.append(0)
        else:
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f1.append(2 * precision * recall / (precision + recall))
        print(i)
        if i > 10:
            break

    f1 = np.mean(f1)
    return f1

In [37]:
print('Validation f1: ', compute_f1(model, val_loader))

0
1
2
3
4
5
6
7
8
9
10
11
Validation f1:  0.7336208688447491


In [27]:
predictions = pd.DataFrame(columns = ['id', 'label'])
model.eval()

predicted = torch.tensor([]).to(device)
translations = []
for i, data in enumerate(test_loader):
    inputs, mask, labels = data
    
    translated = [testset.lang.index2word[i.item()] for input in inputs for i in input]
    translations.append(translated)
    print(translated)
    print(inputs)
    inputs = inputs.to(device)
    mask = mask.to(device)
    
    outputs = model(inputs, mask)

    pre = torch.round(torch.sigmoid(outputs))
    predicted = torch.cat((predicted, pre), dim = 0)
    if i % 200 == 0:
        print(i)

['CLS', 'get', 'the', 'odd', 'feeling', '', 'the', 'head', 'of', 'the', 'of', '', 'will', 'step', 'in', 'and', 'defend', 'this', 'scum', 'for', 'freedom', 'of', 'EOS']
tensor([[   0,   66,   25, 4354, 2264,  413,   25,  122,   28,   25,   28,  413,
          102, 2402,   13,   15, 1780,  153,  882,   34, 1224,   28,    1]])
0
['CLS', 'disagree', 'more', 'with', 'this', 'have', 'moved', 'on', 'and', 'care', 'how', 'primarily', 'the', 'media', 'and', 'others', 'label', 'we', 'stop', 'putting', 'labels', 'on', '', 'biggest', 'problem', 'will', 'be', 'if', 'he', 'has', 'to', 'under', 'the', 'really', 'is', 'that', 'EOS']
tensor([[   0, 1049,   59,  162,  153,   58, 4110,  108,   15, 1060,  548,  343,
           25,  482,   15, 1879, 2287,  207, 3190, 4900, 1274,  108,  413,  236,
         1061,  102,  145,  321,  335,  149,   48, 2364,   25,  336,   24,    3,
            1]])
['CLS', 'the', 'property', 'owner', 'have', 'a', 'vote', 'in', 'the', '', 'tax', '', 'maybe', 'time', 'to', 'scale'

KeyboardInterrupt: 

In [11]:
predicted = predicted.squeeze().detach().numpy().astype(int)
predictions['label'] = predicted
predictions['id'] = test_data['id']
predictions.shape

(12001, 2)

In [12]:
predictions.to_csv('predictions.csv', index = False, header = True)

In [13]:
predictions.head

<bound method NDFrame.head of           id  label
0          0      1
1          1      0
2          2      0
3          3      1
4          4      1
...      ...    ...
11996  11996      1
11997  11997      0
11998  11998      1
11999  11999      1
12000  12000      0

[12001 rows x 2 columns]>