In [2]:
#Load the saved encoder model
import numpy as np
import pandas as pd

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Get the training data

In [3]:
from loader import Lang, ToxicityDataset, collate, normalizeString

lang = Lang("eng")
data = pd.read_csv('data/train_2024.csv', quoting = 3)
df = pd.DataFrame(data)
for sentence in df['text']:
    lang.addSentence(normalizeString(sentence))   

trainset = ToxicityDataset('data/train_2024.csv', 'id', 'text', 'label', lang)
train_loader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=collate)

# Get the validation data

In [4]:
val_data = pd.read_csv('data/dev_2024.csv', quoting = 3)
val_df = pd.DataFrame(val_data)

valset = ToxicityDataset('data/dev_2024.csv', 'id', 'text', 'label', lang)
val_loader = DataLoader(valset, batch_size=64, shuffle=False, collate_fn=collate)

# Get the test data

In [5]:
test_data = pd.read_csv('./test_data.csv', quoting = 3)
test_df = pd.DataFrame(test_data)
#for sentence in test_df['text']:
#    lang.addSentence(normalizeString(sentence))

testset = ToxicityDataset('./test_data.csv', 'id', 'text', 'label', lang)
test_loader = DataLoader(testset, batch_size=1, shuffle=False, collate_fn=collate)


In [6]:
#Load the saved encoder model
from encoder import EncoderClassifier, Encoder, MLPClassifier

embed_size = 256
bert_encoder = Encoder(src_vocab_size=trainset.lang.n_words, n_blocks = 3, n_features = embed_size, n_heads = 4, n_hidden = 512, dropout = 0.1, max_length = 5000)
classifier = MLPClassifier(n_features = embed_size, num_classes = 2, num_layers = 2, dropout = 0.1)
model = EncoderClassifier(bert_encoder, classifier)
model.load_state_dict(torch.load('encoder.pth', map_location = device))

<All keys matched successfully>

In [12]:
def compute_accuracy(model, val_loader):
    model.eval()
    accuracy = []
    for i, data in enumerate(val_loader):
        inputs, mask, labels = data
        inputs = inputs.to(device)
        mask = mask.to(device)
        labels = labels.to(torch.float32).reshape(labels.size(0), 1).to(device)

        outputs = model(inputs, mask)

        out = torch.round(torch.sigmoid(outputs))
        accuracy.append(torch.sum(out == labels).item() / labels.size(0))
        print(i)
        if i > 10:
            break

    accuracy = np.mean(accuracy)
    return accuracy

In [13]:
print('Validation accuracy: ', compute_accuracy(model, val_loader))

0
1
2
3
4
5
6
7
8
9
10
11
Validation accuracy:  0.8046875


In [7]:
predictions = pd.DataFrame(columns = ['id', 'label'])
model.eval()

predicted = torch.tensor([]).to(device)
translations = []
for i, data in enumerate(test_loader):
    inputs, mask, labels = data
    
    #translated = [testset.lang.index2word[i.item()] for input in inputs for i in input]
    #translations.append(translated)
    #print(translated)
    print(inputs)
    inputs = inputs.to(device)
    mask = mask.to(device)
    
    outputs = model(inputs, mask)

    pre = torch.round(torch.sigmoid(outputs))
    predicted = torch.cat((predicted, pre), dim = 0)
    if i % 200 == 0:
        print(i)

tensor([[   0,   66,   25, 4354, 2264,  413,   25,  122,   28,   25,   28,  413,
          102, 2402,   13,   15, 1780,  153,  882,   34, 1224,   28,    1]])
0
tensor([[   0, 1049,   59,  162,  153,   58, 4110,  108,   15, 1060,  548,  343,
           25,  482,   15, 1879, 2287,  207, 3190, 4900, 1274,  108,  413,  236,
         1061,  102,  145,  321,  335,  149,   48, 2364,   25,  336,   24,    3,
            1]])
tensor([[    0,    25,   848,  4235,    58,    18,    68,    13,    25,   413,
           320,   413,   273,    42,    48,  1303,   554,    25,  2379,   413,
            15,    18,   945,  7626,  1536,     3,  1808,   162,   449,   413,
            25,  1793,  2436,    25,  1087,   318,    35,  1330,    49,  1162,
            25,  1793,   111,    25,  1087,   318,    49,   197,    48,   413,
           615,   145, 31432, 66431,  1692,    28,    25,   855,    15,    25,
         21686,    75,     1]])
tensor([[    0,   413,   113,    65,   247,    49,   375,   145,   200,   

KeyboardInterrupt: 

In [17]:
predicted = predicted.squeeze().detach().numpy().astype(int)
predictions['label'] = predicted
predictions['id'] = test_data['id']
predictions.shape

(12001, 2)

In [18]:
predictions.to_csv('predictions.csv', index = False, header = True)

In [19]:
predictions.head

<bound method NDFrame.head of           id  label
0          0      1
1          1      0
2          2      0
3          3      1
4          4      1
...      ...    ...
11996  11996      1
11997  11997      0
11998  11998      1
11999  11999      1
12000  12000      0

[12001 rows x 2 columns]>