In [1]:
#Load the saved encoder model
import numpy as np
import pandas as pd

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
from loader import Lang, ToxicityDataset, collate, normalizeString

lang = Lang("eng")
data = pd.read_csv('data/train_2024.csv', quoting = 3)
df = pd.DataFrame(data)
for sentence in df['text']:
    lang.addSentence(normalizeString(sentence))   

trainset = ToxicityDataset('data/train_2024.csv', 'id', 'text', 'label', lang)
train_loader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=collate)

In [2]:
test_data = pd.read_csv('./test_data.csv', quoting = 3)
test_df = pd.DataFrame(test_data)
for sentence in test_df['text']:
    lang.addSentence(normalizeString(sentence))

testset = ToxicityDataset('./test_data.csv', 'id', 'text', 'label', lang)
test_loader = DataLoader(testset, batch_size=1, shuffle=False, collate_fn=collate)


In [6]:
#Load the saved encoder model
from encoder import EncoderClassifier, Encoder, MLPClassifier

embed_size = 256
bert_encoder = Encoder(src_vocab_size=trainset.lang.n_words, n_blocks = 3, n_features = embed_size, n_heads = 4, n_hidden = 512, dropout = 0.1, max_length = 5000)
classifier = MLPClassifier(n_features = embed_size, num_classes = 2, num_layers = 2, dropout = 0.1)
model = EncoderClassifier(bert_encoder, classifier)
model.load_state_dict(torch.load('encoder.pth', map_location = device))

<All keys matched successfully>

In [7]:
predictions = pd.DataFrame(columns = ['id', 'label'])
model.eval()

predicted = torch.tensor([]).to(device)
translations = []
for i, data in enumerate(test_loader):
    inputs, mask, labels = data
    
    #translated = [testset.lang.index2word[i.item()] for input in inputs for i in input]
    #translations.append(translated)
    inputs = inputs.to(device)
    mask = mask.to(device)
    
    outputs = model(inputs, mask)

    pre = torch.round(torch.sigmoid(outputs))
    predicted = torch.cat((predicted, pre), dim = 0)
    if i % 200 == 0:
        print(i)

0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200
6400
6600
6800
7000
7200
7400
7600
7800
8000
8200
8400
8600
8800
9000
9200
9400
9600
9800
10000
10200
10400
10600
10800
11000
11200
11400
11600
11800


In [8]:
predicted = predicted.squeeze().detach().numpy().astype(int)
predictions['label'] = predicted
predictions['id'] = test_data['id']
predictions.shape

(11929, 2)

In [7]:
predictions.to_csv('predictions.csv', index = False, header = True)