In [9]:
import numpy as np
import pandas as pd

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context="talk")
%matplotlib inline

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
from torch.nn.utils.rnn import pad_sequence
from loader import Lang, ToxicityDataset, normalizeString, collate

lang = Lang("eng")
data = pd.read_csv('data/train_2024.csv', quoting = 3)
df = pd.DataFrame(data)
for sentence in df['text']:
    lang.addSentence(normalizeString(sentence))   

trainset = ToxicityDataset('data/train_2024.csv', 'id', 'text', 'label', lang)
train_loader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=collate)

In [11]:
#Get random sample from trainset
src_seq, label = trainset[1]

print('Source sentence:')
print(' as word indices: ', src_seq)
print(' as string: ', ' '.join(trainset.lang.index2word[i.item()] for i in src_seq))

Source sentence:
 as word indices:  tensor([ 21,  22,  23,  24,  25,  26,  15,  27,  28,  29,  31,  32,  33,  34,
         35,  24,  36,  15,  37,  38,  39,  40,  34,  35,  41,  39,  43,  44,
         41,  45,  46,  47,  48,  49, 413,  50,  48,  51,  35,  52,  25,  53,
         54,  58,  59,  60,  34,  29,  30,  61,  36, 413,  52,  62,  38,  63,
         64,  48,  65,  50,  48,  66,  67,  69,  70,  21,  63,  68,  71,  72,
         28,  73,  74,  75,  76,   3,  24,  77,  79,  68,  80,  81,  82,  28,
         83,  84,  85,  36,   3,  12,  88,  48,  90,  93,  25,  94,  95,  95,
         63,  68,  34,  98,  38,  63,  99,  28,  73, 102,  52,  62,  60,  65,
        103,  38, 104, 106,  28,  42,  38, 102, 104, 107,  15, 108,  49, 109,
         50, 110,  49, 111, 112,   1])
 as string:  i find funny is the loyalty and blindness of english worst possible choice for them is liberal and yet they keep voting for them every keep renewing hope every election year prior to it  just to ignore them at 

In [12]:
test_data = pd.read_csv('data/test_2024.csv', quoting = 3)
test_df = pd.DataFrame(test_data)
test_lang = Lang("eng")
for sentence in test_df['text']:
    test_lang.addSentence(normalizeString(sentence))

testset = ToxicityDataset('data/test_2024.csv', 'id', 'text', 'label', test_lang)
test_loader = DataLoader(testset, batch_size=len(testset), shuffle=False, collate_fn=collate)

In [13]:
class EncoderBlock(nn.Module):
    def __init__(self, n_features, n_heads, n_hidden = 1024, dropout=0.1):
        super(EncoderBlock, self).__init__()
        self.attn = nn.MultiheadAttention(n_features, n_heads, batch_first = True)
        self.feed_forward = nn.Sequential(
            nn.Linear(n_features, n_hidden),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(n_hidden, n_features)
        )
        self.norm1 = nn.LayerNorm(n_features)
        self.dropout1 = nn.Dropout(dropout)

        self.norm2 = nn.LayerNorm(n_features)
        self.dropout2 = nn.Dropout(dropout)
    
    def forward(self, x, mask):
        x2, _ = self.attn(x, x, x, mask)
        x = x + self.dropout1(x2)
        x = self.norm1(x)
        x2 = self.feed_forward(x)
        x = x + self.dropout2(x2)
        x = self.norm2(x)
        return x

In [14]:

def clones(module, N):
    "Produces N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class Encoder(nn.Module):
    def __init__(self, src_vocab_size, n_blocks = 4, n_features = 256, n_heads = 16, n_hidden=512, dropout=0.1, max_length = 5000):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(src_vocab_size, n_features)
        self.pos_embedding = nn.Embedding(max_length, n_features)
        self.blocks = nn.ModuleList([EncoderBlock(n_features, n_heads, n_hidden, dropout) for _ in range(n_blocks)])
        self.norm = nn.LayerNorm(n_features)
        
    def forward(self, x, mask):
        B, T = x.size()
        positions = torch.arange(0, T, device = device)
        x = self.embedding(x)
        x = x + self.pos_embedding(positions)
        for block in self.blocks:
            x = block(x, mask)
        return self.norm(x)

In [15]:
#Classifier on top of the encoder
class MLPClassifier(nn.Module):
    def __init__(self, n_features=512, num_classes=2, num_layers=3, dropout=0.2):
        super(MLPClassifier, self).__init__()
        
        #A single layer
        layers = [
            nn.Linear(n_features, n_features * 4),
            nn.ReLU(),
            nn.Dropout(dropout)
        ]
        
        #Append all layers
        for _ in range(num_layers - 1):
            layers.extend([
                nn.Linear(n_features * 4, n_features * 4),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])

        #The output layer
        if num_classes == 2:
            layers.append(nn.Linear(n_features * 4, 1))
        else:
            layers.append(nn.Linear(n_features * 4, num_classes))
        
        self.classifier = nn.Sequential(*layers)

    def forward(self, x):
        return self.classifier(x)

In [16]:
class EncoderClassifier(nn.Module):
    def __init__(self, encoder, classifier):
        super(EncoderClassifier, self).__init__()
        self.encoder = encoder
        self.classifier = classifier

    def forward(self, x, mask):
        x = self.encoder(x, mask)
        
        #Take the [CLS] token
        #x = x[:,   0, :]

        x = x.mean(dim = 1)
        
        x = self.classifier(x)
        return x
    
class SimpleEncoderClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=512):
        super(SimpleEncoderClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=4, batch_first=True)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)

        self.classifier = nn.Sequential(
            nn.Linear(embedding_dim, embedding_dim*4),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(embedding_dim*4, 1)
        )

    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.encoder(x, src_key_padding_mask=mask)
        
        # Take the [CLS] token
        #x = x[:, 0, :]
        x = x.mean(dim = 1)
        
        # Pass through classifier
        x = self.classifier(x)
        return x

# Defining the models

In [65]:
embed_size = 256

#This correponds to the first model I tried
bert_encoder = Encoder(src_vocab_size=trainset.lang.n_words, n_blocks = 3, n_features = embed_size, n_heads = 4, n_hidden = embed_size, dropout = 0.1, max_length = 5000)
classifier = MLPClassifier(n_features = embed_size, num_classes = 2, num_layers = 2, dropout = 0.1)
encoder_classifier = EncoderClassifier(bert_encoder, classifier)

#This is a simpler one with much less parameters
simple_encoder = Encoder(src_vocab_size = trainset.lang.n_words, n_blocks = 3, n_features = 256, n_heads = 4, n_hidden = 256, dropout = 0.1, max_length = 5000)
simple_classifier = MLPClassifier(n_features = 256, num_classes = 2, num_layers = 3, dropout = 0.1)
encoder_classifier_2 = EncoderClassifier(simple_encoder, simple_classifier)

#This is the simplest one, which implements pytorch builtin encoder layers and a simple classifier in one class
simple_encoder_classifier = SimpleEncoderClassifier(vocab_size = trainset.lang.n_words, embedding_dim=256)

# Training loop

In [67]:
model = encoder_classifier
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, betas = (0.9, 0.98), eps=1e-9)

#Calculate the positive weight fraction
positive  = sum([label for _, label in trainset])
negative = len(trainset) - positive
positive_weight = negative/positive
criterion = nn.BCEWithLogitsLoss()

epochs = 10
for epoch in range(epochs):
    total_loss = 0.0
    correct = 0
    total = 0
    
    # Set the model to training mode
    model.train()
    
    for i, data in enumerate(train_loader):
        optimizer.zero_grad()

        inputs, mask, labels = data
    
        inputs = inputs.to(device)
        mask = mask.to(device)
        labels = labels.to(torch.float32).reshape(labels.size(0), 1).to(device)
        
        outputs = model(inputs, mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()

        # Compute total loss
        total_loss += loss.item()

        # Calculate accuracy
        predicted = torch.round(torch.sigmoid(outputs))
        correct = (predicted == labels).sum().item()
        total = labels.size(0)

        # Print batch loss
        #if i % 100 == 0:
        print(f"Epoch {epoch + 1}, Batch {i + 1}, Loss: {loss.item():.4f}")

        epoch_loss += total_loss / len(train_loader)
        epoch_accuracy += correct / total * 100.0

    # Calculate epoch-level statistics
    epoch_loss = total_loss / len(train_loader)
    epoch_accuracy = correct / total * 100.0
    
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")


tensor([[-0.7534],
        [-0.6633],
        [-0.9696],
        [-0.6990],
        [-0.8221],
        [-0.8310],
        [-0.7843],
        [-0.7840],
        [-0.6412],
        [-0.8956],
        [-0.7508],
        [-0.6645],
        [-0.8478],
        [-0.8355],
        [-0.7087],
        [-0.9064],
        [-0.7706],
        [-0.6872],
        [-0.8381],
        [-0.8109],
        [-0.7976],
        [-0.7944],
        [-0.7779],
        [-0.7242],
        [-0.8658],
        [-0.8156],
        [-0.8460],
        [-0.9442],
        [-0.7968],
        [-0.6567],
        [-0.8387],
        [-0.7515]], grad_fn=<AddmmBackward0>)
Epoch 1, Batch 1, Loss: 0.7709
tensor([[-0.2834],
        [-0.3077],
        [-0.3777],
        [-0.3622],
        [-0.3740],
        [-0.3315],
        [-0.3218],
        [-0.3296],
        [-0.3346],
        [-0.2713],
        [-0.4014],
        [-0.1948],
        [-0.3335],
        [-0.2832],
        [-0.3580],
        [-0.2810],
        [-0.3839],
        [-0

KeyboardInterrupt: 

In [None]:
def calculate_accuracy(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    for i, data in enumerate(data_loader):
        inputs, mask, labels = data
        inputs = inputs.to(device)
        mask = mask.to(device)
        labels = labels.to(device)
        outputs = model(inputs, mask)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    return correct / total

KeyboardInterrupt: 