In [1]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (11

In [2]:
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
import torch.optim as optim
import pandas as pd
import re
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
from contractions import fix
import numpy as np
from collections import Counter
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.nn import functional as F

## Dataset

In [3]:
class JigsawDataset(Dataset):
    def __init__(self, comments, labels, glove_vocab, max_length):
        self.texts = comments.tolist()
        self.labeAls = labels.tolist()
        self.glove_vocab = glove_vocab
        self.max_length = max_length
        self.tokenizer = WhitespaceTokenizer()
        self.processed_texts = [self._preprocess(text) for text in self.texts]

    def _preprocess(self, text):
        # Expand contractions
        text = fix(text)
        # Convert to lower case
        text = text.lower()
        # Replace underscores with spaces
        text = re.sub(r'[_]', ' ', text)
        # Removing characters that usually don't add meaning to a sentence
        text = re.sub(r"[^?$.-:()%@!&=+/><,a-zA-Z\s0-9\w]", '', text)
        # Changes multiple occurrences of these special characters to only one occurrence. For example '???' to '?'
        text = re.sub(r'([?.!#$%&()*+,-/:;_<=>@[^`|])\1+', r'\1', text)
        # Inserts a space before and after special characters so embeddings can catch them
        text = re.sub(r'([?.!#$%&()*+,-/:;_<=>@[^`|])', r' \1 ', text)
        # Removes extra spaces that may have come in from the previous operation
        text = re.sub(r'([\s])\1+', r'\1', text)
        # Tokenize
        tokens = self.tokenizer.tokenize(text)
        # Filter tokens not in GloVe vocab
        tokens = [token if token in self.glove_vocab else '<unk>' for token in tokens]
        return tokens

    def __len__(self):
        return len(self.processed_texts)

    def __getitem__(self, idx):
        # TODO: Update this function based on the format required for the model
        tokens = self.processed_texts[idx]
        # Pad or truncate to max_length
        if len(tokens) < self.max_length:
            tokens += ['<pad>'] * (self.max_length - len(tokens))
        else:
            tokens = tokens[:self.max_length]
        # Convert tokens to indices
        indices = [self.glove_vocab[token] for token in tokens]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        return torch.tensor(indices, dtype=torch.long), label


In [4]:
def load_glove_vocab(filepath='../data/glove.6B/glove.6B.50d.txt'):
    glove_vocab = {}
    embeddings = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for idx, line in enumerate(f):
            values = line.split()
            word = values[0]
            vector = [float(x) for x in values[1:]]
            glove_vocab[word] = idx
            embeddings.append(vector)
    glove_vocab['<pad>'] = len(embeddings)
    embeddings.append([0.0] * len(vector))
    glove_vocab['<unk>'] = len(embeddings)
    embeddings.append([1.0] * len(vector))
    return glove_vocab, torch.tensor(embeddings, dtype=torch.float)

In [5]:
glove_vocab, glove_embeddings = load_glove_vocab('/kaggle/input/gloveembed/glove.6B.100d.txt') #TODO: Tune for optimal distance (50, 100, 200, 300)

In [6]:
df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
df = df.dropna(subset=['comment_text'])
df['target'] = df['target'].round(0).astype(int)
# df = df[:int(0.5*len(df))]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['comment_text'], df['target'], test_size=0.1, random_state=42)

In [8]:
train_dataset = JigsawDataset(X_train, y_train, glove_vocab, max_length=220)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [9]:
test_dataset = JigsawDataset(X_test, y_test, glove_vocab, max_length=220)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [10]:
for batch in test_loader:
    inputs, labels = batch
    print("Input shape:", inputs)
    print("Label shape:", labels)
    break

Input shape: tensor([[  3124,      1,   6380,  ..., 400000, 400000, 400000],
        [  2747,      2,     41,  ..., 400000, 400000, 400000],
        [  2970,     25,      0,  ..., 400000, 400000, 400000],
        ...,
        [  8198,    285,    188,  ..., 400000, 400000, 400000],
        [  4832,  16201,    188,  ..., 400000, 400000, 400000],
        [ 13408, 139150,     14,  ..., 400000, 400000, 400000]])
Label shape: tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


### Model

In [11]:
class ToxicityClassifierLSTM(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, num_layers, dropout=None):
        super(ToxicityClassifierLSTM, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if dropout is not None else 0)
        self.lstm_layer_1 = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.lstm_layer_2 = nn.LSTM(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
        self.linear1 = nn.Linear(4 * hidden_dim, 4 * hidden_dim)
        self.linear2 = nn.Linear(4 * hidden_dim, 4 * hidden_dim)
        self.fc = nn.Linear(4 * hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        
        lstm_out, _ = self.lstm_layer_1(embedded)
        lstm_out, _ = self.lstm_layer_2(lstm_out)
        
        # Global average pooling
        avg_pool = torch.mean(lstm_out, 1)
        # Global max pooling
        max_pool, _ = torch.max(lstm_out, 1)
        
        pool_out = torch.cat((max_pool, avg_pool), 1)
        linear1_out  = F.relu(self.linear1(pool_out))
        linear2_out  = F.relu(self.linear2(pool_out))
        
        final_hidden_state = pool_out + linear1_out + linear2_out
        
        return self.fc(final_hidden_state)

In [12]:
def train_step(model, train_loader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    all_preds = []
    all_labels = []

    for inputs, labels in tqdm(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()

        predictions = model(inputs).squeeze(1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        all_preds.extend((predictions > 0.5).int().cpu().numpy())
        all_labels.extend(labels.int().cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    return epoch_loss / len(train_loader), accuracy


def evaluate_step(model, test_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in tqdm(test_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            predictions = model(inputs).squeeze(1)
            loss = criterion(predictions, labels)

            epoch_loss += loss.item()
            all_preds.extend((predictions > 0.5).int().cpu().numpy())
            all_labels.extend(labels.int().cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    return epoch_loss / len(test_loader), accuracy


In [13]:
def train(train_loader, test_loader, glove_vocab, glove_embeddings):
    # Hyperparameters
    HIDDEN_DIM = 128
    OUTPUT_DIM = 1
    NUM_LAYERS = 2
    PAD_IDX = glove_vocab['<pad>']
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Initialize model
    model = ToxicityClassifierLSTM(
        embedding_matrix=glove_embeddings,
        hidden_dim=HIDDEN_DIM,
        output_dim=OUTPUT_DIM,
        num_layers=NUM_LAYERS,
    )
    model.to(DEVICE)

    # Copy pre-trained embeddings to the model's embedding layer
    # pretrained_embeddings = torch.stack(list(glove_vocab.values()))
    # model.embedding.weight.data.copy_(pretrained_embeddings)

    # Optimizer and Loss
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCEWithLogitsLoss(reduction='mean') #nn.BCELoss()

    # Train and evaluate
    EPOCHS = 3
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}")
        train_loss, train_acc = train_step(model, train_loader, optimizer, criterion, DEVICE)
        print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
        test_loss, test_acc = evaluate_step(model, test_loader, criterion, DEVICE)
        print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

    return model

In [14]:
model = train(train_loader, test_loader, glove_vocab, glove_embeddings)

Epoch 1


100%|██████████| 50762/50762 [15:27<00:00, 54.71it/s]


Train Loss: 0.1000, Train Accuracy: 0.9620


100%|██████████| 5641/5641 [00:47<00:00, 119.86it/s]


Test Loss: 0.0919, Test Accuracy: 0.9621
Epoch 2


100%|██████████| 50762/50762 [15:27<00:00, 54.71it/s]


Train Loss: 0.0882, Train Accuracy: 0.9654


100%|██████████| 5641/5641 [00:46<00:00, 120.77it/s]


Test Loss: 0.0875, Test Accuracy: 0.9656
Epoch 3


100%|██████████| 50762/50762 [15:29<00:00, 54.64it/s]


Train Loss: 0.0841, Train Accuracy: 0.9667


100%|██████████| 5641/5641 [00:46<00:00, 120.37it/s]


Test Loss: 0.0928, Test Accuracy: 0.9663


In [19]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [31]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
    
def predict(model, sentence, max_length=220):
    tokens = test_dataset._preprocess(text=sentence)
    if len(tokens) < max_length:
        tokens += ['<pad>'] * (max_length - len(tokens))
    else:
        tokens = tokens[:max_length]
    indices = torch.tensor([glove_vocab[token] for token in tokens], dtype=torch.long)
    inputs = indices.to(DEVICE)
    predictions = model(inputs.unsqueeze_(0)).squeeze(1)
    
    return sigmoid(predictions.detach().cpu().numpy())

In [61]:
sentence = ""
predict(model, sentence)

array([0.04150735], dtype=float32)

In [16]:
torch.save(model.state_dict(), 'LSTM_1.pth') 