In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/toxic_tweets.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Toxicity,tweet
0,0,0,@user when a father is dysfunctional and is s...
1,1,0,@user @user thanks for #lyft credit i can't us...
2,2,0,bihday your majesty
3,3,0,#model i love u take with u all the time in ...
4,4,0,factsguide: society now #motivation


In [3]:
# small amount of data exploration
# preprocessing pipeline

In [4]:
df.shape

(56745, 3)

In [5]:
df.groupby("Toxicity").size()

Toxicity
0    32592
1    24153
dtype: int64

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

In [7]:
def tokenize_data(texts, tokenizer, max_length=512):
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return inputs['input_ids'], inputs['attention_mask']

texts = [X for X in df["tweet"]]
input_ids, attention_mask = tokenize_data(texts, tokenizer)

In [8]:
train_input_ids, val_input_ids, train_attention_mask, val_attention_mask, train_labels, val_labels = train_test_split(
    input_ids, attention_mask, df["Toxicity"], test_size=0.2, random_state=42
)

In [9]:
class BERTBinaryClassifier(nn.Module):
    
    def __init__(self, bert_model, dropout=0.1):
        
        super(BERTBinaryClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(self.bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        
        # Get BERT outputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Extract the last hidden state of the [CLS] token
        cls_output = outputs.last_hidden_state[:, 0, :]
        # Apply dropout
        cls_output = self.dropout(cls_output)
        # Apply linear layer
        linear_output = self.linear(cls_output)
        # Apply sigmoid activation
        probs = self.sigmoid(linear_output)
        return probs

In [None]:
# Instantiate model
model = BERTBinaryClassifier(bert_model)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Dummy training loop
epochs = 2
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    outputs = model(train_input_ids, train_attention_mask)
    

    # Compute loss
    loss = criterion(outputs, train_labels)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

    # Backward pass
    loss.backward()
    optimizer.step()