In [21]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
import re
import os
from tqdm import tqdm, tqdm_notebook

import torch
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
MAX_SEQUENCE_LENGTH = 220
SEED = 1234
EPOCHS = 1
Data_dir="./jigsaw-dataset"
WORK_DIR = "../working/"
num_to_load=1000000                         #Train size to match time limit
valid_size= 100000                          #Validation Size
TOXICITY_COLUMN = 'target'

In [4]:
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm_notebook(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    print(longer)
    return np.array(all_tokens)

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [16]:
# Load the training data
train = pd.read_csv(os.path.join(Data_dir, "train.csv"))

# Clean the text data
train['comment_text'] = train['comment_text'].replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n', ' ', regex=True)

# Add class labels (binary: 0 for non-toxic, 1 for toxic)
train['label'] = np.where(train['target'] >= 0.5, 1, 0)

# Retain only necessary columns
train = train[['id', 'comment_text', 'label']]

# Sample a 10% subset for demonstration
train_subset = train.sample(frac=0.1, random_state=42)

# Split into training and validation sets (80/20 split)
train_data, val_data = train_test_split(train_subset, test_size=0.2, stratify=train_subset['label'], random_state=42)

# Print data stats
print(f"Subset Training set size: {len(train_data)}")
print(f"Subset Validation set size: {len(val_data)}")

# Save the sampled training and validation sets
train_data.to_csv("train_subset.csv", index=False)
val_data.to_csv("val_subset.csv", index=False)

Subset Training set size: 144389
Subset Validation set size: 36098


In [18]:
# Drop rows with null comment_text
train_data = train_data.dropna(subset=['comment_text'])
val_data = val_data.dropna(subset=['comment_text'])

# Ensure all values are strings
train_data['comment_text'] = train_data['comment_text'].astype(str)
val_data['comment_text'] = val_data['comment_text'].astype(str)

# Check data again
print("Number of null values after cleaning:")
print("Train set:", train_data['comment_text'].isnull().sum())
print("Validation set:", val_data['comment_text'].isnull().sum())



Number of null values after cleaning:
Train set: 0
Validation set: 0


In [19]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize data function
def tokenize_data(data, text_column, label_column):
    encodings = tokenizer(
        list(data[text_column]),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    labels = data[label_column].values
    return encodings, labels

# Tokenize training and validation data
train_encodings, train_labels = tokenize_data(train_data, "comment_text", "label")
val_encodings, val_labels = tokenize_data(val_data, "comment_text", "label")


In [25]:
import torch
from torch.utils.data import Dataset, DataLoader

class ToxicCommentsDataset(Dataset):
    def __init__(self, ids, encodings, labels):
        self.ids = ids  # Store ids for reference
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Only return id for reference, not for the model
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # Add labels for the model
        item['id'] = self.ids[idx]  # Keep 'id' for external use only
        return item


# Create datasets
train_dataset = ToxicCommentsDataset(train_data['id'].values, train_encodings, train_labels)
val_dataset = ToxicCommentsDataset(val_data['id'].values, val_encodings, val_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [None]:
from transformers import BertForSequenceClassification, AdamW, get_scheduler
from tqdm import tqdm

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
num_training_steps = len(train_loader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
epochs = 1
model.train()

# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    loop = tqdm(train_loader, desc="Training", leave=True)
    for batch in loop:
        # Move inputs to the device
        batch = {k: v.to(device) for k, v in batch.items() if k != "id"}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
        # Update tqdm with the current loss
        loop.set_postfix(loss=loss.item())



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/1


Training:  22%|██▏       | 2013/9025 [11:35<40:15,  2.90it/s, loss=0.666]  

In [None]:
from tqdm import tqdm
from sklearn.metrics import classification_report

# Evaluation loop
model.eval()
val_predictions, val_true_labels, val_ids = [], [], []  # Include val_ids to track original IDs

with torch.no_grad():
    loop = tqdm(val_loader, desc="Evaluating", leave=True)
    for batch in loop:
        # Move inputs to the device, exclude 'id'
        batch_inputs = {k: v.to(device) for k, v in batch.items() if k != "id"}
        outputs = model(**batch_inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        # Collect predictions, true labels, and IDs
        val_predictions.extend(preds.cpu().numpy())
        val_true_labels.extend(batch['labels'].cpu().numpy())
        val_ids.extend(batch['id'].cpu().numpy())  # Track the original IDs

# Generate classification report
print(classification_report(val_true_labels, val_predictions, target_names=["Non-Toxic", "Toxic"]))

# Add predictions and true labels to the validation DataFrame for analysis
val_data['id'] = val_ids
val_data['true_label'] = val_true_labels
val_data['predicted_label'] = val_predictions
val_data['predicted_label'] = val_data['predicted_label'].map({0: "non-toxic", 1: "toxic"})

# Save validation results
val_data.to_csv("val_with_predictions.csv", index=False)
print("Validation predictions saved to 'val_with_predictions.csv'.")
