In [None]:
import pandas as pd

# Load the training data
df = pd.read_csv("/content/drive/My Drive/DL/train.csv")

# Show first 5 rows
print("First 5 rows:")
display(df.head())

# Show columns
print("\nColumns in the dataset:")
print(df.columns)

# Show dataset info
print("\nDataset info:")
print(df.info())

# Number of rows
print("\nNumber of rows:", len(df))

In [None]:
# Create a binary label: toxic OR not toxic
df["toxic_binary"] = df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].max(axis=1)

# Check the first few rows
df[["comment_text", "toxic_binary"]].head()

In [None]:
from sklearn.model_selection import train_test_split

# Split the data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["comment_text"].values,
    df["toxic_binary"].values,
    test_size=0.2,
    random_state=42
)

print("Training samples:", len(train_texts))
print("Testing samples:", len(test_texts))

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Test the tokenizer on one example
test_text = "You are an idiot."
tokens = tokenizer.tokenize(test_text)
ids = tokenizer.convert_tokens_to_ids(tokens)

print("Tokens:", tokens)
print("Token IDs:", ids)

In [None]:
import torch
from torch.utils.data import Dataset

class ToxicCommentsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
sample = ToxicCommentsDataset(train_texts, train_labels, tokenizer)[0]
sample

In [None]:
from torch.utils.data import DataLoader

# Create dataset objects
train_dataset = ToxicCommentsDataset(train_texts, train_labels, tokenizer, max_len=128)
test_dataset = ToxicCommentsDataset(test_texts, test_labels, tokenizer, max_len=128)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

print("Train loader batches:", len(train_loader))
print("Test loader batches:", len(test_loader))

In [None]:
from transformers import BertForSequenceClassification

# Load BERT with a classification head
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

device

In [None]:
import torch
torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from torch.optim import AdamW
from tqdm import tqdm  # Progress bar

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 2  # You can increase to 3 if needed

model.train()

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    total_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        # Move data to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss

        # Backpropagation
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Average training loss: {avg_loss:.4f}")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch

# Put model in evaluation mode
model.eval()

all_preds = []
all_labels = []

# Disable gradient calculation for evaluation
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Compute confusion matrix
cm = confusion_matrix(all_labels, all_preds)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Non-Toxic", "Toxic"],
            yticklabels=["Non-Toxic", "Toxic"])

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

cm

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

model_path = "/content/drive/My Drive/DL/bert_toxic_model"

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Move model to GPU if available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
import torch.nn.functional as F

def predict_toxicity(text):
    model.eval()

    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = F.softmax(logits, dim=1)
        predicted_class = torch.argmax(probs, dim=1).item()
        probability = probs[0][predicted_class].item()

    label = "Toxic" if predicted_class == 1 else "Not Toxic"
    return label, probability

In [None]:
print(predict_toxicity("I hate you, you stupid idiot!"))
print(predict_toxicity("You are an amazing person, have a great day!"))
print(predict_toxicity("Shut up, you clown."))
print(predict_toxicity("I like like you, i hate you"))