In [10]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW 
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from tqdm import tqdm

# ✅ Step 1: Load Dataset
df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
X = df['comment_text'].values
y = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

# ✅ Step 2: Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# ✅ Step 3: Tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# ✅ Step 4: Dataset Class
class ToxicDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encodings['input_ids'].squeeze(0),
            'attention_mask': encodings['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

# ✅ Step 5: Prepare Dataloaders
train_dataset = ToxicDataset(X_train, y_train, tokenizer)
val_dataset = ToxicDataset(X_val, y_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# ✅ Step 6: Load Model on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=6)
model.to(device)

# ✅ Step 7: Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# ✅ Step 8: Training Loop
EPOCHS = 3
model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader)}")

# ✅ Step 9: Evaluation
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = torch.sigmoid(outputs.logits).cpu().numpy()
        
        predictions.extend(logits)
        true_labels.extend(labels)

# ✅ Step 10: Threshold and Metrics
pred_labels = (np.array(predictions) > 0.5).astype(int)
print(classification_report(true_labels, pred_labels, target_names=df.columns[2:]))

# ✅ Step 11: Save Model
model.save_pretrained("./roberta-toxic")
tokenizer.save_pretrained("./roberta-toxic")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 8976/8976 [30:34<00:00,  4.89it/s]


Epoch 1 Loss: 0.04964620855998436


Training Epoch 2: 100%|██████████| 8976/8976 [30:36<00:00,  4.89it/s]


Epoch 2 Loss: 0.03796635908143481


Training Epoch 3: 100%|██████████| 8976/8976 [30:34<00:00,  4.89it/s]


Epoch 3 Loss: 0.032745070292305736


Evaluating: 100%|██████████| 998/998 [01:07<00:00, 14.82it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

        toxic       0.84      0.83      0.83      1480
 severe_toxic       0.56      0.30      0.39       148
      obscene       0.81      0.84      0.82       836
       threat       0.38      0.65      0.48        37
       insult       0.71      0.84      0.77       791
identity_hate       0.68      0.48      0.56       147

    micro avg       0.78      0.79      0.79      3439
    macro avg       0.66      0.66      0.64      3439
 weighted avg       0.78      0.79      0.78      3439
  samples avg       0.07      0.07      0.07      3439



('./roberta-toxic/tokenizer_config.json',
 './roberta-toxic/special_tokens_map.json',
 './roberta-toxic/vocab.json',
 './roberta-toxic/merges.txt',
 './roberta-toxic/added_tokens.json')