# DistilBERT Experiment with PyTorch (GPU)
This notebook demonstrates fine-tuning DistilBERT for comment moderation using PyTorch and GPU acceleration.

In [1]:
#!pip install transformers[torch]

In [2]:
import os
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from sklearn.metrics import f1_score, classification_report
import re
import unicodedata
import html

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def text_preprocess(text):
    # Unicode normalization
    text = unicodedata.normalize('NFKC', str(text))
    # Remove HTML tags/entities
    text = html.unescape(re.sub(r'<.*?>', '', text))
    # Remove headers/footers and wiki markup
    text = re.sub(r'=+.*?=+', ' ', text)  # Remove headers like == Reason ==
    text = re.sub(r'REDIRECT\s+\S+', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'WP:\w+', ' ', text)  # Remove WP:SOAPBOX etc.
    # Replace usernames/IPs with [USER]
    text = re.sub(r'\b[A-Z][a-z]+[A-Z][a-z]+\b', '[USER]', text)  # CamelCase usernames
    text = re.sub(r'\b\d{1,3}(?:\.\d{1,3}){3}\b', '[IP]', text)  # IP addresses
    # Remove non-alphanumeric (keep basic punctuation)
    text = re.sub(r'[^\w\s.,!?\'\"]+', ' ', text)
    # Standardize repeated punctuation
    text = re.sub(r'([!?.,])\1+', r'\1', text)
    # Lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [4]:
# Load train and test data
train_df = pd.read_csv('../data/comments_train.csv')
test_df = pd.read_csv('../data/comments_test.csv')
print(f'Train shape: {train_df.shape}')
print(f'Test shape: {test_df.shape}')
train_df.head()

Train shape: (159571, 9)
Test shape: (63978, 9)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,moderation_label
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0


In [5]:
# Apply preprocessing to train and test data
for df in [train_df, test_df]:
    df['comment_text'] = df['comment_text'].apply(lambda x: text_preprocess(x))

In [6]:
# Tokenize data and prepare datasets for Hugging Face Trainer
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch['comment_text'], truncation=True, padding='max_length', max_length=128)

train_ds = Dataset.from_pandas(train_df[['comment_text', 'moderation_label']])
test_ds = Dataset.from_pandas(test_df[['comment_text', 'moderation_label']])
train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)
train_ds = train_ds.rename_column('moderation_label', 'labels')
test_ds = test_ds.rename_column('moderation_label', 'labels')
train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
print(train_ds[0])

Map: 100%|██████████| 159571/159571 [00:14<00:00, 11113.38 examples/s]
Map: 100%|██████████| 63978/63978 [00:05<00:00, 10771.45 examples/s]

{'labels': tensor(0), 'input_ids': tensor([  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
        18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
         1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
         3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
         1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
         1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
        12997,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 




In [7]:
# Confirm that PyTorch and Transformers are using the GPU
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU name:', torch.cuda.get_device_name(0))
    print('Current device:', torch.cuda.current_device())
    print('Device count:', torch.cuda.device_count())
    print('PyTorch default tensor type:', torch.tensor([1.0]).device)
else:
    print('WARNING: Training will run on CPU. Install CUDA-enabled PyTorch for GPU support.')

CUDA available: True
GPU name: NVIDIA GeForce RTX 4060 Laptop GPU
Current device: 0
Device count: 1
PyTorch default tensor type: cpu


In [8]:
# Set device for GPU usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print('GPU name:', torch.cuda.get_device_name(0))
else:
    print('WARNING: Training will run on CPU. Install CUDA-enabled PyTorch for GPU support.')

Using device: cuda
GPU name: NVIDIA GeForce RTX 4060 Laptop GPU


In [9]:
# Define and train DistilBERT model (force model to GPU if available)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    output_dir='./distilbert_results',
    num_train_epochs=10,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    logging_steps=1000,
    fp16=True,  # Enable mixed precision
    dataloader_num_workers=4,  # Use more workers for faster loading
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    f1 = f1_score(labels, preds, average='weighted')
    return {'f1': f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
1000,0.1
2000,0.0651
3000,0.0468
4000,0.0291
5000,0.0167
6000,0.0097
7000,0.0069
8000,0.0055
9000,0.0038
10000,0.0033


TrainOutput(global_step=12470, training_loss=0.02332791756799724, metrics={'train_runtime': 3574.7445, 'train_samples_per_second': 446.384, 'train_steps_per_second': 3.488, 'total_flos': 5.284488817734144e+16, 'train_loss': 0.02332791756799724, 'epoch': 10.0})

In [10]:
# Evaluate model
preds = trainer.predict(test_ds).predictions.argmax(axis=-1)
y_test = test_df['moderation_label'].astype(int)
f1_w = f1_score(y_test, preds, average='weighted')
f1_m = f1_score(y_test, preds, average='macro')
print(f'DistilBERT F1 weighted: {f1_w:.4f} | F1 macro: {f1_m:.4f}')
print(classification_report(y_test, preds))

DistilBERT F1 weighted: 0.9243 | F1 macro: 0.8097
              precision    recall  f1-score   support

           0       0.98      0.92      0.95     57735
           1       0.54      0.86      0.67      6243

    accuracy                           0.92     63978
   macro avg       0.76      0.89      0.81     63978
weighted avg       0.94      0.92      0.92     63978



In [11]:
# Save model and tokenizer locally (optional)
from datetime import datetime
ts = datetime.utcnow().strftime('%Y%m%d_%H%M%S')
local_model_path = f'distilbert_model_{ts}'
model.save_pretrained(local_model_path)
tokenizer.save_pretrained(local_model_path)
print(f'Model and tokenizer saved to {local_model_path}')

Model and tokenizer saved to distilbert_model_20250813_165325
