In [1]:
import os
import sys

# Detect if running in Google Colab
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab. Setting up virtual environment from project repository...")

    # Mount Google Drive for persistent storage
    from google.colab import drive
    drive.mount('/content/drive')

    # Create a project directory in Google Drive
    PROJECT_DIR = '/content/drive/MyDrive/toxy-bot'
    os.makedirs(PROJECT_DIR, exist_ok=True)
    os.chdir(PROJECT_DIR)

    # Install uv package manager if not already installed
    !pip install uv

    # Clone the repository if not already cloned
    if not os.path.exists('pyproject.toml'):
        !git clone https://github.com/anitamaxvim/toxy-bot.git .

    # Verify the files exist
    if os.path.exists('pyproject.toml'):
        print("Found pyproject.toml file.")
    else:
        print("Warning: pyproject.toml not found. Virtual environment setup may be incomplete.")

    if os.path.exists('uv.lock'):
        print("Found uv.lock file.")
    else:
        print("Warning: uv.lock file not found. Will rely on pyproject.toml for dependencies.")

    # Create and activate virtual environment
    !uv venv

    # Install dependencies from pyproject.toml (and uv.lock if available)
    !uv pip install -e .

    # Check installation
    !uv pip list

    print("\nVirtual environment setup complete. You can now import your project packages.")
    print(f"\nProject directory: {PROJECT_DIR}")
else:
    print("Not running in Google Colab. Using local environment.")

Not running in Google Colab. Using local environment.


In [2]:
from toxy_bot.ml.config import Config, DataModuleConfig
from toxy_bot.ml.trainer import train

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
cache_dir = Config.cache_dir
log_dir = Config.log_dir
ckpt_dir = Config.ckpt_dir
perf_dir = Config.perf_dir

print(f"Cache dir: {cache_dir}")
print(f"Log dir: {log_dir}")
print(f"Checkpoint dir: {ckpt_dir}")
print(f"Perf dir: {perf_dir}")

Cache dir: /Users/dbozbay/developer/toxy-bot/data
Log dir: /Users/dbozbay/developer/toxy-bot/logs
Checkpoint dir: /Users/dbozbay/developer/toxy-bot/checkpoints
Perf dir: /Users/dbozbay/developer/toxy-bot/logs/perf


In [4]:
dataset_name = DataModuleConfig.dataset_name
train_size = DataModuleConfig.train_size

print(f"Datastet: {dataset_name}")
print(f"Train size: {train_size}")

Datastet: anitamaxvim/jigsaw-toxic-comments
Train size: 0.85


In [5]:
# Hyperparameters
MAX_LENGTH = 256
BATCH_SIZE = 64
LR = 3e-5
MAX_EPOCHS = 5

# Turn on peformance monitoring
PERF = True

# Models
models = [
    "google/bert_uncased_L-4_H-512_A-8", # BertSmall (29M)
    "google/bert_uncased_L-8_H-512_A-8", # BertMedium (42M)
    "google/bert_uncased_L-12_H-768_A-12", # BertBase (108M)
]

In [None]:
for m in models:
    train(
        model_name=m,
        max_length=MAX_LENGTH,
        batch_size=BATCH_SIZE,
        lr=LR,
        max_epochs=MAX_EPOCHS,
        perf=PERF,
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-512_A-8 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/Users/dbozbay/developer/toxy-bot/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/accelerator_connector.py:513: You passed `Trainer(accelerator='cpu', precision='16-mixed')` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'` instead.
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Seed set to 42
[2025-03-31 14:22:05.141561] Dataset anitamaxvim/jigsaw-toxic-comments exists in cache. Loading from cache.
Map: 100%|██████████| 135635/135635 [01:10<00:00, 1929.00 examples/s]
Map: 100%|██████████| 23936/23936 [00:12<00:00, 1872.85 exa

Epoch 0:   0%|          | 0/2120 [00:00<?, ?it/s] 