In [1]:
import os
import sys

# Detect if running in Google Colab
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab. Setting up project environment...")

    # Mount Google Drive for persistent storage
    from google.colab import drive
    drive.mount('/content/drive')

    # Create a project directory in Google Drive
    PROJECT_DIR = '/content/drive/MyDrive/toxy-bot'
    
    # Remove existing directory if it exists
    if os.path.exists(PROJECT_DIR):
        print("Removing existing toxy-bot directory...")
        !rm -rf {PROJECT_DIR}
    
    os.makedirs(PROJECT_DIR, exist_ok=True)
    os.chdir(PROJECT_DIR)

    # Clone the repository
    print("Cloning latest version from GitHub...")
    !git clone https://github.com/anitamaxvim/toxy-bot.git .

    # Verify the version
    print("\nVerifying package version:")
    !cat pyproject.toml | grep version

    # Install project dependencies
    print("\nInstalling dependencies...")
    !pip install -e .

    print(f"\nProject directory: {PROJECT_DIR}")
else:
    print("Not running in Google Colab. Using local environment.")

Not running in Google Colab. Using local environment.


In [2]:
from toxy_bot.ml.config import Config, DataModuleConfig
from toxy_bot.ml.trainer import train

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
cache_dir = Config.cache_dir
log_dir = Config.log_dir
ckpt_dir = Config.ckpt_dir
perf_dir = Config.perf_dir

print(f"Cache dir: {cache_dir}")
print(f"Log dir: {log_dir}")
print(f"Checkpoint dir: {ckpt_dir}")
print(f"Perf dir: {perf_dir}")

Cache dir: /Users/dbozbay/developer/toxy-bot/data
Log dir: /Users/dbozbay/developer/toxy-bot/logs
Checkpoint dir: /Users/dbozbay/developer/toxy-bot/checkpoints
Perf dir: /Users/dbozbay/developer/toxy-bot/logs/perf


In [4]:
dataset_name = DataModuleConfig.dataset_name
train_size = DataModuleConfig.train_size

print(f"Datastet: {dataset_name}")
print(f"Train size: {train_size}")

Datastet: anitamaxvim/jigsaw-toxic-comments
Train size: 0.85


In [5]:
# Hyperparameters
MAX_LENGTH = 256
BATCH_SIZE = 64
LR = 3e-5
MAX_EPOCHS = 5

# Turn on peformance monitoring
PERF = True

# Models
models = [
    "google/bert_uncased_L-4_H-512_A-8", # BertSmall (29M)
    "google/bert_uncased_L-8_H-512_A-8", # BertMedium (42M)
    "google/bert_uncased_L-12_H-768_A-12", # BertBase (108M)
]

In [None]:
for m in models:
    train(
        model_name=m,
        max_length=MAX_LENGTH,
        batch_size=BATCH_SIZE,
        lr=LR,
        max_epochs=MAX_EPOCHS,
        perf=PERF,
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-512_A-8 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/Users/dbozbay/developer/toxy-bot/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/accelerator_connector.py:513: You passed `Trainer(accelerator='cpu', precision='16-mixed')` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'` instead.
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Seed set to 42
[2025-03-31 14:39:47.483847] Dataset anitamaxvim/jigsaw-toxic-comments exists in cache. Loading from cache.
Map: 100%|██████████| 135635/135635 [01:22<00:00, 1652.89 examples/s]
Map: 100%|██████████| 23936/23936 [00:14<00:00, 1625.35 exa

Epoch 0:   0%|          | 0/2120 [00:00<?, ?it/s] 