In [1]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms

import schedulefree

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output


In [3]:
transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
    ])
dataset1 = datasets.MNIST('/data', train=True, download=True,
                   transform=transform)
dataset2 = datasets.MNIST('/data', train=False,
                   transform=transform)
train_loader = torch.utils.data.DataLoader(
    dataset1,
    num_workers=6,
    pin_memory=True,
    shuffle=True,
    drop_last=True,
    batch_size=64,
)
test_loader = torch.utils.data.DataLoader(
    dataset2,
    num_workers=6,
    pin_memory=True,
    shuffle=False,
    drop_last=False,
    batch_size=1000,
)

In [4]:
import torchmetrics as M
import pytorch_lightning as pl

class test_nosched(pl.LightningModule):
    def __init__(
        self,
        network,
        *args,
        **kwargs
    ):
        super().__init__()
        self.network = network
        self.acc = M.Accuracy(task='multiclass', num_classes=10)
    
    def configure_optimizers(self):
        opt = schedulefree.AdamWScheduleFree(self.parameters(), lr=0.0025)
        return opt
    
    def on_train_epoch_start(self):
        print('\n opt train')
        self.optimizers().train()
        
    def on_validation_start(self):
        print('\n opt eval')
        self.optimizers().eval()  
    
    def forward(self, inputs):
        return self.network(inputs)
                
    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.forward(inputs)
        loss = F.nll_loss(outputs, targets)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.forward(inputs)
        loss = F.nll_loss(outputs, targets)
        acc = self.acc.update(outputs, targets)
        
    def on_validation_epoch_end(self):
        acc = self.acc.compute()
        print("\nAcc: ", acc)
        self.acc.reset()
        

In [5]:
from dl_toolbox.callbacks import ProgressBar

trainer = pl.Trainer(
    accelerator='gpu',
    devices=1,
    max_epochs=20,
    limit_train_batches=1.,
    limit_val_batches=1.,
    callbacks=[ProgressBar()]
)

network = Net()

module = test_nosched(
    network
)

trainer.fit(
    module,
    train_dataloaders=train_loader,
    val_dataloaders=test_loader
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..
`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type               | Params
-----------------------------------------------
0 | loss    | NLLLoss            | 0     
1 | network | Net                | 1.2 M 
2 | acc     | MulticlassAccuracy | 0     
-----------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.800     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]
 opt eval
Sanity Checking DataLoader 0: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.88it/s]
Acc:  tensor(0.1270, device='cuda:0')
Epoch 0:   0%|                                                                                                  | 0/937 [00:00<?, ?it/s]
 opt train
Epoch 0: 100%|██████████████████████████████████████████████████████████████████████████████| 937/937 [00:05<00:00, 170.97it/s, v_num=7]
Validation: 0it [00:00, ?it/s][A
 opt eval

Acc:  tensor(0.9873, device='cuda:0')
Epoch 1:   0%|                                                                                         | 0/937 [00:00<?, ?it/s, v_num=7]
 opt train
Epoch 1: 100%|██████████████████████████████████████████████████████████████████████████████| 937/937 [00:05<00:00, 170.74it/s, v_num=7]
Validation: 0it [00:00, ?it/s][A
 opt eval

Acc:  tensor(0.9894, device='cuda:0')
Epoch 2:   0%|                             

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
Exception in thread Thread-42:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/d/pfournie/dl_toolbox/venv/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 53, in _pin_memory_loop
    do_one_step()
  File "/d/pfournie/dl_toolbox/venv/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 30, in do_one_step
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
    return _ForkingPickler.loads(res)
  File "/d/pfournie/dl_toolbox/venv/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 495, in rebuild_storage_fd
    fd = df.detach()
  File "/usr/lib/python3.8/multiprocessing/resource_sharer.py",