# How can we convert our python code to pytorch-lightning

In [1]:
# import 

import os
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl

In [2]:
# this is a simple autoencoder model

class LitAutoEncoder(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
        self.decoder = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))

    def forward(self, x):
        # in lightning, forward defines the prediction/inference actions
        embedding = self.encoder(x)
        return embedding

    def training_step(self, batch, batch_idx):
        # training_step defined the train loop.
        # It is independent of forward
        x, y = batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat, x)
        # Logging to TensorBoard by default
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [3]:
# load the dataset

dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor())
train_loader = DataLoader(dataset)

## Debug the model 

Before we run our model at whole dataset. we should make sure our code is right. So we need to debug the code first.

In [5]:
# train the model, it is very easy.

# init model
autoencoder = LitAutoEncoder()

# most basic trainer, uses good defaults (auto-tensorboard, checkpoints, logs, and more)
# trainer = pl.Trainer(accelerator="gpu", devices=8) (if you have GPUs)
trainer = pl.Trainer(fast_dev_run=True)
trainer.fit(model=autoencoder, train_dataloaders=train_loader)

# it will take 15 minutes to train the model in a CPU machine. 

  rank_zero_warn(
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
Running in fast_dev_run mode: will run a full train, val, test and prediction loop using 1 batch(es).
`Trainer(limit_train_batches=1)` was configured so 1 batch per epoch will be used.
`Trainer(limit_val_batches=1)` was configured so 1 batch will be used.
`Trainer(limit_test_batches=1)` was configured so 1 batch will be used.
`Trainer(limit_predict_batches=1)` was configured so 1 batch will be used.
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 50.4 K
1 | decoder | Sequential | 51.2 K
---------------------------------------
101 K     Trainable params
0         Non-trainable params
101 K     Total params
0.407     Total estimated model params size 

Training: 0it [00:00, ?it/s]

## how can we set the epoch

In [17]:
autoencoder = LitAutoEncoder()

# most basic trainer, uses good defaults (auto-tensorboard, checkpoints, logs, and more)
# trainer = pl.Trainer(accelerator="gpu", devices=8) (if you have GPUs)
trainer = pl.Trainer(max_epochs=3,limit_train_batches = 10,limit_val_batches=10)
trainer.fit(model=autoencoder, train_dataloaders=train_loader)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 50.4 K
1 | decoder | Sequential | 51.2 K
---------------------------------------
101 K     Trainable params
0         Non-trainable params
101 K     Total params
0.407     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

## how can we save and load the model

In [20]:
model = autoencoder.load_from_checkpoint("lightning_logs/version_6/checkpoints/epoch=2-step=30.ckpt")
print(model)


LitAutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=784, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=3, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=3, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=784, bias=True)
  )
)


In [21]:
# retraining the model

trainer = pl.Trainer(max_epochs=3,limit_val_batches=10)
trainer.fit(model = model, train_dataloaders=train_loader)

# we can see that the training process is from epoch 0

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 50.4 K
1 | decoder | Sequential | 51.2 K
---------------------------------------
101 K     Trainable params
0         Non-trainable params
101 K     Total params
0.407     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

In [24]:
# if we want to restore model, epoch and step, So we can go on training. 

trainer = pl.Trainer(max_epochs=10,limit_val_batches=10)
autoencoder = LitAutoEncoder()

trainer.fit(autoencoder, ckpt_path="lightning_logs/version_6/checkpoints/epoch=2-step=30.ckpt",train_dataloaders=train_loader)
# we can see that the model is training from epoch 2

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
Restoring states from the checkpoint path at lightning_logs/version_6/checkpoints/epoch=2-step=30.ckpt

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 50.4 K
1 | decoder | Sequential | 51.2 K
---------------------------------------
101 K     Trainable params
0         Non-trainable params
101 K     Total params
0.407     Total estimated model params size (MB)
Restored all states from the checkpoint file at lightning_logs/version_6/checkpoints/epoch=2-step=30.ckpt
  rank_zero_warn(


Training: 10it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [25]:
# we can use another parameters

trainer = pl.Trainer(max_epochs=10,limit_val_batches=10,resume_from_checkpoint = "lightning_logs/version_6/checkpoints/epoch=2-step=30.ckpt")
autoencoder = LitAutoEncoder()
trainer.fit(model = autoencoder, train_dataloaders=train_loader)


  rank_zero_deprecation(
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  ckpt_path = ckpt_path or self.resume_from_checkpoint
Restoring states from the checkpoint path at lightning_logs/version_6/checkpoints/epoch=2-step=30.ckpt

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 50.4 K
1 | decoder | Sequential | 51.2 K
---------------------------------------
101 K     Trainable params
0         Non-trainable params
101 K     Total params
0.407     Total estimated model params size (MB)
Restored all states from the checkpoint file at lightning_logs/version_6/checkpoints/epoch=2-step=30.ckpt


Training: 10it [00:00, ?it/s]