In [477]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

from torch.utils.data import DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
from IPython.display import display, clear_output
import pandas as pd
import time
import json
import numpy as np

from pytorch_lightning.loggers import TensorBoardLogger
from itertools import product
from collections import namedtuple
from collections import OrderedDict
from tqdm import tqdm
import matplotlib.pyplot as plt
import math

import os
import platform
import psutil

In [478]:
# This is the original pytorch implementation of the class
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv1_bn=nn.BatchNorm2d(6)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
        self.conv2_bn=nn.BatchNorm2d(12)

        self.fc1 = nn.Linear(in_features=12*4*4, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)
        
    def forward(self, t):
       t = self.conv1(t)
       t = self.conv1_bn(t)
       t = F.max_pool2d(t, kernel_size =2, stride=2)
       t = F.relu(t)
       
       t = self.conv2(t)
       t = self.conv2_bn(t)
       t = F.max_pool2d(t, kernel_size =2, stride=2)
       t = F.relu(t)

       t = t.reshape(-1, 12*4*4)
       t = self.fc1(t)
       t = F.relu(t)

       t = self.fc2(t)
       t = F.relu(t)

       t = self.out(t)

       return t

In [479]:

class NetworkLightning(pl.LightningModule):
    def __init__(self, model: nn.Module,learning_rate: float):
        super(NetworkLightning, self).__init__()
        self.network = Network()
        # Dummy array to get computational graph
        self.example_input_array =torch.empty(1,1,28,28)
        self.save_hyperparameters()
        self.learning_rate = learning_rate
        
    def forward(self, x):
        return self.network(x)
    
    def training_step(self, batch, batch_idx):
        images = batch[0]
        labels = batch[1]

        preds = self.network(images)
        loss = F.cross_entropy(preds, labels)
                # logs- a dictionary 
        self.log('trn_loss', loss.item())
 
        return loss
    
    def validation_step(self, batch, batch_idx):
        images = batch[0]
        labels = batch[1]

        preds = self.network(images)
        loss = F.cross_entropy(preds, labels)
        self.log('val_loss', loss.item(), prog_bar=True)
        return loss
    
        
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)



In [480]:
# Lets now also pull in our run builder class
class RunBuilder():
    @staticmethod
    def get_runs(params):
        # 
        Run = namedtuple('Run', params.keys())

        runs = []
        for vals in product(*params.values()):
            runs.append(Run(*vals))
        return runs

In [481]:
# Lets re-instantiate our normalized train set

mean = 0.2860
std = 0.3530

valid_ratio = 0.2  # Going to use 80%/20% split for train/valid

train_valid_set = torchvision.datasets.FashionMNIST(
    root='/home/slabban/machine_learning/machine_learning_courses/datasets'
    ,train=True
    ,download=True
    ,transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)])
)

nb_train = int((1.0 - valid_ratio) * len(train_valid_set))
nb_valid =  int(valid_ratio * len(train_valid_set))
train_set, valid_set = torch.utils.data.dataset.random_split(train_valid_set, [nb_train, nb_valid])

In [482]:
# Lets now also pull in our run builder class
class RunBuilder():
    @staticmethod
    def get_runs(params):
        # 
        Run = namedtuple('Run', params.keys())

        runs = []
        for vals in product(*params.values()):
            runs.append(Run(*vals))
        return runs

In [483]:
# utils

## log and print

class Log_and_print():
    # need this to ensure that stuff are printed to STDOUT as well for backup
    '''
    A simple logging mecahnism for arbitary timestamped messages during the fit routine

    https://stackoverflow.com/questions/45016458/tensorflow-tf-summary-text-and-linebreaks
    Tensorboard text uses the markdown format.
    That means you need to add 2 spaces before \n to produce a linebreak
    '''
    def __init__(self, tb_logger):
        self.tb_logger = tb_logger
        self.str_log = ('PARTIAL COPY OF TEXT LOG TO TENSORBOARD TEXT  \n'
                        'class Log_and_print() by Arian Prabowo  \n'
                        'RUN NAME: ')

    def lnp(self, tag):
        print(time.asctime(), tag)
        self.str_log += str(time.asctime()) + ' ' + str(tag) + '  \n'
    


## LogParameters

class LogParameters(pl.Callback):
    """ This is a pytorch lightning callback class that logs the weight and biases to tensorbard"""
    def __init__(self):
        super().__init__()

    def on_fit_start(self, trainer, pl_module):
        self.d_parameters = {}
        for n,p in pl_module.named_parameters():
            self.d_parameters[n] = []

    def on_validation_epoch_end(self, trainer, pl_module):
        if not trainer.sanity_checking: # WARN: sanity_check is turned on by default
            lp = []
            for n,p in pl_module.named_parameters():
                trainer.logger.experiment.add_histogram(n, p.data, trainer.current_epoch)
                self.d_parameters[n].append(p.ravel().cpu().numpy())
                lp.append(p.ravel().cpu().numpy())
            p = np.concatenate(lp)
            trainer.logger.experiment.add_histogram('Parameters', p, trainer.current_epoch)

## LogHyperParameters

class LogHyperparameters(pl.Callback):
    """ This is a pytorch lightning callback class that logs high-level run hyperparameters"""
    def __init__(self):
        super().__init__()


    def on_fit_start(self, trainer, pl_module):
        trainer.logger.log_hyperparams(pl_module.hparams)

## LogComputationalGraph
class LogComputationalGraph(pl.Callback):
    """ This is a pytorch lightning callback class that plots the computational graph
    Arguments:
        dummy_input: the graph needs this dummy input to compute the graph

    NOTE: This is currently throwing errors due to some deprecations
    """
    def __init__(self):
        super().__init__()



    def on_fit_start(self, trainer, pl_module):
        trainer.logger.experiment.add_graph(pl_module, pl_module.example_input_array)




In [484]:
def trainer_main(parameters: OrderedDict):
    """ This class """


    tb_logger = TensorBoardLogger("lightning_logs", log_graph=True, )


    lnp = Log_and_print(tb_logger=tb_logger)
    lnp.lnp('Loggers start')

    l_callbacks = []

    bLogParameters = LogParameters()
    l_callbacks.append(bLogParameters)
    bLog_Hyperparameters = LogHyperparameters()
    l_callbacks.append(bLog_Hyperparameters)

    model = Network()

    #bLogCompGraph = LogComputationalGraph()
    #l_callbacks.append(bLogCompGraph)


    
    for run in RunBuilder.get_runs(parameters):
        # TODO: Integrate this
        # Instantiate the model
        model = NetworkLightning(model= model,learning_rate=run.lr)



        # Create a data loader
        train_loader = DataLoader(train_set, batch_size=run.batch_size, shuffle=run.shuffle, num_workers=run.num_workers)
        val_loader = DataLoader(valid_set, batch_size=run.batch_size, shuffle=False, num_workers=run.num_workers)



        trainer = pl.Trainer(max_epochs=run.epochs, logger=tb_logger, accelerator=run.device, devices=1, callbacks=l_callbacks, log_every_n_steps= 3)
        trainer.fit(model, train_loader, val_loader)

    


In [485]:
parameters = OrderedDict(
    lr = [.0001, 0.01]
    ,batch_size = [1000]
    , num_workers = [10]
    , device = ['gpu']
    , shuffle = [True]
    , epochs = [3]
)

In [486]:
trainer_main(parameters=parameters)

  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type    | Params | In sizes       | Out sizes
-----------------------------------------------------------------
0 | network | Network | 33.0 K | [1, 1, 28, 28] | [1, 10]  
-----------------------------------------------------------------
33.0 K    Trainable params
0         Non-trainable params
33.0 K    Total params
0.132     Total estimated model params size (MB)


Sun Jan 22 14:58:40 2023 Loggers start


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.
Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type    | Params | In sizes       | Out sizes
-----------------------------------------------------------------
0 | network | Network | 33.0 K | [1, 1, 28, 28] | [1, 10]  
-----------------------------------------------------------------
33.0 K    Trainable params
0         Non-trainable params
33.0 K    Total params
0.132     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.
