In [1]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F

In [2]:
from PIL import Image
from torchvision import datasets
import torchvision.transforms as transforms
from torch.optim.lr_scheduler import StepLR
import pytorch_lightning as pl

In [3]:
from IPython.display import Javascript
from nbconvert import HTMLExporter
from easydict import EasyDict

In [4]:
import os
import sys
from glob import glob

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from time import time, sleep
from tqdm import tqdm
from datetime import datetime

import warnings
warnings.filterwarnings(action = 'ignore')

In [5]:
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

In [6]:
train_data = datasets.CIFAR10('./data', train = True, download=True, transform = transform)
test_data = datasets.CIFAR10('./data', train = False, download=True, transform = transform)

Files already downloaded and verified
Files already downloaded and verified


In [88]:
class Sample_model(pl.LightningModule):
    def __init__(self, hyperparameters):
        super().__init__()
        
        '''
        model.state_dict() # 모델 weight 확인
        model.hparams # 모델 하이퍼파라미터 확인

        # 모델 학습, folder : 저장 경로
        model.fit(train_dataloader, train_dataloader, folder) 

        # metric은 nn.Loss나 새로운 형태의 loss를 정의해도 됨, default는 학습 때 사용한 metric과 같음. 

        def accuracy (y_hat, y) : # manual metric example
            return torch.sum(torch.max(y_hat, axis = 1)[1] == y).item()/len(y_hat)
        
        # 모델 테스트
        model.test(test_dataloader, 'metric 이름', metric)
        
        # 저장된 ckpt 불러오기
        model = model.load_model(ckpt_path)
        
        # tensorboard
        현재 dir에서
        tensorboard --logdir=./best_model
        
        # 모델 폴더 안에
        hparams.yaml : 하이퍼파라미터 및 loss 저장
        output_file.html : 실행 당시 ipynb 파일
        '''
        
        self.get_notebook_name()
        self.checkpoint_callback = None
        
        if 'validation_loss' in hyperparameters.keys() :
            self.hparams.training_loss = hyperparameters['training_loss']
            self.hparams.validation_loss = hyperparameters['validation_loss']
        else : 
            self.hparams.training_loss = {}
            self.hparams.validation_loss = {}
            
        if 'now' in hyperparameters.keys() :
            self.hparams.now = hyperparameters['now']
        else :
            self.hparams.now = None

        self.hparams.lr = hyperparameters['lr']
        self.hparams.step_size = hyperparameters['step_size'] # epoch 단위로 계산됨.
        self.hparams.gamma = hyperparameters['gamma']
        self.hparams.batch_size = hyperparameters['batch_size']
        self.hparams.max_epochs = hyperparameters['max_epochs']
        self.hparams.gpus = hyperparameters['gpus']
        self.hparams.auto_lr_find = hyperparameters['auto_lr_find']
        self.hparams.save_top_k = hyperparameters['save_top_k']
        self.hparams.num_workers = hyperparameters['num_workers']
        self.hparams.folder = hyperparameters['folder']

        ###################### model layer ######################
        self.loss = nn.CrossEntropyLoss()
        
        self.conv1 = nn.Conv2d(3, 32, 3)
        self.max_pool = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(32, 16, 3)
        self.linear = nn.Linear(16 * 6 * 6, 10)

        
    ################# specific model structure #################
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.max_pool(x))
        x = self.conv2(x)
        x = F.relu(self.max_pool(x))
        x = F.relu(self.linear(x.view(x.size(0), -1)))
        
        return x
    
    ################## optimizer & scheduler ##################
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        scheduler = StepLR(optimizer, step_size=self.hparams.step_size, 
                           gamma=self.hparams.gamma)
        
        return [optimizer], [scheduler]
    
    
    ################################ Do not change ################################
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss(y_hat, y)

        result = pl.TrainResult(loss)
        result.log('train_loss', loss, on_epoch=True)
        
        return result
    
    def training_epoch_end(self, outputs) :
        
        for key, value in outputs.items() :
            try :
                outputs[key] = torch.mean(outputs[key])
            except :
                continue
                
        self.hparams.training_loss['epoch : %s' % self.current_epoch] = outputs[list(outputs.keys())[-1]].item()
        
        train_loss = self.hparams.training_loss['epoch : %s' % self.current_epoch]
        validation_loss = self.hparams.validation_loss['epoch : %s' % self.current_epoch]
        
        print('epoch : %s, training loss : %.4f, validation loss : %.4f, ckpt_path = %s/%s' % \
              (self.current_epoch, train_loss, validation_loss, self.hparams.now,
               ('epoch=%s_val_loss=%.4f' % (self.current_epoch, validation_loss))
              )) 

        return outputs
    

    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x) 
        loss = self.loss(y_hat, y)
        
        result = pl.EvalResult(checkpoint_on=loss)
        result.log('val_loss', loss)
        
        return result
    
    def validation_epoch_end(self, val_outputs):
        self.hparams.validation_loss['epoch : %s' % self.current_epoch] = torch.mean(val_outputs['val_loss']).item()  
        val_outputs['val_loss'] = torch.mean(val_outputs['val_loss'])
        
        return val_outputs
    
    
    
    def fit(self, train_data, test_data) :
        
        self.hparams.now = datetime.now().strftime("%y%m%d_%H:%M:%S")
        checkpoint_callback, tb_logger = self.call_logger()

        self.trainer = pl.Trainer(max_epochs=self.hparams.max_epochs, gpus = self.hparams.gpus, 
                                  auto_lr_find=self.hparams.auto_lr_find, 
                             checkpoint_callback=checkpoint_callback, logger = tb_logger)
        self.save_notebook()
        sleep(1.0)

        current_file = this_notebook
        self.output_HTML(current_file, './%s/%s/' % (self.hparams.folder, self.hparams.now))
        self.trainer.fit(self, train_dataloader, test_dataloader)        
        
        
    def get_notebook_name(self) :
        display(Javascript('Jupyter.notebook.kernel.execute(\
                           "this_notebook = " + "\'"\
                           +Jupyter.notebook.notebook_name+"\'");'))


    def save_notebook(self):
        display(
            Javascript("IPython.notebook.save_notebook()"),
            include=['application/javascript']
        )

    def output_HTML(self, current_file, path):
        import codecs
        import nbformat
        exporter = HTMLExporter()
        
        output_notebook = nbformat.read(current_file, as_version=4)
        output, resources = exporter.from_notebook_node(output_notebook)
        codecs.open(path +'output_file.html', 'w', encoding='utf-8').write(output)
        
    def test(self, dataloader, metric_name, loss = None) :
        checkpoint_callback, tb_logger = self.call_logger()
    
        self.trainer = pl.Trainer(max_epochs=self.hparams.max_epochs, gpus = self.hparams.gpus, 
                                  auto_lr_find=self.hparams.auto_lr_find, 
                             checkpoint_callback=checkpoint_callback, logger = tb_logger)
        
        if loss is not None :
            self.test_loss = loss
        else :
            self.test_loss = self.loss
            
        self.test_metric = metric_name
        self.trainer.test(self, dataloader)

    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x) 
        loss = self.test_loss(y_hat, y) * len(batch)
        return {'loss' : loss, 'len' : len(batch)}
        
    
    def test_epoch_end(self, outputs) :
        sum_loss = 0
        sum_len = 0
        
        for i in outputs :
            sum_loss += i['loss']
            sum_len += i['len']
            
        return {self.test_metric : sum_loss/sum_len}
    
    def call_logger(self) :
        filepath=os.getcwd() + '/%s/%s/{epoch:d}_{val_loss:.4f}' % (self.hparams.folder, 
                                                                    self.hparams.now)
        
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            filepath = filepath,
            save_top_k=self.hparams.save_top_k,
            monitor='val_loss',
            mode='min')

        tb_logger = pl.loggers.TensorBoardLogger(save_dir=self.hparams.folder, name = None, 
                                                 version = self.hparams.now)
        return checkpoint_callback, tb_logger
    
    def load_model(self, ckpt_path) :
        loaded_model = self.load_from_checkpoint('./%s/%s.ckpt' % (self.hparams.folder, 
                                                                   ckpt_path), 
                                         hparams_file = './%s/%s/hparams.yaml' % \
                                           (self.hparams.folder, ckpt_path.split('/')[0]))
        
        return loaded_model

In [97]:
hyperparameters = EasyDict({'lr' : 0.007,
                            'max_epochs' :5,
                            'step_size' : 1,
                            'gamma' : 0.9,
                            'batch_size' : 256,
                            'gpus' : [0],
                            'num_workers' : 16,
                            'auto_lr_find' : True,
                            'save_top_k' : 3,
                            'folder' : 'best_model'
                           })


if not os.path.isdir(hyperparameters['folder']) :
    os.mkdir(hyperparameters['folder'])
batch_size = hyperparameters['batch_size']

In [98]:
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, 
                          num_workers=16) 
test_dataloader = DataLoader(test_data, batch_size=len(test_data), shuffle=False, 
                             num_workers=16 )

In [99]:
model = Sample_model(hyperparameters)

<IPython.core.display.Javascript object>

In [100]:
model.fit(train_dataloader, train_dataloader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]



  | Name     | Type             | Params
----------------------------------------------
0 | loss     | CrossEntropyLoss | 0     
1 | conv1    | Conv2d           | 896   
2 | max_pool | MaxPool2d        | 0     
3 | conv2    | Conv2d           | 4 K   
4 | linear   | Linear           | 5 K   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

epoch : 0, training loss : 1.9090, validation loss : 1.7563, ckpt_path = 200828_18:07:01/epoch=0_val_loss=1.7563


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

epoch : 1, training loss : 1.7187, validation loss : 1.6968, ckpt_path = 200828_18:07:01/epoch=1_val_loss=1.6968


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

epoch : 2, training loss : 1.6354, validation loss : 1.6828, ckpt_path = 200828_18:07:01/epoch=2_val_loss=1.6828


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

epoch : 3, training loss : 1.6008, validation loss : 1.6445, ckpt_path = 200828_18:07:01/epoch=3_val_loss=1.6445


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

epoch : 4, training loss : 1.5770, validation loss : 1.5865, ckpt_path = 200828_18:07:01/epoch=4_val_loss=1.5865


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

epoch : 5, training loss : 1.5629, validation loss : 1.5915, ckpt_path = 200828_18:07:01/epoch=5_val_loss=1.5915


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

epoch : 6, training loss : 1.5418, validation loss : 1.5690, ckpt_path = 200828_18:07:01/epoch=6_val_loss=1.5690


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

epoch : 7, training loss : 1.5254, validation loss : 1.5587, ckpt_path = 200828_18:07:01/epoch=7_val_loss=1.5587


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

epoch : 8, training loss : 1.5198, validation loss : 1.5579, ckpt_path = 200828_18:07:01/epoch=8_val_loss=1.5579


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Saving latest checkpoint..


epoch : 9, training loss : 1.5083, validation loss : 1.5579, ckpt_path = 200828_18:07:01/epoch=9_val_loss=1.5579



In [None]:
def accuracy (y_hat, y) :
    return torch.sum(torch.max(y_hat, axis = 1)[1] == y).item()/len(y_hat)

In [101]:
model.test(test_dataloader, 'test accuracy', accuracy)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'metric 이름': 0.5047}
--------------------------------------------------------------------------------



In [102]:
model = model.load_model('200828_18:07:01/epoch=8_val_loss=1.5579')

<IPython.core.display.Javascript object>

In [42]:
model.test(test_dataloader, 'test accuracy', accuracy)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'metric 이름': 0.2309}
--------------------------------------------------------------------------------

