### Train multiple models in one GPU

This module contains different models that will be trained using one GPU. There are `num_arch` numbers of architecture constructed, each of which is assigned with `num_opt` numbers of optimizers. This gives us a total of `num_models` models to be trained.<br>
Run the function  `multi_mod_training()` in a separate console so you can still import the module to check the result of training even before the `max_epoch`.

In [1]:
import os

import chainer
from chainer import configuration
from chainer.dataset import convert
import chainer.links as L
import chainer.functions as F
from chainer import serializers

import math
import numpy as np
import cupy as cp
import random

import sklearn
from sklearn.utils import shuffle
import pickle

import time

Assign values for the training parameters `batchsize` and `max_epoch`. Specify also the name of `directory` where the model and optimizers, together with the testing and training informations are to be saved.

If you wish to resume an interrupted run, simply set `resume=True`. Keep in mind that you should have the same `directory` as the previous training. Also, the `max_epoch` is now the new `max_epoch` plus the previously interrupted epoch. If you want to start a new run, then simply set `resume=False`.

In [2]:
batchsize = 1600
max_epoch = 3000
directory = 'generalization'
resume = False

gpu_id = 0
device = chainer.get_device(gpu_id)

Construct your own neural network architecture. Specify the number of architectures that you have created in `num_arch`

In [3]:
class MLP1(chainer.Chain):
    #Two hidden layers with 250-100 nodes
    def __init__(self):
        super(MLP1, self).__init__()
        with self.init_scope():
            self.l1 = L.Linear(200, 250, nobias=False, initialW=None, initial_bias=None)
            self.l2 = L.Linear(250, 100, nobias=False, initialW=None, initial_bias=None)
            self.l3 = L.Linear(100,   3, nobias=False, initialW=None, initial_bias=None)

    def forward(self, x):
        h1 = F.relu(self.l1(x))
        h2 = F.relu(self.l2(h1))
        return self.l3(h2)

class MLP2(chainer.Chain):
    #Three hidden layers with 250-100-50 nodes
    def __init__(self):
        super(MLP2, self).__init__()
        with self.init_scope():
            self.l1 = L.Linear(200, 250, nobias=False, initialW=None, initial_bias=None)
            self.l2 = L.Linear(250, 100, nobias=False, initialW=None, initial_bias=None)
            self.l3 = L.Linear(100,  50, nobias=False, initialW=None, initial_bias=None)
            self.l4 = L.Linear(50,    3, nobias=False, initialW=None, initial_bias=None)

    def forward(self, x):
        h1 = F.relu(self.l1(x))
        h2 = F.relu(self.l2(h1))
        h3 = F.relu(self.l3(h2))
        return self.l4(h3)

class MLP3(chainer.Chain):
    #Three hidden layers with 250-250-250 nodes
    def __init__(self):
        super(MLP3, self).__init__()
        with self.init_scope():
            self.l1 = L.Linear(200, 250, nobias=False, initialW=None, initial_bias=None)
            self.l2 = L.Linear(250, 250, nobias=False, initialW=None, initial_bias=None)
            self.l3 = L.Linear(250, 250, nobias=False, initialW=None, initial_bias=None)
            self.l4 = L.Linear(250,   3, nobias=False, initialW=None, initial_bias=None)

    def forward(self, x):
        h1 = F.relu(self.l1(x))
        h2 = F.relu(self.l2(h1))
        h3 = F.relu(self.l3(h2))
        return self.l4(h3)    
    
num_arch = 3

Choose the optimizer that you want to test and specify in `num_opt` the number of optimizer per architecture. That is, for each architecture we have more than one optimizers. The total number of models to be trained, `num_models`, is equal to `num_arch` $\times$ `num_opt` 

In [4]:
#Link each model to a classifier
model1 = L.Classifier(MLP1())
model2 = L.Classifier(MLP2())
model3 = L.Classifier(MLP3())

model4 = L.Classifier(MLP1())
model5 = L.Classifier(MLP2())
model6 = L.Classifier(MLP3())

model1.to_device(device)
model2.to_device(device)
model3.to_device(device)
model4.to_device(device)
model5.to_device(device)
model6.to_device(device)

#Setup the optimizer
optimizer1 = chainer.optimizers.Adam()
optimizer2 = chainer.optimizers.Adam()
optimizer3 = chainer.optimizers.Adam()

optimizer4 = chainer.optimizers.AMSGrad()
optimizer5 = chainer.optimizers.AMSGrad()
optimizer6 = chainer.optimizers.AMSGrad()

optimizer1.setup(model1)
optimizer2.setup(model2)
optimizer3.setup(model3)

optimizer4.setup(model4)
optimizer5.setup(model5)
optimizer6.setup(model6)

num_opt = 2

num_models = num_arch * num_opt

The code below can now be used to execute the training in one GPU. The `train_mnist_custom_loop.py` is modified to accommodate multiple models in one GPU. Simply run `multi_mod_training()` in a separate console so this module can still be imported in a different notebook. Only the training epoch and time elapsed are printed but you may check the training and testing accuracies in the `experiment` notebook.

In [5]:
def multi_mod_training():
    #Train num_models different models in a single GPU
    #using the modified manual MNIST training loop.
    #Variable names for (each model) are generated dynamically using exec()
    
    #Load Classification dataset
    train = pickle.load(open('chainer_train.pkl','rb'))
    test = pickle.load(open('chainer_test.pkl','rb'))
    
    #Define Iterator
    train_iter = chainer.iterators.SerialIterator(train, batchsize)
    test_iter = chainer.iterators.SerialIterator(test, batchsize,
                                             repeat=False, shuffle=False)
    
    out = directory
    if not os.path.isdir(out):
        os.makedirs(out)


    
    if resume==True:
        log = open(os.path.join(out,'training_log.txt'),'a+')
        for modndx in range(num_models):
            #Load training and testing accuracies of the earlier run
            exec('training_accu{}=pickle.load(open(os.path.join(out,"training_accu{}.pkl"),"rb"))'.format(modndx+1,modndx+1))
            exec('testing_accu{}=pickle.load(open(os.path.join(out,"testing_accu{}.pkl"),"rb"))'.format(modndx+1,modndx+1))
            #Load training and testing losses of the earlier run
            exec('training_loss{}=pickle.load(open(os.path.join(out,"training_loss{}.pkl"),"rb"))'.format(modndx+1,modndx+1))
            exec('testing_loss{}=pickle.load(open(os.path.join(out,"testing_loss{}.pkl"),"rb"))'.format(modndx+1,modndx+1)) 
            #Load the model and the optimizer's previous state
            exec('serializers.load_npz(os.path.join(out,"MLP{}.model"), model{})'.format(modndx+1,modndx+1))
            exec('serializers.load_npz(os.path.join(out,"MLP{}.state"), optimizer{})'.format(modndx+1,modndx+1))
        #I need this to assign value to last epoch
        training_accu1 = pickle.load(open(os.path.join(out,"training_accu1.pkl"),"rb"))
    elif resume==False:
        log = open(os.path.join(out,'training_log.txt'),'w+')
        log = open(os.path.join(out,'training_log.txt'),'a+')
        for modndx in range(num_models):
            exec('training_loss{} = []'.format(modndx+1))
            exec('training_accu{} = []'.format(modndx+1))
            exec('testing_loss{} = []'.format(modndx+1))
            exec('testing_accu{} = []'.format(modndx+1))
        #I need this to assign value to last epoch
        training_accu1 = [] 
    
    last_epoch = len(training_accu1)
        
    
    time_start = time.time()    
    test_count = len(test)
    
    #Initialize training
    train_count = 0
    for modndx in range(num_models):
        exec('sum{}_loss = 0'.format(modndx+1))
        exec('sum{}_accu = 0'.format(modndx+1))
        
    #---------------------------start training epoch----------------------------------------------
    while train_iter.epoch < max_epoch:
        batch = train_iter.next()
        x, t = convert.concat_examples(batch, device)
        train_count += len(t)
        
        for modndx in range(num_models):
            #Update network's parameters using forward pass and 
            #backpropagation for each model            
            exec('optimizer{}.update(model{}, x, t)'.format(modndx+1,modndx+1))
            #Calculate training loss and accuracy
            exec('sum{}_loss += float(model{}.loss.array)*len(t)'.format(modndx+1,modndx+1))
            exec('sum{}_accu += float(model{}.accuracy.array)*len(t)'.format(modndx+1,modndx+1))
    #-----------------------end of one epoch----------------------------------------------------
        
        if train_iter.is_new_epoch:
            for modndx in range(num_models):
                #Record training loss and accuracy for each model
                exec('training_loss{}.append(sum{}_loss/train_count)'.format(modndx+1,modndx+1))
                exec('training_accu{}.append(sum{}_accu/train_count)'.format(modndx+1,modndx+1))
                #Initialize loss and accuracy for testing
                exec('sum{}_loss = 0'.format(modndx+1))
                exec('sum{}_accu = 0'.format(modndx+1))
            
            #Enable evaluation mode
            with configuration.using_config('train', False):
                #This is optional but can reduce computational overhead
                with chainer.using_config('enable_backprop', False):
                    for batch in test_iter:
                        x, t = convert.concat_examples(batch, device)
                        
                        #Calculate testing loss and accuracy
                        for modndx in range(num_models):
                            exec('loss{} = model{}(x,t)'.format(modndx+1,modndx+1))
                            exec('sum{}_loss += float(loss{}.array)*len(t)'.format(modndx+1,modndx+1))
                            exec('sum{}_accu += (model{}.accuracy.array)*len(t)'.format(modndx+1,modndx+1))
            test_iter.reset()
            
            
            for modndx in range(num_models):
                #Record testing loss and accuracy
                exec('testing_loss{}.append(sum{}_loss/test_count)'.format(modndx+1,modndx+1))
                exec('testing_accu{}.append(sum{}_accu/test_count)'.format(modndx+1,modndx+1))
                
                #Save model and optimizer state
                exec('serializers.save_npz(os.path.join(out,"MLP{}.model"), model{})'.format(modndx+1,modndx+1))
                exec('serializers.save_npz(os.path.join(out,"MLP{}.state"), optimizer{})'.format(modndx+1,modndx+1))
                
                #Save training data
                exec('pickle.dump(training_loss{},open(os.path.join(out,"training_loss{}.pkl"),"wb"))'.format(modndx+1,modndx+1))
                exec('pickle.dump(training_accu{},open(os.path.join(out,"training_accu{}.pkl"),"wb"))'.format(modndx+1,modndx+1))
                
                #Save testing data
                exec('pickle.dump(testing_loss{},open(os.path.join(out,"testing_loss{}.pkl"),"wb"))'.format(modndx+1,modndx+1))
                exec('pickle.dump(testing_accu{},open(os.path.join(out,"testing_accu{}.pkl"),"wb"))'.format(modndx+1,modndx+1))
                
                #Reinitialize for next training
                exec('sum{}_loss = 0'.format(modndx+1))
                exec('sum{}_accu = 0'.format(modndx+1))                
            train_count = 0
            
            time_epoch = time.time() - time_start
            log.write('epoch:{:04d} time elapsed:{:0.06f} sec \r\n'.format(train_iter.epoch+last_epoch, time_epoch))
            log.flush()
            print('epoch:{:04d} done time elapsed:{:0.06f} sec'.format(train_iter.epoch+last_epoch, time_epoch))