### Search for optimal DNN architecute
This notebook is designed only to search the optimal DNN architecture for the coupled-channel pole classification problem.
We use the standard `Classifier` of `Chainer` to wrap each architecture with the default hyperparameters. The DNN models with fixed number of hidden layers are imported from `dnn_models` notebook.

In [1]:
import os

import chainer
from chainer import configuration
from chainer.dataset import convert
import chainer.links as L
import chainer.functions as F
from chainer import optimizers, initializers, serializers

import math
import numpy as np
import cupy as cp
import random

import sklearn
from sklearn.utils import shuffle
import pickle

import import_ipynb
import dnn_models
from dnn_models import MLP1L, MLP2L, MLP3L
from dnn_models import MLP4L, MLP5L, MLP6L
from dnn_models import MLP4LDRP, MLP5LDRP

import time

importing Jupyter notebook from dnn_models.ipynb


In [2]:
#Choose curricululum to train
curriculum = 32

resume0 = True
#General resume

resume1 = True
#Set resume1 to False if you want to continue using main directory
#Set resume1 to True if you want to resume using snapshot directory

resume2 = False
#Set resume2 to True if you want to continue using PREVIOUS CURRICULUM snapshot directory
#Set resume2 to False otherwise

#Which last epoch would you like to continue?
last_epoch = 31090 #31070 #31050
#Set minibatch size
batchsize = 3*512 #2*512 #512

#For resume=False. If you want to initialize weights (I am not sure if this is truly effective)
initialize = True

#Set maximum epoch of the full training
max_epoch = 100000 - last_epoch
#Set maximum repetitions during epoch restart
max_rep = 10
#Set accuracy drop tolerance to execute epoch restart
drop_tolerance = -0.005
#Save model and state every save_epoch
save_epoch = 10

#Choose your DNN model
dnn1 = MLP3L(200,200,200)

#local directory and file name of training and testing curriculum dataset
#trainset = 'curriculum_trainset//chainer_train_curr{:02d}.pkl'.format(curriculum)
#testset = 'curriculum_testset//chainer_test_curr{:02d}.pkl'.format(curriculum)
trainset = 'chainer_train_curr{:02d}.pkl'.format(curriculum)
testset = 'chainer_test_curr{:02d}.pkl'.format(curriculum)

#Continuous training directory
directory1 = 'dnn_curr{:02d}_full'.format(curriculum)
#Snapshot directory (save models every save_epoch)
directory2 = 'dnn_curr{:02d}_snapshot'.format(curriculum)

gpu_id = 1

#Load previous training results from the snapshot folder
prev_curr = curriculum
if curriculum > 1:
    prev_curr = curriculum - 1

out = directory1
if not os.path.isdir(out):
    os.makedirs(out)

out2 = directory2
if not os.path.isdir(out2):
    os.makedirs(out2)

#present curriculum directory
if resume1 == True and resume2 == False:
    directory3 = directory2
#previous curriculum directory
if resume1 == True and resume2 == True:
    directory3 = 'dnn_curr{:02d}_snapshot'.format(prev_curr)

    
if resume1 == True:        
    import shutil
    shutil.copy(os.path.join(directory3,'epoch{:06d}//MLP1.model'.format(last_epoch)),directory1)
    shutil.copy(os.path.join(directory3,'epoch{:06d}//MLP1.state'.format(last_epoch)),directory1)
    shutil.copy(os.path.join(directory3,'epoch{:06d}//testing_accu1.pkl'.format(last_epoch)),directory1)
    shutil.copy(os.path.join(directory3,'epoch{:06d}//testing_loss1.pkl'.format(last_epoch)),directory1)
    shutil.copy(os.path.join(directory3,'epoch{:06d}//training_accu1.pkl'.format(last_epoch)),directory1)
    shutil.copy(os.path.join(directory3,'epoch{:06d}//training_loss1.pkl'.format(last_epoch)),directory1)
    shutil.copy(os.path.join(directory3,'epoch{:06d}//epoch_log.txt'.format(last_epoch)),directory1)

In [3]:
def train_dnn():
    #GPU usage is only activated here.
    
    device = chainer.get_device(gpu_id)
    model1 = L.Classifier(dnn1)
    model1.to_device(device)
    device.use()
    
    optimizer1 = chainer.optimizers.Adam()
    optimizer1 = optimizer1.setup(model1)
    
    # Weight initilizer
    if resume0 == False and resume1 == False and initialize == True:
        chainer.initializers.HeNormal(model1)
    
    #Load training and testing datasets
    train = pickle.load(open(trainset,'rb'))
    test = pickle.load(open(testset,'rb'))
    
    #Define Iterator
    train_iter = chainer.iterators.SerialIterator(train, batchsize)
    test_iter = chainer.iterators.SerialIterator(test, batchsize, repeat=False, shuffle=False)
    
    out = directory1
    if not os.path.isdir(out):
        os.makedirs(out)

    out2 = directory2
    if not os.path.isdir(out2):
        os.makedirs(out2)        
        
    if resume0 == True:
        #Load log book
        log = open(os.path.join(out,'epoch_log.txt'),'a+')
        #Load lists of training and testing accuracies of the previous run 
        training_accu1 = pickle.load(open(os.path.join(out,"training_accu1.pkl"),"rb"))
        testing_accu1 = pickle.load(open(os.path.join(out,"testing_accu1.pkl"),"rb"))
        #Load lists of traininf and testing losses of the previous run 
        training_loss1 = pickle.load(open(os.path.join(out,"training_loss1.pkl"),"rb"))
        testing_loss1 = pickle.load(open(os.path.join(out,"testing_loss1.pkl"),"rb"))
        #Load the model and the optimizer of the previous run
        serializers.load_npz(os.path.join(out,"MLP1.model"), model1)
        serializers.load_npz(os.path.join(out,"MLP1.state"), optimizer1)
                                     
        #I need this to assign value to last epoch. Batch size might change and we cannot rely on the iterators
        training_accu1 = pickle.load(open(os.path.join(out,"training_accu1.pkl"),"rb"))
        
    elif resume0 == False:
        log = open(os.path.join(out,'epoch_log.txt'),'w+')
        log = open(os.path.join(out,'epoch_log.txt'),'a+')
        #Initialize lists of losses and accuracies
        training_loss1 = []
        training_accu1 = []
        testing_loss1 = []
        testing_accu1 = []
        
        #I need this to assign value to last epoch
        training_accu1 = [] 
    
    last_epoch = len(training_accu1)
        
    
    time_start = time.time()    
    test_count = len(test)
    
    #Initialize training
    train_count = 0
    
    sum1_loss = 0
    sum1_accu = 0
    
    rep = 0
    #---------------------------start training epoch----------------------------------------------
    while (train_iter.epoch < max_epoch) and (rep < max_rep):
        batch = train_iter.next()
        x, t = convert.concat_examples(batch, device)
        train_count += len(t)
        #Update network's parameters using forward pass and backpropagation for each model
        optimizer1.update(model1, x, t)
        #Calculate training loss and accuracy
        sum1_loss += float(model1.loss.array)*len(t)
        sum1_accu += float(model1.accuracy.array)*len(t)
    #-----------------------end of one epoch----------------------------------------------------
        
        if train_iter.is_new_epoch:
            #Record training loss and accuracy for each model
            training_loss1.append(sum1_loss/train_count)
            training_accu1.append(sum1_accu/train_count)
            #Initialize loss and accuracy for testing
            sum1_loss = 0
            sum1_accu = 0
            
            #Enable evaluation mode
            with configuration.using_config('train', False):
                #This is optional but can reduce computational overhead
                with chainer.using_config('enable_backprop', False):
                    for batch in test_iter:
                        x, t = convert.concat_examples(batch, device)
                        #Calculate testing loss and accuracy
                        loss1 = model1(x,t)
                        sum1_loss += float(loss1.array)*len(t)
                        sum1_accu += (model1.accuracy.array)*len(t)
                        
            test_iter.reset()
            
            #I need this to assign value to the recent epoch
            epoch = len(training_accu1)            
            
            #Record testing loss and accuracy
            testing_loss1.append(sum1_loss/test_count)
            testing_accu1.append(sum1_accu/test_count)
            
            #Restart if training is too bad
            drop = float(training_accu1[len(training_accu1)-1]) - float(training_accu1[len(training_accu1)-2])
            retrain = 0
            if drop < drop_tolerance:
                retrain = retrain + 1
                rep = retrain
                #Load log book
                log = open(os.path.join(out,'epoch_log.txt'),'a+')
                log.write('We are restarting at epoch {:06d} for {:03d} times \r\n'.format(epoch, retrain))
                #Load lists of training and testing accuracies of the previous run 
                training_accu1 = pickle.load(open(os.path.join(out,"training_accu1.pkl"),"rb"))
                testing_accu1 = pickle.load(open(os.path.join(out,"testing_accu1.pkl"),"rb"))
                #Load lists of training and testing losses of the previous run 
                training_loss1 = pickle.load(open(os.path.join(out,"training_loss1.pkl"),"rb"))
                testing_loss1 = pickle.load(open(os.path.join(out,"testing_loss1.pkl"),"rb"))
                #Load the model and the optimizer of the previous run
                serializers.load_npz(os.path.join(out,"MLP1.model"), model1)
                serializers.load_npz(os.path.join(out,"MLP1.state"), optimizer1)
                #I need this to assign value to the recent epoch
                epoch = len(training_accu1)
                
            
            #Save model and optimizer state
            serializers.save_npz(os.path.join(out,"MLP1.model"), model1)
            serializers.save_npz(os.path.join(out,"MLP1.state"), optimizer1)
            
            #Save training data
            pickle.dump(training_loss1,open(os.path.join(out,"training_loss1.pkl"),"wb"))
            pickle.dump(training_accu1,open(os.path.join(out,"training_accu1.pkl"),"wb"))
            
            #Save testing data
            pickle.dump(testing_loss1,open(os.path.join(out,"testing_loss1.pkl"),"wb"))
            pickle.dump(testing_accu1,open(os.path.join(out,"testing_accu1.pkl"),"wb"))

            time_epoch = time.time() - time_start
            log.write('epoch:{:06d} time elapsed:{:0.06f} sec \r\n'.format(epoch, time_epoch))
            
            log.flush()
            print('epoch:{:06d} done time elapsed:{:0.06f} sec'.format(epoch, time_epoch))            
            
            #Reinitialize for next training
            sum1_loss = 0
            sum1_accu = 0                
            train_count = 0
            
            #Time machine: if something goes wrong, you can always go back
            if epoch%save_epoch == 0:
                out3 = 'epoch{:06d}'.format(epoch)
                if not os.path.isdir(os.path.join(out2, out3)):
                    os.mkdir(os.path.join(out2, out3))
                    
                import shutil
                shutil.copy(os.path.join(out,'MLP1.model'),os.path.join(out2,out3))
                shutil.copy(os.path.join(out,'MLP1.state'),os.path.join(out2,out3))
                shutil.copy(os.path.join(out,'testing_accu1.pkl'),os.path.join(out2,out3))
                shutil.copy(os.path.join(out,'testing_loss1.pkl'),os.path.join(out2,out3))
                shutil.copy(os.path.join(out,'training_accu1.pkl'),os.path.join(out2,out3))
                shutil.copy(os.path.join(out,'training_loss1.pkl'),os.path.join(out2,out3))
                shutil.copy(os.path.join(out,'epoch_log.txt'),os.path.join(out2,out3)) 