In [1]:
# This script takes work from Pytorch MNIST and UNet Dev.ipynb and makes training and validation repeatable 
# in order to test a method for saving and setting optimizer state (momentum, etc.) that remains consistent
# even when the restoring is done after restarting the kernel. The train_epoch method also has been reduced to training
# over a few batches (not a complete epoch) in order to make testing faster. Testing should be done for Unet
# with its many optimizers, and also for MNIST cnn with its one optimizer.
# .................. EDIT THIS .................................

In [2]:
import pickle
import numpy as np
import torch
from copy import deepcopy

In [3]:
# maybe this is how we can test later, for now I am rewriting code below with the following changes
# No shuffle on train and val
# imports rewritten



# from tfedlrn.collaborator.pytorchmodels.pytorch2dunet import PyTorch2DUNet
# from tfedlrn.collaborator.pytorchmodels.pytorchmnistcnn import PyTorchMNISTCNN

In [4]:
# implementing my own data pipeline initializer in order to enforce reproducibility by
# selecting only one batch for training.

from tfedlrn.datasets import load_dataset
from tfedlrn.collaborator.pytorchmodels.pytorchflutils import pt_create_loader


def init_data_pipelines_no_shuffle(model_type):
    
    batch_size = 64
    
    if model_type == PyTorchMNISTCNN:
        
        X_train, y_train, X_val, y_val = load_dataset('mnist')
        X_train = X_train.reshape([-1, 1, 28, 28])
        X_val = X_val.reshape([-1, 1, 28, 28])
        
        # Here is the key reason for implementing this myself.
        # Reduce to producing only one batch for training.
        X_train, y_train = X_train[:batch_size], y_train[:batch_size]

        train_loader = pt_create_loader(X_train, y_train, batch_size=batch_size, shuffle=True)
        val_loader = pt_create_loader(X_val, y_val, batch_size=batch_size, shuffle=True)

        
    elif model_type == PyTorch2DUNet:
        
        data_by_institution = [load_dataset('BraTS17_institution',
                                                institution=i,
                                                channels_first=True) for i in range(10)]
        data_by_type = zip(*data_by_institution)
        data_by_type = [np.concatenate(d) for d in data_by_type]
        X_train, y_train, X_val, y_val = data_by_type
        
        # Here is the key reason for implementing this myself.
        # Reduce to producing only one batch for training.
        X_train, y_train = X_train[:batch_size], y_train[:batch_size]
        
        # Also reduce val loader for unet data so as to not take too long for validation
        X_val, y_val = X_val[:batch_size], y_val[:batch_size]

        train_loader = pt_create_loader(X_train, y_train, batch_size=batch_size, shuffle=True)
        val_loader = pt_create_loader(X_val, y_val, batch_size=batch_size, shuffle=True)         
         
    else:
        raise ValueError('This model type not supported.')
               
    return train_loader, val_loader
        


In [5]:
from tfedlrn.collaborator.pytorchmodels.pytorchmnistcnn import PyTorchMNISTCNN
from tfedlrn.collaborator.pytorchmodels.pytorch2dunet import PyTorch2DUNet

In [6]:
def initialize_model(device, model_type, unet_optimizer_type):
    train_loader, val_loader = init_data_pipelines_no_shuffle(model_type)
    if model_type == PyTorchMNISTCNN:
        cnn = PyTorchMNISTCNN(device=device, train_loader = train_loader, val_loader = val_loader)
    else:
        cnn = model_type(device=device, train_loader = train_loader, val_loader = val_loader, optimizer=unet_optimizer_type)
    # modifying the learning rate to make a more substantial change when training
    # only on one batch, so as to detect after such training that the 
    # model was not restored correctly
    big_learning_rate = 1E-1
    for group_idx, group in enumerate(cnn.optimizer.__dict__['param_groups']):
        cnn.optimizer.__dict__['param_groups'][group_idx]['lr'] = big_learning_rate
    cnn.optimizer.__dict__['defaults']['lr'] = big_learning_rate
    
    # modifying the momentum so as to test SGD with no state needed
    no_momentum = True
    if no_momentum:
        for group_idx, group in enumerate(cnn.optimizer.__dict__['param_groups']):
            cnn.optimizer.__dict__['param_groups'][group_idx]['momentum'] = 0.0
        cnn.optimizer.__dict__['defaults']['momentum'] = 0.0
    
    
    return cnn

In [7]:
# Here set what type of model, dataset, and optimizer you want
# (for mnist, no optimizer is needed). Testing using the device as
# GPU and CPU should be performed, as well as testing with all optimizer
# options, including SGD with momentum=0.0 in order to see that no
# required state for the optimizer is handled correctly.

# model_type = PyTorchMNISTCNN


model_type = PyTorch2DUNet


unet_optimizer_type = 'SGD'


In [8]:
# device = torch.device("cuda")

In [9]:
device = torch.device("cpu")

In [57]:
#######################################################################################################

In [58]:
###############         EXPLORING SAVING AND RESTORING OPTIMIZER and MODEL          ###################

In [59]:
#######################################################################################################

In [60]:
cnn = initialize_model(device=device, model_type=model_type, 
                       unet_optimizer_type=unet_optimizer_type)

In [61]:
# training once to populate the momentum buffers
# also testing the extent to which validation is reproducible for a given model

In [62]:
cnn.train_epoch()

5.5348387

In [63]:
cnn.validate() - cnn.validate()

0.0

In [64]:
############################################################################################
# testing restoring model and optimizer, should result in training to the same validation ##
############################################################################################

In [65]:
initial_val = cnn.validate()
initial_val

0.0003709124866873026

In [66]:
# save model
model_weights = cnn.get_tensor_dict()

In [67]:
# train for some more, then see that model is different now

In [68]:
cnn.train_epoch()

5.5005393

In [69]:
# Make sure validation has changed significantly now relative to the difference in repeated validation
after_train_val = cnn.validate()
after_train_val - initial_val

4.423956852406263e-06

In [70]:
# saving model to disk as well as initial_val and after_train_val

In [71]:
import pickle
filenames = ['saved_model.pkl', 'initial_val.pkl', 'after_train_val.pkl']
objects = [model_weights, initial_val, after_train_val]
for ob, filename in zip(objects, filenames):
    with open(filename, 'wb') as file:
        pickle.dump(ob, file)

In [None]:
# ---------------------------------------------------------------------------------------

In [None]:
# ------------------\/ exploration code here \/---------------------------------------------

In [None]:
# ---------------------------------------------------------------------------------------

In [None]:
# Here exploring how to modify the learning rate using the optimizer object

In [None]:
cnn.optimizer.__dict__.keys()

In [None]:
# See here that the param_groups contain lr info
print(cnn.optimizer.__dict__['param_groups'][0].keys())
print(cnn.optimizer.__dict__['param_groups'][0]['lr'])


In [None]:
# See here that the defaults also contain lr 
# info (this is used by us when restoring optimizer state).
print(cnn.optimizer.__dict__['param_groups'][0].keys())
print(cnn.optimizer.__dict__['param_groups'][0]['lr'])

In [None]:
# See here (by exploring first state key only) that lr info does not lie below the 'state' key 
first_state_key = list(cnn.optimizer.__dict__['state'].keys())[0]
cnn.optimizer.__dict__['state'][first_state_key]

In [None]:
# exploring how np.float and torch.FloatTensor are indeed compatible.
# In fact, the tensors below point to the same memory store, though they print differently 

In [None]:
array = np.array([4.00343, 5.00676]).astype(np.float32)
tensor = torch.Tensor(array).to(device)
print("array data,tensor data, and datatype: {}, {}, {}"
      .format(array, tensor, tensor.type()))

In [None]:
array = np.array([4.00340, 5.00676]).astype(np.float32)
tensor = torch.from_numpy(array)
print("array data,tensor data, and datatype: {}, {}, {}"
      .format(array, tensor, tensor.type()))

In [None]:
array += 1.00004

In [None]:
array, tensor

In [None]:
# ---------------------------------------------------------------------------------------

In [None]:
# ------------------ /\ exploration code here /\ ---------------------------------------------

In [None]:
# ---------------------------------------------------------------------------------------

In [72]:
# restore saved model
cnn.set_tensor_dict(model_weights)

In [73]:
# see validation is close to previously observed for saved model
initial_val - cnn.validate()

0.0

In [74]:
# see if grabbing the full state again, it gives the same thing as we had saved before

In [75]:
model_weights_2 = cnn.get_tensor_dict()

In [76]:
bool_array = np.array([])
for key in model_weights:
    if key.startswith('__opt_'):
        for inner_key in model_weights[key]:
            np.append(bool_array, model_weights[key][inner_key] == model_weights_2[key][inner_key])
    else:
        np.append(bool_array, [np.all(model_weights[key] == model_weights_2[key])])
np.all(bool_array)


True

In [77]:
# now train again and see you get back to the place you did before after running 'train_partial_epoch' once

In [78]:
cnn.train_epoch()

5.5005393

In [79]:
after_train_val - cnn.validate()

2.9103830456733704e-11

In [80]:
after_train_val - cnn.validate() 

0.0

In [81]:
#######################################

In [None]:
# ----now test that restoring can happen across processes

In [None]:
########################################

In [None]:
##############################################################################################################

In [None]:
######                         !!!!!!!!!!!!!!!!  RESTART KERNEL HERE !!!!!!!!!!!                 #############

In [None]:
######         (then run the top cells of this workbook up to device intialization)            ###############

In [None]:
#############################################################################################################

In [10]:
###### Then return to this point and run cells below ####################

In [11]:
#initiate model and train a bit, then restore model from disk

In [12]:
cnn = initialize_model(device=device, model_type=model_type, 
                       unet_optimizer_type=unet_optimizer_type)

In [13]:
cnn.train_epoch()

5.5005255

In [14]:
# get model weights and validation values from disk BUT NOT RESTORING YET

In [15]:
filenames = ['saved_model.pkl', 'initial_val.pkl', 'after_train_val.pkl']
object_names = ['model_weights', 'initial_val', 'after_train_val']
for object_name, filename in zip(object_names, filenames):
    with open(filename, 'rb') as file:
        vars()[object_name] = pickle.load(file)



In [16]:
# see if valiation is significantly different from the ones before
this_val = cnn.validate()
initial_val - this_val,  after_train_val - this_val 

(-4.514906322583556e-06, -9.094947017729282e-08)

In [17]:
# restore model to saved values
cnn.set_tensor_dict(model_weights)

In [18]:
# see validation is close to previously observed for saved model
initial_val - cnn.validate()

2.9103830456733704e-11

In [19]:
# see if grabbing the full state again, it gives the same thing as we had saved before

In [20]:
model_weights_2 = cnn.get_tensor_dict()

In [21]:
bool_array = np.array([])
for key in model_weights:
    if key.startswith('__opt_'):
        for inner_key in model_weights[key]:
            np.append(bool_array, model_weights[key][inner_key] == model_weights_2[key][inner_key])
    else:
        np.append(bool_array, [np.all(model_weights[key] == model_weights_2[key])])
np.all(bool_array)


True

In [22]:
# now train to get a new validation, and see that it matches what we had after training once before
cnn.train_epoch()

5.5005393

In [23]:

after_train_val - cnn.validate()

2.9103830456733704e-11