# TRAIN VAE ON ISOLATED CELLS

In [22]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
%matplotlib inline
import matplotlib.pyplot as plt

import os
import torch
import numpy
from MODULES.utilities import *
from MODULES.vae_model import * 

#pip install psutil
#pip install neptune-client
#pip install neptune-notebooks
#!jupyter nbextension enable --py neptune-notebooks

#conda install -c conda-forge nodejs
#!jupyter labextension install neptune-notebooks

In [24]:
import neptune

def log_model_summary(experiment, model):
    for x in model.__str__().split('\n'):
        # replace leading spaces with '-' character
        n = len(x) - len(x.lstrip(' '))
        experiment.log_text("model summary", '-'*n + x)
        
def log_metrics(experiment, metric_tuple, prefix: str = ""):
    for key in metric_tuple._fields:
        value = getattr(metric_tuple, key).item()
        if isinstance(value,float):
            experiment.log_metric(prefix+key,value)
            
def replace_artifact(experiment, file_list: list, delay: int=5):
    assert delay >= 1
    assert len(file_list) >= 1
    
    if len(file_list) <= delay:
        experiment.log_artifact(file_list[-1])
    else:
        # i.e. len(file_list) > delay:
        experiment.log_artifact(file_list[-1])
        experiment.delete_artifacts(file_list[-1-delay])
        

In [25]:
neptune.set_project('dalessioluca/sandbox-2')

params = load_json_as_dict("./parameters_smFISH_factor8.json")

exp = neptune.create_experiment(params=flatten_dict(params),
                                upload_source_files=["./MODULES/vae_model.py"])

NVMLError: NVML Shared Library Not Found - GPU usage metrics may not be reported.


https://ui.neptune.ai/dalessioluca/sandbox-2/e/SAN2-25


In [26]:
small_stuff = load_obj("./isolated_cells_dataset.pt")
#exp.set_property("dataset", hashlib.md5(small_stuff).hexdigest())

In [27]:
BATCH_SIZE = 80
N_TRAIN = int(0.8 * small_stuff.shape[0])
 
train_loader = SpecialDataSet(img=small_stuff[:N_TRAIN], 
                              store_in_cuda=torch.cuda.is_available(),
                              shuffle=True,
                              drop_last=True,
                              batch_size=BATCH_SIZE)

test_loader = SpecialDataSet(img=small_stuff[N_TRAIN:], 
                             store_in_cuda=torch.cuda.is_available(),
                             shuffle=False,
                             drop_last=True,
                             batch_size=BATCH_SIZE)

In [28]:
train_batch_example = train_loader.check_batch()
exp.log_image("train_batch_example", train_batch_example)
#train_batch_example

Dataset lenght: 1047
img.shape torch.Size([1047, 1, 28, 28])
img.dtype torch.float32
img.device cpu
MINIBATCH: img.shapes labels.shape, index.shape -> torch.Size([8, 1, 28, 28]) torch.Size([8]) torch.Size([8])
MINIBATCH: min and max of minibatch tensor(0.0301) tensor(0.5948)


In [29]:
test_batch_example = test_loader.check_batch()
exp.log_image("test_batch_example", test_batch_example)
#test_batch_example

Dataset lenght: 262
img.shape torch.Size([262, 1, 28, 28])
img.dtype torch.float32
img.device cpu
MINIBATCH: img.shapes labels.shape, index.shape -> torch.Size([8, 1, 28, 28]) torch.Size([8]) torch.Size([8])
MINIBATCH: min and max of minibatch tensor(0.0301) tensor(0.5878)


In [30]:
reference_imgs, _, _ = test_loader.load(8)
reference = show_batch(reference_imgs, n_padding=4, figsize=(12,12))
exp.log_image("reference", reference)
#reference

### Initialize model and optimizer

In [31]:
vae = SimpleVae(params)
log_model_summary(experiment=exp, model=vae)

history_dict = {}
min_test_loss = 9999999999
dir_output = "./"

optimizer = instantiate_optimizer(model=vae, dict_params_optimizer=params["optimizer"])

if params["optimizer"]["scheduler_is_active"]:
    scheduler = instantiate_scheduler(optimizer=optimizer, dict_params_scheduler=params["optimizer"])

In [32]:
TEST_FREQUENCY = 4 #params["simulation"]["TEST_FREQUENCY"]
CHECKPOINT_FREQUENCY = params["simulation"]["CHECKPOINT_FREQUENCY"]
NUM_EPOCHS = params["simulation"]["MAX_EPOCHS"]
epoch_restart = -1
checkpoint_files, history_files = [], []

for delta_epoch in range(1,NUM_EPOCHS+1):
    epoch = delta_epoch+epoch_restart    
        
    #with torch.autograd.set_detect_anomaly(True):
    with torch.autograd.set_detect_anomaly(False):
        with torch.enable_grad():
            vae.train()
            train_metrics = process_one_epoch(model=vae, 
                                              dataloader=train_loader, 
                                              optimizer=optimizer, 
                                              verbose=(epoch==0), 
                                              weight_clipper=None)
            
            
        with torch.no_grad():      
            print("Train "+train_metrics.pretty_print(epoch))
            log_metrics(exp, train_metrics, prefix="train_")
            
            history_dict = append_tuple_to_dict(source_tuple=train_metrics, 
                                               target_dict=history_dict,
                                               prefix_exclude="wrong_examples",
                                               prefix_to_add="train_")
        
    if params["optimizer"]["scheduler_is_active"]:
        scheduler.step()
    
    if(epoch % TEST_FREQUENCY == 0):
        with torch.no_grad():
            vae.eval()
            test_metrics = process_one_epoch(model=vae, 
                                             dataloader=test_loader, 
                                             optimizer=optimizer, 
                                             verbose=(epoch==0), 
                                             weight_clipper=None)
            print("Test  "+test_metrics.pretty_print(epoch))
            log_metrics(exp, train_metrics, prefix="test_")
        
            history_dict = append_tuple_to_dict(source_tuple=train_metrics, 
                                               target_dict=history_dict,
                                               prefix_exclude="wrong_examples",
                                               prefix_to_add="test_")
        
            test_loss = test_metrics.loss
            min_test_loss = min(min_test_loss, test_loss)
            
            
            imgs_rec = vae.forward(imgs_in=reference_imgs).imgs
            tmp = show_batch(imgs_rec, n_padding=4, figsize=(12,12), title='epoch= {0:6d}'.format(epoch))
            exp.log_image("imgs_rec", tmp)
                        
            if((test_loss == min_test_loss) or ((epoch % CHECKPOINT_FREQUENCY) == 0)): 
                ckpt = create_ckpt(model=vae, 
                                   optimizer=optimizer, 
                                   history_dict=history_dict, 
                                   epoch=epoch, 
                                   hyperparams_dict=params)
                
                checkpoint_files += [os.path.join(dir_output, "ckp_"+str(epoch)+".pkl")]
                history_files += [os.path.join(dir_output, "history_"+str(epoch)+".pkl")]
            
                save_obj(ckpt, checkpoint_files[-1])
                save_dict_as_json(history_dict, history_file[-1])
                
                replace_artifact(experiment=exp, file_list=checkpoint_files)
                replace_artifact(experiment=exp, file_list=history_files)
                
                print("saved files -> "+checkpoint_files[-1]+"  "+history_files[-1])

i =   0 train_loss=36.51169
i =   1 train_loss=32.89920
i =   2 train_loss=29.78791
i =   3 train_loss=24.30220
i =   4 train_loss=15.40609
i =   5 train_loss=11.61520
i =   6 train_loss=12.15938
i =   7 train_loss=11.71213
i =   8 train_loss=9.09930
i =   9 train_loss=6.23104
i =  10 train_loss=4.78668
i =  11 train_loss=4.97643
i =  12 train_loss=4.90802
Train [epoch    0] loss=15.723, mse=19.110, kl_tot=1.923, geco_bal=0.805
i =   0 train_loss=4.00009
i =   1 train_loss=4.21980
i =   2 train_loss=4.61480
Test  [epoch    0] loss=4.278, mse=5.079, kl_tot=0.849, geco_bal=0.811
saved files -> ./ckp_20.pkl  ./history_20.pkl
Train [epoch    1] loss=3.414, mse=3.864, kl_tot=1.455, geco_bal=0.814
Train [epoch    2] loss=2.615, mse=3.043, kl_tot=0.699, geco_bal=0.818
Train [epoch    3] loss=2.361, mse=2.726, kl_tot=0.695, geco_bal=0.820
Train [epoch    4] loss=2.205, mse=2.521, kl_tot=0.742, geco_bal=0.823
Test  [epoch    4] loss=2.025, mse=2.291, kl_tot=0.782, geco_bal=0.824
saved files -> 

KeyboardInterrupt: 

In [1]:
exp.stop()

NameError: name 'exp' is not defined

PROBLEM OF ANTIALIASING
READ MASK CNN TO SEE HOW THEY DO THE REGION PROPOSAL
maybe the problem is to have two transaformation in sequence (first downscaling and then cropping)
maybe the encoder should take the raw image not the outcome of the unet. Outcome of unet is good for mask probably.
test what happends in big code if encoder takes the raw image