In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/KalmanNet_Visual/KalmanNet_VO/

/content/drive/MyDrive/KalmanNet_Visual/KalmanNet_VO


In [3]:
!pip install -U ray

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from functools import partial
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from encoder_ae_models import Encoder_new as Encoder
from torch.utils.data import Dataset, TensorDataset
import json


In [5]:
import psutil
import ray
ray._private.utils.get_system_memory = lambda: psutil.virtual_memory().total

In [6]:
def flatten_normalize_single(p , set_type="training"):
  print(f"Normalization for {set_type}")    
  
  data = np.load(f"/content/drive/MyDrive/KalmanNet_Visual/KalmanNet_VO/Datasets/Pendulum/Encoder_dataset/grid_search/pendulum_images_noisy_encoder_big_p_{p}_grid.npz")

  images = torch.from_numpy(data[set_type+"_input"])
  target = torch.from_numpy(data[set_type+"_target"])

  normalized_input = torch.clamp(images, min=0, max=255) / 255

  return torch.transpose(normalized_input, 0, 1).float() , torch.unsqueeze(target, dim=1).float()

In [7]:
def train(config, checkpoint_dir=None):
  net = Encoder(encoded_dimension=1)

  device = "cpu"
  if torch.cuda.is_available():
    device = "cuda:0"
    if torch.cuda.device_count() > 1:
      net = nn.DataParallel(net)
  print(f"Training on {device}.")
  net.to(device)

  criterion = nn.MSELoss()
  optimizer = optim.Adam(params=net.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])
  inp_train, target_train = flatten_normalize_single(p=config["p"], set_type="training")
  inp_val, target_val = flatten_normalize_single(p=config["p"], set_type="validation")


  inp_train, target_train = inp_train.to(device), target_train.to(device)
  inp_val, target_val = inp_val.to(device), target_val.to(device)
  best_val_loss = 1000
  training_loss = []
  validation_loss = []
  validation_loss_deg = []
  validation_loss_rad = []

  for epoch in range(1, EPOCHS+1):  # loop over the dataset multiple times
    running_loss = 0.0
    epoch_steps = 0
    net.train()
    
    for i in range(100):
      # get the inputs; data is a list of [inputs, labels]
      inputs, labels = inp_train[i::100], target_train[i::100]

      # zero the parameter gradients
      optimizer.zero_grad(set_to_none=True) #set_to_none makes it faster

      # forward + backward + optimize
      outputs = net(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

      # print statistics
      running_loss += loss.item()
      epoch_steps += 1

    print(f"Training loss at epoch {epoch}: {running_loss/epoch_steps}")
    training_loss.append((epoch, running_loss/epoch_steps))
    # Validation loss
    running_val_loss = 0.0
    running_val_loss_deg = 0.0

    val_steps = 0
    net.eval()
    with torch.no_grad():
      inputs, labels = inp_val.to(device), target_val.to(device)

      outputs = net(inputs)
      loss = criterion(outputs, labels)
      running_val_loss_deg += criterion(outputs*180/np.pi, labels*180/np.pi)
  
      running_val_loss += loss.cpu().numpy()
      val_steps += 1
    val_loss = running_val_loss / val_steps
    val_loss_deg = running_val_loss_deg / val_steps

    validation_loss.append((epoch, val_loss))
    validation_loss_deg.append((epoch, val_loss_deg))

    print(f"Validation loss at epoch {epoch}: {val_loss} rad, {val_loss_deg} deg")
    tune.report(loss=val_loss)

  print("Finished Training")
  

In [14]:
def main(config, num_samples=30, max_num_epochs=100, gpus_per_trial=1/16):
  config = config
  #os.environ["RAY_PICKLE_VERBOSE_DEBUG"] = "1" USEFUL FOR DEBUGGING
  scheduler = ASHAScheduler(
      max_t=max_num_epochs,
      grace_period=15,
      reduction_factor=2)
  result = tune.run(
      tune.with_parameters(train),
      resources_per_trial={"cpu": 0.5, "gpu": gpus_per_trial},
      config=config,
      metric="loss",
      verbose=3,
      mode="min",
      num_samples=num_samples,
      scheduler=scheduler
  )

  best_trial = result.get_best_trial("loss", "min", "last")
  print("Best trial config: {}".format(best_trial.config))


  f  = open(f"/content/drive/MyDrive/KalmanNet_Visual/KalmanNet_VO/Simulations_results/Pendulum/Encoder_big_net_optimal_hyp_big/optimal_hyperparameters_p_{p}.txt", "w+")
  f.write(json.dumps(best_trial.config))
  f.close()
    



In [15]:
#probs = [1.0, 1-1/32, 1-1/16, 1-1/8, 1-1/4, 1/2, 1/4, 1/8,  1/16, 1/32]
#probs.reverse()
probs = [1/16, 1-1/16]
num_samples = 60
max_epochs = 100
EPOCHS = max_epochs
num_work = 1


for p in probs:
  config = {
      "learning_rate": tune.qloguniform(1e-5, 0.1, 5e-8),
      "weight_decay": tune.qloguniform(1e-7, 1e-4, 5e-8),
      "p": p
      }
  main(config=config, num_samples=num_samples, max_num_epochs=max_epochs, gpus_per_trial=0.5)

Output hidden; open in https://colab.research.google.com to view.