In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/ETH/Master thesis/KalmanNet_VO/

/content/drive/MyDrive/ETH/Master thesis/KalmanNet_VO


In [3]:
!pip install -U ray

Collecting ray
  Downloading ray-1.12.0-cp37-cp37m-manylinux2014_x86_64.whl (53.2 MB)
[K     |████████████████████████████████| 53.2 MB 1.2 MB/s 
Collecting grpcio<=1.43.0,>=1.28.1
  Downloading grpcio-1.43.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.1 MB)
[K     |████████████████████████████████| 4.1 MB 57.2 MB/s 
Collecting frozenlist
  Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)
[K     |████████████████████████████████| 144 kB 76.8 MB/s 
[?25hCollecting aiosignal
  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
Collecting virtualenv
  Downloading virtualenv-20.14.1-py2.py3-none-any.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 51.5 MB/s 
Collecting distlib<1,>=0.3.1
  Downloading distlib-0.3.4-py2.py3-none-any.whl (461 kB)
[K     |████████████████████████████████| 461 kB 84.3 MB/s 
[?25hCollecting platformdirs<3,>=2
  Downloading platformdirs-2.

In [17]:
from functools import partial
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from encoder_ae_models import Encoder
from torch.utils.data import Dataset, TensorDataset
import json


In [10]:
import psutil
import ray
ray._private.utils.get_system_memory = lambda: psutil.virtual_memory().total

In [11]:
def flatten_add_noise(dir_input, dir_target, r2, noise_type="gaussian", set_type="training_set", size=1):
  input_images = np.load(dir_input)
  targets = np.load(dir_target)
  if size==1:
    input_images = torch.from_numpy(input_images[set_type])
    target = torch.from_numpy(targets[set_type])[:, 0:1, :]
  else:
    till = int(targets[set_type].shape[0] * size)
    input_images = torch.from_numpy(input_images[set_type])[:till, ...]
    target = torch.from_numpy(targets[set_type])[:till, 0:1, :]

  input_images = input_images + torch.normal(mean=0, std=np.sqrt(r2), size=input_images.shape)
  normalized_input = torch.clamp(input_images, min=0, max=255) / 255
  
  return torch.unsqueeze(torch.flatten(normalized_input, 0, 1), 1).float() , torch.flatten(torch.transpose(target, 1, 2), 0, 1).float()


In [12]:
def train(config, checkpoint_dir=None):
    net = Encoder(encoded_dimension=1)

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    print(f"Training on {device}.")
    net.to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(params=net.parameters(), lr=config["learning_rate"])#, weight_decay=config["weight_decay"])


    # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint
    # should be restored.
    if checkpoint_dir:
        checkpoint = os.path.join(checkpoint_dir, "checkpoint")
        model_state, optimizer_state = torch.load(checkpoint)
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)


    inp, target = flatten_add_noise(dir_input=dir_input, dir_target=dir_target, r2=config["r2"], set_type="training_set", size=0.25)
    train_subset = TensorDataset(inp, target)
    inp_val, target_val = flatten_add_noise(dir_input=dir_input, dir_target=dir_target, r2=config["r2"], set_type="validation_set", size=0.25)
    val_subset = TensorDataset(inp_val, target_val)


    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=False,
        num_workers=num_work)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=False,
        num_workers=num_work)

    for epoch in range(EPOCHS):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        net.train()
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1

        print(f"Training loss at epoch {epoch}: {running_loss/epoch_steps}")
        # Validation loss
        val_loss = 0.0
        val_steps = 0
        net.eval()
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        # Here we save a checkpoint. It is automatically registered with
        # Ray Tune and will potentially be passed as the `checkpoint_dir`
        # parameter in future iterations.
        """
        with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save(
                (net.state_dict(), optimizer.state_dict()), path)
        """
        tune.report(loss=(val_loss / val_steps))
    print("Finished Training")

In [26]:
def main(config, num_samples=50, max_num_epochs=100, gpus_per_trial=1/16):
    config = config
    
    scheduler = ASHAScheduler(
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    result = tune.run(
        tune.with_parameters(train),
        resources_per_trial={"cpu": 0.5, "gpu": gpus_per_trial},
        config=config,
        metric="loss",
        mode="min",
        num_samples=num_samples,
        scheduler=scheduler
    )

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    #print("Best trial final validation loss: {}".format(
     #   best_trial.last_result["loss"]))
  
    f  = open(f"/content/drive/MyDrive/ETH/Master thesis/KalmanNet_VO/pendulum_grid_search/optimal_hyperparameters/optimal_hyperparameters_q2_{q2}_v_{v}.txt", "w+")
    f.write(json.dumps(best_trial.config))
    f.close()
    



In [27]:
vs = [-20] #in [dB]
r2s = [10, 2, 1, 0.5, 0.1, 0.01, 0.001, 0.0001]
r2s = [10]
num_samples = 1
max_epochs = 3
EPOCHS = max_epochs
num_work = 1
training_size= 30000
validation_size = 10000
 
for v in vs:
  q2s = list(map(lambda x: x*(10**(v/10)), r2s))
  for r2, q2 in zip(r2s, q2s):
    dir_input = f"/content/drive/MyDrive/ETH/Master thesis/KalmanNet_VO/Datasets/Pendulum/images_clean/pendulum_images_clean_q2_{q2:.0e}_v_{v}.npz"
    dir_target = f"/content/drive/MyDrive/ETH/Master thesis/KalmanNet_VO/Datasets/Pendulum/decimated_clean_data/pendulum_decimated_q2_{q2:.0e}_v_{v}.npz"
    config = {
        "learning_rate": tune.choice([1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]),
        "batch_size": tune.choice([128, 256, 512]),
        "q2": q2, "r2": r2, "v": v
        }
    main(config=config, num_samples=num_samples, max_num_epochs=max_epochs, gpus_per_trial=0.3)

2022-05-06 15:41:43,346	INFO trial_runner.py:803 -- starting train_076f3_00000


Trial name,status,loc,batch_size,learning_rate
train_076f3_00000,RUNNING,172.28.0.2:1827,128,0.01


[2m[36m(train pid=1827)[0m Training on cuda:0.


Trial name,status,loc,batch_size,learning_rate
train_076f3_00000,RUNNING,172.28.0.2:1827,128,0.01


Trial name,status,loc,batch_size,learning_rate
train_076f3_00000,RUNNING,172.28.0.2:1827,128,0.01


Trial name,status,loc,batch_size,learning_rate
train_076f3_00000,RUNNING,172.28.0.2:1827,128,0.01


[2m[36m(train pid=1827)[0m Training loss at epoch 0: 0.08419080786947351


Trial name,status,loc,batch_size,learning_rate
train_076f3_00000,RUNNING,172.28.0.2:1827,128,0.01


Result for train_076f3_00000:
  date: 2022-05-06_15-42-07
  done: false
  experiment_id: 5edd2bd020c04499b25ebfb716005907
  hostname: 321eae11af03
  iterations_since_restore: 1
  loss: 0.0454114900784754
  node_ip: 172.28.0.2
  pid: 1827
  time_since_restore: 21.803831577301025
  time_this_iter_s: 21.803831577301025
  time_total_s: 21.803831577301025
  timestamp: 1651851727
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 076f3_00000
  warmup_time: 0.003164052963256836
  


Trial name,status,loc,batch_size,learning_rate,iter,total time (s),loss
train_076f3_00000,RUNNING,172.28.0.2:1827,128,0.01,1,21.8038,0.0454115


Trial name,status,loc,batch_size,learning_rate,iter,total time (s),loss
train_076f3_00000,RUNNING,172.28.0.2:1827,128,0.01,1,21.8038,0.0454115


[2m[36m(train pid=1827)[0m Training loss at epoch 1: 0.0675749280277414
Result for train_076f3_00000:
  date: 2022-05-06_15-42-20
  done: false
  experiment_id: 5edd2bd020c04499b25ebfb716005907
  hostname: 321eae11af03
  iterations_since_restore: 2
  loss: 0.023699826242689067
  node_ip: 172.28.0.2
  pid: 1827
  time_since_restore: 34.81466293334961
  time_this_iter_s: 13.010831356048584
  time_total_s: 34.81466293334961
  timestamp: 1651851740
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: 076f3_00000
  warmup_time: 0.003164052963256836
  


Trial name,status,loc,batch_size,learning_rate,iter,total time (s),loss
train_076f3_00000,RUNNING,172.28.0.2:1827,128,0.01,2,34.8147,0.0236998


Trial name,status,loc,batch_size,learning_rate,iter,total time (s),loss
train_076f3_00000,RUNNING,172.28.0.2:1827,128,0.01,2,34.8147,0.0236998


[2m[36m(train pid=1827)[0m Training loss at epoch 2: 0.06350612757263034
Result for train_076f3_00000:
  date: 2022-05-06_15-42-33
  done: true
  experiment_id: 5edd2bd020c04499b25ebfb716005907
  hostname: 321eae11af03
  iterations_since_restore: 3
  loss: 0.03642699664478432
  node_ip: 172.28.0.2
  pid: 1827
  time_since_restore: 48.072665214538574
  time_this_iter_s: 13.258002281188965
  time_total_s: 48.072665214538574
  timestamp: 1651851753
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: 076f3_00000
  warmup_time: 0.003164052963256836
  


Trial name,status,loc,batch_size,learning_rate,iter,total time (s),loss
train_076f3_00000,TERMINATED,172.28.0.2:1827,128,0.01,3,48.0727,0.036427


2022-05-06 15:42:33,512	INFO tune.py:702 -- Total run time: 50.28 seconds (50.16 seconds for the tuning loop).


Best trial config: {'learning_rate': 0.01, 'batch_size': 128, 'q2': 0.1, 'r2': 10, 'v': -20}
