In [1]:
rm -rf Results/7_kernel_3LSTM_debug*

In [2]:
from __future__ import print_function
import logging
import torch
from torchvision import transforms
import torch.optim as optim
from torch.utils.data import DataLoader
import os
import platform
import time
from utils.Network import Network
from utils.Analyser import Analyser
from utils.io import save_network, load_network, save, load, make_folder_results
from utils.WaveDataset import create_datasets
from utils.training import train_epoch, validate, test

logging.basicConfig(format='%(message)s',level=logging.INFO)
channels=1
num_workers=4
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

transformVar = {"Test": transforms.Compose([
    transforms.Resize(128),    #Already 184 x 184
    transforms.CenterCrop(128),
    transforms.ToTensor(),
]),
    "Train": transforms.Compose([
    transforms.Resize(128),  # Already 184 x 184
    transforms.CenterCrop(128),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ToTensor(),
    ])
}

nr_net = 0 

version = nr_net + 10
num_input_frames = 5
num_output_frames = 20
reinsert_frequency = 10
network_type = "7_kernel_3LSTM_debug"

if 'Darwin' in platform.system():
    data_dir = './'
else:
    data_dir = '/disk/scratch/s1680171/wave_propagation/'

if not os.path.isdir("./Results"):
    os.mkdir("./Results")
results_dir = "./Results/" + network_type + "_v%03d/" % version

if not os.path.isdir(results_dir):
    make_folder_results(results_dir)

# Data
filename_data = results_dir + "all_data.pickle"
if os.path.isfile(filename_data):
    logging.info('Loading datasets')
    all_data = load(filename_data)
    train_dataset = all_data["Training data"]
    val_dataset = all_data["Validation data"]
    test_dataset = all_data["Testing data"]
else:
    logging.info('Creating new datasets')
    test_dataset, val_dataset, train_dataset = create_datasets(
         data_dir+"Video_Data/", transformVar, test_fraction=0.15, validation_fraction=0.15, check_bad_data=False, channels=channels)
    all_data = {"Training data": train_dataset, "Validation data": val_dataset, "Testing data": test_dataset}
    save(all_data, filename_data)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=num_workers)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True, num_workers=num_workers)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True, num_workers=num_workers)


# analyser
filename_analyser = results_dir + "analyser.pickle" 
if os.path.isfile(filename_analyser):
    logging.info('Loading analyser')
    analyser = load(filename_analyser)
else:
    logging.info('Creating analyser')
    analyser = Analyser(results_dir)

# Model
filename_model = results_dir + "model.pt"
if os.path.isfile(filename_model):
    model = Network(device, channels)
    model = load_network(model, device, filename_model)
else:
    model = Network(device, channels)

# Learning Rate scheduler w. optimizer
# Optimizer
optimizer_algorithm = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)
# Add learning rate schedulers
# Decay LR by a factor of gamma every step_size epochs
scheduler_type = 'plateau'
if scheduler_type == 'step':
    gamma = 0.5
    step_size = 40
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer_algorithm, step_size=step_size, gamma=gamma)
elif scheduler_type == 'plateau':
    # Reduce learning rate when a metric has stopped improving
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_algorithm, mode='min', factor=0.1, patience=7)

filename_metadata = results_dir + "metadata.pickle" 
meta_data_dict = {  "optimizer": optimizer_algorithm.state_dict(),
                    "scheduler_type": scheduler_type, 
                    "scheduler": lr_scheduler.state_dict()}
save(meta_data_dict, filename_metadata)

model = model.to(device) 

# analyser = []
# model =[]
# lr_scheduler = []
# scheduler_dict = []

# analyser.plot_loss()
# analyser.plot_loss_batchwise()
# analyser.plot_validation_loss()

Creating new datasets
Creating analyser


In [3]:
logging.info('Experiment %d' % version)
logging.info('Start training')
epochs=50
for epoch in range(epochs):
    epoch_start = time.time()

    logging.info('Epoch %d' % epoch)
    train_loss = train_epoch(model, lr_scheduler, epoch, val_dataloader, num_input_frames, 
                            num_output_frames,reinsert_frequency, channels, device, analyser, plot=False)
    analyser.save_epoch_loss(train_loss, 1)
    validation_loss = validate(model, train_dataloader, num_input_frames, num_output_frames, reinsert_frequency,
                               channels, device, plot=False)
#     analyser.save_validation_loss(validation_loss, 1)
#     """
#     Here we can access analyser.validation_loss to make decisions
#     """
#     # Learning rate scheduler
#     # perform scheduler step if independent from validation loss
#     if scheduler_type == 'step':
#         lr_scheduler.step()
#     # perform scheduler step if dependent on validation loss
#     if scheduler_type == 'plateau':
#         validation_loss = analyser.validation_loss[-1]
#         lr_scheduler.step(validation_loss)
#     save_network(model, filename_model)
#     save(analyser, filename_analyser)

#     epoch_time = time.time() - epoch_start 
#     logging.info('Epoch time: %.1f' % epoch_time)



Experiment 10
Start training
Epoch 0
Training: Ready to load batches


train batch loss  1.2911416292190552




val batch loss  0.061801813542842865


Validation loss: 0.061802	Time: 13.130
Epoch 1
Training: Ready to load batches


train batch loss  1.0054479837417603




val batch loss  0.06518342345952988


Validation loss: 0.065183	Time: 12.912
Epoch 2
Training: Ready to load batches


train batch loss  0.8339086174964905




val batch loss  0.13548536598682404


Validation loss: 0.135485	Time: 13.365
Epoch 3
Training: Ready to load batches


train batch loss  0.7168624997138977




val batch loss  0.2392827421426773


Validation loss: 0.239283	Time: 13.350
Epoch 4
Training: Ready to load batches


train batch loss  0.6481192708015442




val batch loss  0.2560531795024872


Validation loss: 0.256053	Time: 17.116
Epoch 5
Training: Ready to load batches


train batch loss  0.5834929943084717




val batch loss  0.2318456768989563


Validation loss: 0.231846	Time: 13.732
Epoch 6
Training: Ready to load batches


train batch loss  0.5415123105049133




val batch loss  0.19040828943252563


Validation loss: 0.190408	Time: 13.802
Epoch 7
Training: Ready to load batches


train batch loss  0.506492018699646




val batch loss  0.17319762706756592


Validation loss: 0.173198	Time: 13.155
Epoch 8
Training: Ready to load batches


train batch loss  0.47281697392463684




val batch loss  0.1558084487915039


Validation loss: 0.155808	Time: 13.768
Epoch 9
Training: Ready to load batches


train batch loss  0.4484100341796875




val batch loss  0.14054754376411438


Validation loss: 0.140548	Time: 13.815
Epoch 10
Training: Ready to load batches


train batch loss  0.44418609142303467




val batch loss  0.14024895429611206


Validation loss: 0.140249	Time: 13.785
Epoch 11
Training: Ready to load batches


train batch loss  0.4251880943775177




val batch loss  0.12664102017879486


Validation loss: 0.126641	Time: 13.890
Epoch 12
Training: Ready to load batches


train batch loss  0.39920005202293396




val batch loss  0.11584629118442535


Validation loss: 0.115846	Time: 14.750
Epoch 13
Training: Ready to load batches


train batch loss  0.4123101234436035


Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/stathis/anaconda3/envs/thesis/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/Users/stathis/anaconda3/envs/thesis/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/Users/stathis/anaconda3/envs/thesis/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/Users/stathis/anaconda3/envs/thesis/lib/python3.6/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
  File "/Users/stathis/anaconda3/envs/thesis/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/Users/stathis/anaconda3/envs/thesis/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    

Traceback (most recent call last):
  File "/Users/stathis/anaconda3/envs/thesis/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3291, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-3-22e1fc2f58d9>", line 9, in <module>
    num_output_frames,reinsert_frequency, channels, device, analyser, plot=False)
  File "/Users/stathis/Code/thesis/wave_propagation/utils/training.py", line 91, in train_epoch
    lr_scheduler.optimizer.step()
  File "/Users/stathis/anaconda3/envs/thesis/lib/python3.6/site-packages/torch/optim/adam.py", line 94, in step
    exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/stathis/anaconda3/envs/thesis/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2033, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' obje

KeyboardInterrupt: 