In [1]:
import torch
import numpy as np
import torch.optim as optim
from torch.utils import data
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import pandas as pd
from sklearn.metrics import roc_auc_score
import datetime
import matplotlib.animation as animation
import matplotlib.pyplot as plt
from io import StringIO
import ffmpeg
import pdb
from functions import create_pytorch_dataset, create_multimodal_pytorch_dataset
from functions import get_window_metrics
from functions import get_frame_metrics
from functions import animate

In [2]:
# Lets load the H%PY dataset into a pytorch dataset class.Please see 
# dataset_creator on how to generate the H5PY file. 

# Name of the H5PY dataset 
dset = [ 'Edits/ZED_Depth', "Edits/ONI_Depth"] #where the orginal data is stored 
name = ["ZED_Depth_Edit", 'ONI_IR_Depth'] # name of the h5py file
path = "H5Data\Data_set-{}-imgdim64x64.h5".format(name) # location of the h5py file
# this will also window the data at a set size, and with the set stride 

window_len = 8
stride = 1

Test_Dataset, test_dataloader, Train_Dataset, train_dataloader = create_multimodal_pytorch_dataset(name, dset, path, window_len, stride)


Edits/ZED_RGB
Edits/ZED_Depth
ZED_RGB_Edit
ZED_Depth_Edit
H5Data\data_set-ZED_RGB_Edit-imgdim64x64.h5
Edits/ZED_RGB
ZED_RGB_Edit
H5Data\data_set-ZED_RGB_Edit-imgdim64x64.h5
297
297
99
99
H5Data\data_set-ZED_Depth_Edit-imgdim64x64.h5
Edits/ZED_Depth
ZED_Depth_Edit
H5Data\data_set-ZED_Depth_Edit-imgdim64x64.h5
297
297
99
99


In [3]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        # define layers 
        
        # first layer
        self.ec1 = nn.Conv3d(1, 16, (5, 3, 3), stride=1, padding=(2, 1, 1),)
        self.em1 = nn.MaxPool3d((1, 2, 2), return_indices=True)
        #self.ed1 = nn.Dropout3d(p=0.25)
        # second layer
        self.ec2 = nn.Conv3d(16, 8, (5, 3, 3), stride=1, padding=(2, 1, 1))
        self.em2 = nn.MaxPool3d((2, 2, 2), return_indices=True)
        #self.ed2 = nn.Dropout3d(p=0.25)
        # third layer
        self.ec3 = nn.Conv3d(8, 8, (5, 3, 3), stride=1, padding=(2, 1, 1))
        self.em3 = nn.MaxPool3d((2, 2, 2), return_indices=True)
        # encoding done, time to decode
        self.dc1 = nn.ConvTranspose3d(8, 8, (5, 3, 3), stride=1, padding=(2, 1, 1))
        self.dm1 = nn.MaxUnpool3d((2, 2, 2))
        # inverse of 2nd Conv
        self.dc2 = nn.ConvTranspose3d(8, 8, (5, 3, 3), stride=1, padding=(2, 1, 1))
        self.dm2 = nn.MaxUnpool3d((2, 2, 2))
        # inverse of 1st Conv
        self.dc3 = nn.ConvTranspose3d(8, 16, (5, 3, 3), stride=1, padding=(2, 1, 1))
        self.dm3 = nn.MaxUnpool3d((1, 2, 2))
        # final inverse
        self.dc4 = nn.ConvTranspose3d(16, 1, (5, 3, 3), stride=1, padding=(2, 1, 1))

        # first layer
        self.ec21 = nn.Conv3d(1, 16, (5, 3, 3), stride=1, padding=(2, 1, 1),)
        self.em21 = nn.MaxPool3d((1, 2, 2), return_indices=True)
        #self.ed21 = nn.Dropout3d(p=0.25)
        # second layer
        self.ec22 = nn.Conv3d(16, 8, (5, 3, 3), stride=1, padding=(2, 1, 1))
        self.em22 = nn.MaxPool3d((2, 2, 2), return_indices=True)
        #self.ed22 = nn.Dropout3d(p=0.25)
        # third layer
        self.ec23 = nn.Conv3d(8, 8, (5, 3, 3), stride=1, padding=(2, 1, 1))
        self.em23 = nn.MaxPool3d((2, 2, 2), return_indices=True)
        # encoding done, time to decode
        self.dc21 = nn.ConvTranspose3d(8, 8, (5, 3, 3), stride=1, padding=(2, 1, 1))
        self.dm21 = nn.MaxUnpool3d((2, 2, 2))
        # inverse of 2nd Conv
        self.dc22 = nn.ConvTranspose3d(8, 8, (5, 3, 3), stride=1, padding=(2, 1, 1))
        self.dm22 = nn.MaxUnpool3d((2, 2, 2))
        # inverse of 1st Conv
        self.dc23 = nn.ConvTranspose3d(8, 16, (5, 3, 3), stride=1, padding=(2, 1, 1))
        self.dm23 = nn.MaxUnpool3d((1, 2, 2))
        # final inverse
        self.dc24 = nn.ConvTranspose3d(16, 1, (5, 3, 3), stride=1, padding=(2, 1, 1))

    def forward(self, x1, x2):

        # *** start of encoder
        x1 = x1.permute(1, 0, 2, 3, 4)  # reorder to have correct dimensions
        # (batch_size, chanels, depth, width, height)
        _ec1 = F.relu(self.ec1(x1))
        _em1, i1 = self.em1(_ec1)
        #_ec1 = self.ed1(_ec1)
        # second layer
        _ec2 = F.relu(self.ec2(_em1))
        _em2, i2 = self.em2(_ec2)
        #_em2 = self.ed2(_em2)
        # third layer
        _ec3 = F.relu(self.ec3(_em2))
        _em3, i3 = self.em3(_ec3)

        
        x2 = x2.permute(1, 0, 2, 3, 4)  # reorder to have correct dimensions
        # (batch_size, chanels, depth, width, height)
        _ec21 = F.relu(self.ec21(x2))
        _em21, i1 = self.em21(_ec21)
        #_ec21 = self.ed21(_ec21)
        # second layer
        _ec22 = F.relu(self.ec22(_em21))
        _em22, i2 = self.em22(_ec22)
        #_em22 = self.ed22(_em22)
        # third layer
        _ec23 = F.relu(self.ec23(_em22))
        _em23, i3 = self.em23(_ec23)

        # combined modalaties here 
        combo1 = (_em23*0.5) * _em3
        combo2 = _em23 * (_em3*0.5)


        # print("====== Encoding Done =========")
        # *** encoding done, time to decode
        _dc1 = F.relu(self.dc1(combo1))
        _dm1 = self.dm1(_dc1, i3, output_size=_em2.size())
        # second layer
        _dc2 = F.relu(self.dc2(_dm1))
        _dm2 = self.dm2(_dc2, i2)
        # third layer
        _dc3 = F.relu(self.dc3(_dm2))
        _dm3 = self.dm3(_dc3, i1)
        re_x1 = torch.tanh(self.dc24(_dm3))

        # *** encoding done, time to decode
        _dc21 = F.relu(self.dc21(combo2))
        _dm21 = self.dm21(_dc21, i3, output_size = _em22.size())
        # second layer
        _dc22 = F.relu(self.dc22(_dm21))
        _dm22 = self.dm22(_dc22, i2)
        # third layer
        _dc23 = F.relu(self.dc23(_dm22))
        _dm23 = self.dm23(_dc23, i1)

        re_x2 = torch.tanh(self.dc24(_dm23))

        return re_x1, re_x2

In [8]:
# Now lets train our model

# prepare for GPU training 
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
torch.cuda.empty_cache()

# and lets set the hyperparameters! 

dropout = 0.25
learning_rate = 0.0002
num_epochs = 15
chunk_size = 128
forward_chunk = 8 
forward_chunk_size = 8 # this is smaller due to memory constrains 

# select which model - you could load your own or put it in the function above
model = Autoencoder().to(device)

loss_fn = nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

cuda


In [5]:

def train_model(filepath):
    model.train()
    for epoch in range(num_epochs):
        val_loss = 0
        for i, sample in enumerate(test_dataloader):
            x_modality = []
            y_modality = []
            for j, modalities in enumerate(sample):
                if j == 0:
                    x_modality = modalities
                if j == 1:
                    y_modality = modalities
            for modality in range(len(x_modality)):
                sample1 = x_modality[0]
                sample2 = x_modality[1]
                # label = y_modality[modality] # dont need label for training
                sample1 = sample1.to(device, dtype=torch.float)
                sample2 = sample2.to(device, dtype=torch.float)
                if sample1.shape[1] > sample2.shape[1]:
                    sample1 = sample1[:, :sample2.shape[1], :, :, :]
                if sample1.shape[1] < sample2.shape[1]:
                    sample2 = sample2[:, :sample1.shape[1], :, :, :]
                # split sample into smaller sizes due to GPU memory constraints
                chunks1 = torch.split(sample1, chunk_size, dim=1)
                chunks2 = torch.split(sample2, chunk_size, dim=1)
                for k in range(len(chunks1)):
                    output1, output2 = model(chunks1[k], chunks2[k])
                    output1 = output1.to(device).permute(1, 0, 2, 3, 4)
                    output2 = output2.to(device).permute(1, 0, 2, 3, 4)
                    model.zero_grad()
                    loss1 = loss_fn(output1, chunks1[k])
                    loss2 = loss_fn(output2, chunks2[k])
                    loss = loss1 + loss2
                    # ===================backward====================
                    # Getting gradients w.r.t. parameters
                    loss.backward()
                    # Updating parameters
                    optimizer.step()
                    # Clear gradients w.r.t. parameters
                    optimizer.zero_grad()
                    torch.cuda.empty_cache()
                            
        # ===================log========================
        print("epoch [{}/{}], loss:{:.4f}".format(epoch + 1, num_epochs, loss.item()))
        torch.save(model.state_dict(), filepath) # save the model each epoch at location filepath


In [6]:
def foward_pass(path):
    model.load_state_dict(torch.load(path)) # load a saved model 
    model.eval()
    frame_stats1 = [] 
    window_stats1 = [] 
    frame_stats2 = [] 
    window_stats2 = [] 

    with torch.no_grad():
        print("foward pass occuring")
        # just forward pass of model on test dataset
        for i, sample in enumerate(test_dataloader):
            x_modality = []
            y_modality = []
            for j, modalities in enumerate(sample):
                if j == 0:
                    x_modality = modalities
                if j == 1:
                    y_modality = modalities
            for modality in range(len(x_modality)):
                sample1 = x_modality[0]
                sample2 = x_modality[1]
                label1 = y_modality[0] # dont need label for training
                torch.cuda.empty_cache()
                sample1 = sample1.to(device, dtype=torch.float)
                sample2 = sample2.to(device, dtype=torch.float)
                if sample1.shape[1] > sample2.shape[1]:
                    sample1 = sample1[:, :sample2.shape[1], :, :, :]
                if sample1.shape[1] < sample2.shape[1]:
                    sample2 = sample2[:, :sample1.shape[1], :, :, :]
                # split sample into smaller sizes due to GPU memory constraints
                chunks1 = torch.split(sample1, chunk_size, dim=1)
                chunks2 = torch.split(sample2, chunk_size, dim=1)
                recon_vid1 = []
                recon_vid2 = []
                for k in range(len(chunks1)):
                    output1, output2 = model(chunks1[k], chunks2[k])
                    output1 = output1.to(device).permute(1, 0, 2, 3, 4)
                    output2 = output2.to(device).permute(1, 0, 2, 3, 4)
                    recon_vid1.append(output1)
                    recon_vid2.append(output2)
                    torch.cuda.empty_cache()

                output1 = torch.cat(recon_vid1, dim=1)
                output2 = torch.cat(recon_vid2, dim=1)
                # convert tensors to numpy arrays for easy manipluations
                sample = sample1.data.cpu().numpy()
                output = output1.data.cpu().numpy()
                labels = label1.data.cpu().numpy()

                frame_mean, frame_std, frame_labels = get_frame_metrics(output, sample, labels, window_len)
                mean_window_error, std_window_error, window_labels = get_window_metrics(output, sample, labels, window_len)
                frame_stats1.append([frame_mean, frame_std, frame_labels])
                window_stats1.append([mean_window_error, std_window_error, window_labels])

                sample = sample2.data.cpu().numpy()
                output = output2.data.cpu().numpy()
                labels = label1.data.cpu().numpy()

                frame_mean, frame_std, frame_labels = get_frame_metrics(output, sample, labels, window_len)
                mean_window_error, std_window_error, window_labels = get_window_metrics(output, sample, labels, window_len)
                frame_stats2.append([frame_mean, frame_std, frame_labels])
                window_stats2.append([mean_window_error, std_window_error, window_labels])


                '''
                if j % 50 == 0:
                    animate(sample[0, :, :, :, :], output[0, :, :, :, :], frame_mean, dset, start_time)
                '''
                    

    return(frame_stats1, window_stats1, frame_stats2, window_stats2)

In [9]:
start_time = str(datetime.datetime.today().strftime("%Y-%m-%d-%H-%M-%S"))
print(start_time)
filepath = (
    "Models\\"
    + 'MultiModalFusion_ZED_RGB_DEPTH'
    + start_time
)
#filepath = 'Models\MultiModalFusion2020-12-14-15-52-54'
#print(filepath)
train_model(filepath)
import functions
from functions import animate


frame_stats1, window_stats1, frame_stats2, window_stats2 = foward_pass(filepath)

2020-12-18-00-23-43
epoch [1/15], loss:0.0038
epoch [2/15], loss:0.0034
epoch [3/15], loss:0.0032
epoch [4/15], loss:0.0030
epoch [5/15], loss:0.0028
epoch [6/15], loss:0.0028
epoch [7/15], loss:0.0028
epoch [8/15], loss:0.0028
epoch [9/15], loss:0.0028
epoch [10/15], loss:0.0028
epoch [11/15], loss:0.0027
epoch [12/15], loss:0.0027
epoch [13/15], loss:0.0027
epoch [14/15], loss:0.0026
epoch [15/15], loss:0.0026
foward pass occuring


In [11]:
import functions
from functions import get_total_performance_metrics
from importlib import reload
reload(functions)
from functions import get_total_performance_metrics

print(len(frame_stats1))
get_total_performance_metrics(frame_stats1, window_stats1, window_len)
input("Press Enter to continue...")   
get_total_performance_metrics(frame_stats2, window_stats2, window_len)

#get_total_performance_metrics(originals, reconstruced, testing_labels, window_len)

594
(594, 5, 8)
saving
(594, 5, 8)
saving
