# パッケージ・パス

In [1]:
# パスへの移動
import os
project_path = '/workspace/'
os.chdir(project_path)


# パッケージのロード
import torch
import hydra
from omegaconf import DictConfig
from torch.utils.data import DataLoader
import random
import numpy as np
from src.models.evflownet import EVFlowNet
from src.datasets import DatasetProvider
from enum import Enum, auto
from src.datasets import train_collate
from tqdm import tqdm
from pathlib import Path
from typing import Dict, Any
import os
import time

In [2]:
!pip install hydra-core --upgrade
!pip install hdf5plugin
!sudo apt-get update


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease                         
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease               
Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Reading package lists... Done


In [3]:
# %env HYDRA_FULL_ERROR=1
# !python /workspace/main.py

In [4]:
# hydra用のyamlファイルを読んでargsにロードする。

import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf

with initialize_config_dir(version_base=None, config_dir="/workspace/configs"):
    args = compose(config_name="base")

print(OmegaConf.to_yaml(args))

dataset_path: data
seed: 42
num_epoch: 10
data_loader:
  common:
    num_voxel_bins: 15
  train:
    batch_size: 8
    shuffle: false
  test:
    batch_size: 1
    shuffle: false
train:
  no_batch_norm: false
  initial_learning_rate: 0.01
  weight_decay: 0.0001
  epochs: 10



In [5]:
# 関数の定義
class RepresentationType(Enum):
    VOXEL = auto()
    STEPAN = auto()

def set_seed(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

def compute_epe_error(pred_flow: torch.Tensor, gt_flow: torch.Tensor):
    '''
    end-point-error (ground truthと予測値の二乗誤差)を計算
    pred_flow: torch.Tensor, Shape: torch.Size([B, 2, 480, 640]) => 予測したオプティカルフローデータ
    gt_flow: torch.Tensor, Shape: torch.Size([B, 2, 480, 640]) => 正解のオプティカルフローデータ
    '''
    epe = torch.mean(torch.mean(torch.norm(pred_flow - gt_flow, p=2, dim=1), dim=(1, 2)), dim=0)
    return epe

def save_optical_flow_to_npy(flow: torch.Tensor, file_name: str):
    '''
    optical flowをnpyファイルに保存
    flow: torch.Tensor, Shape: torch.Size([2, 480, 640]) => オプティカルフローデータ
    file_name: str => ファイル名
    '''
    np.save(f"{file_name}.npy", flow.cpu().numpy())


In [6]:
# モデルの保存ディレクトリの作成・データのロード。

# Create the directory if it doesn't exist
if not os.path.exists('checkpoints'):
    os.makedirs('checkpoints')

set_seed(args.seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
'''
    ディレクトリ構造:

    data
    ├─test
    |  ├─test_city
    |  |    ├─events_left
    |  |    |   ├─events.h5
    |  |    |   └─rectify_map.h5
    |  |    └─forward_timestamps.txt
    └─train
        ├─zurich_city_11_a
        |    ├─events_left
        |    |       ├─ events.h5
        |    |       └─ rectify_map.h5
        |    ├─ flow_forward
        |    |       ├─ 000134.png
        |    |       |.....
        |    └─ forward_timestamps.txt
        ├─zurich_city_11_b
        └─zurich_city_11_c
    '''

# ------------------
#    Dataloader
# ------------------
loader = DatasetProvider(
    dataset_path=Path(args.dataset_path),
    representation_type=RepresentationType.VOXEL,
    delta_t_ms=100,
    num_bins=4
)
train_set = loader.get_train_dataset()
test_set = loader.get_test_dataset()
collate_fn = train_collate


train_data = DataLoader(train_set,
                                batch_size=args.data_loader.train.batch_size,
                                shuffle=args.data_loader.train.shuffle,
                                collate_fn=collate_fn,
                                drop_last=False)
test_data = DataLoader(test_set,
                                batch_size=args.data_loader.test.batch_size,
                                shuffle=args.data_loader.test.shuffle,
                                collate_fn=collate_fn,
                                drop_last=False)

'''
train data:
    Type of batch: Dict
    Key: seq_name, Type: list
    Key: event_volume, Type: torch.Tensor, Shape: torch.Size([Batch, 4, 480, 640]) => イベントデータのバッチ
    Key: flow_gt, Type: torch.Tensor, Shape: torch.Size([Batch, 2, 480, 640]) => オプティカルフローデータのバッチ
    Key: flow_gt_valid_mask, Type: torch.Tensor, Shape: torch.Size([Batch, 1, 480, 640]) => オプティカルフローデータのvalid. ベースラインでは使わない

test data:
    Type of batch: Dict
    Key: seq_name, Type: list
    Key: event_volume, Type: torch.Tensor, Shape: torch.Size([Batch, 4, 480, 640]) => イベントデータのバッチ
'''

'\ntrain data:\n    Type of batch: Dict\n    Key: seq_name, Type: list\n    Key: event_volume, Type: torch.Tensor, Shape: torch.Size([Batch, 4, 480, 640]) => イベントデータのバッチ\n    Key: flow_gt, Type: torch.Tensor, Shape: torch.Size([Batch, 2, 480, 640]) => オプティカルフローデータのバッチ\n    Key: flow_gt_valid_mask, Type: torch.Tensor, Shape: torch.Size([Batch, 1, 480, 640]) => オプティカルフローデータのvalid. ベースラインでは使わない\n\ntest data:\n    Type of batch: Dict\n    Key: seq_name, Type: list\n    Key: event_volume, Type: torch.Tensor, Shape: torch.Size([Batch, 4, 480, 640]) => イベントデータのバッチ\n'

In [7]:
train_set[0]['event_volume'].size()

torch.Size([4, 480, 640])

In [8]:
def save_model(model):
    current_time = time.strftime("%Y%m%d%H%M%S")
    model_path = f"checkpoints/model_{current_time}.pth"
    torch.save(model.state_dict(), model_path)
    print(f"Model saved to {model_path}")

    # ------------------
    #   Start predicting ()
    # ------------------
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    flow: torch.Tensor = torch.tensor([]).to(device)
    with torch.no_grad():
        print("start test")
        for batch in tqdm(test_data):
            batch: Dict[str, Any]
            event_image = batch["event_volume"].to(device)
            batch_flow = model(event_image) # [1, 2, 480, 640]
            flow = torch.cat((flow, batch_flow), dim=0)  # [N, 2, 480, 640]
        print("test done")
    # ------------------
    #  save submission
    # ------------------
    file_name = "submission"
    save_optical_flow_to_npy(flow, file_name)
    print("Submission saved")


def train_model(args, model, n_epoch=1):
    # ------------------
    #   optimizer
    # ------------------
    optimizer = torch.optim.Adam(model.parameters(), lr=args.train.initial_learning_rate, weight_decay=args.train.weight_decay)

    # ------------------
    #   Start training
    # ------------------
    model.train()
    for epoch in range(n_epoch):
        total_loss = 0
        print("on epoch: {}".format(epoch+1))
        for i, batch in enumerate(tqdm(train_data)):
            batch: Dict[str, Any]
            event_image = batch["event_volume"].to(device) # [B, 4, 480, 640]
            ground_truth_flow = batch["flow_gt"].to(device) # [B, 2, 480, 640]
            flow = model(event_image) # [B, 2, 480, 640]
            loss: torch.Tensor = compute_epe_error(flow, ground_truth_flow)
            print(f"batch {i} loss: {loss.item()}")
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_data)}')
        
        current_time = time.strftime("%Y%m%d%H%M%S")
        model_path = f"checkpoints/model_{current_time}.pth"
        torch.save(model.state_dict(), model_path)
        print(f"Model saved to {model_path}")


    # ------------------
    #   Start predicting ()
    # ------------------
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    flow: torch.Tensor = torch.tensor([]).to(device)
    with torch.no_grad():
        print("start test")
        for batch in tqdm(test_data):
            batch: Dict[str, Any]
            event_image = batch["event_volume"].to(device)
            batch_flow = model(event_image) # [1, 2, 480, 640]
            flow = torch.cat((flow, batch_flow), dim=0)  # [N, 2, 480, 640]
        print("test done")
    # ------------------
    #  save submission
    # ------------------
    file_name = "submission"
    save_optical_flow_to_npy(flow, file_name)

In [22]:
def train_model(args, model, n_epoch=1):
    # ------------------
    #   optimizer
    # ------------------
    optimizer = torch.optim.Adam(model.parameters(), lr=args.train.initial_learning_rate, weight_decay=args.train.weight_decay)

    # ------------------
    #   Start training
    # ------------------
    model.train()
    for epoch in range(n_epoch):
        total_loss = 0
        print("on epoch: {}".format(epoch+1))
        for i, batch in enumerate(tqdm(train_data)):
            batch: Dict[str, Any]
            event_image = batch["event_volume"].to(device) # [B, 4, 480, 640]
            ground_truth_flow = batch["flow_gt"].to(device) # [B, 2, 480, 640]
            flow = model(event_image) # [B, 2, 480, 640]
            loss: torch.Tensor = compute_epe_error(flow, ground_truth_flow)
            print(f"batch {i} loss: {loss.item()}")
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if loss < 2.5:
                torch.save(model.state_dict(), "checkpoints/model_tmp.pth")
                print("tmp model saved!")


            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_data)}')
        
        current_time = time.strftime("%Y%m%d%H%M%S")
        model_path = f"checkpoints/model_{current_time}.pth"
        torch.save(model.state_dict(), model_path)
        print(f"Model saved to {model_path}")


    # ------------------
    #   Start predicting ()
    # ------------------
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    flow: torch.Tensor = torch.tensor([]).to(device)
    with torch.no_grad():
        print("start test")
        for batch in tqdm(test_data):
            batch: Dict[str, Any]
            event_image = batch["event_volume"].to(device)
            batch_flow = model(event_image) # [1, 2, 480, 640]
            flow = torch.cat((flow, batch_flow), dim=0)  # [N, 2, 480, 640]
        print("test done")
    # ------------------
    #  save submission
    # ------------------
    file_name = "submission"
    save_optical_flow_to_npy(flow, file_name)

In [21]:
def evaluate(model, file_name="submission"):
    # ------------------
    #   Start predicting ()
    # ------------------
    model.load_state_dict(torch.load("checkpoints/model_tmp.pth", map_location=device))
    model.eval()
    flow: torch.Tensor = torch.tensor([]).to(device)
    with torch.no_grad():
        print("start test")
        for batch in tqdm(test_data):
            batch: Dict[str, Any]
            event_image = batch["event_volume"].to(device)
            batch_flow = model(event_image) # [1, 2, 480, 640]
            flow = torch.cat((flow, batch_flow), dim=0)  # [N, 2, 480, 640]
        print("test done")
    # ------------------
    #  save submission
    # ------------------
    save_optical_flow_to_npy(flow, file_name)

# 2d ver.

In [23]:
# model_path_loadのモデルをロード
model_path_load="/workspace/checkpoints/model_20240717025440.pth"
model = EVFlowNet(args.train).to(device)
model.load_state_dict(torch.load(model_path_load, map_location=device))
train_model(args, model)

evaluate(model)

on epoch: 1


  0%|          | 0/252 [00:00<?, ?it/s]

batch 0 loss: 5.663653426153224


  0%|          | 1/252 [00:24<1:40:30, 24.02s/it]

batch 1 loss: 9.096436697539687


  1%|          | 2/252 [00:48<1:40:46, 24.18s/it]

batch 2 loss: 16.774764216209267


  1%|          | 3/252 [01:11<1:38:44, 23.79s/it]

batch 3 loss: 4.297264297191079


  2%|▏         | 4/252 [01:34<1:37:22, 23.56s/it]

batch 4 loss: 3.9926119926452897


  2%|▏         | 5/252 [01:58<1:37:33, 23.70s/it]

batch 5 loss: 2.9989616225109317


  2%|▏         | 6/252 [02:23<1:37:54, 23.88s/it]

batch 6 loss: 2.839137585535814


  3%|▎         | 7/252 [02:46<1:36:56, 23.74s/it]

batch 7 loss: 2.878798417885876


  3%|▎         | 8/252 [03:10<1:36:38, 23.77s/it]

batch 8 loss: 2.8858405702379626


  4%|▎         | 9/252 [03:33<1:35:45, 23.64s/it]

batch 9 loss: 3.413568429723508


  4%|▍         | 10/252 [03:57<1:35:32, 23.69s/it]

batch 10 loss: 3.0218185383264835


  4%|▍         | 11/252 [04:21<1:35:44, 23.84s/it]

batch 11 loss: 2.558471203201143


  5%|▍         | 12/252 [04:45<1:35:40, 23.92s/it]

batch 12 loss: 3.6351077001823517


  5%|▌         | 13/252 [05:08<1:34:17, 23.67s/it]

batch 13 loss: 3.788629974202085


  6%|▌         | 14/252 [05:32<1:33:54, 23.67s/it]

batch 14 loss: 8.196597770160722


  6%|▌         | 15/252 [05:56<1:33:55, 23.78s/it]

batch 15 loss: 3.5964874247314067


  6%|▋         | 16/252 [06:20<1:33:33, 23.78s/it]

batch 16 loss: 3.228619726406445


  7%|▋         | 17/252 [06:43<1:32:59, 23.74s/it]

batch 17 loss: 7.997601200328916


  7%|▋         | 18/252 [07:08<1:32:58, 23.84s/it]

batch 18 loss: 4.869118022306216


  8%|▊         | 19/252 [07:31<1:32:30, 23.82s/it]

batch 19 loss: 5.055208189957071


  8%|▊         | 20/252 [07:55<1:31:59, 23.79s/it]

batch 20 loss: 7.1758264523021005


  8%|▊         | 21/252 [08:18<1:31:04, 23.65s/it]

batch 21 loss: 7.0196266048759774


  9%|▊         | 22/252 [08:42<1:30:29, 23.61s/it]

batch 22 loss: 5.00538333069028


  9%|▉         | 23/252 [09:05<1:30:04, 23.60s/it]

batch 23 loss: 3.7920572387137605


 10%|▉         | 24/252 [09:29<1:29:55, 23.66s/it]

batch 24 loss: 8.522993891265052


 10%|▉         | 25/252 [09:53<1:29:43, 23.72s/it]

batch 25 loss: 4.071835188830104


 10%|█         | 26/252 [10:17<1:28:59, 23.62s/it]

batch 26 loss: 4.461844003396163


 11%|█         | 27/252 [10:40<1:27:51, 23.43s/it]

batch 27 loss: 4.218791676562974


 11%|█         | 28/252 [11:04<1:28:14, 23.64s/it]

batch 28 loss: 3.6956655799057962


 12%|█▏        | 29/252 [11:27<1:28:03, 23.69s/it]

batch 29 loss: 7.577784088225249


 12%|█▏        | 30/252 [11:51<1:27:44, 23.71s/it]

batch 30 loss: 6.891739449635802


 12%|█▏        | 31/252 [12:15<1:27:07, 23.65s/it]

batch 31 loss: 4.309045923478886


 13%|█▎        | 32/252 [12:38<1:26:42, 23.65s/it]

batch 32 loss: 5.681415197376641


 13%|█▎        | 33/252 [13:02<1:26:16, 23.64s/it]

batch 33 loss: 3.2318092493513197


 13%|█▎        | 34/252 [13:26<1:25:46, 23.61s/it]

batch 34 loss: 2.9999616491171412


 14%|█▍        | 35/252 [13:49<1:25:33, 23.66s/it]

batch 35 loss: 2.505910030229362


 14%|█▍        | 36/252 [14:12<1:24:29, 23.47s/it]

batch 36 loss: 2.730997918729463


 15%|█▍        | 37/252 [14:35<1:23:43, 23.37s/it]

batch 37 loss: 6.819557102964949


 15%|█▌        | 38/252 [15:00<1:24:44, 23.76s/it]

batch 38 loss: 9.503727396688674


 15%|█▌        | 39/252 [15:25<1:25:44, 24.15s/it]

batch 39 loss: 11.353917759523476


 16%|█▌        | 40/252 [15:50<1:26:17, 24.42s/it]

batch 40 loss: 4.932374280968431


 16%|█▌        | 40/252 [16:15<1:26:07, 24.38s/it]


KeyboardInterrupt: 

# Mine version

In [14]:
# model = EVFlowNetMy(args.train).to(device)
# model.load_state_dict(torch.load("checkpoints/model_tmp.pth", map_location=device))
# evaluate(model, "submission_tmp")


In [17]:
from src.models.evflownet_my import EVFlowNetMy

# model_path_loadのモデルをロード
model_path_load="/workspace/checkpoints/model_20240717020629.pth"
model = EVFlowNetMy(args.train).to(device)
model.load_state_dict(torch.load(model_path_load, map_location=device))
train_model(args, model)

RuntimeError: Error(s) in loading state_dict for EVFlowNetMy:
	size mismatch for decoder4.general_conv2d.0.weight: copying a param with shape torch.Size([32, 130, 3, 3]) from checkpoint, the shape in current model is torch.Size([16, 130, 3, 3]).
	size mismatch for decoder4.general_conv2d.0.bias: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for decoder4.general_conv2d.2.weight: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for decoder4.general_conv2d.2.bias: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for decoder4.general_conv2d.2.running_mean: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for decoder4.general_conv2d.2.running_var: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for decoder4.predict_flow.0.weight: copying a param with shape torch.Size([2, 32, 1, 1]) from checkpoint, the shape in current model is torch.Size([2, 16, 1, 1]).

In [None]:
save_model(model)

Model saved to checkpoints/model_20240717052428.pth
start test


  return F.conv2d(input, weight, bias, self.stride,
100%|██████████| 97/97 [00:12<00:00,  7.97it/s]


test done
Submission saved
