#### GMFlow Implementation

https://arxiv.org/abs/2111.13680

In [1]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
from nnflow import GMFlow

Using cache found in /home/goswami.p/.cache/torch/hub/facebookresearch_dino_main


In [3]:
from ezflow.models import build_model

In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [5]:
def count_params(model):
    return str(sum(p.numel() for p in model.parameters() if p.requires_grad) / 1000000) + "M params"

___

## GMFlow without refinement

##### Use `num_scales=1` and `upsample_factor=8` for GMFlow without refinement

In [6]:
model = build_model('GMFlow', cfg_path='../configs/gmflow/models/gmflow_v01.yaml', custom_cfg=True)

In [7]:
count_params(model) 

'4.680288M params'

In [6]:
# model = GMFlow(
#     feature_channels=128,
#     num_scales=1,
#     upsample_factor=8,
#     num_head=1,
#     attention_type='swin',
#     ffn_dim_expansion=4,
#     num_transformer_layers=6,
# )

In [23]:
img1, img2 = torch.randn(1,3,368,496), torch.randn(1,3,368,496)

In [28]:
# flow_result = model(img1, img2,
#        attn_splits_list=[2],
#        corr_radius_list=[-1],
#        prop_radius_list=[-1],
#    )

flow_result = model(img1, img2)

In [29]:
flow_result.keys()

dict_keys(['flow_preds'])

In [30]:
len(flow_result['flow_preds'])

2

In [31]:
for flow in flow_result['flow_preds']:
    print(flow.shape)

torch.Size([1, 2, 368, 496])
torch.Size([1, 2, 368, 496])


In [22]:
model.eval()

flow_result = model(img1, img2)
print(flow_result.keys())

flow_result['flow_preds'][0].shape, flow_result['flow_upsampled'].shape

dict_keys(['flow_preds', 'flow_upsampled'])


(torch.Size([1, 2, 256, 256]), torch.Size([1, 2, 256, 256]))

____

## GMFlow with Refinement
##### Use `num_scales=2` and `upsample_factor=4` for GMFlow with refinement

In [17]:
model_with_refinement = GMFlow(
    feature_channels=128,
    num_scales=2,
    upsample_factor=4,
    num_head=1,
    attention_type='swin',
    ffn_dim_expansion=4,
    num_transformer_layers=6,
)

In [19]:
flow_result = model_with_refinement(img1, img2,
       attn_splits_list=[2, 8],
       corr_radius_list=[-1, 4],
       prop_radius_list=[-1, 1],
   )

In [20]:
for flow in flow_result['flow_preds']:
    print(flow.shape)

torch.Size([1, 2, 256, 256])
torch.Size([1, 2, 256, 256])
torch.Size([1, 2, 256, 256])
torch.Size([1, 2, 256, 256])


___

## GMFlowV2

An end-to-end Transformer model

In [6]:
from nnflow import GMFlowV2

In [37]:
model = build_model('GMFlowV2', cfg_path='../configs/gmflow/models/gmflow_v07.yaml', custom_cfg=True)

In [38]:
count_params(model)

'5.34112M params'

In [26]:
img1, img2 = torch.randn(1,3,368,496), torch.randn(1,3,368,496)

# img1 = img1.to(device)
# img2 = img2.to(device)
# model.to(device)

output = model(img1, img2)

for flow in output['flow_preds']:
    print(flow.shape)

torch.Size([1, 2, 368, 496])
torch.Size([1, 2, 368, 496])


___

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from nnflow.models.gmflow.backbone import CNNEncoder
from nnflow.models.gmflow.transformer import FeatureTransformer, FeatureFlowAttention
from nnflow.models.gmflow.matching import global_correlation_softmax, local_correlation_softmax
from nnflow.models.gmflow.geometry import flow_warp
from nnflow.models.gmflow.utils import normalize_img, feature_add_position
from nnflow.models.gmflow import SwinEncoderV2

2022-10-09 13:18:53.532782: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-09 13:18:53.865788: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-09 13:18:55.197388: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /shared/centos7/cuda/11.3/lib64:/shared/centos7/nodejs/14.15.4/lib
2022-10-09 13:18:55.197519: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.

In [2]:
from ezflow.encoder import ENCODER_REGISTRY,build_encoder
from ezflow.modules import BaseModule
from ezflow.config import get_cfg

In [3]:
class GMFlowV2(BaseModule):
    def __init__(self, cfg):
        
        super(GMFlowV2, self).__init__()

        self.cfg = cfg

        self.num_scales = cfg.MODEL.NUM_SCALES
        self.feature_channels = cfg.MODEL.FEATURE_CHANNELS
        self.upsample_factor = cfg.MODEL.UPSAMPLE_FACTOR
        self.num_head = cfg.MODEL.NUM_HEADS
        self.attention_type = cfg.MODEL.ATTENTION_TYPE
        self.ffn_dim_expansion = cfg.MODEL.FFN_DIM_EXPANSION
        self.num_transformer_layers = cfg.MODEL.NUM_TRANSFORMER_LAYERS

        self.attn_splits_list=cfg.MODEL.ATTN_SPLITS_LIST
        self.corr_radius_list=cfg.MODEL.CORR_RADIUS_LIST
        self.prop_radius_list=cfg.MODEL.PROP_RADIUS_LIST
        self.pred_bidir_flow=cfg.MODEL.PRED_BIDIR_FLOW
        
        self.use_sine_pos_embed=cfg.MODEL.USE_SINE_POS_EMBED

        # CNN backbone
        self.backbone = build_encoder(cfg.ENCODER)

        # Transformer
        self.transformer = FeatureTransformer(num_layers=self.num_transformer_layers,
                                              d_model=self.feature_channels,
                                              nhead=self.num_head,
                                              attention_type=self.attention_type,
                                              ffn_dim_expansion=self.ffn_dim_expansion,
                                              )

        # flow propagation with self-attn
        self.feature_flow_attn = FeatureFlowAttention(in_channels=self.feature_channels)

        # convex upsampling: concat feature0 and flow as input
        self.upsampler = nn.Sequential(nn.Conv2d(2 + self.feature_channels, 256, 3, 1, 1),
                                       nn.ReLU(inplace=True),
                                       nn.Conv2d(256, self.upsample_factor ** 2 * 9, 1, 1, 0))

    def extract_feature(self, img0, img1):
        concat = torch.cat((img0, img1), dim=0)  # [2B, C, H, W]
        features = self.backbone(concat)  # list of [2B, C, H, W], resolution from high to low
        
        feature0, feature1 = [], []
        
        chunks = torch.chunk(features, 2, 0)  # tuple
        feature0.append(chunks[0])
        feature1.append(chunks[1])
        
        return feature0, feature1

        # reverse: resolution from low to high
        # features = features[::-1]

        # feature0, feature1 = [], []

        # for i in range(len(features)):
        #     feature = features[i]
        #     chunks = torch.chunk(feature, 2, 0)  # tuple
        #     feature0.append(chunks[0])
        #     feature1.append(chunks[1])

        # return feature0, feature1

    def upsample_flow(self, flow, feature, bilinear=False, upsample_factor=8,
                      ):
        if bilinear:
            up_flow = F.interpolate(flow, scale_factor=upsample_factor,
                                    mode='bilinear', align_corners=True) * upsample_factor

        else:
            # convex upsampling
            concat = torch.cat((flow, feature), dim=1)

            mask = self.upsampler(concat)
            b, flow_channel, h, w = flow.shape
            mask = mask.view(b, 1, 9, self.upsample_factor, self.upsample_factor, h, w)  # [B, 1, 9, K, K, H, W]
            mask = torch.softmax(mask, dim=2)

            up_flow = F.unfold(self.upsample_factor * flow, [3, 3], padding=1)
            up_flow = up_flow.view(b, flow_channel, 9, 1, 1, h, w)  # [B, 2, 9, 1, 1, H, W]

            up_flow = torch.sum(mask * up_flow, dim=2)  # [B, 2, K, K, H, W]
            up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)  # [B, 2, K, H, K, W]
            up_flow = up_flow.reshape(b, flow_channel, self.upsample_factor * h,
                                      self.upsample_factor * w)  # [B, 2, K*H, K*W]

        return up_flow

    def forward(self, img0, img1):

        results_dict = {}
        flow_preds = []

        # img0, img1 = normalize_img(img0, img1)  # [B, 3, H, W]

        # resolution low to high
        feature0_list, feature1_list = self.extract_feature(img0, img1)  # list of features        
        flow = None

        assert len(self.attn_splits_list) == len(self.corr_radius_list) == len(self.prop_radius_list) == self.num_scales

        for scale_idx in range(self.num_scales):
            feature0, feature1 = feature0_list[scale_idx], feature1_list[scale_idx]

            if self.pred_bidir_flow and scale_idx > 0:
                # predicting bidirectional flow with refinement
                feature0, feature1 = torch.cat((feature0, feature1), dim=0), torch.cat((feature1, feature0), dim=0)

            upsample_factor = self.upsample_factor * (2 ** (self.num_scales - 1 - scale_idx))

            if scale_idx > 0:
                flow = F.interpolate(flow, scale_factor=2, mode='bilinear', align_corners=True) * 2

            if flow is not None:
                flow = flow.detach()
                feature1 = flow_warp(feature1, flow)  # [B, C, H, W]

            attn_splits = self.attn_splits_list[scale_idx]
            corr_radius = self.corr_radius_list[scale_idx]
            prop_radius = self.prop_radius_list[scale_idx]

            # add position to features
            if self.use_sine_pos_embed:
                feature0, feature1 = feature_add_position(feature0, feature1, attn_splits, self.feature_channels)

            # Transformer
            feature0, feature1 = self.transformer(feature0, feature1, attn_num_splits=attn_splits)

            # correlation and softmax
            if corr_radius == -1:  # global matching
                flow_pred = global_correlation_softmax(feature0, feature1, self.pred_bidir_flow)[0]
            else:  # local matching
                flow_pred = local_correlation_softmax(feature0, feature1, corr_radius)[0]

            # flow or residual flow
            flow = flow + flow_pred if flow is not None else flow_pred

            # upsample to the original resolution for supervison
            if self.training:  # only need to upsample intermediate flow predictions at training time
                flow_bilinear = self.upsample_flow(flow, None, bilinear=True, upsample_factor=upsample_factor)
                flow_preds.append(flow_bilinear)

            # flow propagation with self-attn
            if self.pred_bidir_flow and scale_idx == 0:
                feature0 = torch.cat((feature0, feature1), dim=0)  # [2*B, C, H, W] for propagation
            flow = self.feature_flow_attn(feature0, flow.detach(),
                                          local_window_attn=prop_radius > 0,
                                          local_window_radius=prop_radius)
            
            # bilinear upsampling at training time except the last one
            if self.training and scale_idx < self.num_scales - 1:
                flow_up = self.upsample_flow(flow, feature0, bilinear=True, upsample_factor=upsample_factor)
                flow_preds.append(flow_up)

            if scale_idx == self.num_scales - 1:
                flow_up = self.upsample_flow(flow, feature0)
                flow_preds.append(flow_up)

        results_dict.update({'flow_preds': flow_preds})

        if not self.training:
            results_dict["flow_upsampled"] = results_dict["flow_preds"][0]

        return results_dict

In [4]:
cfg = get_cfg(cfg_path='../configs/gmflow/models/gmflow_v02.yaml', custom=True)

In [5]:
model = GMFlowV2(cfg)

unfold input: torch.Size([1, 1, 96, 128])


unfold input: torch.Size([1, 1, 96, 128])


unfold input: torch.Size([1, 1, 48, 64])


unfold input: torch.Size([1, 1, 48, 64])


unfold input: torch.Size([1, 1, 48, 64])




  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [33]:
count_params(model.backbone)

'1.466816M params'

In [6]:
img1, img2 = torch.randn(1,3,384,512), torch.randn(1,3,384,512)

___

In [21]:
img1.shape, img2.shape

(torch.Size([1, 3, 368, 496]), torch.Size([1, 3, 368, 496]))

In [24]:
encoder = CNNEncoder(output_dim=128, num_output_scales=1)
count_params(encoder)

'1.04912M params'

In [19]:
feats = encoder(torch.randn(2, 3, 256, 256))[0]
feats.shape

torch.Size([2, 64, 768])

In [1]:
from nnflow.models.gmflow import SwinEncoderV2

Using cache found in /home/goswami.p/.cache/torch/hub/facebookresearch_dino_main


In [2]:
encoderv2 = SwinEncoderV2(
        embedding_channels=64,
        depths=(4, 6),
        input_resolution=(384, 512),
        number_of_heads=(8, 16),
        window_size=8
)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [3]:
encoderv2.patch_size

4

In [239]:
count_params(encoderv2)

'1.466816M params'

In [240]:
feats = encoderv2(torch.randn(1,3, 384, 512))

torch.Size([1, 64, 96, 128])
torch.Size([1, 128, 48, 64])


In [241]:
feats.shape

torch.Size([1, 128, 48, 64])

___

In [None]:
from transformers import Swinv2Config, Swinv2Model

In [None]:
configuration = Swinv2Config()
configuration.depths=[4, 6]
configuration.embed_dim=64
configuration.num_heads=[8,16]
configuration.window_size=8
# configuration.image_size=[368,496]

In [None]:
configuration

Swinv2Config {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    4,
    6
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 64,
  "encoder_stride": 32,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "swinv2",
  "num_channels": 3,
  "num_heads": [
    8,
    16
  ],
  "num_layers": 4,
  "patch_size": 4,
  "path_norm": true,
  "pretrained_window_sizes": [
    0,
    0,
    0,
    0
  ],
  "qkv_bias": true,
  "transformers_version": "4.22.2",
  "use_absolute_embeddings": false,
  "window_size": 8
}

In [None]:
encoderv3 = Swinv2Model(configuration, add_pooling_layer=False)

In [None]:
count_params(encoderv3)

'1.506112M params'

In [None]:
feats = encoderv3(torch.randn(1,3, 368, 496), return_dict=False)

In [82]:
type(feats), len(feats)

(tuple, 2)

In [83]:
feats[0].shape, feats[1]

(torch.Size([1, 2852, 128]), None)

In [191]:
_, C, H, W = img1.shape
H, W

(368, 496)

In [55]:
368 / 8, 496 /8, 46 * 62

(46.0, 62.0, 2852)

In [87]:
feats = feats[0].permute(0,2,1)
feats.shape

torch.Size([1, 128, 2852])

In [75]:
b,c,_ = feats.shape

In [79]:
_feats = feats.reshape(b,c, 46, 62)
_feats.shape

torch.Size([1, 128, 46, 62])

In [56]:
from einops import rearrange

In [None]:
feat = rearrange(feats[0], 'n (h w) c -> n c h w')
feat.shape

EinopsError:  Error while processing rearrange-reduction pattern "n (h/8 w/8) c -> n c h w".
 Input tensor shape: torch.Size([1, 2852, 128]). Additional info: {}.
 Unknown character '/'

___