In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.insert(0,"../")

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
#default_exp model.encoder

In [None]:
#export

import torch.nn as nn
import torch
from core.model.scene_graph.scene_graph import SceneGraph
from torchvision.models import resnet34

In [None]:
from core.dataloader import CLEVR_train, collate_boxes
from torch.utils.data import Dataset, DataLoader

In [None]:
train_dataset = CLEVR_train(root_dir='/home/mprabhud/dataset/clevr_lang/npys/ab_5t.txt')
train_loader = DataLoader(train_dataset, batch_size=5, shuffle=True, collate_fn=collate_boxes)

Initialised..... 234  files...


In [None]:
for b in train_loader:
    feed_dict_q, feed_dict_k, metadata = b
    break

In [None]:
feed_dict_q["images"] = feed_dict_k["images"].cuda()
feed_dict_k["images"] = feed_dict_k["images"].cuda()

In [None]:
#export

class Encoder(nn.Module):
    def __init__(self, dim = 256, mode=None):
        super().__init__()
        
        """
        Input:
            dim : final number of dimensions of the node and spatial embeddings
        
        Returns:
            Intialises a model which has node embeddimgs and spatial embeddings
        """
        
        self.dim=dim
        self.mode=mode
        self.resnet = resnet34(pretrained=True)
        self.feature_extractor = nn.Sequential(*list(self.resnet.children())[:-3])
        
        self.scene_graph = SceneGraph(feature_dim=self.dim, 
                                 output_dims=[self.dim,self.dim],
                                 downsample_rate=16,
                                 mode=self.mode)

        self.spatial_viewpoint_transformation = nn.Sequential(nn.Linear(263,512),
                                                        nn.ReLU(),
                                                        nn.Linear(512,self.dim))
        
        if self.mode=="spatial":
            self.set_parameter_requires_grad()
            
    def set_parameter_requires_grad(self):
        self.feature_extractor.requires_grad = False
        
    def merge_pose_with_scene_embeddings(self,
                                     scene_embeddings,
                                     view=None):
        '''
        Input
            scene_embeddings: output of scene_graph module. A list of of tensors containing node and
                              spatial embeddings of each batch element
            view : a tensor of size [batch, 1, 7] containing information of relative egomotion
                   between the two camera viewpoints
            transform_node and transform spatial: boolean flags whether to do any transformation on nodes or not
        Output
            scene_embeddings: concatenated with pose vectors
        '''

        for batch_ind,(_, spatial_embeddings) in enumerate(scene_embeddings):
            num_obj_x = spatial_embeddings.shape[0]
            num_obj_y = spatial_embeddings.shape[1]

            # Broadcast view to spatial embedding dimension
            view_spatial = view[batch_ind].unsqueeze(0).repeat(num_obj_x, num_obj_y, 1)
            # Concatenate with visual embeddings
            pose_with_features = torch.cat((view_spatial,spatial_embeddings), dim=2)
            # Reassign the scene embeddings
            scene_embeddings[batch_ind][1] = pose_with_features

            ### To Do : Write some assertion test : (Saksham)

        return scene_embeddings

    def do_viewpoint_transformation(self,
                                    scene_embeddings,
                                    transform_node=True,
                                    transform_spatial=False):

        '''
        Input:
            scene_embeddings: output of scene_graph module concatenated with pose. A list of of tensors containing node and
                              spatial embeddings of each batch element
            transform_node and transform spatial: boolean flags whether to do any transformation on nodes or not
        Output:
            scene_embeddings: viewpoint transformed embeddings
        '''
        for ind,(_, spatial_embeddings) in enumerate(scene_embeddings):
            # Do viewpoint transformation on spatial embeddings
            scene_embeddings[ind][1] = self.spatial_viewpoint_transformation(scene_embeddings[ind][1])

        return scene_embeddings

    def forward(self,
                feed_dict,
                rel_viewpoint=None):
        """
        Input:
            feed_dict: a dictionary containing list tensors containing images and bounding box data.
            Each element of the feed_dict corresponds to one elment of the batch.
            Inside each batch are contained ["image": Image tensor,
                                             "boxes":Bounding box tensor,
                                             bounding box
                                            ]
            mode: should be either 'node' or 'spatial' depending on what feature you want to extract
        """
        mode = self.mode
        num_batch = feed_dict["images"].shape[0]
        num_total_nodes = feed_dict["objects"].sum().item()

        image_features = self.feature_extractor(feed_dict["images"])
        outputs = self.scene_graph(image_features, feed_dict["objects_boxes"], feed_dict["objects"])

        if mode=="node":
            return outputs

        if mode=="spatial" and rel_viewpoint is not None:
            outputs = self.merge_pose_with_scene_embeddings(outputs,rel_viewpoint)
            outputs = self.do_viewpoint_transformation(outputs)
            
            return outputs
            
        if mode=="spatial" and rel_viewpoint is None:
            return outputs

## Testing the Encoder

##### **Mode** : Node Embeddings

In [None]:
encoder = Encoder(mode="node")
enoder = encoder.cuda()

In [None]:
feed_dict_ = feed_dict_k

In [None]:
node_outputs_ = encoder(feed_dict_)

In [None]:
image_features_ = encoder.feature_extractor(feed_dict_["images"])
scene_graph_output = encoder.scene_graph(image_features_, feed_dict_["objects_boxes"], feed_dict_["objects"])

In [None]:
batch_ind = 3

In [None]:
len(scene_graph_output), scene_graph_output[batch_ind][0].shape

(5, torch.Size([2, 256]))

##### **Mode** : Spatial Embeddings

In [None]:
encoder = Encoder(mode="spatial")
enoder = encoder.cuda()

In [None]:
rel_viewpoint_ = metadata["rel_viewpoint"]

In [None]:
spatial_outputs_ = encoder(feed_dict_, rel_viewpoint= rel_viewpoint_ )

Ading viewpoint information to spatial features
0
Adding pose to spatial embeddings
1
Adding pose to spatial embeddings
2
Adding pose to spatial embeddings
3
Adding pose to spatial embeddings
4
Adding pose to spatial embeddings
viewpoint transform on spatial embeddings
viewpoint transform on spatial embeddings
viewpoint transform on spatial embeddings
viewpoint transform on spatial embeddings
viewpoint transform on spatial embeddings


In [None]:
len(spatial_outputs_), spatial_outputs_[batch_ind][0].shape

(5, torch.Size([2, 256]))

In [None]:
len(spatial_outputs_), spatial_outputs_[batch_ind][1].shape

(5, torch.Size([2, 2, 256]))

## Matching the Nodes

In [None]:
feed_dict_k_ = feed_dict_k
feed_dict_q_ = feed_dict_q

In [None]:
encoder = Encoder(mode="node")
enoder = encoder.cuda()

In [None]:
output_k_ = encoder(feed_dict_k_)

In [None]:
output_q_ = encoder(feed_dict_q_)

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
def pair_embeddings(output_k, output_q, mode = "node"):
    
    if mode=="node":
        mode = 0
    elif mode=="spatial":
        mode = 1
    else:
        raise ValueError("Mode should be either node or spatial")
    
    num_batch = len(output_k)
    assert num_batch==len(output_q)   
    
    output_q_rearrange = []
    
    for batch_ind in range(num_batch):
        
        num_obj_in_batch = output_k[batch_ind][0].shape[0]
        assert num_obj_in_batch==output_q[batch_ind][0].shape[0]
        
        if mode=="spatial":
            assert num_obj_in_batch==output_q[batch_ind][1].shape[0]
            assert num_obj_in_batch==output_q[batch_ind][1].shape[1]
            assert output_k[batch_ind][1].shape[0]==output_k[batch_ind][1].shape[0]
            assert output_k[batch_ind][1].shape[1]==output_k[batch_ind][1].shape[1]
            assert output_k[batch_ind][1].shape[0]==output_k[batch_ind][1].shape[1]
            assert output_k[batch_ind][1].shape[1]==output_k[batch_ind][1].shape[0]
            
        #flatten the node features only - 
        output_k[batch_ind][0] = output_k[batch_ind][0].view(-1,256)
        output_q[batch_ind][0] = output_q[batch_ind][0].view(-1,256)
        
        
        #form two pool from node features for nearest neighbour search
        pool_e = output_k[batch_ind][0].clone().detach().cpu()
        pool_g = output_q[batch_ind][0].clone().detach().cpu()

        with torch.no_grad():

            knn_e = NearestNeighbors(n_neighbors= num_obj_in_batch, metric="euclidean")
            knn_g = NearestNeighbors(n_neighbors= num_obj_in_batch, metric="euclidean")

            knn_g.fit(pool_g)
            knn_e.fit(pool_e)
            
            paired = []
            pairs = []
            for index in range(num_obj_in_batch):  

                #fit knn on each of the object 
                _, indices_e = knn_g.kneighbors(torch.reshape(pool_e[index], (1,-1)).detach().cpu())
                indices_e = list(indices_e.flatten())
                for e in indices_e:
                    if e not in paired:
                        paired.append(e)
                        pairs.append(e)
                        break
        
        print(pairs)
        #rearranging the matched in output_q based on pair formed
        
    
        #Rearranging the node_features in output_q based on pair formed
        assert num_obj_in_batch == len(pairs)
        
        node_pool_rearranged = torch.zeros(pool_e.shape[0], 256)
        for index_node in range(num_obj_in_batch):
            pair_mapping_obj = pairs[index_node]
            node_pool_rearranged[index_node] = output_q[batch_ind][0][pair_mapping_obj].clone()
        
        output_q[batch_ind][0] = node_pool_rearranged.cuda()
        
        #If mode is spatial : also repair the spatial embeddings
        if mode=="spatial":
            spatial_pool_rearranged = torch.zeros(pool_e.shape[0], pool_e.shape[0], 256)
            for index_subj in range(num_obj_in_batch):
                for index_obj in range(num_obj_in_batch):
                    pair_mapping_subj = pairs[index_subj]
                    pair_mapping_obj = pairs[index_obj]
                    spatial_pool_rearranged[index_subj][index_obj] = output_q[batch_ind][1][pair_mapping_subj][pair_mapping_obj].clone()
                    
            output_q[batch_ind][1] = spatial_pool_rearranged
        
    return output_k, output_q    

In [None]:
rearranged_output_k, rearranged_output_q = pair_embeddings(output_k_, output_q_, mode = "node")

[1, 0]
[1, 0]
[0, 1]
[0, 1]
[0, 1]


In [None]:
rearranged_output_k==output_k_

True

In [None]:
rearranged_output_q==output_q_

True

## Matching the spatial embeddings

In [None]:
encoder = Encoder(mode="spatial")
enoder = encoder.cuda()

In [None]:
output_k__ = encoder(feed_dict_k_)
output_q__ = encoder(feed_dict_q_, rel_viewpoint_)

Ading viewpoint information to spatial features
0
Adding pose to spatial embeddings
1
Adding pose to spatial embeddings
2
Adding pose to spatial embeddings
3
Adding pose to spatial embeddings
4
Adding pose to spatial embeddings
viewpoint transform on spatial embeddings
viewpoint transform on spatial embeddings
viewpoint transform on spatial embeddings
viewpoint transform on spatial embeddings
viewpoint transform on spatial embeddings


In [None]:
rearranged_output_k_, rearranged_output_q_= pair_embeddings(output_k__, output_q__, mode = "spatial")


#Code breaking resolve later

[1, 0]
[1, 0]
[0, 1]
[0, 1]
[0, 1]


## Flatten the embeddings across batch

In [None]:
def stack_features_across_batch(output_feature_list, mode="node"):

    num_batch = len(output_feature_list)
    if mode=="node":  
        node_features = output_feature_list[0][0].view(-1,256)

        for num in range(1,num_batch):
            node_features = torch.cat([node_features, output_feature_list[num][0]], dim =0)
        
        return node_features
    
    if mode=="spatial":
        spatial_features = output_feature_list[0][1].view(-1,256)

        for num in range(1, num_batch):
            spatial_features = torch.cat([spatial_features, output_feature_list[num][1].view(-1,256)], dim =0)
            
        return spatial_features
    
    raise ValueError("Training mode not defined properly. It should be either 'node' or 'spatial'." )       

In [None]:
stacked_output_k = stack_features_across_batch(rearranged_output_k, mode="node")

In [None]:
stacked_output_k.shape

torch.Size([10, 256])

In [None]:
stacked_output_k[3] == rearranged_output_k[1][0][1]

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, Tr

In [None]:
stacked_output_k_ = stack_features_across_batch(rearranged_output_k_, mode="spatial")

In [None]:
stacked_output_k_.shape

torch.Size([20, 256])