# Inception-resnetv2 based encoder

In [1]:
from InceptionResnetv2 import inceptionresnetv2

ir = inceptionresnetv2(num_classes=1000, pretrained='imagenet')

In [2]:
ir

InceptionResNetV2 (
  (conv2d_1a): BasicConv2d (
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True)
    (relu): ReLU ()
  )
  (conv2d_2a): BasicConv2d (
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True)
    (relu): ReLU ()
  )
  (conv2d_2b): BasicConv2d (
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True)
    (relu): ReLU ()
  )
  (maxpool_3a): MaxPool2d (size=(3, 3), stride=(2, 2), dilation=(1, 1))
  (conv2d_3b): BasicConv2d (
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True)
    (relu): ReLU ()
  )
  (conv2d_4a): BasicConv2d (
    (conv): Conv2d(80, 192, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(192

In [3]:
model = InceptionResnetVideoExtractor(ir)

In [8]:
model = InceptionResnetEncoder(ir)

In [9]:
inputs = torch.autograd.Variable(torch.randn(2,3,299,299))
out = model(inputs)

print (out.size())

torch.Size([2, 320, 35, 35])
torch.Size([2, 1088, 17, 17])
torch.Size([2, 2080, 8, 8])


RuntimeError: Given input size: (320x35x35). Calculated output size: (320x-12x-12). Output size is too small at /pytorch/torch/lib/THNN/generic/SpatialAveragePooling.c:64

In [4]:
inputs = torch.autograd.Variable(torch.randn(2,3,404,404))
out = model(inputs)

print (out.size())

torch.Size([2, 320, 48, 48])
torch.Size([2, 1088, 23, 23])
torch.Size([2, 2080, 11, 11])
torch.Size([2, 3, 404, 404])


In [7]:
import torch
import torch.nn as nn

class InceptionResnetEncoder(nn.Module):
    def __init__(self,
                 ir):
        super(InceptionResnetEncoder, self).__init__()
        
        self.stem = nn.Sequential(
            ir.conv2d_1a,
            ir.conv2d_2a,
            ir.conv2d_2b,
            ir.maxpool_3a,
            ir.conv2d_3b,
            ir.conv2d_4a,
            ir.maxpool_5a,
            ir.mixed_5b,
        )   
        
        self.mixed_6a = ir.mixed_6a
        self.mixed_7a = ir.mixed_7a
        
        self.avgpool1 = nn.AvgPool2d(48, stride=1)
        self.avgpool2 = nn.AvgPool2d(23, stride=1)
        self.avgpool3 = nn.AvgPool2d(11, stride=1)
        
        self.skip1 = ir.repeat
        self.skip2 = ir.repeat_1
        self.skip3 = ir.repeat_2

    def forward(self, frame):

        x1 = self.stem(frame)
        x1 = self.skip1(x1)
        print(x1.size())

        x1_resume = self.mixed_6a(x1)
        x2 = self.skip2(x1_resume)
        print(x2.size())

        x2_resume = self.mixed_7a(x2)
        x3 = self.skip3(x2_resume)
        print(x3.size())

        feature_vector = torch.cat((self.avgpool1(x1)
                                    .view(x1.size(0),x1.size(1)),
                                    self.avgpool2(x2)
                                    .view(x2.size(0),x2.size(1)),
                                    self.avgpool3(x3)
                                    .view(x3.size(0),x3.size(1)),
                                    ), dim=1)
        
        return feature_vector

In [2]:
import torch
import torch.nn as nn

class InceptionResnetVideoExtractor(nn.Module):
    def __init__(self,
                 ir):
        super(InceptionResnetVideoExtractor, self).__init__()
        
        self.stem = nn.Sequential(
            ir.conv2d_1a,
            ir.conv2d_2a,
            ir.conv2d_2b,
            ir.maxpool_3a,
            ir.conv2d_3b,
            ir.conv2d_4a,
            ir.maxpool_5a,
            ir.mixed_5b,
        )   
        
        self.mixed_6a = ir.mixed_6a
        self.mixed_7a = ir.mixed_7a
        
        self.avgpool1 = nn.AvgPool2d(48, stride=1)
        self.avgpool2 = nn.AvgPool2d(23, stride=1)
        self.avgpool3 = nn.AvgPool2d(11, stride=1)
        
        self.skip1 = ir.repeat
        self.skip2 = ir.repeat_1
        self.skip3 = ir.repeat_2

    def forward(self, x):
        
        out = []
        for frame in x:
            x1 = self.stem(frame)
            x1 = self.skip1(x1)
            print(x1.size())
            
            x1_resume = self.mixed_6a(x1)
            x2 = self.skip2(x1_resume)
            print(x2.size())
            
            x2_resume = self.mixed_7a(x2)
            x3 = self.skip3(x2_resume)
            print(x3.size())
            
            feature_vector = torch.cat((self.avgpool1(x1)
                                        .view(x1.size(0),x1.size(1)),
                                        self.avgpool2(x2)
                                        .view(x2.size(0),x2.size(1)),
                                        self.avgpool3(x3)
                                        .view(x3.size(0),x3.size(1)),
                                        ), dim=1)
            out.append(feature_vector)
            
        out = torch.stack ([features for features in out])             
        
        return out

# Densenet based encoder

In [1]:
import torchvision.models

densenet = torchvision.models.densenet161(pretrained=True)

In [3]:
model = DenseNetExtractor(densenet)

In [4]:
inputs = torch.autograd.Variable(torch.randn(2,2,3,404,404))
out = model(inputs)

print (out.size())

torch.Size([2, 384, 101, 101])
torch.Size([2, 768, 50, 50])
torch.Size([2, 2112, 25, 25])
torch.Size([2, 2208, 12, 12])
torch.Size([2, 384, 101, 101])
torch.Size([2, 768, 50, 50])
torch.Size([2, 2112, 25, 25])
torch.Size([2, 2208, 12, 12])
torch.Size([2, 2, 3264])


In [2]:
import torch
import torch.nn as nn

class DenseNetExtractor(nn.Module):
    def __init__(self,
                 dn):
        super(DenseNetExtractor, self).__init__()
        
        self.stem = nn.Sequential(
            dn.features.conv0,
            dn.features.norm0,
            dn.features.relu0,
            dn.features.pool0

        )   
        
        self.db1 = dn.features.denseblock1
        self.tr1 = dn.features.transition1
        
        self.db2 = dn.features.denseblock2
        self.tr2 = dn.features.transition2
        
        self.db3 = dn.features.denseblock3
        self.tr3 = dn.features.transition3
        
        self.db4 = dn.features.denseblock4
        
        self.avgpool1 = nn.AvgPool2d(101, stride=1)
        self.avgpool2 = nn.AvgPool2d(50, stride=1)
        self.avgpool3 = nn.AvgPool2d(25, stride=1)
        self.avgpool4 = nn.AvgPool2d(12, stride=1)        

    def forward(self, x):
        
        out = []
        for frame in x:
            x1 = self.stem(frame)
            x1 = self.db1(x1)
            
            x1_resume = self.tr1(x1)
            x2 = self.db2(x1_resume)

            x2_resume = self.tr2(x2)
            x3 = self.db3(x2_resume)

            
            x3_resume = self.tr3(x3)
            x4 = self.db4(x3_resume)
            
            feature_vector = torch.cat((self.avgpool1(x1)
                                        .view(x1.size(0),x1.size(1)),
                                        self.avgpool2(x2)
                                        .view(x2.size(0),x2.size(1)),
                                        self.avgpool3(x3)
                                        .view(x3.size(0),x3.size(1)),
                                        ), dim=1)
            out.append(feature_vector)
            
        out = torch.stack ([features for features in out])             
        
        return out

# Inception4 based encoder

In [1]:
1+1

2

In [2]:
from Inception4Cadene import inceptionv4

inception = inceptionv4(num_classes=1000, pretrained='imagenet')

In [None]:
            BasicConv2d(3, 32, kernel_size=3, stride=2),
            BasicConv2d(32, 32, kernel_size=3, stride=1),
            BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1),
            Mixed_3a(),
            Mixed_4a(),
            Mixed_5a(),
            Inception_A(),
            Inception_A(),
            Inception_A(),
            Inception_A(),
            Reduction_A(), # Mixed_6a
            Inception_B(),
            Inception_B(),
            Inception_B(),
            Inception_B(),
            Inception_B(),
            Inception_B(),
            Inception_B(),
            Reduction_B(), # Mixed_7a
            Inception_C(),
            Inception_C(),
            Inception_C()

In [13]:
import torch
import torch.nn as nn

class InceptionExtractor(nn.Module):
    def __init__(self,
                 inception):
        super(InceptionExtractor, self).__init__()
        
        self.stem = nn.Sequential(
            inception.features[0],
            inception.features[1],
            inception.features[2],
            inception.features[3],
            inception.features[4],
            inception.features[5],
        )
        
        self.block1 = nn.Sequential(
            inception.features[6],
            inception.features[7],
            inception.features[8],
            inception.features[9],
        )        
        
        self.tr1 = inception.features[10]
        
        self.block2 = nn.Sequential(
            inception.features[11],
            inception.features[12],
            inception.features[13],
            inception.features[14],
            inception.features[15],
            inception.features[16],
            inception.features[17],            
        )
        
        self.tr2 = inception.features[18]
        
        self.block3 = nn.Sequential(
            inception.features[19],
            inception.features[20],
            inception.features[21]        
        )        
        
        self.avgpool1 = nn.AvgPool2d(48, stride=1)
        self.avgpool2 = nn.AvgPool2d(23, stride=1)
        self.avgpool3 = nn.AvgPool2d(11, stride=1)
  

    def forward(self, x):
        
        out = []
        for frame in x:
            x1 = self.stem(frame)
            x1 = self.block1(x1)
            
            x1_resume = self.tr1(x1)
            x2 = self.block2(x1_resume)

            x2_resume = self.tr2(x2)
            x3 = self.block3(x2_resume)

            print(x1.size())
            print(x2.size())
            print(x3.size())
            
            feature_vector = torch.cat((self.avgpool1(x1)
                                        .view(x1.size(0),x1.size(1)),
                                        self.avgpool2(x2)
                                        .view(x2.size(0),x2.size(1)),
                                        self.avgpool3(x3)
                                        .view(x3.size(0),x3.size(1)),
                                        ), dim=1)
            out.append(feature_vector)
            
        out = torch.stack ([features for features in out])             
        
        return out

In [1]:
import torch
import torch.nn as nn

class InceptionEncoder(nn.Module):
    def __init__(self,
                 inception):
        super(InceptionEncoder, self).__init__()
        
        self.stem = nn.Sequential(
            inception.features[0],
            inception.features[1],
            inception.features[2],
            inception.features[3],
            inception.features[4],
            inception.features[5],
        )
        
        self.block1 = nn.Sequential(
            inception.features[6],
            inception.features[7],
            inception.features[8],
            inception.features[9],
        )        
        
        self.tr1 = inception.features[10]
        
        self.block2 = nn.Sequential(
            inception.features[11],
            inception.features[12],
            inception.features[13],
            inception.features[14],
            inception.features[15],
            inception.features[16],
            inception.features[17],            
        )
        
        self.tr2 = inception.features[18]
        
        self.block3 = nn.Sequential(
            inception.features[19],
            inception.features[20],
            inception.features[21]        
        )        
        
        self.avgpool1 = nn.AvgPool2d(48, stride=1)
        self.avgpool2 = nn.AvgPool2d(23, stride=1)
        self.avgpool3 = nn.AvgPool2d(11, stride=1)
  

    def forward(self, frame):
        x1 = self.stem(frame)
        x1 = self.block1(x1)

        x1_resume = self.tr1(x1)
        x2 = self.block2(x1_resume)

        x2_resume = self.tr2(x2)
        x3 = self.block3(x2_resume)

        print(x1.size())
        print(x2.size())
        print(x3.size())

        feature_vector = torch.cat((self.avgpool1(x1)
                                    .view(x1.size(0),x1.size(1)),
                                    self.avgpool2(x2)
                                    .view(x2.size(0),x2.size(1)),
                                    self.avgpool3(x3)
                                    .view(x3.size(0),x3.size(1)),
                                    ), dim=1)
        
        return feature_vector

In [14]:
inception = inceptionv4(num_classes=1000, pretrained='imagenet')
model = InceptionExtractor(inception)

In [None]:
model = InceptionEncoder()

In [15]:
inputs = torch.autograd.Variable(torch.randn(2,2,3,404,404))
out = model(inputs)

print (out.size())

torch.Size([2, 384, 48, 48])
torch.Size([2, 1024, 23, 23])
torch.Size([2, 1536, 11, 11])
torch.Size([2, 384, 48, 48])
torch.Size([2, 1024, 23, 23])
torch.Size([2, 1536, 11, 11])
torch.Size([2, 2, 2944])


In [5]:
1+!

SyntaxError: invalid syntax (<ipython-input-5-19b23891d7c6>, line 1)

# Nasnet based encoder

In [1]:
from nasnet import nasnetalarge

nasnet = nasnetalarge(num_classes=1000, pretrained='imagenet')


In [5]:
import torch
import torch.nn as nn

class NasnetExtractor(nn.Module):
    def __init__(self,
                 nasnet):
        super(NasnetExtractor, self).__init__()
        
        self.conv0 = nasnet.conv0 
        self.cell_stem_0 = nasnet.cell_stem_0
        self.cell_stem_1 = nasnet.cell_stem_1

        self.cell_0 = nasnet.cell_0
        self.cell_1 = nasnet.cell_1
        self.cell_2 = nasnet.cell_2
        self.cell_3 = nasnet.cell_3
        self.cell_4 = nasnet.cell_4
        self.cell_5 = nasnet.cell_5

        self.reduction_cell_0 = nasnet.reduction_cell_0

        self.cell_6 = nasnet.cell_6
        self.cell_7 = nasnet.cell_7
        self.cell_8 = nasnet.cell_8
        self.cell_9 = nasnet.cell_9
        self.cell_10 = nasnet.cell_10
        self.cell_11 = nasnet.cell_11

        self.reduction_cell_1 = nasnet.reduction_cell_1

        self.cell_12 = nasnet.cell_12
        self.cell_13 = nasnet.cell_13
        self.cell_14 = nasnet.cell_14
        self.cell_15 = nasnet.cell_15
        self.cell_16 = nasnet.cell_16
        self.cell_17 = nasnet.cell_17
        
        self.avgpool1 = nn.AvgPool2d(48, stride=1)
        self.avgpool2 = nn.AvgPool2d(23, stride=1)
        self.avgpool3 = nn.AvgPool2d(11, stride=1)
  

    def forward(self, x):
        
        out = []
        for frame in x:
           
            x_conv0 = self.conv0(frame)
            x_stem_0 = self.cell_stem_0(x_conv0)
            x_stem_1 = self.cell_stem_1(x_conv0, x_stem_0)

            x_cell_0 = self.cell_0(x_stem_1, x_stem_0)
            x_cell_1 = self.cell_1(x_cell_0, x_stem_1)
            x_cell_2 = self.cell_2(x_cell_1, x_cell_0)
            x_cell_3 = self.cell_3(x_cell_2, x_cell_1)
            x_cell_4 = self.cell_4(x_cell_3, x_cell_2)
            x_cell_5 = self.cell_5(x_cell_4, x_cell_3)

            x_reduction_cell_0 = self.reduction_cell_0(x_cell_5, x_cell_4)

            x_cell_6 = self.cell_6(x_reduction_cell_0, x_cell_4)
            x_cell_7 = self.cell_7(x_cell_6, x_reduction_cell_0)
            x_cell_8 = self.cell_8(x_cell_7, x_cell_6)
            x_cell_9 = self.cell_9(x_cell_8, x_cell_7)
            x_cell_10 = self.cell_10(x_cell_9, x_cell_8)
            x_cell_11 = self.cell_11(x_cell_10, x_cell_9)

            x_reduction_cell_1 = self.reduction_cell_1(x_cell_11, x_cell_10)

            x_cell_12 = self.cell_12(x_reduction_cell_1, x_cell_10)
            x_cell_13 = self.cell_13(x_cell_12, x_reduction_cell_1)
            x_cell_14 = self.cell_14(x_cell_13, x_cell_12)
            x_cell_15 = self.cell_15(x_cell_14, x_cell_13)
            x_cell_16 = self.cell_16(x_cell_15, x_cell_14)
            x_cell_17 = self.cell_17(x_cell_16, x_cell_15)            

            
            feature_vector = torch.cat((self.avgpool1(x_reduction_cell_0)
                                        .view(x_reduction_cell_0.size(0),x_reduction_cell_0.size(1)),
                                        self.avgpool2(x_reduction_cell_1)
                                        .view(x_reduction_cell_1.size(0),x_reduction_cell_1.size(1)),
                                        self.avgpool3(x_cell_17)
                                        .view(x_cell_17.size(0),x_cell_17.size(1)),
                                        ), dim=1)
            out.append(feature_vector)
            
        out = torch.stack ([features for features in out])             
        
        return out

In [None]:
model = NasnetExtractor(nasnet)

In [None]:
input = torch.autograd.Variable(torch.randn(2,2,3,331,331))
output = model(input)
print(output.size())