# Imports

In [29]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
import os 
import pandas as pd
from torchvision.datasets.folder import default_loader
from torchvision.datasets.utils import download_url
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from collections import OrderedDict
from torchvision.models.resnet import resnet18 as raw_resnet18
# from dynamic_models.dy_resnet import resnet18 as dy_resnet18

from datetime import datetime

In [30]:
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
           'wide_resnet50_2', 'wide_resnet101_2']

# Device

In [31]:
torch.cuda.is_available()

True

In [32]:
device = torch.device('cpu' if torch.cuda.is_available() else 'cpu')

In [33]:
torch.cuda.device_count()

5

# Dynamic Convolution

In [34]:
class attention1d(nn.Module):
    def __init__(self,in_planes,ratios,K,temperature,init_weight = True):
        super(attention1d,self).__init__()
        assert temperature % 3 == 1 # for reducing τ temperature from 30 to 1 linearly in the first 10 epochs.
        self.avgpool = nn.AdaptiveAvgPool1d(1)
        
        if in_planes != 3:
            hidden_planes = int(in_planes * ratios) + 1
        else:
            hidden_planes = K
        
        self.fc1   = nn.Conv1d(in_planes,hidden_planes,1,bias = False)
        self.relu  = nn.ReLU()
        self.fc2   = nn.Conv1d(hidden_planes,K,1,bias = True)
        self.temperature = temperature
        
        if init_weight:
            self._initialize_weights()
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m,nn.Conv1d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias,0)
            
            if isinstance(m,nn.BatchNorm2d):
                nn.init.constant_(m.weight,1)
                nn.init.constant_(m.bias,0)
    
    def update__temperature(self):
        if self.temperature != 1:
            self.temperature -= 3
    
    def forward(self,z):
        z = self.avgpool(z)
        z = self.fc1(z)
        z = self.relu(z)
        z = self.fc2(z)
        z = z.view(z.size(0),-1)   
        return F.softmax(z/self.temperature,1) 
    
class Dynamic_conv1d(nn.Module):
    def __init__(self,in_planes,out_planes,kernel_size,ratio = 0.25,stride = 1,padding = 0,dilation = 1,groups = 1,bias = True,K = 4,temperature = 34,init_weight = True):
        super(Dynamic_conv1d,self).__init__()
        
        if in_planes%groups != 0:
            raise ValueError('Error : in_planes%groups != 0')
        self.in_planes    = in_planes
        self.out_planes   = out_planes
        self.kernel_size  = kernel_size
        self.stride       = stride
        self.padding      = padding
        self.dilation     = dilation
        self.groups       = groups
        self.bias         = bias
        self.K            = K
        self.attention    = attention1d(in_planes,ratio,K,temperature)
        self.weight       = nn.Parameter(torch.randn(K,out_planes,in_planes//groups,kernel_size),requires_grad = True)
        
        if bias :
            self.bias = nn.Parameter(torch.Tensor(K,out_planes))
        else:
            self.bias = None
        if init_weight:
            self._initialize_weights()
    
    def _initialize_weights(self):
        for i in range(self.K):
            nn.init.kaiming_uniform_(self.weight[i])
    def update_temperature(self):
        self.attention.update__temperature()
    
    def forward(self,z):
        
#         Regard batch as a dimensional variable, perform group convolution,
#         because the weight of group convolution is different, 
#         and the weight of dynamic convolution is also different
        softmax_attention = self.attention(z)
        batch_size ,in_planes,height = z.size()
        z = z.view(1,-1,height,) # changing into dimension for group convolution
        weight = self.weight.view(self.K,-1)
        
#         The generation of the weight of dynamic convolution,
#         which generates batch_size convolution parameters 
#         (each parameter is different) 
        aggregate_weight = torch.mm(softmax_attention,self.bias).view(-1,self.in_planes,self.kernel_size,)# expects two matrices (2D tensors)
        if self.bias is not None:
            aggregate_bias = torch.mm(softmax_attention,self.bias).view(-1)
            output = F.conv1d(x,weight = aggregate_weight,bias = aggregate_bias,stride = self.stride,padding = self.padding,
                             dilation=self.dilation, groups=self.groups * batch_size)
        else:
            output = F.conv1d(x,weight = aggregate_weight,bias = None,stride = self.stride,padding = self.padding,
                             dilation=self.dilation, groups=self.groups * batch_size)
        output = output.view(batch_size, self.out_planes, output.size(-1))
        return output

In [35]:
class attention2d(nn.Module):
    def __init__(self,in_planes,ratios,K,temperature,init_weight = True):
        super(attention2d,self).__init__()
        assert temperature % 3 == 1 # for reducing τ temperature from 30 to 1 linearly in the first 10 epochs.
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        
        if in_planes != 3:
            hidden_planes = int(in_planes * ratios) + 1
        else:
            hidden_planes = K
        
        self.fc1   = nn.Conv2d(in_planes,hidden_planes,1,bias = False)
        self.relu  = nn.ReLU()
        self.fc2   = nn.Conv2d(hidden_planes,K,1,bias = True)
        self.temperature = temperature
        
        if init_weight:
            self._initialize_weights()
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m,nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias,0)
            
            if isinstance(m,nn.BatchNorm2d):
                nn.init.constant_(m.weight,1)
                nn.init.constant_(m.bias,0)
    
    def update__temperature(self):
        if self.temperature != 1:
            self.temperature -= 3
    
    def forward(self,z):
        z = self.avgpool(z)
        z = self.fc1(z)
        z = self.relu(z)
        z = self.fc2(z)
        z = z.view(z.size(0),-1)   
        return F.softmax(z/self.temperature,1) 
    
class Dynamic_conv2d(nn.Module):
    def __init__(self,in_planes,out_planes,kernel_size,ratio = 0.25,stride = 1,padding = 0,dilation = 1,groups = 1,bias = True,K = 4,temperature = 34,init_weight = True):
        super(Dynamic_conv2d,self).__init__()
        
        if in_planes%groups != 0:
            raise ValueError('Error : in_planes%groups != 0')
        self.in_planes    = in_planes
        self.out_planes   = out_planes
        self.kernel_size  = kernel_size
        self.stride       = stride
        self.padding      = padding
        self.dilation     = dilation
        self.groups       = groups
        self.bias         = bias
        self.K            = K
        self.attention    = attention2d(in_planes,ratio,K,temperature)
        self.weight       = nn.Parameter(torch.randn(K,out_planes,in_planes//groups,kernel_size,kernel_size),requires_grad = True)
        
        if bias :
            self.bias = nn.Parameter(torch.Tensor(K,out_planes))
        else:
            self.bias = None
        if init_weight:
            self._initialize_weights()
    
    def _initialize_weights(self):
        for i in range(self.K):
            nn.init.kaiming_uniform_(self.weight[i])
    def update_temperature(self):
        self.attention.update__temperature()
    
    def forward(self,z):
        
#         Regard batch as a dimensional variable, perform group convolution,
#         because the weight of group convolution is different, 
#         and the weight of dynamic convolution is also different
        softmax_attention = self.attention(z)
        batch_size ,in_planes,height,width = z.size()
        z = z.view(1,-1,height,width) # changing into dimension for group convolution
        weight = self.weight.view(self.K,-1)
        
#         The generation of the weight of dynamic convolution,
#         which generates batch_size convolution parameters 
#         (each parameter is different) 
        aggregate_weight = torch.mm(softmax_attention,self.bias).view(-1,self.in_planes,self.kernel_size,)# expects two matrices (2D tensors)
        if self.bias is not None:
            aggregate_bias = torch.mm(softmax_attention,self.bias).view(-1)
            output = F.conv2d(x,weight = aggregate_weight,bias = aggregate_bias,stride = self.stride,padding = self.padding,
                             dilation=self.dilation, groups=self.groups * batch_size)
        else:
            output = F.conv2d(x,weight = aggregate_weight,bias = None,stride = self.stride,padding = self.padding,
                             dilation=self.dilation, groups=self.groups * batch_size)
        output = output.view(batch_size, self.out_planes, output.size(-2),output.size(-1))
        return output        

In [36]:
class attention3d(nn.Module):
    def __init__(self,in_planes,ratios,K,temperature,init_weight = True):
        super(attention3d,self).__init__()
        assert temperature % 3 == 1 # for reducing τ temperature from 30 to 1 linearly in the first 10 epochs.
        self.avgpool = nn.AdaptiveAvgPool3d(1)
        
        if in_planes != 3:
            hidden_planes = int(in_planes * ratios) + 1
        else:
            hidden_planes = K
        
        self.fc1   = nn.Conv3d(in_planes,hidden_planes,1,bias = False)
        self.relu  = nn.ReLU()
        self.fc2   = nn.Conv3d(hidden_planes,K,1,bias = False)
        self.temperature = temperature
        
        if init_weight:
            self._initialize_weights()
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m,nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias,0)
            
            if isinstance(m,nn.BatchNorm2d):
                nn.init.constant_(m.weight,1)
                nn.init.constant_(m.bias,0)
    
    def update__temperature(self):
        if self.temperature != 1:
            self.temperature -= 3
    
    def forward(self,z):
        z = self.avgpool(z)
        z = self.fc1(z)
        z = self.relu(z)
        z = self.fc2(z)
        z = z.view(z.size(0),-1)   
        return F.softmax(z/self.temperature,1) 
    
class Dynamic_conv3d(nn.Module):
    def __init__(self,in_planes,out_planes,kernel_size,ratio = 0.25,stride = 1,padding = 0,dilation = 1,groups = 1,bias = True,K = 4,temperature = 34,init_weight = True):
        super(Dynamic_conv2d,self).__init__()
        
        if in_planes%groups != 0:
            raise ValueError('Error : in_planes%groups != 0')
        self.in_planes    = in_planes
        self.out_planes   = out_planes
        self.kernel_size  = kernel_size
        self.stride       = stride
        self.padding      = padding
        self.dilation     = dilation
        self.groups       = groups
        self.bias         = bias
        self.K            = K
        self.attention    = attention3d(in_planes,ratio,K,temperature)
        self.weight       = nn.Parameter(torch.randn(K,out_planes,in_planes//groups,kernel_size,kernel_size),requires_grad = True)
        
        if bias :
            self.bias = nn.Parameter(torch.Tensor(K,out_planes))
        else:
            self.bias = None
        if init_weight:
            self._initialize_weights()
    
    def _initialize_weights(self):
        for i in range(self.K):
            nn.init.kaiming_uniform_(self.weight[i])
    def update_temperature(self):
        self.attention.update__temperature()
    
    def forward(self,z):
        
#         Regard batch as a dimensional variable, perform group convolution,
#         because the weight of group convolution is different, 
#         and the weight of dynamic convolution is also different
        softmax_attention = self.attention(z)
        batch_size ,in_planes,height,width = z.size()
        z = z.view(1,-1,height,width) # changing into dimension for group convolution
        weight = self.weight.view(self.K,-1)
        
#         The generation of the weight of dynamic convolution,
#         which generates batch_size convolution parameters 
#         (each parameter is different) 
        aggregate_weight = torch.mm(softmax_attention,self.bias).view(-1,self.in_planes,self.kernel_size,self.kernel_size,self.kernel_size)# expects two matrices (2D tensors)
        if self.bias is not None:
            aggregate_bias = torch.mm(softmax_attention,self.bias).view(-1)
            output = F.conv3d(x,weight = aggregate_weight,bias = aggregate_bias,stride = self.stride,padding = self.padding,
                             dilation=self.dilation, groups=self.groups * batch_size)
        else:
            output = F.conv3d(x,weight = aggregate_weight,bias = None,stride = self.stride,padding = self.padding,
                             dilation=self.dilation, groups=self.groups * batch_size)
        output = output.view(batch_size, self.out_planes,output.size(-3),output.size(-2),output.size(-1))
        return output        

In [37]:
torch.Tensor(1,1)

tensor([[-5.8743e+31]])

# dy_resnet18

##### conv1x1 - dynamic convolution

In [38]:
def conv1x1(in_planes,out_planes,stride = 1): 
    return Dynamic_conv2d(in_planes,out_planes,kernel_size = 1,stride = stride,bias = False)

##### conv3x3 - dynamic convolution

In [39]:
def conv3x3(in_planes,out_planes,stride = 1,groups = 1,dilation = 1): # conv3x3 for dynamic convolution
    return Dynamic_conv2d(in_planes,out_planes,kernel_size = 3,stride = stride,padding = dilation,groups = groups,bias = False,dilation = dilation)

##### BasicBlock

In [40]:
class BasicBlock(nn.Module): # expansion = 1, dilation = 1 , base_width = 64 ,groups = 1
    expansion = 1
    
    def __init__(self,in_planes,out_planes,stride = 1,downsample = None,
                 groups = 1,base_width = 64,dilation = 1,norm_layer = None):
        
        super(BasicBlock,self).__init__()
        
        if base_width != 64:
            raise ValueError('BasicBlock supports only base_width = 64')
        if groups != 1: 
            raise ValueError('BasicBlock supports only groups = 1')
        if dilation > 1:
            raise NotImplementedError('BasicBlock doesnot support dilation > 1')
        # self.conv1 and self.downsample layers downsample the input when stride != 1
        
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        
        self.conv1 = conv3x3(in_planes,out_planes,stride)
        self.bn1   = norm_layer(out_planes)
        self.relu  = nn.ReLU() # modify input directly.
        self.conv2 = conv3x3(out_planes,out_planes)
        self.bn2   = norm_layer(out_planes)
        self.downsample = downsample
        self.stride = stride
    
    def forward(self , z):
        
        identity = z
        
        out = self.conv1(z)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.downsample is not None:
            identity = self.downsample(z)
        
        out = out + identity
        out = self.relu(out)
        
        return out
        
        
    

##### Bottleneck

In [41]:
class Bottleneck(nn.Module):
    
    expansion = 4
    
    def __init__(self,in_planes,out_planes,stride = 1,downsample = None,
                groups = 1,base_width = 64,dilation = 1,norm_layer = None):
        super(Bottleneck,self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        
        width = int(out_planes *(base_width/64.)) * groups  ## ?? 
        
        self.conv1  = conv1x1(in_planes,width)
        self.bn1    = norm_layer(width)
        self.conv2  = conv3x3(width,width,stride,groups,dilation)
        self.bn2    = norm_layer(width)
        self.conv3  = conv1x1(width,out_planes * self.expansion)
        self.bn3    = norm_layer(out_planes * self.expansion)
        self.relu   = nn.ReLU()
        self.downsample = downsample
        self.stride = stride
    
    def forward(self,z):
        identity    = z
        
        out         = self.conv1(z)
        out         = self.bn1(out)
        out         = self.relu(out)
        
        out         = self.conv2(out)
        out         = self.bn2(out)
        out         = self.relu(out)
        
        out         = self.conv3(out)
        out         = self.bn3(out)
        
        if self.downsample is not None:
            identity = self.downsample(z)
        
        out = out + identity
        out = self.relu(out)
        
        return out
        
        
        
        

#####  Resnet class

In [42]:
class ResNet(nn.Module):
    
    def __init__(self,block,layers,num_classes = 1000,zero_init_residual = False,
                groups = 1,width_per_group = 64,replace_stride_with_dilation = None,
                norm_layer = None):
        
        super(ResNet,self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer
        
        self.in_planes = 64
        self.dilation = 1
        
        if replace_stride_with_dilation is None:
            # Each element in the tuple indicates if we should replace 
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False,False,False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("Invalid argument : Size error for replace_stride_with_dilation")
        
        self.groups        = groups
        self.base_width    = width_per_group
        
        self.conv1         = nn.Conv2d(3,self.in_planes,kernel_size = 7,stride = 2,padding = 3,
                                      bias = False)
        self.bn1           = norm_layer(self.in_planes)
        self.relu          = nn.ReLU()
        self.maxpool       = nn.MaxPool2d(kernel_size = 3,stride = 2,padding = 1)
        self.layer1        = self._make_layer(block,64,layers[0])
        self.layer2        = self._make_layer(block,128,layers[1],stride = 2,
                                             dilate = replace_stride_with_dilation[0])
        self.layer3        = self._make_layer(block,256,layers[2],stride = 2,
                                             dilate = replace_stride_with_dilation[1])
        self.layer4        = self._make_layer(block,512,layers[3],stride = 2,
                                              dilate = replace_stride_with_dilation[2])
        self.avgpool       = nn.AdaptiveAvgPool2d((1,1))
        self.fc            = nn.Linear(512 * block.expansion,num_classes)
        
        for m in self.modules():
            if isinstance(m,nn.Conv2d):
                nn.init.kaiming_normal_(m.weight,mode = 'fan_out',nonlinearity = 'relu')
            elif isinstance(m,(nn.BatchNorm2d,nn.GroupNorm)):
                nn.init.constant_(m.weight,1)
                nn.init.constant_(m.bias,0)
                
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m,Bottleneck):
                    nn.init.constant_(m.bn3.weight,0)
                elif isinstance(m,BasicBlock):
                    nn.init.constant_(m.bn2.weight,0)
        
    def update_temperature(self):
        for m in self.modules():
            if isinstance(m,Dynamic_conv2d):
                m.update_temperature()              ### ???
    
    def _make_layer(self,block,out_planes,blocks,stride = 1,dilate = False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.in_planes != out_planes * block.expansion:
            downsample = nn.Sequential(
                            conv1x1(self.in_planes,out_planes * block.expansion,stride),
                            norm_layer(out_planes * block.expansion),
                            )
        layers = []
        layers.append(block(self.in_planes,out_planes,stride,downsample,self.groups,
                           self.base_width,previous_dilation,norm_layer))
        self.in_planes = out_planes * block.expansion
        for _ in range(1,blocks):
            layers.append(block(self.in_planes,out_planes,groups = self.groups,
                               base_width = self.base_width,dilation = self.dilation,
                               norm_layer = norm_layer))
        return nn.Sequential(*layers)
    
    def _forward_impl(self,z):
        print('1')
        z = self.conv1(z)
        z = self.bn1(z)
        z = self.relu(z)
        z = self.maxpool(z)
        print('2')
        z = self.layer1(z)
        z = self.layer2(z)
        z = self.layer3(z)
        z = self.layer4(z)
        
        z = self.avgpool(z)
        z = torch.flatten(z,1)
        z = self.fc(z)
        print('3')
        
        return z
    


In [43]:
def _resnet(arch,block,layers,pretrained,progress, **kwargs):
    model = ResNet(block,layers, **kwargs)
    print("Hi")
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls[arch],progress = progress)
        model.load_state_dict(state_dict)       
    return model
def resnet18(pretrained = False,progress = True,**kwargs):
    print("hey")
    return _resnet('resnet18',BasicBlock,[2,2,2,2],pretrained,progress,**kwargs)


In [44]:
# net = resnet18(num_classes = classes)

# Model

In [45]:
l_rate    = 0.1
momentum_ = 0.9
w_decay   = 1e-4
epochs    = 160

In [46]:
classes = 10 
trainset_cifar10 = torchvision.datasets.CIFAR10(root = '/home/varshittha/dynamic-convolution',train=True,
                                                download=True,
                                                transform=transforms.Compose([
                                                transforms.Pad(4),
                                                transforms.RandomCrop(32),
                                                transforms.RandomHorizontalFlip(),
                                                transforms.ToTensor(),
                                                transforms.Normalize((0.4914, 0.4822, 0.4465),(0.2023, 0.1994, 0.2010))
                                            ]))
trainloader_cifar10 = torch.utils.data.DataLoader(trainset_cifar10, batch_size=128, shuffle=True, num_workers=0)

testset_cifar10 = torchvision.datasets.CIFAR10(root='/home/varshittha/dynamic-convolution', train=False, download=True,
                                           transform=transforms.Compose([
                                               transforms.ToTensor(),
                                               transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
                                           ]))
testloader_cifar10 = torch.utils.data.DataLoader(testset_cifar10, batch_size=20, shuffle=False, num_workers=0)


Files already downloaded and verified
Files already downloaded and verified


In [47]:
classes

10

In [48]:
net = resnet18(num_classes = classes)

hey
Hi


In [49]:
# net.to(device)

In [50]:
optimizer = optim.SGD(net.parameters(),lr = l_rate,momentum = momentum_,weight_decay = w_decay)

In [51]:
def update_lr(optimizer,epoch):
    if epoch in [epochs * 0.5,epochs * 0.75,epochs * 0.85]:
        for p in optimizer.param_groups:
            p['lr'] *= 0.1
            l_rate   = p['lr']
            

In [52]:
def train(epoch):
    print('xxx')
    net.train()
    print('yyy')
    avg_loss = 0.
    train_acc = 0.
    update_lr(optimizer, epoch)
    print(l_rate)
    for batch_idx, (data, target) in enumerate(trainloader_cifar10):

        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = net(data)
        output.to(device)
        loss = F.cross_entropy(output, target)
        avg_loss += loss.item()
        pred = output.data.max(1, keepdim=True)[1]
        train_acc += pred.eq(target.data.view_as(pred)).cpu().sum()
        loss.backward()

        optimizer.step()
    print('Train Epoch: {}, loss{:.6f}, acc{}'.format(epoch, loss.item(), train_acc/len(trainloader_cifar10 .dataset)), end='')
    
    net.update_temperature()
    
    

In [53]:
device

device(type='cpu')

In [54]:
def val(epoch):
    net.eval()
    test_loss = 0.
    correct=0.
    with torch.no_grad():
        for data, label in testloader_cifar10:
            data, label = data.to(device), label.to(device)
            print(data.size())
            output = net(data)
            output.to(device)
            test_loss += F.cross_entropy(output, label, size_average=False).item()
            pred =  output.data.max(1, keepdim=True)[1]
            correct += pred.eq(label.data.view_as(pred)).cpu().sum()
    test_loss/=len(testloader_cifar10.dataset)
    correct = int(correct)
    print('Test set:average loss: {:.4f}, accuracy{}'.format(test_loss, 100.*correct/len(testloader_cifar10.dataset)))
    return correct/len(testloader_cifar10.dataset)

In [55]:
best_val_acc=0.
for i in range(epochs):
    print(i)
    train(i+1)
    print('I came')
    temp_acc = val(i+1)
    if temp_acc>best_val_acc:
        best_val_acc = temp_acc
print('Best acc{}'.format(best_val_acc))

0
xxx
yyy
0.1


NotImplementedError: 