In [5]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import math
import torch
import torch.nn as nn
import torchvision
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
import torch.nn.functional as F
import torch.optim as optim

from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt
%matplotlib qt

In [6]:
n_epochs = 5
batch_size_train = 20
batch_size_test = 1000
learning_rate = 0.01
momentum = 0.5
log_interval = 10

random_seed = 1
torch.manual_seed(random_seed)

<torch._C.Generator at 0x23a64e232d0>

In [7]:
train_set = torchvision.datasets.FashionMNIST("./data", download=True, transform=
                                                transforms.Compose([transforms.ToTensor()]))
test_set = torchvision.datasets.FashionMNIST("./data", download=True, train=False, transform=
                                               transforms.Compose([transforms.ToTensor()]))  
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size_train)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size_test)
def output_label(label):
    output_mapping = {
                 0: "T-shirt/Top",
                 1: "Trouser",
                 2: "Pullover",
                 3: "Dress",
                 4: "Coat", 
                 5: "Sandal", 
                 6: "Shirt",
                 7: "Sneaker",
                 8: "Bag",
                 9: "Ankle Boot"
                 }
    input = (label.item() if type(label) == torch.Tensor else label)
    return output_mapping[input]

In [8]:
class SparseLinear(torch.autograd.Function):  
    @staticmethod
    # bias is an optional argument
    def forward(ctx, input, weight, info, bias=None):
        # Save inputs in context-object for later use in backwards
        ctx.save_for_backward(input, weight, bias) # these are differentiable
        ctx.info = info                            # non-differentiable argument
        output = input.mm(weight.t())
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        return output
    
    @staticmethod
    def backward(ctx, grad_output):
        input, weight, bias = ctx.saved_tensors
        # input   [batchSize, in]
        # output  [batchSize, out]
        # weights [out, in]
        # bias    [out]
        grad_input = grad_weight = grad_bias = None
        
        # Calculate k (different across batches)
        Y = grad_output.abs().sum(1)       # Y[batchSize]
        if (torch.max(Y) > ctx.info.Y_max):     # Check if biggest Y of batch is bigger than recorded Y
            ctx.info.Y_max = torch.max(Y).item()
        bpr = (S_min + Y*(S_max-S_min)/(ctx.info.Y_max))  #TODO add wearoff
    
        K = torch.tensor(torch.round(grad_output.size(1)*bpr))        # K[batchSize]
        K.clamp(1, grad_output.size(1))
        # log in layer
        ctx.info.miniBatchBpr = torch.mean(bpr)
        ctx.info.miniBatchK = torch.mean(K)
        K = K.to(torch.int16)
        
        # create a sparse grad_output tensor. Since k is different across batches, the topK indices
        # must be assembled for each batch separately.
        col = []
        row = []
        val = []
        for batch,k in enumerate(K):
            _, indices = grad_output[batch].abs().topk(k)  # don't use return VALUES since they are abs()!
            col.append(indices)
            val.append(torch.index_select(grad_output[batch], -1, indices)) # select values from grad_output instead
            row += indices.size(0) * [batch]
        col = torch.cat(col).detach()
        row = torch.Tensor(row)
        val = torch.cat(val)
        sparse = torch.sparse_coo_tensor(torch.vstack((row,col)), val, grad_output.size())
        
        # Do the usual bp stuff but use sparse matmul on grad_input and grad_weight
        if ctx.needs_input_grad[0]:
            grad_input = torch.sparse.mm(sparse, weight)
        if ctx.needs_input_grad[1]:
            grad_weight = torch.sparse.mm(sparse.t(), input)  # Gradients are zeroed each batch
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0)
        return grad_input, grad_weight, None, grad_bias

In [26]:
class BackwardsInfo:
    def __init__(self):
        self.Y_max = 0
        self.miniBatchBpr = 0
        self.miniBatchK = 0
        
class TinyPropLinear(nn.Module):
    def __init__(self, input_features, output_features, bias=True):
        super(TinyPropLinear, self).__init__()
        self.input_features = input_features
        self.output_features = output_features
        
        # Saving variables like this will pass it by REFERENCE, so changes 
        # made in backwards are reflected in layer
        self.info = BackwardsInfo() 
        self.weight = nn.Parameter(torch.empty(output_features, input_features))
        if bias:
            self.bias = nn.Parameter(torch.empty(output_features))
        else:
            self.register_parameter('bias', None)
        self.initialize_parameters()
            
    def initialize_parameters(self):
        stdv = 1. / math.sqrt(self.output_features)
        self.weight.data.uniform_(-stdv, stdv)
        self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input):
        # Here the custom linear function is applied
        return SparseLinear.apply(input, self.weight, self.info, self.bias)

In [56]:
S_max = 1
S_min = 0
zeta = 1

In [57]:
class SparseConv2d(torch.autograd.Function):  
    @staticmethod
    # bias is an optional argument
    def forward(ctx, input, weight, kernel_size, out_channels, dilation, padding, stride, info, bias=None):
        # Save inputs in context-object for later use in backwards
        ctx.save_for_backward(input, weight, bias) # these are differentiable
        
        # Save non-differentiable argument(s)
        ctx.info = info 
        #ctx.dilation = dilation
        #ctx.padding = padding
        #ctx.stride = stride
        
        
        # x [batchSize, in_channels, width, height]
        width = ((x.shape[2] + 2*padding[0] - dilation[0]*(kernel_size - 1) - 1) // stride[0]) + 1
        height= ((x.shape[3] + 2*padding[1] - dilation[1]*(kernel_size - 1) - 1) // stride[1]) + 1
     
        windows = F.unfold(x, kernel_size=(kernel_size, kernel_size), padding=padding, dilation=dilation, stride=stride)
        windows = windows.transpose(1, 2).contiguous().view(-1, x.shape[1], kernel_size*kernel_size)
        windows = windows.transpose(0, 1)
        
        print(width)
        print(height)
        print(windows.shape)
        
        output = torch.zeros([x.shape[0]*out_channels, width, height], dtype=torch.float32, device=device)

        # Loop over channels
        for channel in range(x.shape[1]):
            for outChannel in range(out_channels):
                res = torch.matmul(windows[channel], weight[outChannel][channel]) 
                res = res.view(-1, width, height)
                output[outChannel * res.shape[0] : (outChannel + 1) * res.shape[0]] += res
                
        output = output.view(x.shape[0], out_channels, width, height)
       
        #if bias is not None:
            #output += bias.unsqueeze(0).expand_as(output)
        return output
    
    
    @staticmethod
    def backward(ctx, grad_output):
        input, weight, bias = ctx.saved_tensors
        # input   [batchSize, in]
        # output  [batchSize, out]
        # weights [out, in]
        # bias    [out]
        grad_input = grad_weight = grad_bias = None
        
        # Calculate k (different across batches)
        Y = grad_output.abs().sum(1)       # Y[batchSize]
        if (torch.max(Y) > ctx.info.Y_max):     # Check if biggest Y of batch is bigger than recorded Y
            ctx.info.Y_max = torch.max(Y).item()
        bpr = (S_min + Y*(S_max-S_min)/(ctx.info.Y_max))  #TODO add wearoff
    
        K = torch.tensor(torch.round(grad_output.size(1)*bpr))        # K[batchSize]
        K.clamp(1, grad_output.size(1))
        # log in layer
        ctx.info.miniBatchBpr = torch.mean(bpr)
        ctx.info.miniBatchK = torch.mean(K)
        K = K.to(torch.int16)
        
        # create a sparse grad_output tensor. Since k is different across batches, the topK indices
        # must be assembled for each batch separately.
        col = []
        row = []
        val = []
        for batch,k in enumerate(K):
            _, indices = grad_output[batch].abs().topk(k)  # don't use return VALUES since they are abs()!
            col.append(indices)
            val.append(torch.index_select(grad_output[batch], -1, indices)) # select values from grad_output instead
            row += indices.size(0) * [batch]
        col = torch.cat(col).detach()
        row = torch.Tensor(row)
        val = torch.cat(val)
        sparse = torch.sparse_coo_tensor(torch.vstack((row,col)), val, grad_output.size())
        
        # Do the usual bp stuff but use sparse matmul on grad_input and grad_weight
        if ctx.needs_input_grad[0]:
            grad_input = torch.sparse.mm(sparse, weight)
        if ctx.needs_input_grad[1]:
            grad_weight = torch.sparse.mm(sparse.t(), input)  # Gradients are zeroed each batch
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0)
        return grad_input, grad_weight, None, None, None, None, None, None, grad_bias

In [58]:
class TinyPropConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dilation=1, padding=0, stride=1, bias=True):
        super(TinyPropConv2d, self).__init__()

        self.kernel_size = kernel_size
        self.out_channels = out_channels
        self.dilation = (dilation, dilation)
        self.padding = (padding, padding)
        self.stride = (stride, stride)
        self.in_channels = in_channels
        
        # Saving variables like this will pass it by REFERENCE, so changes 
        # made in backwards are reflected in layer
        self.info = BackwardsInfo() 
        self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels, kernel_size * kernel_size))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_channels))

    def forward(self, input):
        # Here the custom conv2d function is applied
        return SparseConv2d.apply(input, self.weight, self.kernel_size, self.out_channels, self.dilation, self.padding, self.stride, self.info, self.bias)

In [61]:
device = 'cpu'
conv = TinyPropConv2d(3, 1, 3)
x = torch.randn(1, 3, 24, 24)
out = conv(x)
#out.mean().backward()
#print(conv.weight.grad)

22
22
torch.Size([3, 484, 9])


In [6]:
S_max = 1
S_min = 0
zeta = 1

class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = TinyPropLinear(28*28, 32)
        self.fc2 = TinyPropLinear(32, 10)
    
    def forward(self, x):
        x = x.view(-1, 28*28)
        x = F.relu(self.fc1(x))
        x = F.log_softmax(self.fc2(x), dim=1)
        return x

In [10]:
def train(network, epoch):
    network.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = network(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        # log
        fc1_meanBpr[epoch-1] += network.fc1.info.miniBatchBpr
        fc1_meanK[epoch-1] += network.fc1.info.miniBatchK
        fc2_meanBpr[epoch-1] += network.fc2.info.miniBatchBpr
        fc2_meanK[epoch-1] += network.fc2.info.miniBatchK
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_idx * len(data), len(train_loader.dataset),
            100. * batch_idx / len(train_loader), loss.item()), end ='\r')
            train_losses.append(loss.item())
            train_counter.append((batch_idx*batch_size_train) + ((epoch-1)*len(train_loader.dataset)))
    print('Train Epoch: {} completed          '.format(epoch))
    
    fc1_meanBpr[epoch-1] /= len(train_loader.dataset)//batch_size_train
    fc1_meanK[epoch-1]   /= len(train_loader.dataset)//batch_size_train
    fc2_meanBpr[epoch-1] /= len(train_loader.dataset)//batch_size_train
    fc2_meanK[epoch-1]   /= len(train_loader.dataset)//batch_size_train
    
    
def test(network):
    network.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = network(data)
            test_loss += F.nll_loss(output, target, size_average=False).item()
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).sum()
    test_loss /= len(test_loader.dataset)
    test_losses.append(test_loss)
    print('Test set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    100. * correct / len(test_loader.dataset)))

In [11]:
train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]

fc1_meanBpr = torch.zeros([n_epochs])
fc1_meanK = torch.zeros([n_epochs])
fc2_meanBpr = torch.zeros([n_epochs])
fc2_meanK = torch.zeros([n_epochs])

#fc1_weights = torch.zeros([n_epochs, 784*32])
#fc1_loc_err = torch.zeros([n_epochs, 32])
#fc1_Y       = torch.zeros([n_epochs])
#fc2_weights = torch.zeros([n_epochs,  32*10])
#fc2_loc_err = torch.zeros([n_epochs, 10])

In [12]:
network = Net()
optimizer = optim.SGD(network.parameters(), lr=learning_rate, momentum=momentum)

test(network)
for epoch in range(1, n_epochs + 1):
    train(network, epoch)
    test(network)

Test set: Avg. loss: 2.5484, Accuracy: 730/10000 (7%)




Train Epoch: 1 completed          ]	Loss: 2.028627
Test set: Avg. loss: 1.5996, Accuracy: 6550/10000 (66%)

Train Epoch: 2 completed          ]	Loss: 1.063643
Test set: Avg. loss: 1.5513, Accuracy: 7175/10000 (72%)

Train Epoch: 3 completed          ]	Loss: 1.171235
Test set: Avg. loss: 1.6653, Accuracy: 7532/10000 (75%)

Train Epoch: 4 completed          ]	Loss: 1.314612
Test set: Avg. loss: 2.4600, Accuracy: 7026/10000 (70%)

Train Epoch: 5 completed          ]	Loss: 0.604967
Test set: Avg. loss: 2.0251, Accuracy: 7408/10000 (74%)



In [15]:
print(fc1_meanBpr)
print(fc2_meanBpr)

tensor([0.2403, 0.1675, 0.1461, 0.1276, 0.1191])
tensor([0.3646, 0.2760, 0.2558, 0.2404, 0.2309])


In [24]:
fig = plt.figure()
plt.plot(train_counter, train_losses, color='blue')
plt.scatter(test_counter, test_losses, color='red')
plt.legend(['Train Loss', 'Test Loss'], loc='upper right')
plt.xlabel('number of training examples seen')
plt.ylabel('negative log likelihood loss')

Text(0, 0.5, 'negative log likelihood loss')

In [51]:
def plotGradients(gradients, end, num_bins, num_samples):
    bins = np.linspace(0, end, num_bins+1)
    X, Y = np.meshgrid(bins[:num_bins], np.arange(0, num_samples))
    Z = np.empty([num_samples, num_bins])

    for i,sample in enumerate(gradients):
        hist, edges = np.histogram(sample, bins=bins)
        #print('hist')
        #print(sample)
        #print(hist)
        #print(edges)
        Z[i] = hist

    fig = plt.figure()
    ax = plt.axes(projection='3d')
    ax.plot_surface(X, Y, Z, cmap='viridis', edgecolor='none')
    ax.set_title('Surface plot')
    plt.show()

In [59]:
plotGradients(fc1_loc_err, 5, 25, n_epochs)