In [1]:
import tensor_comprehensions as tc
import torch
import torch.nn as nn
from torch.autograd import Variable

import numpy as np
import matplotlib.pyplot as plt

import os
import sys
import logging
import time

In [None]:
# as recommended by the authors for better performance
tune_settings = {
    "threads": 32,
    "generations": 8,
    "pop_size": 20,
    "number_elites": 3
}

In [2]:
# override with much larger optimization
tune_settings = {
    "threads": 32,
    "generations": 20,
    "pop_size": 50,
    "number_elites": 3
}

In [3]:
gconv_lang = tc.database['group_convolution']['lang']

In [None]:
# try the sizes given in the paper
# N = 32, G = 32, F = 16, C = 16, W = 14, H = 14

I = torch.randn(32, 32, 16, 14, 14).cuda()
W1 = torch.randn(32, 16, 16, 3, 3).cuda()
B = torch.randn(32, 16).cuda()

In [5]:
gconv = tc.define(gconv_lang, name='group_convolution')

In [6]:
gconv.autotune(I, W1, B,
               cache='cache/gconvslow128.tc',
               **tune_settings,
               options=tc.Options('conv'))

[INFO]: Autotuning cache will be saved to: cache/gconvslow128.tc.cuda/options


<tensor_comprehensions.mapping_options.Options at 0x7fbb50bac928>

In [None]:
gconv.autotune((32, 32, 4, 56, 56), (32, 4, 4, 3, 3), (32, 4), cache='cache/gconv2.tc', **tune_settings, options=tc.Options('conv'))

In [11]:
# redefine with the new dimensions
I = torch.randn(128, 16, 16, 10, 10).cuda()
W1 = torch.randn(16, 16, 16, 3, 3).cuda()
B = torch.randn(16, 16).cuda()

In [None]:
# run twice to prep cuda
out = gconv(I, W1, B, cache='cache/gconv2.tc')
torch.cuda.synchronize()
out = gconv(I, W1, B, cache='cache/gconv2.tc')
torch.cuda.synchronize()
print(out.size())


In [12]:
# test performance using TC
torch.cuda.synchronize()

timings = np.zeros(1000)

for i in range(1000):
    start = time.perf_counter()
    output = gconv(I, W1, B, cache='cache/gconv2.tc')
    torch.cuda.synchronize()
    timings[i] = time.perf_counter() - start

total_elapsed = np.sum(timings)
print('total time: ' + str(total_elapsed))

# get percentile statistics
percentiles = np.percentile(timings, [0, 50, 90])
print(percentiles)


total time: 1.09498435501
[ 0.00106181  0.00108257  0.00112031]


In [None]:
# test the torch equivalent

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.layer = nn.Sequential(
            nn.Conv2d(256, 256, 3, groups=16))
    
    def forward(self, x):
        return self.layer(x)

CN = ConvNet()
CN.cuda()

var_input = Variable(torch.randn(128, 256, 10, 10).cuda())
# run a couple times to work out start hiccups
CN(var_input)
torch.cuda.synchronize()
CN(var_input)
torch.cuda.synchronize()

# test performance
timings = np.zeros(1000)

for i in range(1000):
    start = time.perf_counter()
    output = CN(var_input)
    torch.cuda.synchronize()
    timings[i] = time.perf_counter() - start

total_elapsed = np.sum(timings)
print('total time: ' + str(total_elapsed))

# get percentile statistics
percentiles = np.percentile(timings, [0, 50, 90])
print(percentiles)

In [None]:
print(CN)

In [None]:
print(output.size())