In [1]:
import tensor_comprehensions as tc
import torch
import torch.nn as nn
from torch.autograd import Variable

import numpy as np
import matplotlib.pyplot as plt

import os
import sys
import logging
import time

In [2]:
# as recommended by the authors for better performance
tune_settings = {
    "threads": 32,
    "generations": 8,
    "pop_size": 25,
    "number_elites": 3
}

In [31]:
mobilenet_lang = tc.database['small_mobilenet']['lang']

In [32]:
# tune mobilenet for the dimensions typical of mobilenet on a single input

I = torch.randn(128, 56, 56).cuda()
W1 = torch.randn(128, 3, 3).cuda()
B1 = torch.randn(128).cuda()
W2 = torch.randn(128, 128).cuda()
B2 = torch.randn(128).cuda()


In [None]:

small_mobilenet = tc.define(mobilenet_lang, name='small_mobilenet')
small_mobilenet.autotune(I, W1, B1, W2, B2,
                         cache='cache/small_mobilenet_128_56_3_128',
                         **tune_settings,
                         options=tc.Options('conv'))

In [34]:
# run twice to prep cuda
small_mobilenet = tc.define(mobilenet_lang, name='small_mobilenet')
small_mobilenet(I, W1, B1, W2, B2, cache='cache/small_mobilenet_128_56_3_128')
torch.cuda.synchronize()
small_mobilenet(I, W1, B1, W2, B2, cache='cache/small_mobilenet_128_56_3_128')
torch.cuda.synchronize()

In [36]:
# test performance using TC

temp = torch.zeros(128, 56, 56).cuda()
output = torch.zeros(128, 56, 56).cuda()
torch.cuda.synchronize()

timings = np.zeros(500)

for i in range(500):
    start = time.perf_counter()
    small_mobilenet(I, W1, B1, W2, B2,
         cache='cache/small_mobilenet_128_56_3_128', 
         outputs=[temp, output])
    torch.cuda.synchronize()
    timings[i] = time.perf_counter() - start

total_elapsed = np.sum(timings)
print('total time: ' + str(total_elapsed))

# get percentile statistics
percentiles = np.percentile(timings, [0, 50, 90])
print(percentiles)

total time: 0.458713924007
[ 0.00085937  0.0008865   0.00097795]


In [37]:
# test the torch equivalent

class MobileNet(nn.Module):
    def __init__(self):
        super(MobileNet, self).__init__()
        self.layer = nn.Sequential(
            nn.Conv2d(128, 128, 3, padding=1, groups=128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, 1, padding=0),
            nn.ReLU(inplace=True))
    
    def forward(self, x):
        return self.layer(x)

MN = MobileNet()
MN.cuda()

var_input = Variable(torch.randn(1, 128, 56, 56).cuda())
# run a couple times to work out start hiccups
MN(var_input)
torch.cuda.synchronize()
MN(var_input)
torch.cuda.synchronize()


# test performance
timings = np.zeros(500)
output = torch.randn(1, 128, 56, 56).cuda()

for i in range(500):
    start = time.perf_counter()
    output = MN(var_input)
    torch.cuda.synchronize()
    timings[i] = time.perf_counter() - start

total_elapsed = np.sum(timings)
print('total time: ' + str(total_elapsed))

# get percentile statistics
percentiles = np.percentile(timings, [0, 50, 90])
print(percentiles)

total time: 0.0718612639885
[ 0.00013221  0.00013758  0.00015918]


In [2]:
# try with a larger batch size

batch_mobilenet_lang = """
def batch_mobilenet(float(B, C1, H, W) I, float(C1, KH1, KW1) W1, float(C1) B1, float(C2, C1) W2, float(C2) B2)
-> (O1, O2) {
    O1(b, c1, h, w) +=! I(b, c1, h + kh, w + kw) * W1(c1, kh, kw)
    O1(b, c1, h, w)  = O1(b, c1, h, w) + B1(c1)
    O1(b, c1, h, w)  = fmax(O1(b, c1, h, w), 0)

    O2(b, c2, h, w) +=! O1(b, c1, h, w) * W2(c2, c1)
    O2(b, c2, h, w)  = O2(b, c2, h, w) + B2(c2)
    O2(b, c2, h, w)  = fmax(O2(b, c2, h, w), 0)
}
"""

In [3]:
# still above author-recommended settings, but less to reduce tuning time
tune_settings = {
    "threads": 32,
    "generations": 5,
    "pop_size": 12,
    "number_elites": 2
}

In [5]:
# tune mobilenet for the typical mobilenet dimensions and a normal batch size

I = torch.randn(32, 64, 56, 56).cuda()
W1 = torch.randn(64, 3, 3).cuda()
B1 = torch.randn(64).cuda()
W2 = torch.randn(32, 64).cuda()
B2 = torch.randn(32).cuda()

batch_mobilenet = tc.define(batch_mobilenet_lang, name='batch_mobilenet')
batch_mobilenet.autotune(I, W1, B1, W2, B2,
                         cache='cache/batch_mobilenet.tc',
                         **tune_settings,
                         options=tc.Options('conv'))

[INFO]: Autotuning cache will be saved to: cache/batch_mobilenet.tc.cuda/options


<tensor_comprehensions.mapping_options.Options at 0x7fb8019fb6f8>

In [20]:
# run twice to prep cuda

batch_mobilenet(I, W1, B1, W2, B2, cache='cache/batch_mobilenet.tc')
torch.cuda.synchronize()
batch_mobilenet(I, W1, B1, W2, B2, cache='cache/batch_mobilenet.tc')
torch.cuda.synchronize()

print(I.size())

torch.Size([32, 64, 56, 56])


In [29]:
# test performance using TC

temp = torch.zeros(32, 64, 56, 56).cuda()
output = torch.zeros(32, 32, 56, 56).cuda()
torch.cuda.synchronize()

timings = np.zeros(1000)

for i in range(1000):
    start = time.perf_counter()
    batch_mobilenet(I, W1, B1, W2, B2,
         cache='cache/batch_mobilenet.tc',
         outputs=[temp, output])
    torch.cuda.synchronize()
    timings[i] = time.perf_counter() - start

total_elapsed = np.sum(timings)
print('total time: ' + str(total_elapsed))

# get percentile statistics
percentiles = np.percentile(timings, [0, 50, 90])
print(percentiles)

total time: 4.75282455601
[ 0.00458481  0.00468457  0.00475157]


In [26]:
# test the torch equivalent

class NMobileNet(nn.Module):
    def __init__(self):
        super(NMobileNet, self).__init__()
        self.layer = nn.Sequential(
            nn.Conv2d(64, 64, 3, padding=1, groups=64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 32, 1, padding=0),
            nn.ReLU(inplace=True))
    
    def forward(self, x):
        return self.layer(x)

MN = NMobileNet()
MN.cuda()

var_input = Variable(torch.randn(32, 64, 56, 56).cuda())
# run a couple times to work out start hiccups
MN(var_input)
torch.cuda.synchronize()
MN(var_input)
torch.cuda.synchronize()


# test performance
timings = np.zeros(1000)
output = torch.randn(32, 32, 56, 56).cuda()

for i in range(1000):
    start = time.perf_counter()
    output = MN(var_input)
    torch.cuda.synchronize()
    timings[i] = time.perf_counter() - start

total_elapsed = np.sum(timings)
print('total time: ' + str(total_elapsed))

# get percentile statistics
percentiles = np.percentile(timings, [0, 50, 90])
print(percentiles)

total time: 1.13856418202
[ 0.00111421  0.00112798  0.00115446]
