In [1]:
import tensor_comprehensions as tc
import torch
import numpy as np
import matplotlib.pyplot as plt

import os
import sys
import logging
import time

In [3]:
tc.decode('cache/tbmm_500_26_72_26.tc.options')

In [4]:
# try doing tbmm using the options provided in the paper

options = tc.Options('naive')

In [5]:
print(options)

<tensor_comprehensions.mapping_options.Options object at 0x7fca5bbd77a0>


In [13]:
options.tile([1])
options.mapToThreads([7, 26])
options.mapToBlocks([72, 16, 1])
options.useSharedMemory(True)
options.unrollCopyShared(True)
options.unroll(128)


<tensor_comprehensions.mapping_options.Options at 0x7fca5bbd77a0>

In [14]:
tbmm_lang = """
    def tbmm(float(B, N, M) X, float(B, K, M) Y) -> (Z) {
        Z(b, n, k) +=! X(b, n, m) * Y(b, k, m)
    }
"""

tbmm = tc.define(tbmm_lang, name='tbmm')

In [15]:
mat1 = torch.randn(500, 26, 72).cuda()
mat2 = torch.randn(500, 26, 72).cuda()

tbmm(mat1, mat2, options=options)

Variable containing:
( 0 ,.,.) = 
 -1.6885e+01 -7.3772e+00  8.4065e+00  ...   1.4715e+01  5.7064e+00 -1.4126e+00
  1.1810e+01  4.6003e+00  2.9482e+00  ...   1.3520e+00 -6.3731e+00 -8.5967e+00
  3.8169e+00  1.2704e+01 -1.9057e+00  ...   1.1380e+00 -1.2910e+01 -8.8803e+00
                 ...                   ⋱                   ...                
 -3.8452e+00 -6.2901e+00 -1.2869e+01  ...  -5.9708e+00 -8.7984e+00 -1.1660e+01
 -2.3053e+00  1.0173e-01  2.2982e+01  ...  -1.3297e+01  1.3197e+00  5.6996e+00
  1.8203e+00 -3.9348e+00  1.4310e+00  ...   1.8315e-01  1.2535e+01  8.7401e+00

( 1 ,.,.) = 
 -3.3645e+00 -1.0240e+01  4.7245e+00  ...   3.6880e+00  5.7236e+00  8.5060e+00
 -1.1546e+01 -2.9363e+00  4.7658e+00  ...   1.5735e+00 -1.0690e+00 -6.7049e+00
  7.0505e+00  2.3530e+00  3.0476e+00  ...  -5.0394e+00  1.2592e+01  5.2499e+00
                 ...                   ⋱                   ...                
  9.1670e-01 -2.2322e+01  1.4825e+00  ...   6.9605e+00 -5.0934e+00 -1.8117e+01
  1.

In [17]:
# test performance by running multiple iterations
output = torch.zeros(500, 26, 26).cuda()

timings = np.zeros(1000)

for i in range(1000):
    start = time.perf_counter()
    tbmm(mat1, mat2, cache='cache/tbmm_500_26_72_26.tc', outputs=output)
    torch.cuda.synchronize()
    timings[i] = time.perf_counter() - start


total_elapsed = np.sum(timings)
print('total time: ' + str(total_elapsed))

# get percentile statistics
percentiles = np.percentile(timings, [0, 50, 90])
print(percentiles)



total time: 0.153102859998
[ 0.00013205  0.00014391  0.00017327]
