In [1]:
import tensor_comprehensions as tc
import torch
import numpy as np
import matplotlib.pyplot as plt

import os
import sys
import logging
import time

In [2]:
# as of current, there are only two layers with gradient code:
# matmul and convolution_strided

[tc.database[entry]['grad'] for entry in tc.database if 'grad' in tc.database[entry].keys()]

['def matmul_grad(float(M,N) A, float(N,K), float(M,K) O_grad) -> (A_grad, B_grad){\n  A_grad(i, j) +=! O_grad(i, kk) * B(j, kk)\n  B_grad(i, j) +=! O_grad(kk, j) * A(kk, i)\n}\n',
 'def convolution_strided_grad(float(N, C, H, W) I, float(M, C, KH, KW) W1, float(N, M, H, W) O_grad)\n-> (I_grad, W1_grad) {{\n  I_grad(n, c, h, w) +=! O_grad(n, m, {sh} * h - kh, {sw} * w - kw) * W1(m, c, kh, kw)\n  W1_grad(m, c, kh, kw) +=! O_grad(n, m, {sh} * h - kh, {sw} * w - kw) * I(n, c, h, w)\n}}\n']

In [None]:
# default autotuner settings don't specify very much
tc.autotuner_settings

In [None]:
# as recommended by the authors for better performance
tune_settings = {
    "threads": 32,
    "generations": 8,
    "pop_size": 20,
    "number_elites": 4
}

In [None]:
# heavier tune settings
tune_settings = {
    "threads": 32,
    "generations": 10,
    "pop_size": 50,
    "number_elites": 3
}

In [3]:
tbmm_lang = """
    def tbmm(float(B, N, M) X, float(B, K, M) Y) -> (Z) {
        Z(b, n, k) +=! X(b, n, m) * Y(b, k, m)
    }
"""

In [6]:
tbmm = tc.define(tbmm_lang, name='tbmm')

In [None]:
# tune tbmm for the paper-given batch size of (B, N, M, K) = (500, 26, 72, 26), time how long it takes

mat1 = torch.randn(500, 26, 72).cuda()
mat2 = torch.randn(500, 26, 72).cuda()

tbmm = tc.define(tbmm_lang, name='tbmm')
tbmm.autotune(mat1, mat2, 
              cache='cache/tbmm_500_26_72_26.tc', 
              **tune_settings, 
              options=tc.Options('mlp'))

In [None]:
# run twice to prep cuda

tbmm(mat1, mat2, cache='cache/tbmm_500_26_72_26.tc')
torch.cuda.synchronize()
tbmm(mat1, mat2, cache='cache/tbmm_500_26_72_26.tc')
torch.cuda.synchronize()

In [None]:
# test performance by running multiple iterations
output = torch.zeros(500, 26, 26).cuda()

timings = np.zeros(1000)

for i in range(1000):
    start = time.perf_counter()
    tbmm(mat1, mat2, cache='cache/tbmm_500_26_72_26.tc', outputs=output)
    torch.cuda.synchronize()
    timings[i] = time.perf_counter() - start


total_elapsed = np.sum(timings)
print('total time: ' + str(total_elapsed))

# get percentile statistics
percentiles = np.percentile(timings, [0, 50, 90])
print(percentiles)



In [None]:
# time the torch equivalent

timings = np.zeros(1000)

for i in range(1000):
    start = time.perf_counter()
    output = mat1.matmul(mat2.permute(0, 2, 1))
    torch.cuda.synchronize()
    timings[i] = time.perf_counter() - start

total_elapsed = np.sum(timings)
print('total time: ' + str(total_elapsed))

# get percentile statistics
percentiles = np.percentile(timings, [0, 50, 90])
print(percentiles)



In [None]:
# tune using a completely different size (B, N, M, K) = (800, 80, 40, 80), time how long it takes

mat1 = torch.randn(500, 80, 40).cuda()
mat2 = torch.randn(500, 80, 40).cuda()

tbmm = tc.define(tbmm_lang, name='tbmm')
tbmm.autotune(mat1, mat2, 
              cache='cache/tbmm_800_80_40_80.tc', 
              **tune_settings, 
              options=tc.Options('mlp'))

In [None]:
# run twice to prep cuda

tbmm(mat1, mat2, cache='cache/tbmm_800_80_40_80.tc')
torch.cuda.synchronize()
tbmm(mat1, mat2, cache='cache/tbmm_800_80_40_80.tc')
torch.cuda.synchronize()

In [None]:
# test performance by running multiple iterations
output = torch.zeros(800, 80, 80).cuda()

timings = np.zeros(1000)

for i in range(1000):
    start = time.perf_counter()
    tbmm(mat1, mat2, cache='cache/tbmm_800_80_40_80.tc', outputs=output)
    torch.cuda.synchronize()
    timings[i] = time.perf_counter() - start


total_elapsed = np.sum(timings)
print('total time: ' + str(total_elapsed))

# get percentile statistics
percentiles = np.percentile(timings, [0, 50, 90])
print(percentiles)



In [None]:
# time the torch equivalent

timings = np.zeros(1000)

for i in range(1000):
    start = time.perf_counter()
    output = mat1.matmul(mat2.permute(0, 2, 1))
    torch.cuda.synchronize()
    timings[i] = time.perf_counter() - start

total_elapsed = np.sum(timings)
print('total time: ' + str(total_elapsed))

# get percentile statistics
percentiles = np.percentile(timings, [0, 50, 90])
print(percentiles)



In [None]:
# tune using a completely different size (B, N, M, K) = (200, 200, 200, 200), time how long it takes

mat1 = torch.randn(200, 200, 200).cuda()
mat2 = torch.randn(200, 200, 200).cuda()

tbmm = tc.define(tbmm_lang, name='tbmm')
tbmm.autotune(mat1, mat2, 
              cache='cache/tbmm_slow.tc', 
              **tune_settings, 
              options=tc.Options('mlp'))

In [4]:

mat1 = torch.randn(200, 200, 200).cuda()
mat2 = torch.randn(200, 200, 200).cuda()

In [None]:
# run twice to prep cuda

tbmm(mat1, mat2, cache='cache/tbmm_200_200_200_200.tc')
torch.cuda.synchronize()
tbmm(mat1, mat2, cache='cache/tbmm_200_200_200_200.tc')
torch.cuda.synchronize()

In [8]:
# test performance by running multiple iterations
output = torch.zeros(200, 200, 200).cuda()

timings = np.zeros(1000)

for i in range(1000):
    start = time.perf_counter()
    tbmm(mat1, mat2, cache='cache/tbmm_slow.tc', outputs=output)
    torch.cuda.synchronize()
    timings[i] = time.perf_counter() - start


total_elapsed = np.sum(timings)
print('total time: ' + str(total_elapsed))

# get percentile statistics
percentiles = np.percentile(timings, [0, 50, 90])
print(percentiles)


total time: 12.460763718
[ 0.01223893  0.01245288  0.01253984]


In [10]:
# time the torch equivalent

timings = np.zeros(1000)

for i in range(1000):
    start = time.perf_counter()
    output = mat1.matmul(mat2.permute(0, 2, 1))
    torch.cuda.synchronize()
    timings[i] = time.perf_counter() - start

total_elapsed = np.sum(timings)
print('total time: ' + str(total_elapsed))

# get percentile statistics
percentiles = np.percentile(timings, [0, 50, 90])
print(percentiles)



total time: 2.47778633102
[ 0.00219591  0.0024615   0.00259213]
