In [None]:
import tensor_comprehensions as tc
import torch
import numpy as np
import matplotlib.pyplot as plt

import os
import sys
import logging
import time

In [None]:
# as of current, there are only two layers with gradient code:
# matmul and convolution_strided

[tc.database[entry]['grad'] for entry in tc.database if 'grad' in tc.database[entry].keys()]

In [None]:
# default autotuner settings don't specify very much
tc.autotuner_settings

In [None]:
# as recommended by the authors for better performance
tune_settings = {
    "threads": 32,
    "generations": 5,
    "pop_size": 10,
    "number_elites": 1
}

In [None]:
# tune 100 by 100 matrix multiplication

mat1 = torch.randn(100, 100).cuda()
mat2 = torch.randn(100, 100).cuda()

matmul = tc.define(tc.database['matmul']['lang'], name='matmul')
matmul.autotune(mat1, mat2, 
                cache='cache/matmul_100.tc', 
                **tune_settings, 
                options=tc.Options('mlp'))

In [None]:
matmul(mat1, mat2, cache='cache/matmul_100.tc')

In [None]:
# test performance by running multiple iterations
output = torch.zeros(100, 100).cuda()
start = time.time()
for i in range(10000):
    matmul(mat1, mat2, cache='cache/matmul_100.tc', outputs=output)
end = time.time()
elapsed = end - start
print(elapsed)

In [None]:
# time the torch equivalent
start = time.time()
for i in range(10000):
    output = mat1.mm(mat2)
end = time.time()
elapsed = end - start
print(elapsed)

In [None]:
# how long does matrix multiply actually take?
large1 = torch.randn(1000, 1000).cuda()
large2 = torch.randn(1000, 1000).cuda()
out = torch.zeros(1000, 1000).cuda()

In [1]:
s = time.perf_counter()
for i in range(10):
    out = large1.mm(large2)
elapsed = time.perf_counter()- s
print(elapsed)

NameError: name 'time' is not defined

In [None]:
# https://discuss.pytorch.org/t/tensorflow-vs-pytorch-convnet-benchmark/8738/4

# https://discuss.pytorch.org/t/measuring-gpu-tensor-operation-speed/2513/3