In [1]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('facial_expression')
sys.path.append('facial_age_gender')

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split
import numpy as np
from typing import List
from copy import deepcopy

from facial_expression.VGG_Face_torch import VGG_emotionNet
from facial_age_gender.VGG_Face_torch import VGG_ageNet, VGG_genderNet
from test_func import test_facial, test_multi_acc

from metamorph.compiler.compiler import MetaMorph
from metamorph.graph.abs_graph import Graph
from metamorph.graph.cmp_graph import ComputeGraph
from metamorph.metrics.testing_utils import test_accuracy, test_latency
from metamorph.compiler.policy import SimulatedAnnealingPolicy
from metamorph.data.dataloader import DatasetSampler

DEVICE = 'cuda:1' if torch.cuda.is_available() else 'cpu'
SAMPLE_INPUT = torch.rand(1,3,224,224).to(DEVICE)
kwargs = {'num_workers': 4, 'pin_memory': True}

In [2]:
if_print_ori_model = False

if_test_facial = False
if_test_ori_accuracy = False
if_test_ori_latency = False

test_random_connet = False

In [None]:
# emotionNet + FER2013
emo_transform_train = transforms.Compose([transforms.RandomHorizontalFlip(),
                                 transforms.Resize(256),
                                 transforms.RandomCrop(224),
                                 transforms.ToTensor(),
                                 transforms.Normalize((0.507395516207, ),(0.255128989415, ))
                                ])
emo_transform_test  = transforms.Compose([transforms.Resize(224),
                                 transforms.ToTensor(),
                                 transforms.Normalize((0.507395516207, ),(0.255128989415, ))
                                ])
emo_train_data = torchvision.datasets.ImageFolder('/home/qizhengyang/MTL_compiler/test_metamorph/face/dataset/fer2013/train',transform=emo_transform_train)
emo_test_data = torchvision.datasets.ImageFolder('/home/qizhengyang/MTL_compiler/test_metamorph/face/dataset/fer2013/test',transform=emo_transform_test)
emo_test_loader = torch.utils.data.DataLoader(emo_test_data, batch_size=128, shuffle=False, **kwargs)
print(len(emo_train_data), len(emo_test_data))

emotionNet = VGG_emotionNet()
emotionNet.load_state_dict(torch.load('facial_expression/EmotionNet.model', map_location=DEVICE))
emotionNet = emotionNet.to(DEVICE).eval()

if if_print_ori_model:
    print(emotionNet)

if if_test_facial:
    emo_acc = test_facial(emotionNet, emo_test_loader, DEVICE)

In [4]:
# ageNet + Adience
age_transform_test  = transforms.Compose([transforms.Resize(224),
                                 transforms.ToTensor(),
                                 transforms.Normalize((0.507395516207, ),(0.255128989415, ))
                                ])
age_data = torchvision.datasets.ImageFolder('/home/qizhengyang/MTL_compiler/test_metamorph/face/dataset/adience/age',transform=age_transform_test)
age_train_indices, age_test_indices = train_test_split(list(range(len(age_data.targets))), test_size=0.2, stratify=age_data.targets, random_state=10)
age_train_data = torch.utils.data.Subset(age_data, age_train_indices)
age_test_data = torch.utils.data.Subset(age_data, age_test_indices)
age_test_loader = torch.utils.data.DataLoader(age_test_data, batch_size=128, shuffle=False, **kwargs)
print(len(age_train_data), len(age_test_data))

ageNet = VGG_ageNet()
ageNet.load_state_dict(torch.load('facial_age_gender/ageNet.model', map_location=DEVICE))
ageNet = ageNet.to(DEVICE).eval()

if if_print_ori_model:
    print(ageNet)

if if_test_facial:
    age_acc = test_facial(ageNet, age_test_loader, DEVICE)

8824 2206


In [5]:
# genderNet + Adience
gen_transform_test  = transforms.Compose([transforms.Resize(224),
                                 transforms.ToTensor(),
                                 transforms.Normalize((0.507395516207, ),(0.255128989415, ))
                                ])
gender_data = torchvision.datasets.ImageFolder('/home/qizhengyang/MTL_compiler/test_metamorph/face/dataset/adience/gender',transform=gen_transform_test)
gen_train_indices, gen_test_indices = train_test_split(list(range(len(gender_data.targets))), test_size=0.2, stratify=gender_data.targets, random_state=10)
gen_train_data = torch.utils.data.Subset(gender_data, gen_train_indices)
gen_test_data = torch.utils.data.Subset(gender_data, gen_test_indices)
gen_test_loader = torch.utils.data.DataLoader(gen_test_data, batch_size=128, shuffle=False, **kwargs)
print(len(gen_train_data), len(gen_test_data))

genderNet = VGG_genderNet()
genderNet.load_state_dict(torch.load('facial_age_gender/genderNet.model', map_location=DEVICE))
genderNet = genderNet.to(DEVICE).eval()

if if_print_ori_model:
    print(genderNet)

if if_test_facial:
    gen_acc = test_facial(genderNet, gen_test_loader, DEVICE)

7878 1970


In [6]:
def parse_model(model: nn.Module) -> List[nn.Module]:
    res = []
    for layer in model.children():
        if type(layer) in MetaMorph.BASIC_OPS:
            res.append(layer)
        elif isinstance(layer, nn.Sequential):
            res.extend(parse_model(layer))
        else:
            res.append(layer)
    return res

parse_models = [parse_model(emotionNet), parse_model(ageNet), parse_model(genderNet)]
absGraph = Graph(SAMPLE_INPUT, parse_models, DEVICE)
cmpGraph = ComputeGraph(absGraph, parse_models, DEVICE)

if if_print_ori_model:
    print(absGraph)

if if_test_ori_latency:
    ori_latency = test_latency(cmpGraph, SAMPLE_INPUT)

if test_random_connet:
    n_trial = 20
    graph1 = deepcopy(absGraph)
    graph1.random_connect(n_trial=n_trial)
    graph1.build_mergeable_nodes()
    graph1.random_connect(n_trial=n_trial)
    graph1.build_mergeable_nodes()
    graph1.random_connect(n_trial=n_trial)
    graph1.build_mergeable_nodes()
    graph1.random_connect(n_trial=n_trial)
    graph1.build_mergeable_nodes()
    graph1.random_connect(n_trial=n_trial)
    graph1.build_mergeable_nodes()
    graph1.random_connect(n_trial=n_trial)
    # print(graph1)
    cmpGraph = ComputeGraph(graph1, parse_models, DEVICE)
    # print(cmpGraph)
    cmpGraph.freeze_all_node()
    latency1 = test_latency(cmpGraph, SAMPLE_INPUT)
    torch.save(cmpGraph, 'opt_models/net1.pt')

Inference Latency: {'mean': 3.6052626847392983, 'median': 3.6256671727945404, 'std': 0.0864943688566018} 



In [7]:
# list of models
MODELS = [emotionNet, ageNet, genderNet]

# dataloader
ds_samples = DatasetSampler(
        [emo_train_data, age_train_data, gen_train_data],
        MODELS,
        DEVICE,
        [10000, 5000, 5000]
    )
samples_dataloader = torch.utils.data.DataLoader(ds_samples, batch_size=128, shuffle=True, **kwargs)
print(len(samples_dataloader.dataset))
test_loader_list = [emo_test_loader, age_test_loader, gen_test_loader]

if if_test_ori_accuracy:
    print("Task Accuracy of original graph: ")
    test_accuracy(cmpGraph, test_multi_acc, test_loader_list, DEVICE)

3


In [8]:
import time

# compiler settings
optimizer = torch.optim.Adam
compiler = MetaMorph(
    models=MODELS, optimizer=optimizer, optimizer_lr=0.001,
    input_size=SAMPLE_INPUT.shape, train_loader=samples_dataloader, test_loader=test_loader_list,
    f_accuracy=test_multi_acc, fine_tune_epochs=1, max_epoch=10, device=DEVICE
)
policy = SimulatedAnnealingPolicy(
    base_graph=compiler.original_graph,
    models=compiler.models,
    f_finetune=compiler.fine_tune, f_latency=compiler.f_latency, f_accuracy=compiler.f_accuracy,
    accuracy_tolerence = 0.02,
    device=compiler.device
)

# test the compiling time
torch.cuda.synchronize()
start = time.time()
best_result = compiler.optimize(policy)
torch.cuda.synchronize()
end = time.time()

print('---------------------------- Evaluation ---------------------------------')
print("Optimal Graph: \n", best_result.graph)
print("Optimal Latency: ", best_result.latency)
print("Compiling time: ", end - start)

cmpGraph_opt = best_result.cmp_graph
print("Task Accuracy of optimized graph: ")
test_accuracy(cmpGraph_opt, test_multi_acc, test_loader_list, DEVICE)

net1 Accuracy: 69.82446363889663%   net2 Accuracy: 66.13780598368088%   net3 Accuracy: 79.13705583756345%   

Inference Latency: {'mean': 7.945091457416612, 'median': 7.961193518713117, 'std': 0.09359653587794645} 

---------- Epoch: 1/10, current best latency: 7.953068802340164
The current number of candidates is 0, the value of P is 0
Optimizing on the Original Graph ...
ComputeNode()False	
ComputeNode(
  (op): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_mode=True)
)False	
ComputeNode(
  (op): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)True	ComputeNode(
  (op): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)True	
ComputeNode(
  (op): ReLU()
)False	C

                                                                     

Finetune stop, the accuracy drop is:  0.12747187912464142
---------- Epoch: 2/10, current best latency: 7.953068802340164
The current number of candidates is 0, the value of P is 0.49960654973983765
Optimizing on the Original Graph ...
ComputeNode()False	
ComputeNode(
  (op): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_mode=True)
)False	
ComputeNode(
  (op): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_m

                                                                     

Finetune stop, the accuracy drop is:  0.396162748336792
---------- Epoch: 3/10, current best latency: 7.953068802340164
The current number of candidates is 0, the value of P is 0.49864137172698975
Optimizing on the Original Graph ...
ComputeNode()False	
ComputeNode(
  (op): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	ComputeNode(
  (op): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	ComputeNode(
  (op): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_mode=True)
)False	ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_mode=True)
)

                                                                     

Finetune stop, the accuracy drop is:  0.24366486072540283
---------- Epoch: 4/10, current best latency: 7.953068802340164
The current number of candidates is 0, the value of P is 0.49907153844833374
Optimizing on the Original Graph ...
ComputeNode()False	
ComputeNode(
  (op): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_mode=True)
)False	
ComputeNode(
  (op): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_m

                                                                     

Finetune stop, the accuracy drop is:  0.4055040776729584
---------- Epoch: 5/10, current best latency: 7.953068802340164
The current number of candidates is 0, the value of P is 0.4982832074165344
Optimizing on the Original Graph ...
ComputeNode()False	
ComputeNode(
  (op): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_mode=True)
)False	
ComputeNode(
  (op): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)True	ComputeNode(
  (op): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)True	
ComputeNode(
  (op): ReLU()
)False	ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	ComputeNode

                                                                     

Finetune stop, the accuracy drop is:  0.02643221616744995
---------- Epoch: 6/10, current best latency: 7.953068802340164
The current number of candidates is 0, the value of P is 0.4998756945133209
Optimizing on the Original Graph ...
ComputeNode()False	
ComputeNode(
  (op): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_mode=True)
)False	
ComputeNode(
  (op): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_mo

                                                            

Finetune stop, the accuracy drop is:  0.23291654884815216
---------- Epoch: 7/10, current best latency: 7.953068802340164
The current number of candidates is 0, the value of P is 0.49878257513046265
Optimizing on the Original Graph ...
ComputeNode()False	
ComputeNode(
  (op): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_mode=True)
)False	
ComputeNode(
  (op): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_m

                                                                     

Finetune stop, the accuracy drop is:  0.25701114535331726
---------- Epoch: 8/10, current best latency: 7.953068802340164
The current number of candidates is 0, the value of P is 0.49850738048553467
Optimizing on the Original Graph ...
ComputeNode()False	
ComputeNode(
  (op): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_mode=True)
)False	
ComputeNode(
  (op): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_m

                                                                     

Finetune stop, the accuracy drop is:  0.40368303656578064
---------- Epoch: 9/10, current best latency: 7.953068802340164
The current number of candidates is 0, the value of P is 0.4973950982093811
Optimizing on the Original Graph ...
ComputeNode()False	
ComputeNode(
  (op): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_mode=True)
)False	
ComputeNode(
  (op): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_mo

                                                                     

Finetune stop, the accuracy drop is:  0.39038729667663574
---------- Epoch: 10/10, current best latency: 7.953068802340164
The current number of candidates is 0, the value of P is 0.49720099568367004
Optimizing on the Original Graph ...
ComputeNode()False	
ComputeNode(
  (op): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_mode=True)
)False	
ComputeNode(
  (op): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)False	
ComputeNode(
  (op): ReLU()
)False	
ComputeNode(
  (op): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0), dilation=1, ceil_

                                                                     

Finetune stop, the accuracy drop is:  0.25401678681373596
---------------------------- Evaluation ---------------------------------
Optimal Graph: 
 ((0, 0))placeholder	
((0, 0)->(1, 0))Conv2d	((0, 0)->(2, 0))Conv2d	((0, 0)->(3, 0))Conv2d	
((1, 0)->(1, 1))ReLU	((2, 0)->(2, 1))ReLU	((3, 0)->(3, 1))ReLU	
((1, 1)->(1, 2))Conv2d	((2, 1)->(2, 2))Conv2d	((3, 1)->(3, 2))Conv2d	
((1, 2)->(1, 3))ReLU	((2, 2)->(2, 3))ReLU	((3, 2)->(3, 3))ReLU	
((1, 3)->(1, 4))MaxPool2d	((2, 3)->(2, 4))MaxPool2d	((3, 3)->(3, 4))MaxPool2d	
((1, 4)->(1, 5))Conv2d	((2, 4)->(2, 5))Conv2d	((3, 4)->(3, 5))Conv2d	
((1, 5)->(1, 6))ReLU	((2, 5)->(2, 6))ReLU	((3, 5)->(3, 6))ReLU	
((1, 6)->(1, 7))Conv2d	((2, 6)->(2, 7))Conv2d	((3, 6)->(3, 7))Conv2d	
((1, 7)->(1, 8))ReLU	((2, 7)->(2, 8))ReLU	((3, 7)->(3, 8))ReLU	
((1, 8)->(1, 9))MaxPool2d	((2, 8)->(2, 9))MaxPool2d	((3, 8)->(3, 9))MaxPool2d	
((1, 9)->(1, 10))Conv2d	((2, 9)->(2, 10))Conv2d	((3, 9)->(3, 10))Conv2d	
((1, 10)->(1, 11))ReLU	((2, 10)->(2, 11))ReLU	((3, 10)->(3, 11)

tensor([0.6982, 0.6614, 0.7914])

In [12]:
if_tvm_tune = False
use_tvm_tuned = False
log_file = "tvm.log"

if_test_tvm_accuracy = False
if_test_tvm_latency = True

In [10]:
# test TVM
import tvm, time
from tvm import relay, autotvm
from tvm.contrib import graph_executor
from tvm_build import get_network, tune_tasks, tune_and_evaluate
torch.manual_seed(0)

traced_module = torch.jit.trace(cmpGraph, SAMPLE_INPUT).eval()
# print(traced_module)

input_shape = (1, 3, 224, 224)
input_data = torch.randn(input_shape)
input_name = 'input0'
shape_list = [(input_name, input_shape)]

mod, params = get_network(traced_module, shape_list)
# print(mod)

# # running TVM to compile model
if if_tvm_tune:
    tune_and_evaluate(mod, params, input_shape, log_name=log_file)

target = tvm.target.cuda()
if use_tvm_tuned:
    with autotvm.apply_history_best(log_file):
        with tvm.transform.PassContext(opt_level=3):
            lib = lib = relay.build(mod, target=target, params=params)
else:
    with tvm.transform.PassContext(opt_level=3):
        lib = relay.build(mod, target=target, params=params)
dev = tvm.device(str(target), 0)
tvm_model = graph_executor.GraphModule(lib["default"](dev))  

One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.


In [13]:
# run TVM model
tvm_model.set_input(input_name, input_data)

tvm_model.run()

if if_test_tvm_accuracy:
    tvm_out_0, tvm_out_1, tvm_out_2 = tvm_model.get_output(0), tvm_model.get_output(1), tvm_model.get_output(2)
    tvm_out_0 = torch.tensor(tvm_out_0.numpy()).to(DEVICE)
    tvm_out_1 = torch.tensor(tvm_out_1.numpy()).to(DEVICE)
    tvm_out_2 = torch.tensor(tvm_out_2.numpy()).to(DEVICE)
    ori_out_0, ori_out_1, ori_out_2 = cmpGraph(input_data.to(DEVICE))
    print(tvm_out_2)
    print(ori_out_2)

if if_test_tvm_latency:
    import timeit
    timing_number = 30
    timing_repeat = 30
    optimized = (
            np.array(timeit.Timer(lambda:tvm_model.run()).repeat(repeat=timing_repeat, number=timing_number))
            * 1000
            / timing_number
        )
    optimized = {"mean": np.mean(optimized), "median": np.median(optimized), "std":np.std(optimized)}

    print("optimized: %s" % (optimized))

optimized: {'mean': 3.142960899406009, 'median': 3.191243469094237, 'std': 0.31698324067760714}
