In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
sys.path.append(f'/home/beans/bespoke')

from models import EffNet
from constants import *
from imports import *
from rw_dataloader import RealWorldDataloader
from loaders import BlenderDataloader
from train_utils import *

torch.__version__, torch.cuda.device_count(), torch.cuda.get_device_name(torch.cuda.current_device())



('1.13.1', 2, 'NVIDIA GeForce RTX 3090')

In [2]:
import torch
import torch_tensorrt
import timm
import time
import numpy as np
import torch.backends.cudnn as cudnn

torch.hub._validate_not_a_forked_repo=lambda a,b,c: True

#efficientnet_b0 = timm.create_model('efficientnet_b0',pretrained=True)
#efficientnet = timm.create_model('efficientnet_b4',pretrained=True)

In [3]:
m = EffNet().to(device) # 13M params, 11.6M without RNN, 
sum([torch.numel(p) for p in m.parameters()]) / 1000

INFO:timm.models.helpers:Loading pretrained weights from url (https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b4_ra2_320-7eb33cd5.pth)
INFO:timm.models.helpers:Converted input conv conv_stem pretrained weights from 3 to 6 channel(s)


18303.919

In [4]:
stem = "12.15_avg"

m.load_state_dict(torch.load(f"{BESPOKE_ROOT}/models_deploy/m_{stem}.torch"), strict=False)
backbone = m.backbone

In [5]:
cudnn.benchmark = True

def benchmark(model, input_shape=(1024, 3, 512, 512), dtype='fp32', nwarmup=50, nruns=1000):
    input_data = torch.randn(input_shape)
    input_data = input_data.to("cuda")
    if dtype=='fp16':
        input_data = input_data.half()
        
    print("Warm up ...")
    with torch.no_grad():
        for _ in range(nwarmup):
            features = model(input_data)
    torch.cuda.synchronize()
    print("Start timing ...")
    timings = []
    with torch.no_grad():
        for i in range(1, nruns+1):
            start_time = time.time()
            pred_loc  = model(input_data)
            torch.cuda.synchronize()
            end_time = time.time()
            timings.append(end_time - start_time)
            if i%10==0:
                print('Iteration %d/%d, avg batch time %.2f ms'%(i, nruns, np.mean(timings)*1000))

    print("Input shape:", input_data.size())
    print('Average throughput: %.2f images/second'%(input_shape[0]/np.mean(timings)))

In [6]:
input_shape = (1, N_CHANNELS, IMG_HEIGHT, IMG_WIDTH)

In [9]:
model = backbone.eval().to("cuda")
benchmark(model, input_shape=input_shape, nruns=10)

Warm up ...
Start timing ...
Iteration 10/10, avg batch time 10.56 ms
Input shape: torch.Size([1, 6, 360, 1440])
Average throughput: 94.66 images/second


In [10]:
traced_model = torch.jit.trace(model, torch.randn(input_shape).to("cuda"))
benchmark(traced_model, input_shape=input_shape, nruns=10)

Warm up ...
Start timing ...
Iteration 10/10, avg batch time 8.20 ms
Input shape: torch.Size([1, 6, 360, 1440])
Average throughput: 121.90 images/second


In [11]:
trt_model = torch_tensorrt.compile(model, 
    inputs= [torch_tensorrt.Input(input_shape)],
    enabled_precisions= { torch_tensorrt.dtype.half} # Run with FP16
)

ERROR: [Torch-TensorRT TorchScript Conversion Context] - 2: [virtualMemoryBuffer.cpp::resizePhysical::145] Error Code 2: OutOfMemory (no further information)
ERROR: [Torch-TensorRT TorchScript Conversion Context] - 2: [virtualMemoryBuffer.cpp::resizePhysical::145] Error Code 2: OutOfMemory (no further information)
Try decreasing the workspace size with IBuilderConfig::setMemoryPoolLimit().
ERROR: [Torch-TensorRT TorchScript Conversion Context] - 2: [virtualMemoryBuffer.cpp::resizePhysical::145] Error Code 2: OutOfMemory (no further information)
ERROR: [Torch-TensorRT TorchScript Conversion Context] - 2: [virtualMemoryBuffer.cpp::resizePhysical::145] Error Code 2: OutOfMemory (no further information)
Try decreasing the workspace size with IBuilderConfig::setMemoryPoolLimit().
ERROR: [Torch-TensorRT TorchScript Conversion Context] - 2: [virtualMemoryBuffer.cpp::resizePhysical::145] Error Code 2: OutOfMemory (no further information)
ERROR: [Torch-TensorRT TorchScript Conversion Context] -

In [12]:
benchmark(trt_model, input_shape=input_shape, nruns=10)

G: [Torch-TensorRT TorchScript Conversion Context] - Skipping tactic 8 due to insufficient memory on requested size of 17179869184 detected for tactic 0x000000000000003c.
Try decreasing the workspace size with IBuilderConfig::setMemoryPoolLimit().
ERROR: [Torch-TensorRT TorchScript Conversion Context] - 2: [virtualMemoryBuffer.cpp::resizePhysical::145] Error Code 2: OutOfMemory (no further information)
ERROR: [Torch-TensorRT TorchScript Conversion Context] - 2: [virtualMemoryBuffer.cpp::resizePhysical::145] Error Code 2: OutOfMemory (no further information)
Try decreasing the workspace size with IBuilderConfig::setMemoryPoolLimit().
ERROR: [Torch-TensorRT TorchScript Conversion Context] - 2: [virtualMemoryBuffer.cpp::resizePhysical::145] Error Code 2: OutOfMemory (no further information)
ERROR: [Torch-TensorRT TorchScript Conversion Context] - 2: [virtualMemoryBuffer.cpp::resizePhysical::145] Error Code 2: OutOfMemory (no further information)
Try decreasing the workspace size with IBui

Warm up ...
Start timing ...
Iteration 10/10, avg batch time 3.67 ms
Input shape: torch.Size([1, 6, 360, 1440])
Average throughput: 272.26 images/second


In [13]:
torch.jit.save(trt_model, f"{BESPOKE_ROOT}/trt_models/backbone_trt.jit.pt")