In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
sys.path.append(f'/home/beans/bespoke')

from models import EffNet
from constants import *
from imports import *
from train_utils import *

torch.__version__, torch.cuda.device_count(), torch.cuda.get_device_name(torch.cuda.current_device())

In [None]:
import torch
import torch_tensorrt
import timm
import time
import numpy as np
import torch.backends.cudnn as cudnn

torch.hub._validate_not_a_forked_repo=lambda a,b,c: True

#efficientnet_b0 = timm.create_model('efficientnet_b0',pretrained=True)
#efficientnet = timm.create_model('efficientnet_b4',pretrained=True)

In [None]:
m = EffNet().to(device) # 13M params, 11.6M without RNN, 
sum([torch.numel(p) for p in m.parameters()]) / 1000

In [None]:
stem = "1.29_avg"

m.load_state_dict(torch.load(f"{BESPOKE_ROOT}/models_deploy/m{stem}.torch"), strict=False)
backbone = m.backbone

In [None]:
# https://developer.nvidia.com/blog/accelerating-inference-up-to-6x-faster-in-pytorch-with-torch-tensorrt/
import torch.backends.cudnn as cudnn
cudnn.benchmark = True

def benchmark(model, input_shape=(1024, 3, 512, 512), dtype='fp32', nwarmup=50, nruns=1000):
    input_data = torch.randn(input_shape)
    input_data = input_data.to("cuda")
    if dtype=='fp16':
        input_data = input_data.half()
        
    print("Warm up ...")
    with torch.no_grad():
        for _ in range(nwarmup):
            features = model(input_data)
    torch.cuda.synchronize()
    print("Start timing ...")
    timings = []
    with torch.no_grad():
        for i in range(1, nruns+1):
            start_time = time.time()
            pred_loc  = model(input_data)
            torch.cuda.synchronize()
            end_time = time.time()
            timings.append(end_time - start_time)
            if i%10==0:
                print('Iteration %d/%d, avg batch time %.2f ms'%(i, nruns, np.mean(timings)*1000))

    print("Input shape:", input_data.size())
    print('Average throughput: %.2f images/second'%(input_shape[0]/np.mean(timings)))

In [None]:
N_CHANNELS_MODEL

In [None]:
input_shape = (1, N_CHANNELS_MODEL, IMG_HEIGHT, IMG_WIDTH)

In [None]:
model = backbone.eval().to("cuda")
benchmark(model, input_shape=input_shape, nruns=10)

In [None]:
traced_model = torch.jit.trace(model, torch.randn(input_shape).to("cuda"))
benchmark(traced_model, input_shape=input_shape, nruns=10)

In [None]:
import torch_tensorrt

In [None]:
%%time
trt_model = torch_tensorrt.compile(model, 
    inputs= [torch_tensorrt.Input(input_shape)],
    enabled_precisions= { torch_tensorrt.dtype.half} # Run with FP16
)

In [None]:
trt_model

In [None]:
torch.jit.save(trt_model, f"{BESPOKE_ROOT}/trt_models/backbone_trt.jit.pt")

In [None]:
benchmark(trt_model, input_shape=input_shape, nruns=10)

In [None]:
# maybe try something w this bc was getting errors about mem "CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage"

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
sys.path.append(f'/home/beans/bespoke')

from models import EffNet
from constants import *
from imports import *
from train_utils import *

import torch
import torch_tensorrt
import timm
import time
import numpy as np
import torch.backends.cudnn as cudnn

In [2]:
stem = "1.29_avg"

m = EffNet().to(device) 
m.load_state_dict(torch.load(f"{BESPOKE_ROOT}/models_deploy/m{stem}.torch"))
m.model_stem = stem
sum([torch.numel(p) for p in m.parameters()]) / 1000

INFO:timm.models.helpers:Loading pretrained weights from url (https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b4_ra2_320-7eb33cd5.pth)
INFO:timm.models.helpers:Converted input conv conv_stem pretrained weights from 3 to 4 channel(s)


18297.787

In [3]:
m.load_trt_backbone()



In [4]:
import wandb
wandb.init(id='66i4vxlw', project="carla", resume="allow")

[34m[1mwandb[0m: Currently logged in as: [33mrgilman33[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.13.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [5]:
%%time
from rollout import RwEvaluator
rw_evaluator = RwEvaluator(m, wandb=wandb, save_rollouts=False, trt=True, run_ids=["run_556a"], bptt=1)

CPU times: user 1.95 ms, sys: 734 µs, total: 2.69 ms
Wall time: 2.31 ms


In [6]:
rw_evaluator.evaluate() #20490

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
loader is done
Rollout complete!
down w rollouts, reporting
run_556a
