In [None]:
# Copyright 2019 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Torch-TensorRT Getting Started - CitriNet

## Overview

[Citrinet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/models.html#citrinet) is an acoustic model used for the speech to text recognition task. It is a version of [QuartzNet](https://arxiv.org/pdf/1910.10261.pdf) that extends [ContextNet](https://arxiv.org/pdf/2005.03191.pdf), utilizing subword encoding (via Word Piece tokenization) and Squeeze-and-Excitation(SE) mechanism and are therefore smaller than QuartzNet models.

CitriNet models take in audio segments and transcribe them to letter, byte pair, or word piece sequences. 

<img src="https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/_images/jasper_vertical.png" alt="alt" width="50%"/>


### Learning objectives

This notebook demonstrates the steps for optimizing a pretrained CitriNet model with Torch-TensorRT, and running it to test the speedup obtained.

## Content
1. [Requirements](#1)
1. [Download Citrinet model](#2)
1. [Create Torch-TensorRT modules](#3)
1. [Benchmark Torch-TensorRT models](#4)
1. [Conclusion](#5)

<a id="1"></a>
## 1. Requirements

Follow the steps in [README](README.md) to prepare a Docker container, within which you can run this notebook. 
This notebook assumes that you are within a Jupyter environment in a docker container with Torch-TensorRT installed, such as an NGC monthly release of `nvcr.io/nvidia/pytorch:<yy.mm>-py3` (where `yy` indicates the last two numbers of a calendar year, and `mm` indicates the month in two-digit numerical form)

Now that you are in the docker, the next step is to install the required dependencies.

In [None]:
# Install dependencies
!pip install wget
!apt-get update && DEBIAN_FRONTEND=noninteractive  apt-get install -y libsndfile1 ffmpeg
!pip install Cython

## Install NeMo
!pip install nemo_toolkit[all]==1.5.1

<a id="2"></a>
## 2. Download Citrinet model

Next, we download a pretrained Nemo Citrinet model and convert it to a Torchscript module:

In [None]:
import nemo
import torch

import nemo.collections.asr as nemo_asr
from nemo.core import typecheck
typecheck.set_typecheck_enabled(False) 

In [None]:
variant = 'stt_en_citrinet_256'

print(f"Downloading and saving {variant}...")
asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name=variant)
asr_model.export(f"{variant}.ts")

### Benchmark utility

Let us define a helper benchmarking function, then benchmark the original Pytorch model.

In [None]:
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division

import argparse
import timeit
import numpy as np
import torch
import torch_tensorrt as trtorch
import torch.backends.cudnn as cudnn

def benchmark(model, input_tensor, num_loops, model_name, batch_size):
    def timeGraph(model, input_tensor, num_loops):
        print("Warm up ...")
        with torch.no_grad():
            for _ in range(20):
                features = model(input_tensor)

        torch.cuda.synchronize()
        print("Start timing ...")
        timings = []
        with torch.no_grad():
            for i in range(num_loops):
                start_time = timeit.default_timer()
                features = model(input_tensor)
                torch.cuda.synchronize()
                end_time = timeit.default_timer()
                timings.append(end_time - start_time)
                # print("Iteration {}: {:.6f} s".format(i, end_time - start_time))
        return timings
    def printStats(graphName, timings, batch_size):
        times = np.array(timings)
        steps = len(times)
        speeds = batch_size / times
        time_mean = np.mean(times)
        time_med = np.median(times)
        time_99th = np.percentile(times, 99)
        time_std = np.std(times, ddof=0)
        speed_mean = np.mean(speeds)
        speed_med = np.median(speeds)
        msg = ("\n%s =================================\n"
                "batch size=%d, num iterations=%d\n"
                "  Median samples/s: %.1f, mean: %.1f\n"
                "  Median latency (s): %.6f, mean: %.6f, 99th_p: %.6f, std_dev: %.6f\n"
                ) % (graphName,
                    batch_size, steps,
                    speed_med, speed_mean,
                    time_med, time_mean, time_99th, time_std)
        print(msg)
    timings = timeGraph(model, input_tensor, num_loops)
    printStats(model_name, timings, batch_size)

precisions_str = 'fp32' # Precision (default=fp32, fp16)
variant = 'stt_en_citrinet_256' # Nemo Citrinet variant
batch_sizes = [1, 8, 32, 128] # Batch sizes (default=1,8,32,128)
trt = False # If True, infer with Torch-TensorRT engine. Else, infer with Pytorch model.
precision = torch.float32 if precisions_str =='fp32' else torch.float16

for batch_size in batch_sizes:
    if trt:
        model_name = f"{variant}_bs{batch_size}_{precision}.torch-tensorrt"
    else:
        model_name = f"{variant}.ts"

    print(f"Loading model: {model_name}") 
    # Load traced model to CPU first
    model = torch.jit.load(model_name).cuda()
    cudnn.benchmark = True
    # Create random input tensor of certain size
    torch.manual_seed(12345)
    input_shape=(batch_size, 80, 1488)
    input_tensor = torch.randn(input_shape).cuda()

    # Timing graph inference
    benchmark(model, input_tensor, 50, model_name, batch_size)

Confirming the GPU we are using here:

In [None]:
!nvidia-smi

<a id="3"></a>
## 3. Create Torch-TensorRT modules

In this step, we optimize the Citrinet Torchscript module with Torch-TensorRT with various precisions and batch sizes.

In [None]:
import torch
import torch.nn as nn
import torch_tensorrt as trtorch
import argparse

# trtorch.logging.set_reportable_log_level(trtorch.logging.Level.Info)

arg_precisions = "fp32,fp16"
arg_batch_sizes = "1,8,32,128"
arg_variant = "stt_en_citrinet_256"

precisions_str = arg_precisions.split(',')
precisions = []
if 'fp32' in precisions_str:
    precisions.append(torch.float32)
if 'fp16' in precisions_str:
    precisions.append(torch.half)

batch_sizes = [int(x) for x in arg_batch_sizes.split(',')]

model = torch.jit.load(f"{arg_variant}.ts")

for precision in precisions:
    for batch_size in batch_sizes:
        compile_settings = {
            "inputs": [trtorch.Input(shape=[batch_size, 80, 1488])],
            "enabled_precisions": {precision},
            "workspace_size": 2000000000,
            "truncate_long_and_double": True,
        }
        print(f"Generating Torchscript-TensorRT module for batchsize {batch_size} precision {precision}")
        trt_ts_module = trtorch.compile(model, **compile_settings)
        torch.jit.save(trt_ts_module, f"{arg_variant}_bs{batch_size}_{precision}.torch-tensorrt")

<a id="4"></a>
## 4. Benchmark Torch-TensorRT models

Finally, we are ready to benchmark the Torch-TensorRT optimized Citrinet models.

### FP32 (single precision)

In [None]:
precisions_str = 'fp32' # Precision (default=fp32, fp16)
batch_sizes = [1, 8, 32, 128] # Batch sizes (default=1,8,32,128)
precision = torch.float32 if precisions_str =='fp32' else torch.float16
trt = True

for batch_size in batch_sizes:
    if trt:
        model_name = f"{variant}_bs{batch_size}_{precision}.torch-tensorrt"
    else:
        model_name = f"{variant}.ts"

    print(f"Loading model: {model_name}") 
    # Load traced model to CPU first
    model = torch.jit.load(model_name).cuda()
    cudnn.benchmark = True
    # Create random input tensor of certain size
    torch.manual_seed(12345)
    input_shape=(batch_size, 80, 1488)
    input_tensor = torch.randn(input_shape).cuda()

    # Timing graph inference
    benchmark(model, input_tensor, 50, model_name, batch_size)

### FP16 (half precision)

In [None]:
precisions_str = 'fp16' # Precision (default=fp32, fp16)
batch_sizes = [1, 8, 32, 128] # Batch sizes (default=1,8,32,128)
precision = torch.float32 if precisions_str =='fp32' else torch.float16

for batch_size in batch_sizes:
    if trt:
        model_name = f"{variant}_bs{batch_size}_{precision}.torch-tensorrt"
    else:
        model_name = f"{variant}.ts"

    print(f"Loading model: {model_name}") 
    # Load traced model to CPU first
    model = torch.jit.load(model_name).cuda()
    cudnn.benchmark = True
    # Create random input tensor of certain size
    torch.manual_seed(12345)
    input_shape=(batch_size, 80, 1488)
    input_tensor = torch.randn(input_shape).cuda()

    # Timing graph inference
    benchmark(model, input_tensor, 50, model_name, batch_size)

<a id="5"></a>
## 5. Conclusion

In this notebook, we have walked through the complete process of optimizing the Citrinet model with Torch-TensorRT. On an A100 GPU, with Torch-TensorRT, we observe a speedup of ~**2.4X** with FP32, and ~**2.9X** with FP16 at batchsize of 128.

### What's next
Now it's time to try Torch-TensorRT on your own model. Fill out issues at https://github.com/NVIDIA/Torch-TensorRT. Your involvement will help future development of Torch-TensorRT.
