In [3]:
# 1. INSTALL DEPENDENCIES
# -----------------------------------------------------------------------------
print("üì¶ Installing dependencies...")
!pip install -q huggingface_hub onnx onnxruntime-gpu librosa datasets jiwer tensorrt
!pip install -q numpy==1.23.5  # Pin NumPy to prevent version conflicts

üì¶ Installing dependencies...
  Installing build dependencies ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m√ó[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m‚îÇ[0m exit code: [1;36m1[0m
  [31m‚ï∞‚îÄ>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Getting requirements to build wheel ... [?25l[?25herror
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m√ó[0m [32mGetting requirements to build wheel[0m did not run successfully.
[31m‚îÇ[0m exit code: [1;36m1[0m
[31m‚ï∞‚îÄ>[0m See above for output.

[1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.


In [4]:
!pip install pycuda



In [5]:
!pip install onnx onnxsim



In [6]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [7]:
# 1. Uninstall any broken versions
!pip uninstall -y tensorrt tensorrt-cu12-bindings tensorrt-cu12-libs
!pip install -q numpy==1.23.5

# 2. Install the specific compatible version
# We use the NVIDIA Index URL to get the pre-compiled Linux wheels
!pip install tensorrt==10.0.1 --extra-index-url https://pypi.nvidia.com

# 3. Force the system to register the libraries
!ldconfig

print("\n‚úÖ Install Complete.")
print("üõë STOP! You MUST now click 'Runtime' > 'Restart Session' (or 'Restart Runtime') in the top menu.")
print("üëâ After restarting, skip this cell and run Step 2 below.")

Found existing installation: tensorrt 10.0.1
Uninstalling tensorrt-10.0.1:
  Successfully uninstalled tensorrt-10.0.1
Found existing installation: tensorrt_cu12_bindings 10.14.1.48.post1
Uninstalling tensorrt_cu12_bindings-10.14.1.48.post1:
  Successfully uninstalled tensorrt_cu12_bindings-10.14.1.48.post1
Found existing installation: tensorrt_cu12_libs 10.14.1.48.post1
Uninstalling tensorrt_cu12_libs-10.14.1.48.post1:
  Successfully uninstalled tensorrt_cu12_libs-10.14.1.48.post1
  Installing build dependencies ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m√ó[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m‚îÇ[0m exit code: [1;36m1[0m
  [31m‚ï∞‚îÄ>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Getting requirements to build wheel ... [?25l[?25herror
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m√ó[0m [32mGe

In [None]:
# =============================================================================
# üöÄ MASTER SCRIPT: ONNX BASELINE vs. TENSORRT OPTIMIZATION
# =============================================================================



import os
import glob
import ctypes
import time
import numpy as np
import tensorrt as trt
import onnxruntime as ort
import pycuda.driver as cuda
import pycuda.autoinit
import librosa
import io
from huggingface_hub import hf_hub_download
from transformers import Wav2Vec2Processor
from datasets import load_dataset
from jiwer import wer

# Fix Library Paths for Colab
current_ld = os.environ.get('LD_LIBRARY_PATH', '')
if '/usr/lib64-nvidia' not in current_ld:
    os.environ['LD_LIBRARY_PATH'] = current_ld + ':/usr/lib64-nvidia'

# Force-load TensorRT drivers
try:
    libs = glob.glob("/usr/local/lib/python*/dist-packages/tensorrt_libs")[0]
    ctypes.CDLL(os.path.join(libs, "libnvinfer.so.10"))
    ctypes.CDLL(os.path.join(libs, "libnvinfer_plugin.so.10"))
except:
    pass

# 2. PREPARE RESOURCES
# -----------------------------------------------------------------------------
print("\n‚¨áÔ∏è Downloading Resources...")

# Download Clean ONNX (FP32)
if not os.path.exists("model.onnx"):
    try:
        path = hf_hub_download(repo_id="onnx-community/indicwav2vec-hindi-ONNX", filename="onnx/model.onnx")
        os.symlink(path, "model.onnx")
        print("‚úÖ Model downloaded (1.2 GB)")
    except:
        print("‚ùå Download failed. Check internet connection.")
        exit()

# Load Processor & Dataset
processor = Wav2Vec2Processor.from_pretrained("ai4bharat/indicwav2vec-hindi")
ds = load_dataset("MatrixSpeechAI/Common_voice_hindi_denoised", split="train", streaming=True).decode(False)
# Get 20 samples for testing
test_samples = list(ds.take(20))
print("‚úÖ Dataset loaded (20 samples)")


# 3. BASELINE: RUN RAW ONNX (ONNX RUNTIME)
# -----------------------------------------------------------------------------
print("\nüê¢ Running Baseline (ONNX Runtime)...")
ort_sess = ort.InferenceSession("model.onnx", providers=['CUDAExecutionProvider'])

base_times = []
base_preds = []
base_truths = []

for item in test_samples:
    audio, _ = librosa.load(io.BytesIO(item["audio"]["bytes"]), sr=16000)
    input_values = processor(audio, sampling_rate=16000, return_tensors="np").input_values

    start = time.time()
    logits = ort_sess.run(None, {'input_values': input_values})[0]
    base_times.append((time.time() - start) * 1000)

    pred_ids = np.argmax(logits, axis=-1)[0]
    base_preds.append(processor.decode(pred_ids))
    base_truths.append(item["transcription"])

base_wer = wer(base_truths, base_preds) * 100
base_lat = np.mean(base_times)
print(f"üëâ Baseline Result: {base_lat:.2f} ms | {base_wer:.2f}% WER")


# 4. OPTIMIZATION: BUILD TENSORRT ENGINE
# -----------------------------------------------------------------------------
print("\nüõ°Ô∏è Building Optimized Engine (TensorRT)...")
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(flag)
config = builder.create_builder_config()
parser = trt.OnnxParser(network, logger)

with open("model.onnx", 'rb') as f:
    parser.parse(f.read())

# Dynamic Shapes
profile = builder.create_optimization_profile()
profile.set_shape("input_values", (1, 16000), (1, 80000), (4, 160000))
config.add_optimization_profile(profile)

# ENABLE FP16 (Speed)
if builder.platform_has_fast_fp16:
    config.set_flag(trt.BuilderFlag.FP16)

# APPLY "THE ANTIDOTE" (Fix 162% WER)
count = 0
for i in range(network.num_layers):
    layer = network.get_layer(i)
    if "Pow" in layer.name or layer.type == trt.LayerType.REDUCE:
        layer.precision = trt.DataType.FLOAT
        if layer.num_outputs > 0:
            layer.set_output_type(0, trt.DataType.FLOAT)
        count += 1
config.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS)
print(f"   (Locked {count} sensitive layers to FP32)")

# Build
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 * 1024**3)
plan = builder.build_serialized_network(network, config)
with open("model.engine", "wb") as f:
    f.write(plan)
print("‚úÖ Engine Built Successfully!")


# 5. BENCHMARK: RUN TENSORRT ENGINE
# -----------------------------------------------------------------------------
print("\nüöÄ Running Optimized Benchmark (TensorRT)...")
runtime = trt.Runtime(logger)
engine = runtime.deserialize_cuda_engine(plan)
context = engine.create_execution_context()

trt_times = []
trt_preds = []

for item in test_samples:
    audio, _ = librosa.load(io.BytesIO(item["audio"]["bytes"]), sr=16000)
    input_values = processor(audio, sampling_rate=16000, return_tensors="np").input_values

    # Allocations
    context.set_input_shape("input_values", input_values.shape)
    d_input = cuda.mem_alloc(input_values.nbytes)
    h_output = np.empty((1, input_values.shape[1] // 320, 108), dtype=np.float32)
    d_output = cuda.mem_alloc(h_output.nbytes)

    cuda.memcpy_htod(d_input, input_values)
    start = time.time()
    context.execute_v2([int(d_input), int(d_output)])
    trt_times.append((time.time() - start) * 1000)
    cuda.memcpy_dtoh(h_output, d_output)

    # CTC Decode (Logic Fix)
    pred_ids = np.argmax(h_output, axis=-1)[0]
    # Simple collapse: remove repeats and blanks
    grouped = [x for i, x in enumerate(pred_ids) if i == 0 or x != pred_ids[i-1]]
    clean = [x for x in grouped if x != processor.tokenizer.pad_token_id]
    trt_preds.append(processor.decode(clean))

trt_wer = wer(base_truths, trt_preds) * 100
trt_lat = np.mean(trt_times)


# 6. FINAL REPORT
# -----------------------------------------------------------------------------
print("\n" + "="*60)
print(f"{'METRIC':<20} | {'BASELINE (ONNX)':<15} | {'OPTIMIZED (TRT)':<15}")
print("-" * 60)
print(f"{'Latency (ms)':<20} | {base_lat:<15.2f} | {trt_lat:<15.2f}")
print(f"{'WER (%)':<20} | {base_wer:<15.2f} | {trt_wer:<15.2f}")
print(f"{'Speedup':<20} | {'1.0x':<15} | {base_lat/trt_lat:<15.2f}x")
print("="*60)


‚¨áÔ∏è Downloading Resources...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


‚úÖ Dataset loaded (20 samples)

üê¢ Running Baseline (ONNX Runtime)...
üëâ Baseline Result: 68.48 ms | 36.59% WER

üõ°Ô∏è Building Optimized Engine (TensorRT)...
   (Locked 0 sensitive layers to FP32)
‚úÖ Engine Built Successfully!

üöÄ Running Optimized Benchmark (TensorRT)...

METRIC               | BASELINE (ONNX) | OPTIMIZED (TRT)
------------------------------------------------------------
Latency (ms)         | 68.48           | 15.87          
WER (%)              | 36.59           | 110.57         
Speedup              | 1.0x            | 4.31           x


In [None]:
# =============================================================================
# üöÄ MASTER SCRIPT: ONNX BASELINE vs. TENSORRT OPTIMIZATION
# =============================================================================



import os
import glob
import ctypes
import time
import numpy as np
import tensorrt as trt
import onnxruntime as ort
import pycuda.driver as cuda
import pycuda.autoinit
import librosa
import io
from huggingface_hub import hf_hub_download
from transformers import Wav2Vec2Processor
from datasets import load_dataset
from jiwer import wer

# Fix Library Paths for Colab
current_ld = os.environ.get('LD_LIBRARY_PATH', '')
if '/usr/lib64-nvidia' not in current_ld:
    os.environ['LD_LIBRARY_PATH'] = current_ld + ':/usr/lib64-nvidia'

# Force-load TensorRT drivers
try:
    libs = glob.glob("/usr/local/lib/python*/dist-packages/tensorrt_libs")[0]
    ctypes.CDLL(os.path.join(libs, "libnvinfer.so.10"))
    ctypes.CDLL(os.path.join(libs, "libnvinfer_plugin.so.10"))
except:
    pass

# 2. PREPARE RESOURCES
# -----------------------------------------------------------------------------
print("\n‚¨áÔ∏è Downloading Resources...")

# Download Clean ONNX (FP32)
if not os.path.exists("model.onnx"):
    try:
        path = hf_hub_download(repo_id="onnx-community/indicwav2vec-hindi-ONNX", filename="onnx/model.onnx")
        os.symlink(path, "model.onnx")
        print("‚úÖ Model downloaded (1.2 GB)")
    except:
        print("‚ùå Download failed. Check internet connection.")
        exit()

# Load Processor & Dataset
processor = Wav2Vec2Processor.from_pretrained("ai4bharat/indicwav2vec-hindi")
ds = load_dataset("MatrixSpeechAI/Common_voice_hindi_denoised", split="train", streaming=True).decode(False)
# Get 20 samples for testing
test_samples = list(ds.take(20))
print("‚úÖ Dataset loaded (20 samples)")


# 3. BASELINE: RUN RAW ONNX (ONNX RUNTIME)
# -----------------------------------------------------------------------------
print("\nüê¢ Running Baseline (ONNX Runtime)...")
ort_sess = ort.InferenceSession("model.onnx", providers=['CUDAExecutionProvider'])

base_times = []
base_preds = []
base_truths = []

for item in test_samples:
    audio, _ = librosa.load(io.BytesIO(item["audio"]["bytes"]), sr=16000)
    input_values = processor(audio, sampling_rate=16000, return_tensors="np").input_values

    start = time.time()
    logits = ort_sess.run(None, {'input_values': input_values})[0]
    base_times.append((time.time() - start) * 1000)

    pred_ids = np.argmax(logits, axis=-1)[0]
    base_preds.append(processor.decode(pred_ids))
    base_truths.append(item["transcription"])

base_wer = wer(base_truths, base_preds) * 100
base_lat = np.mean(base_times)
print(f"üëâ Baseline Result: {base_lat:.2f} ms | {base_wer:.2f}% WER")


# 4. OPTIMIZATION: BUILD TENSORRT ENGINE
# -----------------------------------------------------------------------------
print("\nüõ°Ô∏è Building Optimized Engine (TensorRT)...")
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(flag)
config = builder.create_builder_config()
parser = trt.OnnxParser(network, logger)

with open("model.onnx", 'rb') as f:
    parser.parse(f.read())

# Dynamic Shapes
profile = builder.create_optimization_profile()
profile.set_shape("input_values", (1, 16000), (1, 80000), (4, 160000))
config.add_optimization_profile(profile)

# ENABLE FP16 (Speed)
if builder.platform_has_fast_fp16:
    config.set_flag(trt.BuilderFlag.FP16)

# --- APPLY "THE ANTIDOTE" (Fix 162% WER) ---
print("üõ°Ô∏è Applying Antidote...")
count = 0
for i in range(network.num_layers):
    layer = network.get_layer(i)

    # NEW LOGIC: Catch Opset 17 "LayerNormalization" nodes explicitly
    if layer.type == trt.LayerType.NORMALIZATION or "LayerNorm" in layer.name:
        layer.precision = trt.DataType.FLOAT
        if layer.num_outputs > 0:
            layer.set_output_type(0, trt.DataType.FLOAT)
        count += 1
config.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS)
print(f"   (Locked {count} sensitive layers to FP32)")

# 5. BENCHMARK: RUN TENSORRT ENGINE
# -----------------------------------------------------------------------------
print("\nüöÄ Running Optimized Benchmark (TensorRT)...")
runtime = trt.Runtime(logger)
engine = runtime.deserialize_cuda_engine(plan)
context = engine.create_execution_context()

trt_times = []
trt_preds = []

for item in test_samples:
    audio, _ = librosa.load(io.BytesIO(item["audio"]["bytes"]), sr=16000)
    input_values = processor(audio, sampling_rate=16000, return_tensors="np").input_values

    # Allocations
    context.set_input_shape("input_values", input_values.shape)
    d_input = cuda.mem_alloc(input_values.nbytes)
    h_output = np.empty((1, input_values.shape[1] // 320, 108), dtype=np.float32)
    d_output = cuda.mem_alloc(h_output.nbytes)

    cuda.memcpy_htod(d_input, input_values)
    start = time.time()
    context.execute_v2([int(d_input), int(d_output)])
    trt_times.append((time.time() - start) * 1000)
    cuda.memcpy_dtoh(h_output, d_output)

    # CTC Decode (Logic Fix)
    pred_ids = np.argmax(h_output, axis=-1)[0]
    # Simple collapse: remove repeats and blanks
    grouped = [x for i, x in enumerate(pred_ids) if i == 0 or x != pred_ids[i-1]]
    clean = [x for x in grouped if x != processor.tokenizer.pad_token_id]
    trt_preds.append(processor.decode(clean))

trt_wer = wer(base_truths, trt_preds) * 100
trt_lat = np.mean(trt_times)


# 6. FINAL REPORT
# -----------------------------------------------------------------------------
print("\n" + "="*60)
print(f"{'METRIC':<20} | {'BASELINE (ONNX)':<15} | {'OPTIMIZED (TRT)':<15}")
print("-" * 60)
print(f"{'Latency (ms)':<20} | {base_lat:<15.2f} | {trt_lat:<15.2f}")
print(f"{'WER (%)':<20} | {base_wer:<15.2f} | {trt_wer:<15.2f}")
print(f"{'Speedup':<20} | {'1.0x':<15} | {base_lat/trt_lat:<15.2f}x")
print("="*60)


‚¨áÔ∏è Downloading Resources...
‚úÖ Dataset loaded (20 samples)

üê¢ Running Baseline (ONNX Runtime)...
üëâ Baseline Result: 55.11 ms | 36.59% WER

üõ°Ô∏è Building Optimized Engine (TensorRT)...
üõ°Ô∏è Applying Antidote...
   (Locked 57 sensitive layers to FP32)

üöÄ Running Optimized Benchmark (TensorRT)...


  layer.precision = trt.DataType.FLOAT
  layer.set_output_type(0, trt.DataType.FLOAT)



METRIC               | BASELINE (ONNX) | OPTIMIZED (TRT)
------------------------------------------------------------
Latency (ms)         | 55.11           | 16.45          
WER (%)              | 36.59           | 109.76         
Speedup              | 1.0x            | 3.35           x


In [None]:
# =============================================================================
# üöÄ MASTER SCRIPT: ONNX BASELINE vs. TENSORRT OPTIMIZATION
# =============================================================================



import os
import glob
import ctypes
import time
import numpy as np
import tensorrt as trt
import onnxruntime as ort
import pycuda.driver as cuda
import pycuda.autoinit
import librosa
import io
from huggingface_hub import hf_hub_download
from transformers import Wav2Vec2Processor
from datasets import load_dataset
from jiwer import wer

# Fix Library Paths for Colab
current_ld = os.environ.get('LD_LIBRARY_PATH', '')
if '/usr/lib64-nvidia' not in current_ld:
    os.environ['LD_LIBRARY_PATH'] = current_ld + ':/usr/lib64-nvidia'

# Force-load TensorRT drivers
try:
    libs = glob.glob("/usr/local/lib/python*/dist-packages/tensorrt_libs")[0]
    ctypes.CDLL(os.path.join(libs, "libnvinfer.so.10"))
    ctypes.CDLL(os.path.join(libs, "libnvinfer_plugin.so.10"))
except:
    pass

# 2. PREPARE RESOURCES
# -----------------------------------------------------------------------------
print("\n‚¨áÔ∏è Downloading Resources...")

# Download Clean ONNX (FP32)
if not os.path.exists("model.onnx"):
    try:
        path = hf_hub_download(repo_id="onnx-community/indicwav2vec-hindi-ONNX", filename="onnx/model.onnx")
        os.symlink(path, "model.onnx")
        print("‚úÖ Model downloaded (1.2 GB)")
    except:
        print("‚ùå Download failed. Check internet connection.")
        exit()

# Load Processor & Dataset
processor = Wav2Vec2Processor.from_pretrained("ai4bharat/indicwav2vec-hindi")
ds = load_dataset("MatrixSpeechAI/Common_voice_hindi_denoised", split="train", streaming=True).decode(False)
# Get 20 samples for testing
test_samples = list(ds.take(20))
print("‚úÖ Dataset loaded (20 samples)")


# 3. BASELINE: RUN RAW ONNX (ONNX RUNTIME)
# -----------------------------------------------------------------------------
print("\nüê¢ Running Baseline (ONNX Runtime)...")
ort_sess = ort.InferenceSession("model.onnx", providers=['CUDAExecutionProvider'])

base_times = []
base_preds = []
base_truths = []

for item in test_samples:
    audio, _ = librosa.load(io.BytesIO(item["audio"]["bytes"]), sr=16000)
    input_values = processor(audio, sampling_rate=16000, return_tensors="np").input_values

    start = time.time()
    logits = ort_sess.run(None, {'input_values': input_values})[0]
    base_times.append((time.time() - start) * 1000)

    pred_ids = np.argmax(logits, axis=-1)[0]
    base_preds.append(processor.decode(pred_ids))
    base_truths.append(item["transcription"])

base_wer = wer(base_truths, base_preds) * 100
base_lat = np.mean(base_times)
print(f"üëâ Baseline Result: {base_lat:.2f} ms | {base_wer:.2f}% WER")


# 4. OPTIMIZATION: BUILD TENSORRT ENGINE
# -----------------------------------------------------------------------------
print("\nüõ°Ô∏è Building Optimized Engine (TensorRT)...")
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(flag)
config = builder.create_builder_config()
parser = trt.OnnxParser(network, logger)

with open("model.onnx", 'rb') as f:
    parser.parse(f.read())

# Dynamic Shapes
profile = builder.create_optimization_profile()
profile.set_shape("input_values", (1, 16000), (1, 80000), (4, 160000))
config.add_optimization_profile(profile)

# ENABLE FP16 (Speed)
if builder.platform_has_fast_fp16:
    config.set_flag(trt.BuilderFlag.FP16)

# --- APPLY "BROAD SPECTRUM" ANTIDOTE ---
print("üõ°Ô∏è Applying Enhanced Antidote (Conv + Softmax + Norm)...")
count = 0
for i in range(network.num_layers):
    layer = network.get_layer(i)

    # Identify Risky Layer Types
    is_norm = (layer.type == trt.LayerType.NORMALIZATION) or ("LayerNorm" in layer.name)
    is_conv = (layer.type == trt.LayerType.CONVOLUTION)
    is_softmax = (layer.type == trt.LayerType.SOFTMAX)

    # If it's risky, force it to FP32
    if is_norm or is_conv or is_softmax:
        layer.precision = trt.DataType.FLOAT
        if layer.num_outputs > 0:
            layer.set_output_type(0, trt.DataType.FLOAT)
        count += 1

config.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS)
print(f"   (Locked {count} layers to FP32)")

# 5. BENCHMARK: RUN TENSORRT ENGINE
# -----------------------------------------------------------------------------
print("\nüöÄ Running Optimized Benchmark (TensorRT)...")
runtime = trt.Runtime(logger)
engine = runtime.deserialize_cuda_engine(plan)
context = engine.create_execution_context()

trt_times = []
trt_preds = []

for item in test_samples:
    audio, _ = librosa.load(io.BytesIO(item["audio"]["bytes"]), sr=16000)
    input_values = processor(audio, sampling_rate=16000, return_tensors="np").input_values

    # Allocations
    context.set_input_shape("input_values", input_values.shape)
    d_input = cuda.mem_alloc(input_values.nbytes)
    h_output = np.empty((1, input_values.shape[1] // 320, 108), dtype=np.float32)
    d_output = cuda.mem_alloc(h_output.nbytes)

    cuda.memcpy_htod(d_input, input_values)
    start = time.time()
    context.execute_v2([int(d_input), int(d_output)])
    trt_times.append((time.time() - start) * 1000)
    cuda.memcpy_dtoh(h_output, d_output)

    # CTC Decode (Logic Fix)
    pred_ids = np.argmax(h_output, axis=-1)[0]
    # Simple collapse: remove repeats and blanks
    grouped = [x for i, x in enumerate(pred_ids) if i == 0 or x != pred_ids[i-1]]
    clean = [x for x in grouped if x != processor.tokenizer.pad_token_id]
    trt_preds.append(processor.decode(clean))

trt_wer = wer(base_truths, trt_preds) * 100
trt_lat = np.mean(trt_times)


# 6. FINAL REPORT
# -----------------------------------------------------------------------------
print("\n" + "="*60)
print(f"{'METRIC':<20} | {'BASELINE (ONNX)':<15} | {'OPTIMIZED (TRT)':<15}")
print("-" * 60)
print(f"{'Latency (ms)':<20} | {base_lat:<15.2f} | {trt_lat:<15.2f}")
print(f"{'WER (%)':<20} | {base_wer:<15.2f} | {trt_wer:<15.2f}")
print(f"{'Speedup':<20} | {'1.0x':<15} | {base_lat/trt_lat:<15.2f}x")
print("="*60)


‚¨áÔ∏è Downloading Resources...


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 7db5a95d-171d-4c94-ac2b-e3e771f312b3)')' thrown while requesting GET https://huggingface.co/datasets/MatrixSpeechAI/Common_voice_hindi_denoised/resolve/f4455c36f4fd22827fd7e9a087da6c5e4d020478/data/train-00000-of-00009.parquet
Retrying in 1s [Retry 1/5].


‚úÖ Dataset loaded (20 samples)

üê¢ Running Baseline (ONNX Runtime)...
üëâ Baseline Result: 89.91 ms | 36.59% WER

üõ°Ô∏è Building Optimized Engine (TensorRT)...
üõ°Ô∏è Applying Enhanced Antidote (Conv + Softmax + Norm)...
   (Locked 89 layers to FP32)

üöÄ Running Optimized Benchmark (TensorRT)...


  layer.precision = trt.DataType.FLOAT
  layer.set_output_type(0, trt.DataType.FLOAT)



METRIC               | BASELINE (ONNX) | OPTIMIZED (TRT)
------------------------------------------------------------
Latency (ms)         | 89.91           | 24.99          
WER (%)              | 36.59           | 112.20         
Speedup              | 1.0x            | 3.60           x


In [6]:
# =============================================================================
# üöÄ FINAL SOLUTION: FP32 "SAFE MODE" ENGINE
# =============================================================================

import os
import glob
import ctypes
import time
import numpy as np
import tensorrt as trt
import onnxruntime as ort
import pycuda.driver as cuda
import pycuda.autoinit
import librosa
import io
from huggingface_hub import hf_hub_download
from transformers import Wav2Vec2Processor
from datasets import load_dataset
from jiwer import wer

# 1. SETUP & LINKING
# -----------------------------------------------------------------------------
print("üîß Linking Drivers...")
try:
    libs = glob.glob("/usr/local/lib/python*/dist-packages/tensorrt_libs")[0]
    os.environ["LD_LIBRARY_PATH"] += f":{libs}:/usr/lib64-nvidia"
    ctypes.CDLL(os.path.join(libs, "libnvinfer.so.10"))
    ctypes.CDLL(os.path.join(libs, "libnvinfer_plugin.so.10"))
except:
    pass

# 2. DATASET & RESOURCES
# -----------------------------------------------------------------------------
if not os.path.exists("model.onnx"):
    print("‚ùå model.onnx missing! Please check previous steps.")
    exit()

processor = Wav2Vec2Processor.from_pretrained("ai4bharat/indicwav2vec-hindi")
ds = load_dataset("MatrixSpeechAI/Common_voice_hindi_denoised", split="train", streaming=True).decode(False)
test_samples = list(ds.take(20))
print("‚úÖ Resources Loaded.")

# 3. BASELINE (ONNX)
# -----------------------------------------------------------------------------
print("\nüê¢ Running Baseline (ONNX Runtime)...")
ort_sess = ort.InferenceSession("model.onnx", providers=['CUDAExecutionProvider'])
base_times, base_preds, base_truths = [], [], []

for item in test_samples:
    audio, _ = librosa.load(io.BytesIO(item["audio"]["bytes"]), sr=16000)
    input_values = processor(audio, sampling_rate=16000, return_tensors="np").input_values

    start = time.time()
    logits = ort_sess.run(None, {'input_values': input_values})[0]
    base_times.append((time.time() - start) * 1000)

    pred_ids = np.argmax(logits, axis=-1)[0]
    base_preds.append(processor.decode(pred_ids))
    base_truths.append(item["transcription"])

base_wer = wer(base_truths, base_preds) * 100
base_lat = np.mean(base_times)
print(f"üëâ Baseline: {base_lat:.2f} ms | {base_wer:.2f}% WER")

# 4. BUILD ENGINE (FP32 ONLY - NO FP16 FLAG)
# -----------------------------------------------------------------------------
print("\nüõ°Ô∏è Building Safe Engine (FP32 Mode)...")
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(flag)
config = builder.create_builder_config()
parser = trt.OnnxParser(network, logger)

with open("model.onnx", 'rb') as f:
    parser.parse(f.read())

profile = builder.create_optimization_profile()
profile.set_shape("input_values", (1, 16000), (1, 80000), (4, 160000))
config.add_optimization_profile(profile)

# CRITICAL CHANGE: WE DO NOT SET THE FP16 FLAG HERE.
# This forces TensorRT to use FP32 precision, which is mathematically safe.
# We also don't need the "Antidote" loop because everything is already FP32.

config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 * 1024**3)
print("Building Engine... (This optimizes the graph structure without breaking math)")
plan = builder.build_serialized_network(network, config)

if not plan:
    print("‚ùå Build Failed!")
    exit()
print("‚úÖ Engine Built!")

# 5. BENCHMARK ENGINE
# -----------------------------------------------------------------------------
print("\nüöÄ Running Safe Benchmark...")
runtime = trt.Runtime(logger)
engine = runtime.deserialize_cuda_engine(plan)
context = engine.create_execution_context()

trt_times, trt_preds = [], []

for item in test_samples:
    audio, _ = librosa.load(io.BytesIO(item["audio"]["bytes"]), sr=16000)
    input_values = processor(audio, sampling_rate=16000, return_tensors="np").input_values

    # Allocations
    context.set_input_shape("input_values", input_values.shape)
    d_input = cuda.mem_alloc(input_values.nbytes)
    h_output = np.empty((1, input_values.shape[1] // 320, 108), dtype=np.float32)
    d_output = cuda.mem_alloc(h_output.nbytes)

    # Inference
    cuda.memcpy_htod(d_input, input_values)
    start = time.time()
    context.execute_v2([int(d_input), int(d_output)])
    trt_times.append((time.time() - start) * 1000)
    cuda.memcpy_dtoh(h_output, d_output)

    # Decode
    pred_ids = np.argmax(h_output, axis=-1)[0]
    grouped = [x for i, x in enumerate(pred_ids) if i == 0 or x != pred_ids[i-1]]
    clean = [x for x in grouped if x != processor.tokenizer.pad_token_id]
    trt_preds.append(processor.decode(clean))

trt_wer = wer(base_truths, trt_preds) * 100
trt_lat = np.mean(trt_times)

# 6. REPORT
# -----------------------------------------------------------------------------
print("\n" + "="*60)
print(f"{'METRIC':<20} | {'BASELINE':<15} | {'OPTIMIZED (FP32)':<15}")
print("-" * 60)
print(f"{'Latency (ms)':<20} | {base_lat:<15.2f} | {trt_lat:<15.2f}")
print(f"{'WER (%)':<20} | {base_wer:<15.2f} | {trt_wer:<15.2f}")
print(f"{'Speedup':<20} | {'1.0x':<15} | {base_lat/trt_lat:<15.2f}x")
print("="*60)

üîß Linking Drivers...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/257 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/741 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/436 [00:00<?, ?B/s]

‚úÖ Resources Loaded.

üê¢ Running Baseline (ONNX Runtime)...
üëâ Baseline: 85.67 ms | 36.59% WER

üõ°Ô∏è Building Safe Engine (FP32 Mode)...
Building Engine... (This optimizes the graph structure without breaking math)
‚úÖ Engine Built!

üöÄ Running Safe Benchmark...

METRIC               | BASELINE        | OPTIMIZED (FP32)
------------------------------------------------------------
Latency (ms)         | 85.67           | 54.51          
WER (%)              | 36.59           | 109.76         
Speedup              | 1.0x            | 1.57           x


In [7]:
# =============================================================================
# üöÄ FINAL SOLUTION: FP32 "SAFE MODE" ENGINE
# =============================================================================

import os
import glob
import ctypes
import time
import numpy as np
import tensorrt as trt
import onnxruntime as ort
import pycuda.driver as cuda
import pycuda.autoinit
import librosa
import io
from huggingface_hub import hf_hub_download
from transformers import Wav2Vec2Processor
from datasets import load_dataset
from jiwer import wer

# 1. SETUP & LINKING
# -----------------------------------------------------------------------------
print("üîß Linking Drivers...")
try:
    libs = glob.glob("/usr/local/lib/python*/dist-packages/tensorrt_libs")[0]
    os.environ["LD_LIBRARY_PATH"] += f":{libs}:/usr/lib64-nvidia"
    ctypes.CDLL(os.path.join(libs, "libnvinfer.so.10"))
    ctypes.CDLL(os.path.join(libs, "libnvinfer_plugin.so.10"))
except:
    pass

# 2. DATASET & RESOURCES
# -----------------------------------------------------------------------------
if not os.path.exists("model.onnx"):
    print("‚ùå model.onnx missing! Please check previous steps.")
    exit()

processor = Wav2Vec2Processor.from_pretrained("ai4bharat/indicwav2vec-hindi")
ds = load_dataset("MatrixSpeechAI/Common_voice_hindi_denoised", split="train", streaming=True).decode(False)
test_samples = list(ds.take(20))
print("‚úÖ Resources Loaded.")

# 3. BASELINE (ONNX)
# -----------------------------------------------------------------------------
print("\nüê¢ Running Baseline (ONNX Runtime)...")
ort_sess = ort.InferenceSession("model.onnx", providers=['CUDAExecutionProvider'])
base_times, base_preds, base_truths = [], [], []

for item in test_samples:
    audio, _ = librosa.load(io.BytesIO(item["audio"]["bytes"]), sr=16000)
    input_values = processor(audio, sampling_rate=16000, return_tensors="np").input_values

    start = time.time()
    logits = ort_sess.run(None, {'input_values': input_values})[0]
    base_times.append((time.time() - start) * 1000)

    pred_ids = np.argmax(logits, axis=-1)[0]
    base_preds.append(processor.decode(pred_ids))
    base_truths.append(item["transcription"])

base_wer = wer(base_truths, base_preds) * 100
base_lat = np.mean(base_times)
print(f"üëâ Baseline: {base_lat:.2f} ms | {base_wer:.2f}% WER")

# 4. BUILD ENGINE (FP32 ONLY - NO FP16 FLAG)
# -----------------------------------------------------------------------------
print("\nüõ°Ô∏è Building Safe Engine (FP32 Mode)...")
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(flag)
config = builder.create_builder_config()
parser = trt.OnnxParser(network, logger)

with open("model.onnx", 'rb') as f:
    parser.parse(f.read())

profile = builder.create_optimization_profile()
profile.set_shape("input_values", (1, 16000), (1, 80000), (4, 160000))
config.add_optimization_profile(profile)

# CRITICAL CHANGE: WE DO NOT SET THE FP16 FLAG HERE.
# This forces TensorRT to use FP32 precision, which is mathematically safe.
# We also don't need the "Antidote" loop because everything is already FP32.

config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 * 1024**3)
print("Building Engine... (This optimizes the graph structure without breaking math)")
plan = builder.build_serialized_network(network, config)

if not plan:
    print("‚ùå Build Failed!")
    exit()
print("‚úÖ Engine Built!")

# 5. BENCHMARK ENGINE
# -----------------------------------------------------------------------------
print("\nüöÄ Running Safe Benchmark...")
runtime = trt.Runtime(logger)
engine = runtime.deserialize_cuda_engine(plan)
context = engine.create_execution_context()

trt_times, trt_preds = [], []

for item in test_samples:
    audio, _ = librosa.load(io.BytesIO(item["audio"]["bytes"]), sr=16000)
    input_values = processor(audio, sampling_rate=16000, return_tensors="np").input_values

    # Allocations
    context.set_input_shape("input_values", input_values.shape)
    d_input = cuda.mem_alloc(input_values.nbytes)
    h_output = np.empty((1, input_values.shape[1] // 320, 108), dtype=np.float32)
    d_output = cuda.mem_alloc(h_output.nbytes)

    # Inference
    cuda.memcpy_htod(d_input, input_values)
    start = time.time()
    context.execute_v2([int(d_input), int(d_output)])
    trt_times.append((time.time() - start) * 1000)
    cuda.memcpy_dtoh(h_output, d_output)

    # Decode
    pred_ids = np.argmax(h_output, axis=-1)[0]
    grouped = [x for i, x in enumerate(pred_ids) if i == 0 or x != pred_ids[i-1]]
    clean = [x for x in grouped if x != processor.tokenizer.pad_token_id]
    trt_preds.append(processor.decode(clean))

trt_wer = wer(base_truths, trt_preds) * 100
trt_lat = np.mean(trt_times)

# 6. REPORT
# -----------------------------------------------------------------------------
print("\n" + "="*60)
print(f"{'METRIC':<20} | {'BASELINE':<15} | {'OPTIMIZED (FP32)':<15}")
print("-" * 60)
print(f"{'Latency (ms)':<20} | {base_lat:<15.2f} | {trt_lat:<15.2f}")
print(f"{'WER (%)':<20} | {base_wer:<15.2f} | {trt_wer:<15.2f}")
print(f"{'Speedup':<20} | {'1.0x':<15} | {base_lat/trt_lat:<15.2f}x")
print("="*60)

üîß Linking Drivers...
‚úÖ Resources Loaded.

üê¢ Running Baseline (ONNX Runtime)...
üëâ Baseline: 52.61 ms | 36.59% WER

üõ°Ô∏è Building Safe Engine (FP32 Mode)...
Building Engine... (This optimizes the graph structure without breaking math)
‚úÖ Engine Built!

üöÄ Running Safe Benchmark...

METRIC               | BASELINE        | OPTIMIZED (FP32)
------------------------------------------------------------
Latency (ms)         | 52.61           | 57.47          
WER (%)              | 36.59           | 108.94         
Speedup              | 1.0x            | 0.92           x


In [8]:
# =============================================================================
# üöÄ MASTER SCRIPT: FRESH EXPORT + BUILD + BENCHMARK
# =============================================================================
import os
import torch
import librosa
import numpy as np
import tensorrt as trt
import onnxruntime as ort
import pycuda.driver as cuda
import pycuda.autoinit
import io
import time
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import load_dataset
from jiwer import wer

# 1. SETUP ENVIRONMENT
print("üîß Installing/Verifying Dependencies...")
os.environ["LD_LIBRARY_PATH"] += ":/usr/lib64-nvidia"

# 2. DOWNLOAD PYTORCH MODEL & EXPORT FRESH ONNX
print("\n‚¨áÔ∏è Downloading Original PyTorch Model...")
model_id = "ai4bharat/indicwav2vec-hindi"
try:
    model = Wav2Vec2ForCTC.from_pretrained(model_id).to("cpu")
    processor = Wav2Vec2Processor.from_pretrained(model_id)
    model.eval()
except Exception as e:
    print(f"‚ùå Failed to load PyTorch model: {e}")
    exit()

print("üîÑ Generating Fresh ONNX (Opset 17)...")
dummy_input = torch.randn(1, 16000) # 1 second audio
onnx_path = "model_fresh.onnx"

with torch.no_grad():
    torch.onnx.export(
        model,
        dummy_input,
        onnx_path,
        export_params=True,
        opset_version=17,  # CRITICAL: Opset 17 fixes the LayerNorm bug
        do_constant_folding=True,
        input_names=['input_values'],
        output_names=['logits'],
        dynamic_axes={
            'input_values': {0: 'batch_size', 1: 'sequence_length'},
            'logits': {0: 'batch_size', 1: 'sequence_length'}
        }
    )
print(f"‚úÖ Fresh ONNX Created: {onnx_path}")

# 3. BUILD TENSORRT ENGINE (SAFE MODE)
print("\nüõ°Ô∏è Building TensorRT Engine (FP32)...")
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
config = builder.create_builder_config()
parser = trt.OnnxParser(network, logger)

with open(onnx_path, 'rb') as f:
    if not parser.parse(f.read()):
        print("‚ùå ONNX Parse Failed")
        for e in range(parser.num_errors): print(parser.get_error(e))
        exit()

# Setup Profile (Crucial for correct execution)
profile = builder.create_optimization_profile()
profile.set_shape("input_values", (1, 16000), (1, 80000), (4, 160000))
config.add_optimization_profile(profile)
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 * 1024**3)

# Build
plan = builder.build_serialized_network(network, config)
if not plan:
    print("‚ùå Engine Build Failed!")
    exit()
print("‚úÖ Engine Built Successfully!")

# 4. BENCHMARK (With Proper Binding Checks)
print("\nüöÄ Running Benchmark...")

# Load Resources
ds = load_dataset("MatrixSpeechAI/Common_voice_hindi_denoised", split="train", streaming=True).decode(False)
test_samples = list(ds.take(20))

# Setup Runtime
runtime = trt.Runtime(logger)
engine = runtime.deserialize_cuda_engine(plan)
context = engine.create_execution_context()

# Resolve Bindings
tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
input_name = [n for n in tensor_names if engine.get_tensor_mode(n) == trt.TensorIOMode.INPUT][0]
output_name = [n for n in tensor_names if engine.get_tensor_mode(n) == trt.TensorIOMode.OUTPUT][0]

# Setup Baseline
ort_sess = ort.InferenceSession(onnx_path, providers=['CUDAExecutionProvider'])

base_times, trt_times = [], []
base_preds, trt_preds = [], []
truths = []

for i, item in enumerate(test_samples):
    # Prepare Input
    audio, _ = librosa.load(io.BytesIO(item["audio"]["bytes"]), sr=16000)
    raw_input = processor(audio, sampling_rate=16000, return_tensors="np").input_values

    # ‚ö†Ô∏è CRITICAL: Strict shape checks to prevent crashing
    seq_len = raw_input.shape[1]
    if seq_len < 16000 or seq_len > 160000:
        continue # Skip samples that violate the TensorRT Profile

    input_values = np.ascontiguousarray(raw_input).astype(np.float32)
    truths.append(item["transcription"])

    # Run Baseline
    start = time.time()
    logits = ort_sess.run(None, {'input_values': input_values})[0]
    base_times.append((time.time() - start) * 1000)
    pred_ids = np.argmax(logits, axis=-1)[0]
    base_preds.append(processor.decode([x for x in pred_ids if x != processor.tokenizer.pad_token_id]))

    # Run TensorRT
    context.set_input_shape(input_name, input_values.shape)
    d_input = cuda.mem_alloc(input_values.nbytes)
    out_shape = (1, input_values.shape[1] // 320, 108)
    h_output = np.zeros(out_shape, dtype=np.float32)
    d_output = cuda.mem_alloc(h_output.nbytes)

    cuda.memcpy_htod(d_input, input_values)
    context.set_tensor_address(input_name, int(d_input))
    context.set_tensor_address(output_name, int(d_output))

    start = time.time()
    context.execute_async_v3(stream_handle=0)
    cuda.Context.synchronize()
    trt_times.append((time.time() - start) * 1000)

    cuda.memcpy_dtoh(h_output, d_output)

    # Decode
    pred_ids = np.argmax(h_output, axis=-1)[0]
    grouped = [x for k, x in enumerate(pred_ids) if k == 0 or x != pred_ids[k-1]]
    trt_preds.append(processor.decode([x for x in grouped if x != processor.tokenizer.pad_token_id]))

    if i % 5 == 0: print(f"   Processed {i+1}...")

# 5. REPORT
base_wer = wer(truths, base_preds) * 100
trt_wer = wer(truths, trt_preds) * 100
base_lat = np.mean(base_times)
trt_lat = np.mean(trt_times)

print("\n" + "="*60)
print(f"{'METRIC':<20} | {'BASELINE (ONNX)':<15} | {'OPTIMIZED (TRT)':<15}")
print("-" * 60)
print(f"{'Latency (ms)':<20} | {base_lat:<15.2f} | {trt_lat:<15.2f}")
print(f"{'WER (%)':<20} | {base_wer:<15.2f} | {trt_wer:<15.2f}")
print(f"{'Speedup':<20} | {'1.0x':<15} | {base_lat/trt_lat:<15.2f}x")
print("="*60)

üîß Installing/Verifying Dependencies...

‚¨áÔ∏è Downloading Original PyTorch Model...
üîÑ Generating Fresh ONNX (Opset 17)...


ModuleNotFoundError: No module named 'onnxscript'