In [1]:
import torch

In [2]:
x = torch.tensor([1.], requires_grad=True)
with torch.no_grad():
    y = x * 2
y.requires_grad

False

In [3]:
@torch.no_grad()
def doubler(x):
    return x * 2


z = doubler(x)
z.requires_grad

False

In [4]:
@torch.no_grad()
def doubler(x):
    return x * 2


z = doubler(x)
z.requires_grad

False

In [5]:
@torch.no_grad()
def tripler(x):
    return x * 3


z = tripler(x)
z.requires_grad

False

In [6]:
import torch

x = torch.ones(1, 2, 3, requires_grad=True)
with torch.inference_mode():
    y = x * x
y.requires_grad

False

In [7]:
@torch.inference_mode()
def func(x):
    return x * x


out = func(x)
out.requires_grad

False

In [8]:
import torch
from torchvision.models import resnet50

model = resnet50().to("cuda")

dummy_input = torch.randn(1, 3, 224, 224).to("cuda")

model.train()
output_train = model(dummy_input)

In [9]:
from torch.profiler import profile, record_function, ProfilerActivity

torch.cuda.synchronize()
torch.cuda.empty_cache()

with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        profile_memory=True,
        record_shapes=True
) as prof_no_grad:
    with record_function("no_grad_inference"):
        with torch.no_grad():
            _ = model(dummy_input)

torch.cuda.synchronize()

print(
    prof_no_grad.key_averages()
    .table(sort_by="self_cuda_memory_usage", row_limit=10)
)


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::empty         1.88%     544.198us         1.88%     544.198us       2.054us       0.000us         0.00%       0.000us       0.000us           0 B           0 B      42.88 MB      42.88 M

  _warn_once(


In [10]:
torch.cuda.synchronize()
torch.cuda.empty_cache()

with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        profile_memory=True,
        record_shapes=True
) as prof_infer:
    with record_function("inference_mode"):
        with torch.inference_mode():
            _ = model(dummy_input)

torch.cuda.synchronize()

print(
    prof_infer.key_averages()
    .table(sort_by="self_cuda_memory_usage", row_limit=10)
)


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::empty         6.03%       1.452ms         6.03%       1.452ms       5.481us       0.000us         0.00%       0.000us       0.000us           0 B           0 B      42.88 MB      42.88 M

In [11]:
import torch
from torch.profiler import profile, record_function, ProfilerActivity
import pandas as pd
from torchvision.models import resnet50, vit_b_16, efficientnet_b0, mobilenet_v3_small
import time
from typing import Dict, List, Tuple


class InferenceModeComparator:
    def __init__(self, num_runs: int = 5):
        self.num_runs = num_runs
        self.results = []

    def get_models_and_inputs(self) -> List[Tuple[str, torch.nn.Module, torch.Tensor]]:
        models = []

        resnet = resnet50().cuda()
        resnet_input = torch.randn(32, 3, 224, 224).cuda()
        models.append(("ResNet50", resnet, resnet_input))

        vit = vit_b_16().cuda()
        vit_input = torch.randn(32, 3, 224, 224).cuda()
        models.append(("ViT-B/16", vit, vit_input))

        effnet = efficientnet_b0().cuda()
        effnet_input = torch.randn(32, 3, 224, 224).cuda()
        models.append(("EfficientNet-B0", effnet, effnet_input))

        mobilenet = mobilenet_v3_small().cuda()
        mobilenet_input = torch.randn(32, 3, 224, 224).cuda()
        models.append(("MobileNetV3-Small", mobilenet, mobilenet_input))

        return models

    def profile_no_grad(self, model: torch.nn.Module, input_tensor: torch.Tensor) -> Dict:
        torch.cuda.synchronize()
        torch.cuda.empty_cache()

        start_time = time.time()

        with profile(
                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                profile_memory=True,
                record_shapes=True
        ) as prof:
            with record_function("no_grad_inference"):
                with torch.no_grad():
                    _ = model(input_tensor)

        torch.cuda.synchronize()
        end_time = time.time()

        stats = prof.key_averages()

        total_cuda_time = 0
        total_cpu_time = 0
        total_cuda_memory = 0

        for item in stats:
            total_cuda_time += item.device_time_total
            total_cpu_time += item.cpu_time_total

            total_cuda_memory += item.self_device_memory_usage

        return {
            'cuda_time_us': total_cuda_time,
            'cpu_time_us': total_cpu_time,
            'cuda_memory_bytes': total_cuda_memory,
            'wall_time_s': end_time - start_time
        }

    def profile_inference_mode(self, model: torch.nn.Module, input_tensor: torch.Tensor) -> Dict:
        torch.cuda.synchronize()
        torch.cuda.empty_cache()

        start_time = time.time()

        with profile(
                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                profile_memory=True,
                record_shapes=True
        ) as prof:
            with record_function("inference_mode"):
                with torch.inference_mode():
                    _ = model(input_tensor)

        torch.cuda.synchronize()
        end_time = time.time()

        stats = prof.key_averages()

        total_cuda_time = 0
        total_cpu_time = 0
        total_cuda_memory = 0

        for item in stats:
            total_cuda_time += item.device_time_total
            total_cpu_time += item.cpu_time_total

            total_cuda_memory += item.self_device_memory_usage

        return {
            'cuda_time_us': total_cuda_time,
            'cpu_time_us': total_cpu_time,
            'cuda_memory_bytes': total_cuda_memory,
            'wall_time_s': end_time - start_time
        }

    def run_comparison(self):
        models = self.get_models_and_inputs()

        for model_name, model, input_tensor in models:
            for run in range(self.num_runs):
                if run == 0:
                    with torch.no_grad():
                        _ = model(input_tensor)
                    torch.cuda.synchronize()

                no_grad_metrics = self.profile_no_grad(model, input_tensor)

                inference_mode_metrics = self.profile_inference_mode(model, input_tensor)

                self.results.append({
                    'model': model_name,
                    'run': run + 1,
                    'mode': 'no_grad',
                    **no_grad_metrics
                })

                self.results.append({
                    'model': model_name,
                    'run': run + 1,
                    'mode': 'inference_mode',
                    **inference_mode_metrics
                })

            del model
            torch.cuda.empty_cache()

    def analyze_results(self) -> pd.DataFrame:
        df = pd.DataFrame(self.results)

        avg_df = df.groupby(['model', 'mode']).agg({
            'cuda_time_us': 'mean',
            'cpu_time_us': 'mean',
            'cuda_memory_bytes': 'mean',
            'wall_time_s': 'mean'
        }).reset_index()

        comparison_data = []

        for model_name in avg_df['model'].unique():
            model_data = avg_df[avg_df['model'] == model_name]

            no_grad_row = model_data[model_data['mode'] == 'no_grad'].iloc[0]
            inf_mode_row = model_data[model_data['mode'] == 'inference_mode'].iloc[0]

            comparison_data.append({
                'Model': model_name,
                'no_grad_cuda_time': no_grad_row['cuda_time_us'],
                'inference_mode_cuda_time': inf_mode_row['cuda_time_us'],
                'cuda_time_diff_%': ((inf_mode_row['cuda_time_us'] - no_grad_row['cuda_time_us']) / no_grad_row[
                    'cuda_time_us'] * 100),
                'no_grad_cpu_time': no_grad_row['cpu_time_us'],
                'inference_mode_cpu_time': inf_mode_row['cpu_time_us'],
                'cpu_time_diff_%': ((inf_mode_row['cpu_time_us'] - no_grad_row['cpu_time_us']) / no_grad_row[
                    'cpu_time_us'] * 100),
                'no_grad_memory_MB': no_grad_row['cuda_memory_bytes'] / 1024 / 1024,
                'inference_mode_memory_MB': inf_mode_row['cuda_memory_bytes'] / 1024 / 1024,
                'memory_diff_%': ((inf_mode_row['cuda_memory_bytes'] - no_grad_row['cuda_memory_bytes']) / no_grad_row[
                    'cuda_memory_bytes'] * 100) if no_grad_row['cuda_memory_bytes'] != 0 else 0,
                'no_grad_wall_time_ms': no_grad_row['wall_time_s'] * 1000,
                'inference_mode_wall_time_ms': inf_mode_row['wall_time_s'] * 1000,
                'wall_time_diff_%': ((inf_mode_row['wall_time_s'] - no_grad_row['wall_time_s']) / no_grad_row[
                    'wall_time_s'] * 100),
            })

        comparison_df = pd.DataFrame(comparison_data)

        return comparison_df, avg_df, df

    def print_summary(self, comparison_df: pd.DataFrame):
        print(comparison_df[
            ['Model', 'no_grad_cuda_time', 'inference_mode_cuda_time', 'cuda_time_diff_%']].to_string(
            index=False))

        print(comparison_df[['Model', 'no_grad_memory_MB', 'inference_mode_memory_MB', 'memory_diff_%']].to_string(
            index=False))

        print(comparison_df[
            ['Model', 'no_grad_wall_time_ms', 'inference_mode_wall_time_ms', 'wall_time_diff_%']].to_string(
            index=False))




In [12]:
comparator = InferenceModeComparator(num_runs=50)
comparator.run_comparison()

comparison_df, avg_df, raw_df = comparator.analyze_results()

comparator.print_summary(comparison_df)

comparison_df.to_csv('inference_mode_comparison.csv', index=False)

comparison_df, avg_df, raw_df

            Model  no_grad_cuda_time  inference_mode_cuda_time  cuda_time_diff_%
  EfficientNet-B0       3.771264e+05              3.771020e+05         -0.006480
MobileNetV3-Small       8.782631e+04              8.780438e+04         -0.024972
         ResNet50       1.006342e+06              1.005473e+06         -0.086349
         ViT-B/16       4.537360e+06              4.538415e+06          0.023253
            Model  no_grad_memory_MB  inference_mode_memory_MB  memory_diff_%
  EfficientNet-B0            0.12207                   0.12207            0.0
MobileNetV3-Small            0.12207                   0.12207            0.0
         ResNet50            0.12207                   0.12207            0.0
         ViT-B/16            0.12207                   0.12207            0.0
            Model  no_grad_wall_time_ms  inference_mode_wall_time_ms  wall_time_diff_%
  EfficientNet-B0             81.880479                    82.191329          0.379639
MobileNetV3-Small             2

(               Model  no_grad_cuda_time  inference_mode_cuda_time  \
 0    EfficientNet-B0       3.771264e+05              3.771020e+05   
 1  MobileNetV3-Small       8.782631e+04              8.780438e+04   
 2           ResNet50       1.006342e+06              1.005473e+06   
 3           ViT-B/16       4.537360e+06              4.538415e+06   
 
    cuda_time_diff_%  no_grad_cpu_time  inference_mode_cpu_time  \
 0         -0.006480       91967.76426              92084.39496   
 1         -0.024972       36113.06724              36202.56864   
 2         -0.086349      179098.77398             178651.68460   
 3          0.023253      872713.57940             872462.48304   
 
    cpu_time_diff_%  no_grad_memory_MB  inference_mode_memory_MB  \
 0         0.126817            0.12207                   0.12207   
 1         0.247837            0.12207                   0.12207   
 2        -0.249633            0.12207                   0.12207   
 3        -0.028772            0.12207 