In [2]:
import torch
import os
import time
import psutil
import pynvml
import csv
import random
import torch
#use venv torch_cuda126 to use torch with cuda

In [2]:
!pip install pynvml

Collecting pynvml
  Downloading pynvml-13.0.1-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-ml-py>=12.0.0 (from pynvml)
  Downloading nvidia_ml_py-13.580.82-py3-none-any.whl.metadata (9.6 kB)
Downloading pynvml-13.0.1-py3-none-any.whl (28 kB)
Downloading nvidia_ml_py-13.580.82-py3-none-any.whl (49 kB)
Installing collected packages: nvidia-ml-py, pynvml
Successfully installed nvidia-ml-py-13.580.82 pynvml-13.0.1


In [3]:
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.device_count())  # Should return the number of GPUs available
print(torch.cuda.get_device_name(0))  # Should print the name of your GPU (RTX 4050)
print(torch.cuda.current_device())  # Should return 0 (GPU index)


True
1
NVIDIA GeForce RTX 4050 Laptop GPU
0


In [27]:
# --- Initialize pynvml ---
pynvml.nvmlInit()
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0) # Assuming GPU 0


In [5]:
def get_system_state():
    """Captures the current state of CPU and GPU."""
    cpu_load = psutil.cpu_percent()
    gpu_util_rates = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle)
    gpu_util = gpu_util_rates.gpu
    gpu_mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
    gpu_mem_used_percent = 100 * (gpu_mem_info.used / gpu_mem_info.total)
    return cpu_load, gpu_util, gpu_mem_used_percent


In [6]:
def benchmark_vec_add_multi_thread(size, device, num_threads=None):
    """
    Runs vector addition and returns execution time in milliseconds.
    For the CPU, it allows specifying the number of threads to use.
    """
    a = torch.randn(size, device=device)
    b = torch.randn(size, device=device)
    
    if device == 'cuda':
        # GPU timing remains the same
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        torch.cuda.synchronize()
        
        start_event.record()
        c = a + b
        end_event.record()
        
        torch.cuda.synchronize()
        return start_event.elapsed_time(end_event)
    
    elif device == 'cpu':
        # For CPU, we control the number of threads
        if num_threads is None:
            # Use all available threads by default
            num_threads = os.cpu_count()

        # Save the original number of threads to restore it later
        original_threads = torch.get_num_threads()
        torch.set_num_threads(num_threads)
        
        # Verify the setting
        # print(f"Running CPU computation with {torch.get_num_threads()} threads.")

        try:
            start_time = time.perf_counter()
            c = a + b
            end_time = time.perf_counter()
            
            # Return duration in milliseconds
            return (end_time - start_time) * 1000
        finally:
            # IMPORTANT: Restore the original thread count to avoid side effects
            torch.set_num_threads(original_threads)
    else:
        raise ValueError("Device must be 'cpu' or 'cuda'")

In [7]:
def benchmark_vec_add(size, device):
    """Runs vector addition and returns execution time in milliseconds."""
    a = torch.randn(size, device=device)
    b = torch.randn(size, device=device)
    
    if device == 'cuda':
        # Use CUDA events for accurate GPU timing (this part is correct)
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        torch.cuda.synchronize()
        
        start_event.record()
        c = a + b
        end_event.record()
        
        torch.cuda.synchronize()
        return start_event.elapsed_time(end_event)
    else:
        # CORRECTED CPU TIMING: Use time.perf_counter()
        start_time = time.perf_counter()
        c = a + b
        end_time = time.perf_counter()
        
        # Return duration in milliseconds
        return (end_time - start_time) * 1000

In [12]:
# --- Data Collection Logic ---
# input_size = 1000000000 # 10^9 this the max limit
input_size = 100000
task_name = "vector_add"
#min limit is 10^3
# max limit is 5 * 10^ 8 

In [16]:
# Capture system state BEFORE running the task
cpu_load, gpu_util, gpu_mem = get_system_state()


In [19]:
# Benchmark
cpu_time_ms = benchmark_vec_add_multi_thread(input_size, 'cpu')
cpu_time_ms

0.8505000005243346

In [20]:
gpu_time_ms = benchmark_vec_add(input_size, 'cuda')
gpu_time_ms

0.14745600521564484

In [21]:
# Save the data point
data_point = [
    task_name, 
    input_size, 
    cpu_load, 
    gpu_util,
    gpu_mem,
    cpu_time_ms, 
    gpu_time_ms
]

In [22]:
print(f"Data Point: {data_point}")
# Here you would append this list to a CSV file


Data Point: ['vector_add', 100000, 19.7, 0, 7.297009851815665, 0.8505000005243346, 0.14745600521564484]


In [23]:
# --- Shutdown pynvml ---
pynvml.nvmlShutdown()

In [24]:
def get_random_integer(min_val=1000, max_val=10000000):
  """
  Returns a random integer between min_val and max_val (inclusive).
  """
  return random.randint(min_val, max_val)

In [28]:
DATA_SIZE= 5000
min_size = 1000
max_size = 500000000
task_name = "vector_add"
for i in range(DATA_SIZE):
    vec_size = get_random_integer(min_size,max_size)
    cpu_load, gpu_util, gpu_mem = get_system_state()
    cpu_time_ms = benchmark_vec_add_multi_thread(input_size, 'cpu')
    gpu_time_ms = benchmark_vec_add_multi_thread(input_size, 'cuda')
    # Save the data point
    data_point = [
        task_name, 
        input_size, 
        cpu_load, 
        gpu_util,
        gpu_mem,
        cpu_time_ms, 
        gpu_time_ms
    ]
    



In [87]:
# --- Configuration ---
DATA_SAMPLES = 500  # Reduced for a quick example
MIN_SIZE = 1000
MAX_SIZE = 50_000_000
TASK_NAME = "vector_add"
CSV_FILENAME = "benchmark_data.csv"

# --- Define the header for your CSV file ---
header = [
    "task_name", 
    "vector_size", 
    "cpu_load_%", 
    "gpu_util_%",
    "gpu_mem_%",
    "cpu_time_ms", 
    "gpu_time_ms"
]

# --- Main Data Collection Loop ---
# Use 'with open' to handle the file safely
with open(CSV_FILENAME, 'w', newline='') as f:
    # 1. Create a CSV writer object
    writer = csv.writer(f)
    
    # 2. Write the header row once, before the loop starts
    writer.writerow(header)
    
    print(f"Starting data collection... saving to {CSV_FILENAME}")
    
    # 3. Loop to generate and write data points
    for i in range(DATA_SAMPLES):
        # Generate a new random size for each iteration
        vec_size = get_random_integer(MIN_SIZE, MAX_SIZE)
        
        # Get system state before running benchmarks
        cpu_load, gpu_util, gpu_mem = get_system_state()
        
        # Run benchmarks with the correct 'vec_size' variable
        cpu_time_ms = benchmark_vec_add_multi_thread(vec_size, 'cpu')
        gpu_time_ms = benchmark_vec_add_multi_thread(vec_size, 'cuda')
        
        # Assemble the data point for the current iteration
        data_point = [
            TASK_NAME, 
            vec_size, 
            cpu_load, 
            gpu_util,
            gpu_mem,
            round(cpu_time_ms, 4), # Rounding for cleaner data 
            round(gpu_time_ms, 4)
        ]
        
        # 4. Write the data_point list as a new row in the CSV
        writer.writerow(data_point)

        # Optional: Print progress
        if (i + 1) % 50 == 0:
            print(f"  ... collected {i + 1}/{DATA_SAMPLES} samples.")

print("Data collection complete.")

Starting data collection... saving to benchmark_data.csv
  ... collected 50/500 samples.
  ... collected 100/500 samples.
  ... collected 150/500 samples.
  ... collected 200/500 samples.
  ... collected 250/500 samples.
  ... collected 300/500 samples.
  ... collected 350/500 samples.
  ... collected 400/500 samples.
  ... collected 450/500 samples.
  ... collected 500/500 samples.
Data collection complete.
