In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
torch.zeros(10).cuda()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0')

In [4]:
import torch
import time

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create random tensors
size = 10000 
a = torch.randn(size, size, device=device)
b = torch.randn(size, size, device=device)

# Warm-up
for _ in range(10):
    c = torch.matmul(a, b)

# Measure time
start_time = time.time()
c = torch.matmul(a, b)
end_time = time.time()

print(f"Time taken for matrix multiplication: {end_time - start_time} seconds")


Using device: cuda
Time taken for matrix multiplication: 0.0 seconds


In [5]:
import torch
import time

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create larger random tensors
size = 20000
a = torch.randn(size, size, device=device)
b = torch.randn(size, size, device=device)

# Warm-up
for _ in range(10):
    c = torch.matmul(a, b)

# Measure time for multiple operations
start_time = time.time()
for _ in range(10):
    c = torch.matmul(a, b)
    c = torch.sin(c)
    c = torch.exp(c)
end_time = time.time()

print(f"Time taken for multiple operations: {end_time - start_time} seconds")


Using device: cuda
Time taken for multiple operations: 9.512589931488037 seconds


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import time

# Define a simple CNN
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(128 * 32 * 32, 1024)  # Adjusted to match the input size
        self.fc2 = nn.Linear(1024, 10)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.flatten(x, 1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create random input tensor
input_tensor = torch.randn(64, 3, 32, 32, device=device)

# Initialize the model, loss function, and optimizer
model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Warm-up
model.train()
for _ in range(10):
    optimizer.zero_grad()
    outputs = model(input_tensor)
    loss = criterion(outputs, torch.randint(0, 10, (64,), device=device))
    loss.backward()
    optimizer.step()

# Measure time for training step
start_time = time.time()
for _ in range(10):
    optimizer.zero_grad()
    outputs = model(input_tensor)
    loss = criterion(outputs, torch.randint(0, 10, (64,), device=device))
    loss.backward()
    optimizer.step()
end_time = time.time()

print(f"Time taken for 10 training steps: {end_time - start_time} seconds")


Using device: cuda
Time taken for 10 training steps: 0.00999307632446289 seconds


In [7]:
import torch

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create a tensor and print its device
tensor = torch.randn(10, device=device)
print(f"Tensor is on device: {tensor.device}")


Using device: cuda
Tensor is on device: cuda:0


In [9]:
import torch
import time

def test_gpu_performance(device='cuda'):
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available. Please check your GPU setup.")

    # Set the device
    device = torch.device(device)

    # Define the size of the tensor and number of operations
    tensor_size = (1024, 1024)  # Adjust size as needed
    num_operations = 10000

    # Generate random tensors and move them to GPU
    a = torch.randn(tensor_size, device=device)
    b = torch.randn(tensor_size, device=device)

    # Warm-up GPU
    for _ in range(10):
        _ = a + b
        _ = a @ b

    # Measure performance
    start_time = time.time()
    
    for _ in range(num_operations):
        c = a + b
        d = a @ b

    end_time = time.time()

    elapsed_time = end_time - start_time
    average_time_per_operation = elapsed_time / num_operations

    print(f"Elapsed time for {num_operations} operations: {elapsed_time:.4f} seconds")
    print(f"Average time per operation: {average_time_per_operation:.4f} seconds")

if __name__ == "__main__":
    test_gpu_performance()


Elapsed time for 10000 operations: 1.5645 seconds
Average time per operation: 0.0002 seconds


In [10]:
import torch
import time

def test_gpu_performance(device='cuda'):
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available. Please check your GPU setup.")
    
    # Set the device
    device = torch.device(device)

    # Define the size of the tensor and number of operations
    tensor_size = (4096, 4096)  # Larger tensor size
    num_operations = 50  # Number of operations to perform

    # Generate random tensors and move them to GPU
    a = torch.randn(tensor_size, device=device)
    b = torch.randn(tensor_size, device=device)

    # Warm-up GPU
    for _ in range(10):
        _ = a + b
        _ = a @ b
        _ = a * b
        _ = a.sin()
        _ = a.mean()

    # Measure performance
    start_time = time.time()
    
    for _ in range(num_operations):
        c = a + b
        d = a @ b
        e = a * b
        f = a.sin()
        g = a.mean()

    end_time = time.time()

    elapsed_time = end_time - start_time
    average_time_per_operation = elapsed_time / num_operations

    print(f"Elapsed time for {num_operations} operations: {elapsed_time:.4f} seconds")
    print(f"Average time per operation: {average_time_per_operation:.4f} seconds")

    # Test with multiple concurrent operations
    num_threads = 4  # Number of threads to simulate concurrent operations
    start_time = time.time()

    def concurrent_operations():
        for _ in range(num_operations // num_threads):
            _ = a + b
            _ = a @ b
            _ = a * b
            _ = a.sin()
            _ = a.mean()

    from concurrent.futures import ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(concurrent_operations) for _ in range(num_threads)]
        for future in futures:
            future.result()

    end_time = time.time()
    concurrent_elapsed_time = end_time - start_time

    print(f"Elapsed time for concurrent operations with {num_threads} threads: {concurrent_elapsed_time:.4f} seconds")

if __name__ == "__main__":
    test_gpu_performance()


Elapsed time for 50 operations: 0.0032 seconds
Average time per operation: 0.0001 seconds
Elapsed time for concurrent operations with 4 threads: 0.0078 seconds


In [None]:
import torch
import time
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

def tensor_operations(a, b):
    # Perform a series of complex tensor operations
    c = a + b
    d = a @ b
    e = a * b
    f = a.sin()
    g = a.mean()
    h = a.log()
    i = torch.relu(a)
    j = torch.matmul(a, b)
    k = torch.einsum('ij,jk->ik', a, b)
    return c, d, e, f, g, h, i, j, k

def test_gpu_performance(device='cuda'):
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available. Please check your GPU setup.")
    
    device = torch.device(device)

    # Define tensor size and number of operations
    tensor_size = (8192, 8192)  # Larger tensor size for more stress
    num_operations = 100  # Number of operations to perform
    num_threads = 8  # Number of concurrent threads

    # Generate random tensors and move them to GPU
    a = torch.randn(tensor_size, device=device)
    b = torch.randn(tensor_size, device=device)

    # Warm-up GPU
    for _ in range(20):
        tensor_operations(a, b)

    # Measure performance for a single-threaded operation
    start_time = time.time()
    for _ in range(num_operations):
        tensor_operations(a, b)
    end_time = time.time()
    elapsed_time = end_time - start_time
    average_time_per_operation = elapsed_time / num_operations
    print(f"Single-threaded elapsed time for {num_operations} operations: {elapsed_time:.4f} seconds")
    print(f"Average time per operation: {average_time_per_operation:.4f} seconds")

    # Measure performance for multi-threaded operation
    def concurrent_tasks():
        for _ in range(num_operations // num_threads):
            tensor_operations(a, b)

    start_time = time.time()
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(concurrent_tasks) for _ in range(num_threads)]
        for future in as_completed(futures):
            future.result()
    end_time = time.time()
    concurrent_elapsed_time = end_time - start_time
    print(f"Multi-threaded elapsed time with {num_threads} threads: {concurrent_elapsed_time:.4f} seconds")

if __name__ == "__main__":
    test_gpu_performance()


Single-threaded elapsed time for 100 operations: 2.9766 seconds
Average time per operation: 0.0298 seconds
