In [None]:
!torchvision --index-url https://download.pytorch.org/whl/cu126

In [1]:
import torch

In [3]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU name: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.8.0+cu126
CUDA available: True
CUDA version: 12.6
GPU name: NVIDIA GeForce GTX 1660 Ti


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
import numpy as np
from typing import Tuple, Dict, Any

class BenchmarkModel(nn.Module):
    """Тестовая модель для benchmarking"""
    def __init__(self, input_size: int = 512, hidden_size: int = 1024, num_layers: int = 3):
        super(BenchmarkModel, self).__init__()
        
        self.layers = nn.ModuleList()
        # Входной слой
        self.layers.append(nn.Linear(input_size, hidden_size))
        self.layers.append(nn.ReLU())
        self.layers.append(nn.Dropout(0.1))
        
        # Скрытые слои
        for _ in range(num_layers - 2):
            self.layers.append(nn.Linear(hidden_size, hidden_size))
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Dropout(0.1))
        
        # Выходной слой
        self.layers.append(nn.Linear(hidden_size, 10))
        self.layers.append(nn.LogSoftmax(dim=1))
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        for layer in self.layers:
            x = layer(x)
        return x

class GPUBenchmark:
    """Комплексный бенчмарк для GPU"""
    
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.results = {}
    
    def print_system_info(self) -> None:
        """Вывод информации о системе"""
        print("=" * 60)
        print("SYSTEM INFORMATION")
        print("=" * 60)
        
        print(f"PyTorch version: {torch.__version__}")
        print(f"CUDA available: {torch.cuda.is_available()}")
        
        if torch.cuda.is_available():
            print(f"CUDA version: {torch.version.cuda}")
            print(f"GPU: {torch.cuda.get_device_name(0)}")
            print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
            print(f"GPU Compute capability: {torch.cuda.get_device_capability()}")
            print(f"Current device: {torch.cuda.current_device()}")
            print(f"Device count: {torch.cuda.device_count()}")
        
        print(f"Using device: {self.device}")
        print("=" * 60)
    
    def tensor_operations_benchmark(self, size: Tuple[int, int] = (1024, 1024)) -> Dict[str, float]:
        """Тест операций с тензорами"""
        print("\nTensor Operations Benchmark")
        print("-" * 40)
        
        # Создаем большие тензоры
        a = torch.randn(*size, device=self.device)
        b = torch.randn(*size, device=self.device)
        
        operations = {
            'Matrix Multiplication': lambda: torch.mm(a, b),
            'Element-wise Multiplication': lambda: a * b,
            'Matrix Transpose + Multiplication': lambda: torch.mm(a, b.t()),
            'SVD Decomposition': lambda: torch.svd(a),
            'Matrix Inverse': lambda: torch.inverse(a),
        }
        
        results = {}
        for op_name, op_func in operations.items():
            # Прогрев
            for _ in range(3):
                op_func()
            
            torch.cuda.synchronize()  # Ждем завершения всех операций на GPU
            
            start_time = time.time()
            for _ in range(10):  # 10 итераций для стабильности
                result = op_func()
            
            torch.cuda.synchronize()
            end_time = time.time()
            
            avg_time = (end_time - start_time) / 10
            results[op_name] = avg_time
            print(f"{op_name}: {avg_time:.6f} seconds")
        
        return results
    
    def neural_network_benchmark(self, batch_size: int = 32, input_size: int = 512) -> Dict[str, float]:
        """Тест производительности нейросети"""
        print(f"\nNeural Network Benchmark (batch_size={batch_size})")
        print("-" * 50)
        
        model = BenchmarkModel(input_size=input_size).to(self.device)
        criterion = nn.NLLLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        # Создаем тестовые данные
        x = torch.randn(batch_size, input_size, device=self.device)
        y = torch.randint(0, 10, (batch_size,), device=self.device)
        
        # Прогрев
        for _ in range(3):
            optimizer.zero_grad()
            output = model(x)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
        
        torch.cuda.synchronize()
        
        # Замер времени обучения
        start_time = time.time()
        for _ in range(50):  # 50 итераций
            optimizer.zero_grad()
            output = model(x)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
        
        torch.cuda.synchronize()
        training_time = time.time() - start_time
        
        # Замер времени inference
        model.eval()
        with torch.no_grad():
            start_time = time.time()
            for _ in range(100):  # 100 предсказаний
                _ = model(x)
            
            torch.cuda.synchronize()
            inference_time = time.time() - start_time
        
        results = {
            'Training_time_per_epoch': training_time / 50,
            'Inference_time_per_batch': inference_time / 100,
            'Total_training_memory': torch.cuda.max_memory_allocated() / 1024**2
        }
        
        print(f"Training time per epoch: {results['Training_time_per_epoch']:.4f} seconds")
        print(f"Inference time per batch: {results['Inference_time_per_batch']:.6f} seconds")
        print(f"Peak GPU memory usage: {results['Total_training_memory']:.2f} MB")
        
        return results
    
    def memory_bandwidth_test(self) -> Dict[str, float]:
        """Тест пропускной способности памяти"""
        print("\nMemory Bandwidth Test")
        print("-" * 30)
        
        sizes = [1024, 2048, 4096, 8192]  # Разные размеры матриц
        results = {}
        
        for size in sizes:
            a = torch.randn(size, size, device=self.device)
            b = torch.randn(size, size, device=self.device)
            
            # Прогрев
            torch.mm(a, b)
            torch.cuda.synchronize()
            
            start_time = time.time()
            for _ in range(10):
                c = torch.mm(a, b)
            
            torch.cuda.synchronize()
            end_time = time.time()
            
            # Вычисляем пропускную способность
            operations = 2 * size ** 3  # Примерное количество операций для умножения матриц
            time_taken = (end_time - start_time) / 10
            gflops = (operations / time_taken) / 1e9
            
            results[f'Matrix_{size}x{size}'] = gflops
            print(f"Matrix {size}x{size}: {gflops:.2f} GFLOPS")
        
        return results
    
    def run_comprehensive_benchmark(self) -> Dict[str, Any]:
        """Запуск комплексного тестирования"""
        print("Starting Comprehensive GPU Benchmark")
        print("=" * 60)
        
        self.print_system_info()
        
        results = {
            'system_info': {
                'pytorch_version': torch.__version__,
                'cuda_available': torch.cuda.is_available(),
                'gpu_name': torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None',
                'gpu_memory_gb': torch.cuda.get_device_properties(0).total_memory / 1024**3 if torch.cuda.is_available() else 0
            },
            'tensor_operations': self.tensor_operations_benchmark(),
            'neural_network': self.neural_network_benchmark(),
            'memory_bandwidth': self.memory_bandwidth_test()
        }
        
        print("\n" + "=" * 60)
        print("BENCHMARK COMPLETED!")
        print("=" * 60)
        
        # Сводка результатов
        if torch.cuda.is_available():
            avg_gflops = np.mean(list(results['memory_bandwidth'].values()))
            print(f"Average GPU Performance: {avg_gflops:.2f} GFLOPS")
            print(f"Neural Network Training Speed: {results['neural_network']['Training_time_per_epoch']:.4f} s/epoch")
            print(f"Inference Speed: {results['neural_network']['Inference_time_per_batch']:.6f} s/batch")
        
        return results

# Запуск бенчмарка
if __name__ == "__main__":
    benchmark = GPUBenchmark()
    results = benchmark.run_comprehensive_benchmark()
    
    # Дополнительная проверка смешанной точности (едоступно)
    if torch.cuda.is_available() and hasattr(torch, 'amp'):
        print("\nTesting Mixed Precision Training...")
        with torch.amp.autocast('cuda'):
            # Быстрый тест mixed precision
            x = torch.randn(32, 512, device='cuda')
            model = BenchmarkModel().cuda()
            output = model(x)
            print("Mixed precision test passed!")

Starting Comprehensive GPU Benchmark
SYSTEM INFORMATION
PyTorch version: 2.8.0+cu126
CUDA available: True
CUDA version: 12.6
GPU: NVIDIA GeForce GTX 1660 Ti
GPU Memory: 6.00 GB
GPU Compute capability: (7, 5)
Current device: 0
Device count: 1
Using device: cuda

Tensor Operations Benchmark
----------------------------------------
Matrix Multiplication: 0.000835 seconds
Element-wise Multiplication: 0.000100 seconds
Matrix Transpose + Multiplication: 0.000889 seconds
SVD Decomposition: 0.153929 seconds
Matrix Inverse: 0.006971 seconds

Neural Network Benchmark (batch_size=32)
--------------------------------------------------
Training time per epoch: 0.0038 seconds
Inference time per batch: 0.000242 seconds
Peak GPU memory usage: 46.55 MB

Memory Bandwidth Test
------------------------------
Matrix 1024x1024: 1372.07 GFLOPS
Matrix 2048x2048: 3668.14 GFLOPS
Matrix 4096x4096: 5614.92 GFLOPS
Matrix 8192x8192: 4945.43 GFLOPS

BENCHMARK COMPLETED!
Average GPU Performance: 3900.14 GFLOPS
Neural