In [21]:
import subprocess
import re
import torch
import pynvml
import logging
from collections import Counter

# Initialize pynvml library
pynvml.nvmlInit()

class HardwareCheck:

    def __init__(self):
        # self.log_nvidia_smi()  # This method doesn't perform tests, so it's fine to keep
        # self.log_cuda_toolkit_version()
        pass

    def test_nvidia_smi(self):
        try:
            nvidia_smi_output = subprocess.check_output(['nvidia-smi']).decode()

            # Extracting GPU names
            gpu_names = re.findall(r'(\d+\s+NVIDIA [^\s]+)', nvidia_smi_output)
            gpu_count = Counter(gpu_names)

            # Parsing the driver version
            driver_version_match = re.search(r'Driver Version: (\d+\.\d+\.\d+)', nvidia_smi_output)
            driver_version = driver_version_match.group(1) if driver_version_match else "Unknown"

            # Checking if at least one GPU is detected
            self.nvidia_smi_working = len(gpu_count) > 0
            if self.nvidia_smi_working:
                logging.info("nvidia-smi is working and GPUs are detected.")
                for gpu, count in gpu_count.items():
                    logging.info(f"GPU: {gpu}, Count: {count}")
                logging.info(f"Driver Version: {driver_version}")
            else:
                logging.warning("No GPUs detected by nvidia-smi.")

        except Exception as e:
            self.nvidia_smi_working = False
            logging.error(f"Error running nvidia-smi: {e}")

        return self


    def test_cudnn_availability(self):
        # Check for cuDNN availability and log it
        self.cudnn_available = torch.backends.cudnn.is_available()
        if self.cudnn_available:
            self.cudnn_version = torch.backends.cudnn.version()
            logging.info("cuDNN is available.")
            logging.info(f"cuDNN Version: {self.cudnn_version}")
        else:
            logging.info("cuDNN is not available.")
        return self


    def test_cuda_availability(self):
        # Check for CUDA availability and log it
        if torch.cuda.is_available():
            self.cuda_available = True
            self.device = torch.device("cuda")
        else:
            self.cuda_available = False
            self.device = torch.device("cpu")
            raise RuntimeError("CUDA is not available and fallback to CPU is not desired.")

        logging.info(f"CUDA Availability: {self.cuda_available}")
        logging.info(f"Device set to: {self.device}")
        return self


    def log_gpu_stats(self):
        try:
            gpu_count = pynvml.nvmlDeviceGetCount()
            logging.info(f"Number of GPUs: {gpu_count}")

            for i in range(gpu_count):
                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
                name = pynvml.nvmlDeviceGetName(handle)  # Remove .decode('utf-8')
                temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)

                logging.info(f"GPU {i + 1} - Name: {name}, Temperature: {temperature}°C,"
                            f" Memory Used: {memory_info.used / 1024 / 1024} MB,"
                            f" GPU Utilization: {utilization.gpu}%, Memory Utilization: {utilization.memory}%")
        except Exception as e:
            logging.info(f"Error logging GPU stats: {e}")


    def log_nvidia_smi(self):
        try:
            nvidia_smi_output = subprocess.check_output(['nvidia-smi']).decode()

            # Extracting GPU names
            gpu_names = re.findall(r'(\d+\s+NVIDIA [^\s]+)', nvidia_smi_output)
            gpu_count = Counter(gpu_names)

            # Parsing the driver version
            driver_version_match = re.search(r'Driver Version: (\d+\.\d+\.\d+)', nvidia_smi_output)
            driver_version = driver_version_match.group(1) if driver_version_match else "Unknown"

            # Logging GPU counts and names
            for gpu, count in gpu_count.items():
                logging.info(f"GPU: {gpu}, Count: {count}")

            logging.info(f"Driver Version: {driver_version}")

        except Exception as e:
            logging.info(f"Error running nvidia-smi: {e}")

            
    def log_cuda_toolkit_version(self):
        # Check for CUDA Toolkit version used by PyTorch
        cuda_version = torch.version.cuda
        logging.info(f"Logging -- CUDA Toolkit Version (used by PyTorch): {cuda_version}")

        
    def run_hardware_pipeline(self):
        self.test_nvidia_smi()
        self.test_cudnn_availability()
        self.test_cuda_availability()
        self.log_cuda_toolkit_version()

        if not self.nvidia_smi_working:
            raise RuntimeError("nvidia-smi check failed.")
        if not self.cuda_available:
            raise RuntimeError("CUDA availability check failed.")
        if not self.cudnn_available:
            raise RuntimeError("cuDNN availability check failed.")

        logging.info("Passed nvidia_smi_working, cuda_available, cudnn_available")

        # Log GPU stats and CUDA toolkit version only if all checks passed
        self.log_gpu_stats()
        # self.log_cuda_toolkit_version()

        logging.info("All hardware checks passed. Ready for inference.")
        return True

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    # Initialize the hardware check
    hardware_check = HardwareCheck()
    
    # Run the hardware pipeline
    hardware_check.run_hardware_pipeline()

2023-12-04 18:30:12,773 - INFO - nvidia-smi is working and GPUs are detected.
2023-12-04 18:30:12,774 - INFO - GPU: 0  NVIDIA GeForce, Count: 1
2023-12-04 18:30:12,775 - INFO - Driver Version: 535.129.03
2023-12-04 18:30:12,775 - INFO - cuDNN is available.
2023-12-04 18:30:12,776 - INFO - cuDNN Version: 8902
2023-12-04 18:30:12,776 - INFO - CUDA Availability: True
2023-12-04 18:30:12,777 - INFO - Device set to: cuda
2023-12-04 18:30:12,778 - INFO - Logging -- CUDA Toolkit Version (used by PyTorch): 12.1
2023-12-04 18:30:12,778 - INFO - Passed nvidia_smi_working, cuda_available, cudnn_available
2023-12-04 18:30:12,779 - INFO - Number of GPUs: 1
2023-12-04 18:30:12,780 - INFO - GPU 1 - Name: NVIDIA GeForce GTX 1070, Temperature: 33°C, Memory Used: 3436.4375 MB, GPU Utilization: 0%, Memory Utilization: 0%
2023-12-04 18:30:12,781 - INFO - All hardware checks passed. Ready for inference.


In [2]:
pip install -U pynvml

Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m577.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pynvml
Successfully installed pynvml-11.5.0
Note: you may need to restart the kernel to use updated packages.
