In [1]:
import os
import sys

print("=" * 60)
print("Environment Check")
print("=" * 60)

# Check environment variables
print(f"\nPython executable: {sys.executable}")
print(f"Python version: {sys.version}")

print(f"\nCUDA_HOME: {os.environ.get('CUDA_HOME', 'NOT SET')}")
print(f"CUDNNROOT: {os.environ.get('CUDNNROOT', 'NOT SET')}")

ld_path = os.environ.get('LD_LIBRARY_PATH', 'NOT SET')
print(f"\nLD_LIBRARY_PATH (first 300 chars):")
print(f"  {ld_path[:300]}...")

# Check if CUDA libraries are accessible
print("\n" + "=" * 60)
print("Library Check")
print("=" * 60)

import ctypes.util
import glob

libs_to_check = {
    'libcudart': 'CUDA Runtime',
    'libcudnn': 'cuDNN',
    'libcublas': 'cuBLAS',
    'libcufft': 'cuFFT',
}

for lib_name, lib_desc in libs_to_check.items():
    # Try to find library
    found_paths = []
    if 'LD_LIBRARY_PATH' in os.environ:
        for path in os.environ['LD_LIBRARY_PATH'].split(':'):
            if path:
                matches = glob.glob(f"{path}/{lib_name}.so*")
                if matches:
                    found_paths.extend(matches)
    
    if found_paths:
        print(f"  ✅ {lib_desc}: {found_paths[0]}")
    else:
        # Try ctypes
        lib_path = ctypes.util.find_library(lib_name)
        if lib_path:
            print(f"  ✅ {lib_desc}: {lib_path}")
        else:
            print(f"  ❌ {lib_desc}: NOT FOUND")

# Test TensorFlow
print("\n" + "=" * 60)
print("TensorFlow GPU Detection")
print("=" * 60)

import tensorflow as tf

print(f"\nTensorFlow version: {tf.__version__}")

# List all physical devices
print("\nAll physical devices:")
all_devices = tf.config.list_physical_devices()
for device in all_devices:
    print(f"  - {device}")

# Check GPU devices specifically
gpus = tf.config.list_physical_devices('GPU')
print(f"\nGPU devices: {len(gpus)}")
if len(gpus) > 0:
    print("✅ GPU is available!")
    for i, gpu in enumerate(gpus):
        print(f"  GPU {i}: {gpu}")
        # Try to get GPU details
        try:
            gpu_details = tf.config.experimental.get_device_details(gpu)
            print(f"    Details: {gpu_details}")
        except:
            pass
else:
    print("❌ No GPU devices detected")
    print("\nTroubleshooting:")
    print("  1. Check if wrapper script is being used (check ~/.jupyter/kernel_wrapper_debug.log)")
    print("  2. Verify LD_LIBRARY_PATH includes CUDA libraries")
    print("  3. Make sure TensorFlow module is loaded")
    print("  4. Try: import tensorflow as tf; print(tf.config.list_physical_devices())")

# Test GPU computation if available
if len(gpus) > 0:
    print("\n" + "=" * 60)
    print("GPU Test")
    print("=" * 60)
    try:
        with tf.device('/GPU:0'):
            a = tf.constant([[1.0, 2.0], [3.0, 4.0]])
            b = tf.constant([[1.0, 1.0], [0.0, 1.0]])
            c = tf.matmul(a, b)
            print(f"✅ GPU computation test successful!")
            print(f"   Result: {c.numpy()}")
    except Exception as e:
        print(f"❌ GPU computation test failed: {e}")

Environment Check

Python executable: /tmp/python-venv/lra_venv/bin/python
Python version: 3.9.25 (main, Nov  3 2025, 22:33:05) 
[GCC 11.2.0]

CUDA_HOME: /usr/local/pace-apps/spack/packages/linux-rhel9-x86_64_v3/gcc-11.3.1/cuda-11.7.0-iyx5xnjk3fbe2fqhogxbzfzpj7xhi77t
CUDNNROOT: /usr/local/pace-apps/spack/packages/linux-rhel9-x86_64_v3/gcc-11.3.1/cudnn-8.5.0.96-11.7-k7jh7qvot2tupopuooyvp2hlkss75nhc

LD_LIBRARY_PATH (first 300 chars):
  /usr/local/pace-apps/spack/packages/linux-rhel9-x86_64_v3/gcc-11.3.1/cudnn-8.5.0.96-11.7-k7jh7qvot2tupopuooyvp2hlkss75nhc/lib:/usr/local/pace-apps/spack/packages/linux-rhel9-x86_64_v3/gcc-11.3.1/cuda-11.7.0-iyx5xnjk3fbe2fqhogxbzfzpj7xhi77t/lib64:/usr/local/pace-apps/spack/packages/linux-rhel9-x86_64...

Library Check
  ✅ CUDA Runtime: /usr/local/pace-apps/spack/packages/linux-rhel9-x86_64_v3/gcc-11.3.1/cuda-11.7.0-iyx5xnjk3fbe2fqhogxbzfzpj7xhi77t/lib64/libcudart.so
  ✅ cuDNN: /usr/local/pace-apps/spack/packages/linux-rhel9-x86_64_v3/gcc-11.3.1/cudnn-8.5.0

2026-01-05 08:23:54.065374: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-05 08:23:54.122327: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



TensorFlow version: 2.13.0

All physical devices:
  - PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')
  - PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')

GPU devices: 1
✅ GPU is available!
  GPU 0: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
    Details: {'compute_capability': (7, 0), 'device_name': 'Tesla V100-PCIE-16GB'}

GPU Test


2026-01-05 08:23:56.669151: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14728 MB memory:  -> device: 0, name: Tesla V100-PCIE-16GB, pci bus id: 0000:3b:00.0, compute capability: 7.0


✅ GPU computation test successful!
   Result: [[1. 3.]
 [3. 7.]]
