#### Testing CuPy for the first time

In [1]:
# Import necessary libraries
import cupy as cp
import numpy as np
import time

In [2]:
!pwd

/home/rapids/notebooks


In [3]:
!nvidia-smi

Wed Sep 24 05:09:00 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:09:00.0 Off |                  N/A |
| 36%   43C    P8              27W / 370W |     26MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
cp.show_config()

OS                           : Linux-5.15.0-58-generic-x86_64-with-glibc2.35
Python Version               : 3.11.13
CuPy Version                 : 13.6.0
CuPy Platform                : NVIDIA CUDA
NumPy Version                : 2.2.6
SciPy Version                : 1.16.2
Cython Build Version         : 3.1.3
Cython Runtime Version       : None
CUDA Root                    : /opt/conda
nvcc PATH                    : /opt/conda/bin/nvcc
CUDA Build Version           : 12090
CUDA Driver Version          : 12030
CUDA Runtime Version         : 12090 (linked to CuPy) / 12000 (locally installed)
CUDA Extra Include Dirs      : []
cuBLAS Version               : (available)
cuFFT Version                : 11000
cuRAND Version               : 10301
cuSOLVER Version             : (11, 4, 2)
cuSPARSE Version             : (available)
NVRTC Version                : (12, 0)
Thrust Version               : 200802
CUB Build Version            : 200800
Jitify Build Version         : <unknown>
cuDNN Build Ve

#### On the **CPU** with **NumPy**

In [5]:
# --- On the CPU with NumPy ---
# Create a large array on the CPU
cpu_array = np.random.rand(10000, 10000)

In [6]:
# Perform a matrix multiplication on the CPU
start_time_cpu = time.time()
result_cpu = np.dot(cpu_array, cpu_array)
end_time_cpu = time.time()
print(f"NumPy execution time: {end_time_cpu - start_time_cpu:.4f} seconds")

NumPy execution time: 4.5097 seconds


In [7]:
# --- On the CPU with NumPy ---
start_time_cpu = time.time()
# Perform a matrix multiplication on the CPU
result_cpu = np.dot(cpu_array,np.dot(cpu_array,np.dot(cpu_array, cpu_array)))
end_time_cpu = time.time()
print(f"NumPy execution time: {end_time_cpu - start_time_cpu:.4f} seconds")

NumPy execution time: 13.7193 seconds


#### On the **GPU** with **CuPy**

In [8]:
# --- On the GPU with CuPy ---
# Now, copy the *exact same* array to the GPU
gpu_array = cp.asarray(cpu_array)

In [9]:
# Perform a matrix multiplication on the GPU
start_time_gpu = time.time()
result_gpu = cp.dot(gpu_array, gpu_array)
end_time_gpu = time.time()
print(f"CuPy execution time: {end_time_gpu - start_time_gpu:.4f} seconds")

CuPy execution time: 0.0140 seconds


In [10]:
# Perform a matrix multiplication on the GPU
start_time_gpu_b = time.time()
result_gpu = cp.dot(gpu_array,cp.dot(gpu_array,cp.dot(gpu_array, gpu_array)))
end_time_gpu_b = time.time()
print(f"CuPy execution time: {end_time_gpu_b - start_time_gpu_b:.4f} seconds")

CuPy execution time: 0.0024 seconds


> What?!? Anyone can explain this?!?

- Your “weird result” comes from asynchronous GPU execution + warm-up overhead.
- Without synchronization, your timer doesn’t reflect the true compute cost.
- Add cp.cuda.Stream.null.synchronize() after the operation, and you’ll get realistic timings.

In [12]:
# Function to run the comparison
def compare_performance(matrix_size):
    # --- Data Preparation ---
    # Create large matrices on the CPU
    np_array1 = np.random.rand(matrix_size, matrix_size).astype(np.float32)
    np_array2 = np.random.rand(matrix_size, matrix_size).astype(np.float32)

    # --- NumPy (CPU) Execution ---
    start_cpu = time.time()
    result_np = np.dot(np_array1, np_array2)
    end_cpu = time.time()
    cpu_time = end_cpu - start_cpu
    print(f"NumPy (CPU) execution time: {cpu_time:.4f} seconds")

    # --- CuPy (GPU) Execution ---
    # Copy data from CPU to GPU memory
    cp_array1 = cp.asarray(np_array1)
    cp_array2 = cp.asarray(np_array2)

    # Warm-up run to exclude initialization overhead
    cp.dot(cp_array1, cp_array2)
    cp.cuda.Stream.null.synchronize()

    start_gpu = time.time()
    result_cp = cp.dot(cp_array1, cp_array2)
    cp.cuda.Stream.null.synchronize()  # Wait for GPU computation to finish
    end_gpu = time.time()
    gpu_time = end_gpu - start_gpu

    # You would typically copy the result back to CPU if needed, but we don't for timing
    # result_np_from_cp = cp.asnumpy(result_cp)

    print(f"CuPy (GPU) execution time: {gpu_time:.4f} seconds")

    # --- Results ---
    speedup = cpu_time / gpu_time if gpu_time > 0 else float('inf')
    print(f"GPU is {speedup:.2f}x faster than CPU")
    print("-" * 30)

# --- Run comparisons with different matrix sizes ---
print("Running with a small matrix (500x500):")
compare_performance(500)
print("\nRunning with a larger matrix (5000x5000):")
compare_performance(5000)
print("\nRunning with a very large matrix (10000x10000):")
compare_performance(10000)
print("\nRunning with a very large matrix (20000x20000):")
compare_performance(20000)

Running with a small matrix (500x500):
NumPy (CPU) execution time: 0.0007 seconds
CuPy (GPU) execution time: 0.0001 seconds
GPU is 9.96x faster than CPU
------------------------------

Running with a larger matrix (5000x5000):
NumPy (CPU) execution time: 0.2697 seconds
CuPy (GPU) execution time: 0.0098 seconds
GPU is 27.56x faster than CPU
------------------------------

Running with a very large matrix (10000x10000):
NumPy (CPU) execution time: 2.0907 seconds
CuPy (GPU) execution time: 0.0875 seconds
GPU is 23.89x faster than CPU
------------------------------

Running with a very large matrix (20000x20000):
NumPy (CPU) execution time: 17.1720 seconds
CuPy (GPU) execution time: 0.7236 seconds
GPU is 23.73x faster than CPU
------------------------------


> FYI: <br>
- `np.dot` utilizes all available CPU threads for its calculations. 
- On this machine, it is configured to use 16 threads, leveraging highly optimized, multi-threaded libraries like BLAS and LAPACK.
- Though the multi-threading, 20x times slower than GPU.

#### You can use `xp` to handle both cases.
```python
import numpy as np

try:
    import cupy as cp
    # 기본 GPU (장치 0번) 초기화를 시도
    with cp.cuda.Device(0):
        xp = cp
        print("Using CuPy. GPU device 0 is active.")
except (ImportError, cp.cuda.runtime.CUDARuntimeError):
    # CuPy가 없거나, 장치 초기화에 실패하면 NumPy로 전환
    xp = np
    print("Using NumPy. CuPy not installed or device initialization failed.")
```