In [3]:
# Uninstall all previous versions of cupy that may conflict
!pip uninstall -y cupy-cuda11x cupy-cuda12x cupy

# Install compatible version for Colab's CUDA (11.x)
!pip install -q cupy-cuda11x

import cupy as cp
try:
    print("GPU name:", cp.cuda.runtime.getDeviceProperties(0)['name'].decode())
    print("✅ GPU is available and working with CuPy.")
except cp.cuda.runtime.CUDARuntimeError as e:
    print("❌ GPU not available or driver issue:", e)

 If you see GPU not available, do this:
Go to Runtime > Change runtime type

Set Hardware accelerator: GPU

Click Save

Restart runtime after setting GPU


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.0/100.0 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Operation cancelled by user[0m[31m
[0mGPU name: Tesla T4
✅ GPU is available and working with CuPy.


In [6]:
import cupy as cp
import numpy as np

In [7]:
# Matrix size
N, M, K = 128, 128, 128  # You can change these values

# Random input matrices on CPU
A_cpu = np.random.randint(0, 100, size=(N, K)).astype(np.float32)
B_cpu = np.random.randint(0, 100, size=(K, M)).astype(np.float32)

# CPU matrix multiplication
C_cpu = A_cpu @ B_cpu

# Print input matrices
print("Matrix A:")
print(A_cpu)
print("\nMatrix B:")
print(B_cpu)

Matrix A:
[[28. 30. 34. ... 29. 66. 71.]
 [15. 40. 27. ... 88. 76. 83.]
 [91.  0. 75. ... 60. 90. 84.]
 ...
 [82. 75. 45. ... 62. 69. 37.]
 [30. 16. 88. ... 17. 19. 39.]
 [74. 33. 53. ... 38. 67. 79.]]

Matrix B:
[[66. 43. 54. ... 75. 55. 21.]
 [90. 22. 80. ... 57. 11. 84.]
 [27. 36. 31. ... 13. 86. 25.]
 ...
 [ 7. 12. 64. ...  5.  8. 83.]
 [12.  7. 34. ... 39. 53. 30.]
 [94. 54. 16. ... 92. 61. 96.]]


In [8]:
# Transfer to GPU
A_gpu = cp.array(A_cpu)
B_gpu = cp.array(B_cpu)

# Time GPU multiplication
start = cp.cuda.Event()
end = cp.cuda.Event()
start.record()

C_gpu = A_gpu @ B_gpu

end.record()
end.synchronize()
gpu_time = cp.cuda.get_elapsed_time(start, end)

# Bring result back to CPU
C_result = cp.asnumpy(C_gpu)

# Print results
print("\nCPU Result (C = A @ B):")
print(C_cpu)
print("\nGPU Result:")
print(C_result)


CPU Result (C = A @ B):
[[309028. 316748. 321881. ... 301639. 313374. 319015.]
 [280164. 312020. 308720. ... 272032. 284370. 311517.]
 [308866. 295653. 303233. ... 313308. 317096. 316020.]
 ...
 [322095. 335452. 344794. ... 334664. 328340. 328364.]
 [286842. 306860. 285753. ... 283615. 290674. 285059.]
 [331694. 319110. 325290. ... 327280. 330593. 328361.]]

GPU Result:
[[309028. 316748. 321881. ... 301639. 313374. 319015.]
 [280164. 312020. 308720. ... 272032. 284370. 311517.]
 [308866. 295653. 303233. ... 313308. 317096. 316020.]
 ...
 [322095. 335452. 344794. ... 334664. 328340. 328364.]
 [286842. 306860. 285753. ... 283615. 290674. 285059.]
 [331694. 319110. 325290. ... 327280. 330593. 328361.]]


In [9]:
# Total error (sum of element-wise differences)
total_error = np.sum(C_result - C_cpu)
print(f"\nTotal Error: {total_error:.6f}")
print(f"Time Elapsed on GPU (ms): {gpu_time:.4f}")



Total Error: 0.000000
Time Elapsed on GPU (ms): 0.5437
