# From Zero to CUDA: Python Acceleration 101

Welcome to this beginner-friendly walkthrough of accelerating Python with CUDA! We'll explore how to run simple operations on your GPU using libraries like CuPy and Numba. Perfect for entry-level data scientists and ML enthusiasts!

In [None]:
# Install dependencies
!pip install cupy-cuda11x numba matplotlib --quiet


## Step 1: Check if GPU is available

In [None]:
import cupy as cp
cp.show_config()

## Step 2: Basic CuPy Array Operation

In [None]:
# Create an array on GPU
x_gpu = cp.arange(10)
print("GPU Array:", x_gpu)
print("Squared:", x_gpu**2)

## Step 3: Compare CPU vs GPU for Matrix Multiplication

In [None]:
import numpy as np
import time

# CPU
a_cpu = np.random.rand(1000, 1000)
b_cpu = np.random.rand(1000, 1000)
start_cpu = time.time()
c_cpu = np.dot(a_cpu, b_cpu)
print("CPU Time:", time.time() - start_cpu)

# GPU
a_gpu = cp.random.rand(1000, 1000)
b_gpu = cp.random.rand(1000, 1000)
start_gpu = time.time()
c_gpu = cp.dot(a_gpu, b_gpu)
print("GPU Time:", time.time() - start_gpu)

## Step 4: Accelerating Python Functions with Numba + CUDA

In [None]:
from numba import cuda
import numpy as np

@cuda.jit
def add_kernel(x, y, out):
    idx = cuda.grid(1)
    if idx < x.size:
        out[idx] = x[idx] + y[idx]

n = 1024
x = np.arange(n).astype(np.float32)
y = np.arange(n).astype(np.float32)
out = np.zeros_like(x)

threads_per_block = 128
blocks_per_grid = (x.size + (threads_per_block - 1)) // threads_per_block

add_kernel[blocks_per_grid, threads_per_block](x, y, out)
print("Output[:10]:", out[:10])

## Step 5: Visualizing Performance

In [None]:
import matplotlib.pyplot as plt

labels = ['CPU', 'GPU']
times = [0.3, 0.05]  # Example values
plt.bar(labels, times, color=['orange', 'green'])
plt.ylabel('Time (s)')
plt.title('Matrix Multiplication Speed Comparison')
plt.show()

## 🎉 You're CUDA-Ready!

Congrats on running your first CUDA-powered Python notebook. Keep experimenting and happy coding!