In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba", "torch"])
except ImportError:
    pass

try:
    import torch
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
except ImportError:
    print("PyTorch not available - examples will be conceptual")

print("\n‚ö†Ô∏è  CUDA C++ is the PRIMARY learning material!")

---

## Part 1: Extension Architecture

### PyTorch Extension Structure

```
my_extension/
‚îú‚îÄ‚îÄ setup.py           # Build configuration
‚îú‚îÄ‚îÄ my_ext.cpp         # C++ bindings
‚îú‚îÄ‚îÄ my_ext_cuda.cu     # CUDA kernels
‚îî‚îÄ‚îÄ my_ext/
    ‚îî‚îÄ‚îÄ __init__.py    # Python interface
    
Flow:
  Python ‚Üí C++ binding ‚Üí CUDA kernel ‚Üí C++ binding ‚Üí Python
```

### Required Headers

```cpp
#include <torch/extension.h>  // Main PyTorch C++ API
#include <cuda.h>
#include <cuda_runtime.h>
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile fused_gelu_cuda.cu
// fused_gelu_cuda.cu - Custom CUDA kernel for PyTorch
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

// GELU constants
#define SQRT_2_PI 0.7978845608028654f
#define GELU_COEF 0.044715f

// Forward kernel: y = GELU(x + bias)
__global__ void fused_gelu_bias_forward_kernel(
    const float* __restrict__ input,
    const float* __restrict__ bias,
    float* __restrict__ output,
    int batch_size,
    int hidden_size
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int total = batch_size * hidden_size;
    
    if (idx < total) {
        int bias_idx = idx % hidden_size;
        float x = input[idx] + bias[bias_idx];
        
        // GELU: 0.5 * x * (1 + tanh(sqrt(2/œÄ) * (x + 0.044715 * x¬≥)))
        float x3 = x * x * x;
        float inner = SQRT_2_PI * (x + GELU_COEF * x3);
        output[idx] = 0.5f * x * (1.0f + tanhf(inner));
    }
}

// Backward kernel for GELU
__global__ void fused_gelu_bias_backward_kernel(
    const float* __restrict__ grad_output,
    const float* __restrict__ input,
    const float* __restrict__ bias,
    float* __restrict__ grad_input,
    float* __restrict__ grad_bias,  // Atomically accumulated
    int batch_size,
    int hidden_size
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int total = batch_size * hidden_size;
    
    if (idx < total) {
        int bias_idx = idx % hidden_size;
        float x = input[idx] + bias[bias_idx];
        float g = grad_output[idx];
        
        // GELU derivative
        float x3 = x * x * x;
        float inner = SQRT_2_PI * (x + GELU_COEF * x3);
        float tanh_inner = tanhf(inner);
        float sech2 = 1.0f - tanh_inner * tanh_inner;
        
        float gelu = 0.5f * x * (1.0f + tanh_inner);
        float dgelu_dx = 0.5f * (1.0f + tanh_inner) + 
                         0.5f * x * sech2 * SQRT_2_PI * (1.0f + 3.0f * GELU_COEF * x * x);
        
        grad_input[idx] = g * dgelu_dx;
        atomicAdd(&grad_bias[bias_idx], g * dgelu_dx);
    }
}

// C++ wrapper functions
torch::Tensor fused_gelu_bias_forward(
    torch::Tensor input,
    torch::Tensor bias
) {
    const int batch_size = input.size(0);
    const int hidden_size = input.size(1);
    
    auto output = torch::empty_like(input);
    
    const int threads = 256;
    const int blocks = (batch_size * hidden_size + threads - 1) / threads;
    
    fused_gelu_bias_forward_kernel<<<blocks, threads>>>(
        input.data_ptr<float>(),
        bias.data_ptr<float>(),
        output.data_ptr<float>(),
        batch_size,
        hidden_size
    );
    
    return output;
}

std::vector<torch::Tensor> fused_gelu_bias_backward(
    torch::Tensor grad_output,
    torch::Tensor input,
    torch::Tensor bias
) {
    const int batch_size = input.size(0);
    const int hidden_size = input.size(1);
    
    auto grad_input = torch::empty_like(input);
    auto grad_bias = torch::zeros_like(bias);
    
    const int threads = 256;
    const int blocks = (batch_size * hidden_size + threads - 1) / threads;
    
    fused_gelu_bias_backward_kernel<<<blocks, threads>>>(
        grad_output.data_ptr<float>(),
        input.data_ptr<float>(),
        bias.data_ptr<float>(),
        grad_input.data_ptr<float>(),
        grad_bias.data_ptr<float>(),
        batch_size,
        hidden_size
    );
    
    return {grad_input, grad_bias};
}

// Python bindings
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &fused_gelu_bias_forward, "Fused GELU+Bias forward");
    m.def("backward", &fused_gelu_bias_backward, "Fused GELU+Bias backward");
}

---

## Part 2: Build Configuration

### setup.py

In [None]:
%%writefile setup.py
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

setup(
    name='fused_gelu',
    ext_modules=[
        CUDAExtension(
            name='fused_gelu_cuda',
            sources=['fused_gelu_cuda.cu'],
            extra_compile_args={
                'cxx': ['-O3'],
                'nvcc': ['-O3', '-arch=sm_75']
            }
        )
    ],
    cmdclass={'build_ext': BuildExtension}
)

### JIT Compilation (Alternative)

```python
from torch.utils.cpp_extension import load

# Compile at runtime
fused_gelu = load(
    name='fused_gelu',
    sources=['fused_gelu_cuda.cu'],
    extra_cuda_cflags=['-O3', '-arch=sm_75']
)
```

---

## Part 3: Autograd Integration

### Custom autograd.Function

In [None]:
%%writefile fused_gelu_module.py
# fused_gelu_module.py - Python wrapper with autograd
import torch
from torch.autograd import Function

# Import the compiled extension
try:
    import fused_gelu_cuda
except ImportError:
    fused_gelu_cuda = None
    print("Warning: fused_gelu_cuda not compiled")

class FusedGELUBias(Function):
    @staticmethod
    def forward(ctx, input, bias):
        # Save for backward
        ctx.save_for_backward(input, bias)
        
        if fused_gelu_cuda:
            output = fused_gelu_cuda.forward(input, bias)
        else:
            # Fallback to PyTorch
            x = input + bias
            output = 0.5 * x * (1 + torch.tanh(
                0.7978845608 * (x + 0.044715 * x ** 3)
            ))
        return output
    
    @staticmethod
    def backward(ctx, grad_output):
        input, bias = ctx.saved_tensors
        
        if fused_gelu_cuda:
            grad_input, grad_bias = fused_gelu_cuda.backward(
                grad_output.contiguous(), input, bias
            )
        else:
            # Fallback implementation
            x = input + bias
            x3 = x ** 3
            inner = 0.7978845608 * (x + 0.044715 * x3)
            tanh_inner = torch.tanh(inner)
            sech2 = 1 - tanh_inner ** 2
            dgelu = 0.5 * (1 + tanh_inner) + \
                    0.5 * x * sech2 * 0.7978845608 * (1 + 3 * 0.044715 * x ** 2)
            grad_input = grad_output * dgelu
            grad_bias = grad_input.sum(dim=0)
        
        return grad_input, grad_bias

class FusedGELUBiasModule(torch.nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.bias = torch.nn.Parameter(torch.zeros(hidden_size))
    
    def forward(self, x):
        return FusedGELUBias.apply(x, self.bias)

# Test if running directly
if __name__ == "__main__":
    # Quick test
    x = torch.randn(32, 768, device='cuda', requires_grad=True)
    module = FusedGELUBiasModule(768).cuda()
    
    y = module(x)
    loss = y.sum()
    loss.backward()
    
    print(f"Input: {x.shape}")
    print(f"Output: {y.shape}")
    print(f"Gradient: {x.grad.shape}")
    print("‚úì Autograd integration working!")

---

## Part 4: Half Precision Support

### Adding FP16 Kernels

In [None]:
%%writefile fused_gelu_fp16.cu
// fused_gelu_fp16.cu - Half precision variant
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>

#define SQRT_2_PI 0.7978845608028654f
#define GELU_COEF 0.044715f

// Half precision forward kernel
__global__ void fused_gelu_bias_forward_fp16(
    const half* __restrict__ input,
    const half* __restrict__ bias,
    half* __restrict__ output,
    int batch_size,
    int hidden_size
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int total = batch_size * hidden_size;
    
    if (idx < total) {
        int bias_idx = idx % hidden_size;
        
        // Compute in FP32 for accuracy
        float x = __half2float(input[idx]) + __half2float(bias[bias_idx]);
        
        float x3 = x * x * x;
        float inner = SQRT_2_PI * (x + GELU_COEF * x3);
        float result = 0.5f * x * (1.0f + tanhf(inner));
        
        output[idx] = __float2half(result);
    }
}

// Vectorized version using half2 for 2x throughput
__global__ void fused_gelu_bias_forward_fp16_vec2(
    const half2* __restrict__ input,
    const half2* __restrict__ bias,
    half2* __restrict__ output,
    int num_pairs
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (idx < num_pairs) {
        half2 x = __hadd2(input[idx], bias[idx % (num_pairs / 2)]);
        
        // Convert to float2 for computation
        float2 xf = __half22float2(x);
        
        // GELU for first element
        float x1_3 = xf.x * xf.x * xf.x;
        float inner1 = SQRT_2_PI * (xf.x + GELU_COEF * x1_3);
        float result1 = 0.5f * xf.x * (1.0f + tanhf(inner1));
        
        // GELU for second element
        float x2_3 = xf.y * xf.y * xf.y;
        float inner2 = SQRT_2_PI * (xf.y + GELU_COEF * x2_3);
        float result2 = 0.5f * xf.y * (1.0f + tanhf(inner2));
        
        output[idx] = __floats2half2_rn(result1, result2);
    }
}

torch::Tensor fused_gelu_bias_forward_half(
    torch::Tensor input,
    torch::Tensor bias
) {
    TORCH_CHECK(input.dtype() == torch::kHalf, "Input must be FP16");
    
    const int batch_size = input.size(0);
    const int hidden_size = input.size(1);
    const int total = batch_size * hidden_size;
    
    auto output = torch::empty_like(input);
    
    // Use vectorized version if aligned
    if (total % 2 == 0) {
        const int threads = 256;
        const int blocks = (total / 2 + threads - 1) / threads;
        
        fused_gelu_bias_forward_fp16_vec2<<<blocks, threads>>>(
            reinterpret_cast<half2*>(input.data_ptr<at::Half>()),
            reinterpret_cast<half2*>(bias.data_ptr<at::Half>()),
            reinterpret_cast<half2*>(output.data_ptr<at::Half>()),
            total / 2
        );
    } else {
        const int threads = 256;
        const int blocks = (total + threads - 1) / threads;
        
        fused_gelu_bias_forward_fp16<<<blocks, threads>>>(
            reinterpret_cast<half*>(input.data_ptr<at::Half>()),
            reinterpret_cast<half*>(bias.data_ptr<at::Half>()),
            reinterpret_cast<half*>(output.data_ptr<at::Half>()),
            batch_size,
            hidden_size
        );
    }
    
    return output;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward_half", &fused_gelu_bias_forward_half, "Fused GELU+Bias FP16");
}

---

## üéØ Exercises

### üî∑ CUDA C++ Exercises (Primary)

In [None]:
%%writefile pytorch_ext_exercises.cu
// CUDA C++ Exercises - PyTorch Extensions
#include <cuda_runtime.h>
#include <stdio.h>

// Exercise 1: Implement a custom activation function kernel
// TODO: Add your implementation here

// Exercise 2: Add FP16 support with half2 vectorization
// TODO: Add your implementation here

// Exercise 3: Create backward pass for autograd support
// TODO: Add your implementation here

int main() {
    printf("=== PyTorch Extension Exercises ===\n");
    printf("Implement the exercises above and run!\n");
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o pytorch_ext_exercises pytorch_ext_exercises.cu && ./pytorch_ext_exercises

### üî∂ Python/Numba Exercises (Optional)

The following exercises use Python and Numba for rapid prototyping. Complete the CUDA C++ exercises above first for the primary learning objectives.

---

## Summary

### PyTorch Extension Workflow

1. **Write CUDA kernels** in `.cu` files
2. **Create C++ bindings** with `torch/extension.h`
3. **Build** with `setup.py` or JIT compilation
4. **Wrap in autograd.Function** for training support
5. **Create nn.Module** for easy use

### Best Practices

‚úÖ **Do**:
- Support both FP32 and FP16
- Add proper error checking with `TORCH_CHECK`
- Provide Python fallback for debugging
- Use vectorized operations (half2, float4)

‚ùå **Don't**:
- Forget to make tensors contiguous
- Ignore CUDA errors
- Hardcode block sizes for all GPUs