## Why Texture Memory?

```
┌────────────────────────────────────────────────────────────────┐
│                    Texture Memory Benefits                      │
├────────────────────────────────────────────────────────────────┤
│                                                                │
│  1. HARDWARE CACHING                                           │
│     └─ Optimized for 2D spatial locality                       │
│     └─ Separate cache from L1/L2 (doesn't pollute)            │
│                                                                │
│  2. FREE INTERPOLATION                                         │
│     └─ Linear/bilinear/trilinear in hardware                  │
│     └─ Zero extra instructions                                 │
│                                                                │
│  3. AUTOMATIC ADDRESSING                                       │
│     └─ Clamp, wrap, mirror modes                              │
│     └─ No bounds checking in kernel                           │
│                                                                │
│  4. NORMALIZED COORDINATES                                     │
│     └─ [0,1] range regardless of dimensions                   │
│     └─ Resolution-independent sampling                        │
│                                                                │
└────────────────────────────────────────────────────────────────┘
```

In [None]:
!nvidia-smi --query-gpu=name,memory.total --format=csv

## Texture Object API

Modern CUDA uses **texture objects** (not deprecated texture references):

```cpp
// 1. Create CUDA array (optimal layout for textures)
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray_t cuArray;
cudaMallocArray(&cuArray, &channelDesc, width, height);
cudaMemcpy2DToArray(cuArray, ...);

// 2. Specify texture resource
cudaResourceDesc resDesc = {};
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;

// 3. Specify texture parameters
cudaTextureDesc texDesc = {};
texDesc.addressMode[0] = cudaAddressModeClamp;  // U coordinate
texDesc.addressMode[1] = cudaAddressModeClamp;  // V coordinate
texDesc.filterMode = cudaFilterModeLinear;      // Bilinear interp
texDesc.normalizedCoords = true;                // [0,1] range

// 4. Create texture object
cudaTextureObject_t tex;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);

// 5. Use in kernel
__global__ void kernel(cudaTextureObject_t tex) {
    float val = tex2D<float>(tex, u, v);  // Hardware sampling!
}
```

## Example 1: Basic 1D Texture

In [None]:
%%writefile texture_1d.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
                   cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

// Sample from 1D texture at fractional positions
__global__ void sampleTexture1D(cudaTextureObject_t tex, float* output, 
                                 int numSamples, float scale) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < numSamples) {
        // Sample at position idx * scale (can be fractional!)
        float pos = idx * scale;
        output[idx] = tex1Dfetch<float>(tex, (int)pos);
    }
}

// With linear interpolation
__global__ void sampleTexture1DLinear(cudaTextureObject_t tex, float* output,
                                       int numSamples, int dataSize) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < numSamples) {
        // Normalized coordinate [0, 1)
        float u = (float)idx / numSamples;
        output[idx] = tex1D<float>(tex, u);
    }
}

int main() {
    printf("=== 1D Texture Demo ===\n\n");
    
    const int DATA_SIZE = 8;
    const int NUM_SAMPLES = 16;
    
    // Create source data
    float h_data[DATA_SIZE] = {0.0f, 1.0f, 2.0f, 4.0f, 8.0f, 4.0f, 2.0f, 1.0f};
    printf("Source data (%d elements): ", DATA_SIZE);
    for (int i = 0; i < DATA_SIZE; i++) printf("%.1f ", h_data[i]);
    printf("\n\n");
    
    // Allocate device memory (for linear memory texture)
    float* d_data;
    CHECK_CUDA(cudaMalloc(&d_data, DATA_SIZE * sizeof(float)));
    CHECK_CUDA(cudaMemcpy(d_data, h_data, DATA_SIZE * sizeof(float), cudaMemcpyHostToDevice));
    
    // Create texture object from linear memory
    cudaResourceDesc resDesc = {};
    resDesc.resType = cudaResourceTypeLinear;
    resDesc.res.linear.devPtr = d_data;
    resDesc.res.linear.desc = cudaCreateChannelDesc<float>();
    resDesc.res.linear.sizeInBytes = DATA_SIZE * sizeof(float);
    
    cudaTextureDesc texDesc = {};
    texDesc.addressMode[0] = cudaAddressModeClamp;
    texDesc.filterMode = cudaFilterModePoint;  // No interpolation
    texDesc.readMode = cudaReadModeElementType;
    
    cudaTextureObject_t tex;
    CHECK_CUDA(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
    
    // Sample texture
    float* d_output;
    CHECK_CUDA(cudaMalloc(&d_output, NUM_SAMPLES * sizeof(float)));
    
    sampleTexture1D<<<1, NUM_SAMPLES>>>(tex, d_output, NUM_SAMPLES, 0.5f);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    float h_output[NUM_SAMPLES];
    CHECK_CUDA(cudaMemcpy(h_output, d_output, NUM_SAMPLES * sizeof(float), cudaMemcpyDeviceToHost));
    
    printf("Sampled at 0.5x rate (point sampling):\n");
    for (int i = 0; i < NUM_SAMPLES; i++) {
        printf("  sample[%2d] = %.1f (pos %.1f)\n", i, h_output[i], i * 0.5f);
    }
    
    // Cleanup
    CHECK_CUDA(cudaDestroyTextureObject(tex));
    CHECK_CUDA(cudaFree(d_data));
    CHECK_CUDA(cudaFree(d_output));
    
    printf("\n=== Done! ===\n");
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 texture_1d.cu -o texture_1d && ./texture_1d

## Example 2: 2D Texture with Bilinear Interpolation

In [None]:
%%writefile texture_2d.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
                   cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

__global__ void upsample2x(cudaTextureObject_t tex, float* output,
                           int outWidth, int outHeight) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (x < outWidth && y < outHeight) {
        // Map output coords to input coords with 0.5 offset for center sampling
        float u = (x + 0.5f) / outWidth;
        float v = (y + 0.5f) / outHeight;
        
        // Hardware bilinear interpolation!
        output[y * outWidth + x] = tex2D<float>(tex, u, v);
    }
}

int main() {
    printf("=== 2D Texture Bilinear Upsampling Demo ===\n\n");
    
    const int IN_WIDTH = 4, IN_HEIGHT = 4;
    const int OUT_WIDTH = 8, OUT_HEIGHT = 8;
    
    // Create 4x4 input image
    float h_input[IN_WIDTH * IN_HEIGHT] = {
        0, 0, 1, 1,
        0, 0, 1, 1,
        2, 2, 3, 3,
        2, 2, 3, 3
    };
    
    printf("Input (%dx%d):\n", IN_WIDTH, IN_HEIGHT);
    for (int y = 0; y < IN_HEIGHT; y++) {
        printf("  ");
        for (int x = 0; x < IN_WIDTH; x++) {
            printf("%.0f ", h_input[y * IN_WIDTH + x]);
        }
        printf("\n");
    }
    
    // Create CUDA array for optimal texture access
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
    cudaArray_t cuArray;
    CHECK_CUDA(cudaMallocArray(&cuArray, &channelDesc, IN_WIDTH, IN_HEIGHT));
    CHECK_CUDA(cudaMemcpy2DToArray(cuArray, 0, 0, h_input, 
                                    IN_WIDTH * sizeof(float),
                                    IN_WIDTH * sizeof(float), IN_HEIGHT,
                                    cudaMemcpyHostToDevice));
    
    // Create texture object
    cudaResourceDesc resDesc = {};
    resDesc.resType = cudaResourceTypeArray;
    resDesc.res.array.array = cuArray;
    
    cudaTextureDesc texDesc = {};
    texDesc.addressMode[0] = cudaAddressModeClamp;
    texDesc.addressMode[1] = cudaAddressModeClamp;
    texDesc.filterMode = cudaFilterModeLinear;  // Bilinear interpolation
    texDesc.readMode = cudaReadModeElementType;
    texDesc.normalizedCoords = true;  // Use [0,1] coordinates
    
    cudaTextureObject_t tex;
    CHECK_CUDA(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
    
    // Allocate output
    float* d_output;
    CHECK_CUDA(cudaMalloc(&d_output, OUT_WIDTH * OUT_HEIGHT * sizeof(float)));
    
    // Upsample 2x with bilinear interpolation
    dim3 block(16, 16);
    dim3 grid((OUT_WIDTH + 15) / 16, (OUT_HEIGHT + 15) / 16);
    upsample2x<<<grid, block>>>(tex, d_output, OUT_WIDTH, OUT_HEIGHT);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    // Get results
    float h_output[OUT_WIDTH * OUT_HEIGHT];
    CHECK_CUDA(cudaMemcpy(h_output, d_output, OUT_WIDTH * OUT_HEIGHT * sizeof(float),
                          cudaMemcpyDeviceToHost));
    
    printf("\nOutput (%dx%d) with bilinear interpolation:\n", OUT_WIDTH, OUT_HEIGHT);
    for (int y = 0; y < OUT_HEIGHT; y++) {
        printf("  ");
        for (int x = 0; x < OUT_WIDTH; x++) {
            printf("%4.1f ", h_output[y * OUT_WIDTH + x]);
        }
        printf("\n");
    }
    
    // Cleanup
    CHECK_CUDA(cudaDestroyTextureObject(tex));
    CHECK_CUDA(cudaFreeArray(cuArray));
    CHECK_CUDA(cudaFree(d_output));
    
    printf("\n=== Done! ===\n");
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 texture_2d.cu -o texture_2d && ./texture_2d

## Example 3: Address Modes

In [None]:
%%writefile texture_addressing.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error: %s\n", cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

__global__ void sampleWithCoords(cudaTextureObject_t tex, float* output,
                                  float* coords, int n) {
    int idx = threadIdx.x;
    if (idx < n) {
        output[idx] = tex1D<float>(tex, coords[idx]);
    }
}

void testAddressMode(const char* name, cudaTextureAddressMode mode,
                     float* h_data, int dataSize) {
    // Create array
    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
    cudaArray_t array;
    CHECK_CUDA(cudaMallocArray(&array, &desc, dataSize, 0));
    CHECK_CUDA(cudaMemcpyToArray(array, 0, 0, h_data, dataSize * sizeof(float),
                                  cudaMemcpyHostToDevice));
    
    // Create texture
    cudaResourceDesc resDesc = {};
    resDesc.resType = cudaResourceTypeArray;
    resDesc.res.array.array = array;
    
    cudaTextureDesc texDesc = {};
    texDesc.addressMode[0] = mode;
    texDesc.filterMode = cudaFilterModePoint;
    texDesc.normalizedCoords = true;
    
    cudaTextureObject_t tex;
    CHECK_CUDA(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
    
    // Test coordinates (including out of range)
    float h_coords[] = {-0.25f, 0.0f, 0.25f, 0.5f, 0.75f, 1.0f, 1.25f};
    int numCoords = sizeof(h_coords) / sizeof(float);
    
    float *d_coords, *d_output;
    CHECK_CUDA(cudaMalloc(&d_coords, numCoords * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&d_output, numCoords * sizeof(float)));
    CHECK_CUDA(cudaMemcpy(d_coords, h_coords, numCoords * sizeof(float), cudaMemcpyHostToDevice));
    
    sampleWithCoords<<<1, numCoords>>>(tex, d_output, d_coords, numCoords);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    float h_output[7];
    CHECK_CUDA(cudaMemcpy(h_output, d_output, numCoords * sizeof(float), cudaMemcpyDeviceToHost));
    
    printf("%s:\n  ", name);
    for (int i = 0; i < numCoords; i++) {
        printf("u=%.2f->%.0f  ", h_coords[i], h_output[i]);
    }
    printf("\n");
    
    CHECK_CUDA(cudaDestroyTextureObject(tex));
    CHECK_CUDA(cudaFreeArray(array));
    CHECK_CUDA(cudaFree(d_coords));
    CHECK_CUDA(cudaFree(d_output));
}

int main() {
    printf("=== Texture Address Modes ===\n\n");
    
    float h_data[] = {1, 2, 3, 4};
    printf("Data: [1, 2, 3, 4] (normalized coords: 0, 0.25, 0.5, 0.75)\n\n");
    
    testAddressMode("CLAMP (default)", cudaAddressModeClamp, h_data, 4);
    testAddressMode("WRAP (repeat)  ", cudaAddressModeWrap, h_data, 4);
    testAddressMode("MIRROR         ", cudaAddressModeMirror, h_data, 4);
    testAddressMode("BORDER (zero)  ", cudaAddressModeBorder, h_data, 4);
    
    printf("\n");
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 texture_addressing.cu -o texture_addressing && ./texture_addressing

## Performance: Texture vs Global Memory

In [None]:
%%writefile texture_perf.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error: %s\n", cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

// Random access pattern (texture excels here)
__global__ void randomAccessGlobal(const float* data, float* output,
                                    const int* indices, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        output[idx] = data[indices[idx]];
    }
}

__global__ void randomAccessTexture(cudaTextureObject_t tex, float* output,
                                     const int* indices, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        output[idx] = tex1Dfetch<float>(tex, indices[idx]);
    }
}

int main() {
    printf("=== Texture vs Global Memory Performance ===\n\n");
    
    const int DATA_SIZE = 16 * 1024 * 1024;  // 16M elements
    const int NUM_ACCESSES = 1024 * 1024;    // 1M random accesses
    const int ITERATIONS = 100;
    
    // Allocate and initialize
    float* h_data = new float[DATA_SIZE];
    int* h_indices = new int[NUM_ACCESSES];
    
    for (int i = 0; i < DATA_SIZE; i++) h_data[i] = (float)i;
    for (int i = 0; i < NUM_ACCESSES; i++) h_indices[i] = rand() % DATA_SIZE;
    
    float *d_data, *d_output;
    int* d_indices;
    CHECK_CUDA(cudaMalloc(&d_data, DATA_SIZE * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&d_output, NUM_ACCESSES * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&d_indices, NUM_ACCESSES * sizeof(int)));
    
    CHECK_CUDA(cudaMemcpy(d_data, h_data, DATA_SIZE * sizeof(float), cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(d_indices, h_indices, NUM_ACCESSES * sizeof(int), cudaMemcpyHostToDevice));
    
    // Create texture object
    cudaResourceDesc resDesc = {};
    resDesc.resType = cudaResourceTypeLinear;
    resDesc.res.linear.devPtr = d_data;
    resDesc.res.linear.desc = cudaCreateChannelDesc<float>();
    resDesc.res.linear.sizeInBytes = DATA_SIZE * sizeof(float);
    
    cudaTextureDesc texDesc = {};
    texDesc.readMode = cudaReadModeElementType;
    
    cudaTextureObject_t tex;
    CHECK_CUDA(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
    
    dim3 block(256);
    dim3 grid((NUM_ACCESSES + 255) / 256);
    
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));
    
    // Warmup
    randomAccessGlobal<<<grid, block>>>(d_data, d_output, d_indices, NUM_ACCESSES);
    randomAccessTexture<<<grid, block>>>(tex, d_output, d_indices, NUM_ACCESSES);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    // Benchmark global memory
    CHECK_CUDA(cudaEventRecord(start));
    for (int i = 0; i < ITERATIONS; i++) {
        randomAccessGlobal<<<grid, block>>>(d_data, d_output, d_indices, NUM_ACCESSES);
    }
    CHECK_CUDA(cudaEventRecord(stop));
    CHECK_CUDA(cudaEventSynchronize(stop));
    
    float globalMs;
    CHECK_CUDA(cudaEventElapsedTime(&globalMs, start, stop));
    
    // Benchmark texture memory
    CHECK_CUDA(cudaEventRecord(start));
    for (int i = 0; i < ITERATIONS; i++) {
        randomAccessTexture<<<grid, block>>>(tex, d_output, d_indices, NUM_ACCESSES);
    }
    CHECK_CUDA(cudaEventRecord(stop));
    CHECK_CUDA(cudaEventSynchronize(stop));
    
    float textureMs;
    CHECK_CUDA(cudaEventElapsedTime(&textureMs, start, stop));
    
    printf("Random access pattern (%d elements, %d accesses):\n", DATA_SIZE, NUM_ACCESSES);
    printf("  Global memory:  %.2f ms (%.2f GB/s)\n", 
           globalMs, (float)NUM_ACCESSES * sizeof(float) * ITERATIONS / globalMs / 1e6);
    printf("  Texture memory: %.2f ms (%.2f GB/s)\n",
           textureMs, (float)NUM_ACCESSES * sizeof(float) * ITERATIONS / textureMs / 1e6);
    printf("  Speedup: %.2fx\n", globalMs / textureMs);
    
    // Cleanup
    CHECK_CUDA(cudaDestroyTextureObject(tex));
    CHECK_CUDA(cudaFree(d_data));
    CHECK_CUDA(cudaFree(d_output));
    CHECK_CUDA(cudaFree(d_indices));
    delete[] h_data;
    delete[] h_indices;
    
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 texture_perf.cu -o texture_perf && ./texture_perf

## Summary

| Feature | Description |
|---------|-------------|
| `cudaFilterModePoint` | Nearest-neighbor (no interpolation) |
| `cudaFilterModeLinear` | Bilinear/trilinear interpolation |
| `cudaAddressModeClamp` | Clamp coordinates to [0, 1] |
| `cudaAddressModeWrap` | Repeat texture (tiling) |
| `cudaAddressModeMirror` | Mirror at boundaries |
| `cudaAddressModeBorder` | Return 0 outside bounds |

**When to use textures:**
- 2D/3D data with spatial locality
- Need free interpolation
- Random access patterns
- Read-only data