## Application 1: Image Resizing with Bilinear Interpolation

In [None]:
%%writefile image_resize.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error: %s\n", cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

__global__ void resizeImage(cudaTextureObject_t tex, unsigned char* output,
                            int outWidth, int outHeight) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (x < outWidth && y < outHeight) {
        // Normalized coordinates with half-pixel offset
        float u = (x + 0.5f) / outWidth;
        float v = (y + 0.5f) / outHeight;
        
        // tex2D with linear filtering returns float in [0, 255]
        float val = tex2D<float>(tex, u, v);
        output[y * outWidth + x] = (unsigned char)fminf(fmaxf(val, 0.0f), 255.0f);
    }
}

void printImage(const char* title, unsigned char* img, int w, int h) {
    printf("%s (%dx%d):\n", title, w, h);
    for (int y = 0; y < h; y++) {
        printf("  ");
        for (int x = 0; x < w; x++) {
            printf("%3d ", img[y * w + x]);
        }
        printf("\n");
    }
}

int main() {
    printf("=== Image Resize with Texture Bilinear Interpolation ===\n\n");
    
    // 4x4 test image with distinct values
    const int IN_W = 4, IN_H = 4;
    unsigned char h_input[IN_W * IN_H] = {
          0,  50, 100, 150,
         50, 100, 150, 200,
        100, 150, 200, 250,
        150, 200, 250, 255
    };
    
    printImage("Input", h_input, IN_W, IN_H);
    
    // Convert to float for texture
    float h_inputFloat[IN_W * IN_H];
    for (int i = 0; i < IN_W * IN_H; i++) {
        h_inputFloat[i] = (float)h_input[i];
    }
    
    // Create CUDA array
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
    cudaArray_t cuArray;
    CHECK_CUDA(cudaMallocArray(&cuArray, &channelDesc, IN_W, IN_H));
    CHECK_CUDA(cudaMemcpy2DToArray(cuArray, 0, 0, h_inputFloat,
                                    IN_W * sizeof(float),
                                    IN_W * sizeof(float), IN_H,
                                    cudaMemcpyHostToDevice));
    
    // Create texture with bilinear filtering
    cudaResourceDesc resDesc = {};
    resDesc.resType = cudaResourceTypeArray;
    resDesc.res.array.array = cuArray;
    
    cudaTextureDesc texDesc = {};
    texDesc.addressMode[0] = cudaAddressModeClamp;
    texDesc.addressMode[1] = cudaAddressModeClamp;
    texDesc.filterMode = cudaFilterModeLinear;  // Bilinear!
    texDesc.normalizedCoords = true;
    
    cudaTextureObject_t tex;
    CHECK_CUDA(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
    
    // Resize to 8x8
    const int OUT_W = 8, OUT_H = 8;
    unsigned char* d_output;
    CHECK_CUDA(cudaMalloc(&d_output, OUT_W * OUT_H));
    
    dim3 block(16, 16);
    dim3 grid((OUT_W + 15) / 16, (OUT_H + 15) / 16);
    resizeImage<<<grid, block>>>(tex, d_output, OUT_W, OUT_H);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    unsigned char h_output[OUT_W * OUT_H];
    CHECK_CUDA(cudaMemcpy(h_output, d_output, OUT_W * OUT_H, cudaMemcpyDeviceToHost));
    
    printf("\n");
    printImage("Output (2x upscaled with bilinear)", h_output, OUT_W, OUT_H);
    
    // Cleanup
    CHECK_CUDA(cudaDestroyTextureObject(tex));
    CHECK_CUDA(cudaFreeArray(cuArray));
    CHECK_CUDA(cudaFree(d_output));
    
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 image_resize.cu -o image_resize && ./image_resize

## Application 2: Lookup Tables (LUTs)

In [None]:
%%writefile lut_texture.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error: %s\n", cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

// Compute expensive function on every element
__global__ void computeDirect(float* output, float* input, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float x = input[idx];
        // Expensive computation: sin(x) * exp(-x^2) * log(1 + x)
        output[idx] = sinf(x) * expf(-x*x) * logf(1.0f + fabsf(x));
    }
}

// Use precomputed LUT with interpolation
__global__ void computeLUT(cudaTextureObject_t lut, float* output, 
                           float* input, int n, float minVal, float maxVal) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float x = input[idx];
        // Normalize to [0, 1] for texture lookup
        float u = (x - minVal) / (maxVal - minVal);
        u = fmaxf(0.0f, fminf(1.0f, u));  // Clamp
        output[idx] = tex1D<float>(lut, u);
    }
}

int main() {
    printf("=== Lookup Table with Texture ===\n\n");
    
    const int N = 10 * 1024 * 1024;  // 10M elements
    const int LUT_SIZE = 4096;        // 4K LUT entries
    const float MIN_VAL = 0.0f;
    const float MAX_VAL = 10.0f;
    const int ITERATIONS = 100;
    
    // Create input data
    float* h_input = new float[N];
    for (int i = 0; i < N; i++) {
        h_input[i] = MIN_VAL + (MAX_VAL - MIN_VAL) * ((float)rand() / RAND_MAX);
    }
    
    // Create LUT
    float* h_lut = new float[LUT_SIZE];
    for (int i = 0; i < LUT_SIZE; i++) {
        float x = MIN_VAL + (MAX_VAL - MIN_VAL) * i / (LUT_SIZE - 1);
        h_lut[i] = sinf(x) * expf(-x*x) * logf(1.0f + fabsf(x));
    }
    
    // Allocate device memory
    float *d_input, *d_output;
    CHECK_CUDA(cudaMalloc(&d_input, N * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&d_output, N * sizeof(float)));
    CHECK_CUDA(cudaMemcpy(d_input, h_input, N * sizeof(float), cudaMemcpyHostToDevice));
    
    // Create LUT texture
    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
    cudaArray_t lutArray;
    CHECK_CUDA(cudaMallocArray(&lutArray, &desc, LUT_SIZE, 0));
    CHECK_CUDA(cudaMemcpyToArray(lutArray, 0, 0, h_lut, LUT_SIZE * sizeof(float),
                                  cudaMemcpyHostToDevice));
    
    cudaResourceDesc resDesc = {};
    resDesc.resType = cudaResourceTypeArray;
    resDesc.res.array.array = lutArray;
    
    cudaTextureDesc texDesc = {};
    texDesc.addressMode[0] = cudaAddressModeClamp;
    texDesc.filterMode = cudaFilterModeLinear;  // Interpolate between entries
    texDesc.normalizedCoords = true;
    
    cudaTextureObject_t lutTex;
    CHECK_CUDA(cudaCreateTextureObject(&lutTex, &resDesc, &texDesc, NULL));
    
    dim3 block(256);
    dim3 grid((N + 255) / 256);
    
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));
    
    // Warmup
    computeDirect<<<grid, block>>>(d_output, d_input, N);
    computeLUT<<<grid, block>>>(lutTex, d_output, d_input, N, MIN_VAL, MAX_VAL);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    // Benchmark direct computation
    CHECK_CUDA(cudaEventRecord(start));
    for (int i = 0; i < ITERATIONS; i++) {
        computeDirect<<<grid, block>>>(d_output, d_input, N);
    }
    CHECK_CUDA(cudaEventRecord(stop));
    CHECK_CUDA(cudaEventSynchronize(stop));
    float directMs;
    CHECK_CUDA(cudaEventElapsedTime(&directMs, start, stop));
    
    // Benchmark LUT
    CHECK_CUDA(cudaEventRecord(start));
    for (int i = 0; i < ITERATIONS; i++) {
        computeLUT<<<grid, block>>>(lutTex, d_output, d_input, N, MIN_VAL, MAX_VAL);
    }
    CHECK_CUDA(cudaEventRecord(stop));
    CHECK_CUDA(cudaEventSynchronize(stop));
    float lutMs;
    CHECK_CUDA(cudaEventElapsedTime(&lutMs, start, stop));
    
    printf("f(x) = sin(x) * exp(-xÂ²) * log(1+|x|)\n");
    printf("Elements: %d, LUT size: %d\n\n", N, LUT_SIZE);
    printf("Direct computation: %.2f ms\n", directMs);
    printf("LUT with texture:   %.2f ms\n", lutMs);
    printf("Speedup: %.2fx\n\n", directMs / lutMs);
    
    // Verify accuracy
    float* h_direct = new float[1000];
    float* h_lut_out = new float[1000];
    
    computeDirect<<<1, 1000>>>(d_output, d_input, 1000);
    CHECK_CUDA(cudaMemcpy(h_direct, d_output, 1000 * sizeof(float), cudaMemcpyDeviceToHost));
    
    computeLUT<<<1, 1000>>>(lutTex, d_output, d_input, 1000, MIN_VAL, MAX_VAL);
    CHECK_CUDA(cudaMemcpy(h_lut_out, d_output, 1000 * sizeof(float), cudaMemcpyDeviceToHost));
    
    float maxError = 0.0f;
    for (int i = 0; i < 1000; i++) {
        float err = fabsf(h_direct[i] - h_lut_out[i]);
        if (err > maxError) maxError = err;
    }
    printf("Max error (vs direct): %.6f\n", maxError);
    
    // Cleanup
    CHECK_CUDA(cudaDestroyTextureObject(lutTex));
    CHECK_CUDA(cudaFreeArray(lutArray));
    CHECK_CUDA(cudaFree(d_input));
    CHECK_CUDA(cudaFree(d_output));
    delete[] h_input;
    delete[] h_lut;
    delete[] h_direct;
    delete[] h_lut_out;
    
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 lut_texture.cu -o lut_texture && ./lut_texture

## Application 3: Image Convolution with Textures

In [None]:
%%writefile convolution_texture.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error: %s\n", cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

#define KERNEL_RADIUS 2
#define KERNEL_SIZE (2 * KERNEL_RADIUS + 1)

__constant__ float c_kernel[KERNEL_SIZE * KERNEL_SIZE];

// Convolution using texture (automatic border handling!)
__global__ void convolveTexture(cudaTextureObject_t tex, float* output,
                                 int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (x < width && y < height) {
        float sum = 0.0f;
        
        for (int ky = -KERNEL_RADIUS; ky <= KERNEL_RADIUS; ky++) {
            for (int kx = -KERNEL_RADIUS; kx <= KERNEL_RADIUS; kx++) {
                // Texture automatically handles borders with clamp mode!
                float u = (float)(x + kx + 0.5f) / width;
                float v = (float)(y + ky + 0.5f) / height;
                float pixel = tex2D<float>(tex, u, v);
                float weight = c_kernel[(ky + KERNEL_RADIUS) * KERNEL_SIZE + (kx + KERNEL_RADIUS)];
                sum += pixel * weight;
            }
        }
        
        output[y * width + x] = sum;
    }
}

// Convolution using global memory (manual border handling)
__global__ void convolveGlobal(const float* input, float* output,
                                int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (x < width && y < height) {
        float sum = 0.0f;
        
        for (int ky = -KERNEL_RADIUS; ky <= KERNEL_RADIUS; ky++) {
            for (int kx = -KERNEL_RADIUS; kx <= KERNEL_RADIUS; kx++) {
                int ix = min(max(x + kx, 0), width - 1);   // Manual clamp
                int iy = min(max(y + ky, 0), height - 1);
                float pixel = input[iy * width + ix];
                float weight = c_kernel[(ky + KERNEL_RADIUS) * KERNEL_SIZE + (kx + KERNEL_RADIUS)];
                sum += pixel * weight;
            }
        }
        
        output[y * width + x] = sum;
    }
}

int main() {
    printf("=== Convolution: Texture vs Global Memory ===\n\n");
    
    const int WIDTH = 4096, HEIGHT = 4096;
    const int ITERATIONS = 100;
    
    // 5x5 Gaussian blur kernel
    float h_kernel[KERNEL_SIZE * KERNEL_SIZE] = {
        1,  4,  6,  4, 1,
        4, 16, 24, 16, 4,
        6, 24, 36, 24, 6,
        4, 16, 24, 16, 4,
        1,  4,  6,  4, 1
    };
    // Normalize
    float sum = 0;
    for (int i = 0; i < KERNEL_SIZE * KERNEL_SIZE; i++) sum += h_kernel[i];
    for (int i = 0; i < KERNEL_SIZE * KERNEL_SIZE; i++) h_kernel[i] /= sum;
    
    CHECK_CUDA(cudaMemcpyToSymbol(c_kernel, h_kernel, sizeof(h_kernel)));
    
    // Create test image
    float* h_input = new float[WIDTH * HEIGHT];
    for (int i = 0; i < WIDTH * HEIGHT; i++) {
        h_input[i] = (float)(rand() % 256);
    }
    
    // Device memory
    float *d_input, *d_output;
    CHECK_CUDA(cudaMalloc(&d_input, WIDTH * HEIGHT * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&d_output, WIDTH * HEIGHT * sizeof(float)));
    CHECK_CUDA(cudaMemcpy(d_input, h_input, WIDTH * HEIGHT * sizeof(float), cudaMemcpyHostToDevice));
    
    // Create texture
    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
    cudaArray_t cuArray;
    CHECK_CUDA(cudaMallocArray(&cuArray, &desc, WIDTH, HEIGHT));
    CHECK_CUDA(cudaMemcpy2DToArray(cuArray, 0, 0, h_input,
                                    WIDTH * sizeof(float),
                                    WIDTH * sizeof(float), HEIGHT,
                                    cudaMemcpyHostToDevice));
    
    cudaResourceDesc resDesc = {};
    resDesc.resType = cudaResourceTypeArray;
    resDesc.res.array.array = cuArray;
    
    cudaTextureDesc texDesc = {};
    texDesc.addressMode[0] = cudaAddressModeClamp;
    texDesc.addressMode[1] = cudaAddressModeClamp;
    texDesc.filterMode = cudaFilterModePoint;
    texDesc.normalizedCoords = true;
    
    cudaTextureObject_t tex;
    CHECK_CUDA(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
    
    dim3 block(16, 16);
    dim3 grid((WIDTH + 15) / 16, (HEIGHT + 15) / 16);
    
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));
    
    // Warmup
    convolveGlobal<<<grid, block>>>(d_input, d_output, WIDTH, HEIGHT);
    convolveTexture<<<grid, block>>>(tex, d_output, WIDTH, HEIGHT);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    // Benchmark global memory
    CHECK_CUDA(cudaEventRecord(start));
    for (int i = 0; i < ITERATIONS; i++) {
        convolveGlobal<<<grid, block>>>(d_input, d_output, WIDTH, HEIGHT);
    }
    CHECK_CUDA(cudaEventRecord(stop));
    CHECK_CUDA(cudaEventSynchronize(stop));
    float globalMs;
    CHECK_CUDA(cudaEventElapsedTime(&globalMs, start, stop));
    
    // Benchmark texture
    CHECK_CUDA(cudaEventRecord(start));
    for (int i = 0; i < ITERATIONS; i++) {
        convolveTexture<<<grid, block>>>(tex, d_output, WIDTH, HEIGHT);
    }
    CHECK_CUDA(cudaEventRecord(stop));
    CHECK_CUDA(cudaEventSynchronize(stop));
    float textureMs;
    CHECK_CUDA(cudaEventElapsedTime(&textureMs, start, stop));
    
    printf("5x5 Gaussian blur on %dx%d image:\n", WIDTH, HEIGHT);
    printf("  Global memory: %.2f ms\n", globalMs);
    printf("  Texture:       %.2f ms\n", textureMs);
    printf("  Speedup: %.2fx\n", globalMs / textureMs);
    
    // Cleanup
    CHECK_CUDA(cudaDestroyTextureObject(tex));
    CHECK_CUDA(cudaFreeArray(cuArray));
    CHECK_CUDA(cudaFree(d_input));
    CHECK_CUDA(cudaFree(d_output));
    delete[] h_input;
    
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 convolution_texture.cu -o convolution_texture && ./convolution_texture

## Summary

| Application | Texture Benefit |
|-------------|----------------|
| Image resize | Free bilinear interpolation |
| Lookup tables | Interpolation + cache |
| Convolution | Automatic border handling |
| Volume rendering | 3D texture with trilinear |

**Best practices:**
1. Use CUDA arrays for 2D/3D data (better cache locality)
2. Use linear memory for 1D lookup tables
3. Choose filter mode based on need (point vs linear)
4. Use normalized coords for resolution-independent code