#GPU PROJECT

```
CEPARCO-S11
GROUP 4
TOPIC: Hadamard product using 2D variables
MEMBERS:
    Arca, Althea Denisse
    Co Chiong, Sean
    Uy, Wesley King

```

WRITEFILE CODES

000 - 1024x1024 - 8x8

001 - 1024x1024 - 16x16

010 - 1024x1024 - 32x32

011 - 2048x2048 - 8x8

100 - 2048x2048 - 16x16

101 - 2048x2048 - 32x32

110 - 4096x4096 - 8x8

111 - 4096x4096 - 16x16

1000 - 4096x4096 - 32x32



#Deep Dive: Hadamard product using 2D variables

##1024x1024

###ARRAY SIZE: 1024x1024 THREADSIZE: 8x8
WRITE FILE = hadamard000

In [1]:
!nvcc --version
!nvidia-smi

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Wed Feb 19 15:24:39 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   66C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                       

In [2]:
%%writefile CUDA_hadamard000.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>

#define RANDOM_MAX 10  // Ensures values are between 0 and 10
#define ARRAY_SIZE 1024 // Updated matrix size

#define CHECK_CUDA(call)                                                        \
    do {                                                                         \
        cudaError_t err = call;                                                  \
        if (err != cudaSuccess) {                                                \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,     \
                    cudaGetErrorString(err));                                    \
            exit(1);                                                             \
        }                                                                        \
    } while (0)

void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name);

__global__
void cuda_hadamard(size_t numCol, size_t numRow, float* Z, float* X, float* Y) {
    size_t row = blockIdx.y * blockDim.y + threadIdx.y;
    size_t col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < numRow && col < numCol) {
        size_t idx = row * numCol + col;
        Z[idx] = X[idx] * Y[idx];
    }
}

int main() {
    srand(time(NULL));
    size_t ARRAY_BYTES = ARRAY_SIZE * ARRAY_SIZE * sizeof(float);

    // Unified Memory Allocation
    float *X, *Y, *Z;
    CHECK_CUDA(cudaMallocManaged(&X, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Y, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Z, ARRAY_BYTES));

    // Get GPU ID
    int device = -1;
    CHECK_CUDA(cudaGetDevice(&device));

    // Memory Advice
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Z, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, device));

    // Initialize matrices with random values
    for (size_t i = 0; i < ARRAY_SIZE; ++i) {
        for (size_t j = 0; j < ARRAY_SIZE; ++j) {
            size_t idx = i * ARRAY_SIZE + j;
            X[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
            Y[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
        }
    }

    // Prefetch data to GPU
    CHECK_CUDA(cudaMemPrefetchAsync(X, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Y, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Z, ARRAY_BYTES, device, NULL));

    // Ensure data is fully transferred before kernel execution
    CHECK_CUDA(cudaDeviceSynchronize());

    // Configure kernel launch parameters
    dim3 threadsPerBlock(8, 8);
    dim3 numBlocks((ARRAY_SIZE + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (ARRAY_SIZE + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Timing setup
    float total_time = 0.0f;
    int runs = 30;
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    for (int i = 0; i < runs; ++i) {
        CHECK_CUDA(cudaEventRecord(start));
        cuda_hadamard<<<numBlocks, threadsPerBlock>>>(ARRAY_SIZE, ARRAY_SIZE, Z, X, Y);
        CHECK_CUDA(cudaGetLastError());  // Catch kernel launch errors
        CHECK_CUDA(cudaEventRecord(stop));
        CHECK_CUDA(cudaEventSynchronize(stop));

        float milliseconds = 0;
        CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
        total_time += milliseconds;
    }

    printf("Average execution time over %d runs: %.4f ms\n", runs, total_time / runs);

    // Synchronize before printing results
    CHECK_CUDA(cudaDeviceSynchronize());

    // Print initial matrices (only first 10x10 elements)
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, X, "X");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Y, "Y");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Z, "Z");

    // Free allocated memory
    CHECK_CUDA(cudaFree(X));
    CHECK_CUDA(cudaFree(Y));
    CHECK_CUDA(cudaFree(Z));

    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));

    return 0;
}

// Corrected function to print only the first 10x10 elements
void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name) {
    printf("\nMatrix %s (First 10x10 elements):\n", name);
    for (size_t i = 0; i < 10; ++i) {  // Print first 10 rows
        for (size_t j = 0; j < 10; ++j) {  // Print first 10 columns
            printf("%.2f ", Arr[i * numCol + j]);  // Corrected indexing
        }
        printf("\n");
    }
}


Writing CUDA_hadamard000.cu


In [3]:
%%shell
nvcc -o CUDA_hadamard000 CUDA_hadamard000.cu -arch=sm_75
nvprof ./CUDA_hadamard000

==358== NVPROF is profiling process 358, command: ./CUDA_hadamard000
Average execution time over 30 runs: 0.0732 ms

Matrix X (First 10x10 elements):
7.00 9.00 1.00 6.00 6.00 3.00 3.00 8.00 1.00 9.00 
10.00 1.00 5.00 6.00 4.00 0.00 2.00 5.00 2.00 5.00 
4.00 5.00 0.00 5.00 5.00 6.00 4.00 8.00 8.00 4.00 
3.00 5.00 7.00 4.00 7.00 2.00 9.00 0.00 2.00 10.00 
9.00 3.00 4.00 6.00 3.00 9.00 4.00 3.00 5.00 1.00 
4.00 0.00 10.00 9.00 3.00 4.00 1.00 0.00 1.00 1.00 
7.00 8.00 8.00 8.00 5.00 9.00 0.00 8.00 0.00 1.00 
2.00 6.00 9.00 0.00 7.00 6.00 9.00 10.00 9.00 2.00 
8.00 7.00 0.00 0.00 6.00 10.00 8.00 6.00 7.00 9.00 
2.00 6.00 1.00 1.00 10.00 7.00 6.00 8.00 5.00 7.00 

Matrix Y (First 10x10 elements):
3.00 10.00 1.00 2.00 10.00 6.00 0.00 7.00 5.00 7.00 
8.00 0.00 5.00 9.00 4.00 5.00 6.00 0.00 10.00 9.00 
6.00 9.00 3.00 6.00 8.00 2.00 7.00 8.00 7.00 0.00 
9.00 3.00 3.00 9.00 10.00 10.00 5.00 0.00 6.00 8.00 
3.00 1.00 10.00 10.00 2.00 5.00 7.00 4.00 8.00 8.00 
0.00 7.00 5.00 2.00 0.00 6.00 9.00 2.0



###ARRAY SIZE: 1024x1024 THREADSIZE: 16x16
WRITE FILE = hadamard001

In [4]:
%%writefile CUDA_hadamard001.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>

#define RANDOM_MAX 10  // Ensures values are between 0 and 10
#define ARRAY_SIZE 1024 // Updated matrix size

#define CHECK_CUDA(call)                                                        \
    do {                                                                         \
        cudaError_t err = call;                                                  \
        if (err != cudaSuccess) {                                                \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,     \
                    cudaGetErrorString(err));                                    \
            exit(1);                                                             \
        }                                                                        \
    } while (0)

void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name);

__global__
void cuda_hadamard(size_t numCol, size_t numRow, float* Z, float* X, float* Y) {
    size_t row = blockIdx.y * blockDim.y + threadIdx.y;
    size_t col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < numRow && col < numCol) {
        size_t idx = row * numCol + col;
        Z[idx] = X[idx] * Y[idx];
    }
}

int main() {
    srand(time(NULL));
    size_t ARRAY_BYTES = ARRAY_SIZE * ARRAY_SIZE * sizeof(float);

    // Unified Memory Allocation
    float *X, *Y, *Z;
    CHECK_CUDA(cudaMallocManaged(&X, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Y, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Z, ARRAY_BYTES));

    // Get GPU ID
    int device = -1;
    CHECK_CUDA(cudaGetDevice(&device));

    // Memory Advice
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Z, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, device));

    // Initialize matrices with random values
    for (size_t i = 0; i < ARRAY_SIZE; ++i) {
        for (size_t j = 0; j < ARRAY_SIZE; ++j) {
            size_t idx = i * ARRAY_SIZE + j;
            X[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
            Y[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
        }
    }

    // Prefetch data to GPU
    CHECK_CUDA(cudaMemPrefetchAsync(X, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Y, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Z, ARRAY_BYTES, device, NULL));

    // Ensure data is fully transferred before kernel execution
    CHECK_CUDA(cudaDeviceSynchronize());

    // Configure kernel launch parameters
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((ARRAY_SIZE + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (ARRAY_SIZE + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Timing setup
    float total_time = 0.0f;
    int runs = 30;
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    for (int i = 0; i < runs; ++i) {
        CHECK_CUDA(cudaEventRecord(start));
        cuda_hadamard<<<numBlocks, threadsPerBlock>>>(ARRAY_SIZE, ARRAY_SIZE, Z, X, Y);
        CHECK_CUDA(cudaGetLastError());  // Catch kernel launch errors
        CHECK_CUDA(cudaEventRecord(stop));
        CHECK_CUDA(cudaEventSynchronize(stop));

        float milliseconds = 0;
        CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
        total_time += milliseconds;
    }

    printf("Average execution time over %d runs: %.4f ms\n", runs, total_time / runs);

    // Synchronize before printing results
    CHECK_CUDA(cudaDeviceSynchronize());

    // Print initial matrices (only first 10x10 elements)
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, X, "X");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Y, "Y");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Z, "Z");

    // Free allocated memory
    CHECK_CUDA(cudaFree(X));
    CHECK_CUDA(cudaFree(Y));
    CHECK_CUDA(cudaFree(Z));

    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));

    return 0;
}

// Corrected function to print only the first 10x10 elements
void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name) {
    printf("\nMatrix %s (First 10x10 elements):\n", name);
    for (size_t i = 0; i < 10; ++i) {  // Print first 10 rows
        for (size_t j = 0; j < 10; ++j) {  // Print first 10 columns
            printf("%.2f ", Arr[i * numCol + j]);  // Corrected indexing
        }
        printf("\n");
    }
}


Writing CUDA_hadamard001.cu


In [9]:
%%shell
nvcc -o CUDA_hadamard001 CUDA_hadamard001.cu -arch=sm_75
nvprof ./CUDA_hadamard001

==885== NVPROF is profiling process 885, command: ./CUDA_hadamard001
Average execution time over 30 runs: 0.0738 ms

Matrix X (First 10x10 elements):
1.00 6.00 7.00 0.00 10.00 9.00 5.00 5.00 5.00 1.00 
7.00 10.00 6.00 4.00 7.00 10.00 10.00 4.00 1.00 4.00 
3.00 5.00 5.00 6.00 8.00 0.00 7.00 4.00 3.00 4.00 
10.00 9.00 0.00 10.00 1.00 5.00 0.00 6.00 5.00 8.00 
1.00 8.00 8.00 6.00 9.00 7.00 6.00 7.00 5.00 10.00 
5.00 2.00 5.00 3.00 7.00 8.00 0.00 2.00 7.00 1.00 
9.00 3.00 5.00 10.00 9.00 8.00 1.00 8.00 5.00 9.00 
6.00 1.00 3.00 3.00 6.00 8.00 1.00 10.00 5.00 3.00 
6.00 2.00 7.00 3.00 4.00 0.00 0.00 8.00 0.00 6.00 
10.00 8.00 5.00 1.00 8.00 10.00 7.00 0.00 7.00 8.00 

Matrix Y (First 10x10 elements):
1.00 1.00 1.00 7.00 9.00 6.00 10.00 4.00 0.00 9.00 
5.00 7.00 9.00 7.00 6.00 10.00 5.00 8.00 6.00 0.00 
10.00 5.00 2.00 3.00 1.00 3.00 5.00 0.00 10.00 6.00 
2.00 0.00 2.00 6.00 5.00 5.00 1.00 5.00 1.00 3.00 
9.00 8.00 4.00 9.00 6.00 2.00 7.00 5.00 8.00 10.00 
4.00 6.00 1.00 7.00 1.00 2.00 9.00 



###ARRAY SIZE: 1024x1024 THREADSIZE: 32x32
WRITE FILE = hadamard010

In [16]:
%%writefile CUDA_hadamard010.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>

#define RANDOM_MAX 10  // Ensures values are between 0 and 10
#define ARRAY_SIZE 1024 // Updated matrix size

#define CHECK_CUDA(call)                                                        \
    do {                                                                         \
        cudaError_t err = call;                                                  \
        if (err != cudaSuccess) {                                                \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,     \
                    cudaGetErrorString(err));                                    \
            exit(1);                                                             \
        }                                                                        \
    } while (0)

void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name);

__global__
void cuda_hadamard(size_t numCol, size_t numRow, float* Z, float* X, float* Y) {
    size_t row = blockIdx.y * blockDim.y + threadIdx.y;
    size_t col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < numRow && col < numCol) {
        size_t idx = row * numCol + col;
        Z[idx] = X[idx] * Y[idx];
    }
}

int main() {
    srand(time(NULL));
    size_t ARRAY_BYTES = ARRAY_SIZE * ARRAY_SIZE * sizeof(float);

    // Unified Memory Allocation
    float *X, *Y, *Z;
    CHECK_CUDA(cudaMallocManaged(&X, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Y, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Z, ARRAY_BYTES));

    // Get GPU ID
    int device = -1;
    CHECK_CUDA(cudaGetDevice(&device));

    // Memory Advice
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Z, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, device));

    // Initialize matrices with random values
    for (size_t i = 0; i < ARRAY_SIZE; ++i) {
        for (size_t j = 0; j < ARRAY_SIZE; ++j) {
            size_t idx = i * ARRAY_SIZE + j;
            X[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
            Y[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
        }
    }

    // Prefetch data to GPU
    CHECK_CUDA(cudaMemPrefetchAsync(X, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Y, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Z, ARRAY_BYTES, device, NULL));

    // Ensure data is fully transferred before kernel execution
    CHECK_CUDA(cudaDeviceSynchronize());

    // Configure kernel launch parameters
    dim3 threadsPerBlock(32, 32);
    dim3 numBlocks((ARRAY_SIZE + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (ARRAY_SIZE + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Timing setup
    float total_time = 0.0f;
    int runs = 30;
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    for (int i = 0; i < runs; ++i) {
        CHECK_CUDA(cudaEventRecord(start));
        cuda_hadamard<<<numBlocks, threadsPerBlock>>>(ARRAY_SIZE, ARRAY_SIZE, Z, X, Y);
        CHECK_CUDA(cudaGetLastError());  // Catch kernel launch errors
        CHECK_CUDA(cudaEventRecord(stop));
        CHECK_CUDA(cudaEventSynchronize(stop));

        float milliseconds = 0;
        CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
        total_time += milliseconds;
    }

    printf("Average execution time over %d runs: %.4f ms\n", runs, total_time / runs);

    // Synchronize before printing results
    CHECK_CUDA(cudaDeviceSynchronize());

    // Print initial matrices (only first 10x10 elements)
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, X, "X");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Y, "Y");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Z, "Z");

    // Free allocated memory
    CHECK_CUDA(cudaFree(X));
    CHECK_CUDA(cudaFree(Y));
    CHECK_CUDA(cudaFree(Z));

    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));

    return 0;
}

// Corrected function to print only the first 10x10 elements
void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name) {
    printf("\nMatrix %s (First 10x10 elements):\n", name);
    for (size_t i = 0; i < 10; ++i) {  // Print first 10 rows
        for (size_t j = 0; j < 10; ++j) {  // Print first 10 columns
            printf("%.2f ", Arr[i * numCol + j]);  // Corrected indexing
        }
        printf("\n");
    }
}


Overwriting CUDA_hadamard010.cu


In [17]:
%%shell
nvcc -o CUDA_hadamard010 CUDA_hadamard010.cu -arch=sm_75
nvprof ./CUDA_hadamard010

==2250== NVPROF is profiling process 2250, command: ./CUDA_hadamard010
Average execution time over 30 runs: 0.0741 ms

Matrix X (First 10x10 elements):
10.00 5.00 9.00 2.00 3.00 9.00 8.00 10.00 5.00 10.00 
5.00 6.00 3.00 5.00 3.00 3.00 6.00 1.00 4.00 6.00 
5.00 9.00 2.00 10.00 0.00 9.00 5.00 2.00 7.00 5.00 
9.00 8.00 4.00 3.00 3.00 0.00 6.00 5.00 4.00 3.00 
6.00 1.00 8.00 0.00 4.00 1.00 9.00 7.00 10.00 7.00 
7.00 9.00 5.00 5.00 4.00 3.00 10.00 5.00 10.00 1.00 
7.00 10.00 1.00 1.00 9.00 9.00 9.00 7.00 0.00 9.00 
2.00 1.00 3.00 3.00 10.00 7.00 3.00 6.00 0.00 7.00 
8.00 8.00 7.00 0.00 5.00 0.00 8.00 8.00 2.00 1.00 
0.00 10.00 4.00 8.00 2.00 10.00 8.00 5.00 8.00 8.00 

Matrix Y (First 10x10 elements):
10.00 0.00 10.00 0.00 2.00 0.00 0.00 6.00 6.00 9.00 
3.00 3.00 7.00 9.00 2.00 9.00 5.00 2.00 1.00 9.00 
0.00 3.00 5.00 3.00 7.00 10.00 7.00 7.00 1.00 7.00 
5.00 4.00 3.00 10.00 4.00 1.00 5.00 8.00 3.00 2.00 
9.00 5.00 3.00 7.00 1.00 10.00 2.00 8.00 10.00 9.00 
7.00 5.00 0.00 10.00 4.00 9.00 5



##2048x2048

###ARRAY SIZE: 2048x2048 THREADSIZE: 8x8
WRITE FILE = hadamard011

In [19]:
%%writefile CUDA_hadamard011.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>

#define RANDOM_MAX 10  // Ensures values are between 0 and 10
#define ARRAY_SIZE 2048 // Updated matrix size

#define CHECK_CUDA(call)                                                        \
    do {                                                                         \
        cudaError_t err = call;                                                  \
        if (err != cudaSuccess) {                                                \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,     \
                    cudaGetErrorString(err));                                    \
            exit(1);                                                             \
        }                                                                        \
    } while (0)

void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name);

__global__
void cuda_hadamard(size_t numCol, size_t numRow, float* Z, float* X, float* Y) {
    size_t row = blockIdx.y * blockDim.y + threadIdx.y;
    size_t col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < numRow && col < numCol) {
        size_t idx = row * numCol + col;
        Z[idx] = X[idx] * Y[idx];
    }
}

int main() {
    srand(time(NULL));
    size_t ARRAY_BYTES = ARRAY_SIZE * ARRAY_SIZE * sizeof(float);

    // Unified Memory Allocation
    float *X, *Y, *Z;
    CHECK_CUDA(cudaMallocManaged(&X, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Y, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Z, ARRAY_BYTES));

    // Get GPU ID
    int device = -1;
    CHECK_CUDA(cudaGetDevice(&device));

    // Memory Advice
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Z, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, device));

    // Initialize matrices with random values
    for (size_t i = 0; i < ARRAY_SIZE; ++i) {
        for (size_t j = 0; j < ARRAY_SIZE; ++j) {
            size_t idx = i * ARRAY_SIZE + j;
            X[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
            Y[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
        }
    }

    // Prefetch data to GPU
    CHECK_CUDA(cudaMemPrefetchAsync(X, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Y, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Z, ARRAY_BYTES, device, NULL));

    // Ensure data is fully transferred before kernel execution
    CHECK_CUDA(cudaDeviceSynchronize());

    // Configure kernel launch parameters
    dim3 threadsPerBlock(8, 8);
    dim3 numBlocks((ARRAY_SIZE + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (ARRAY_SIZE + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Timing setup
    float total_time = 0.0f;
    int runs = 30;
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    for (int i = 0; i < runs; ++i) {
        CHECK_CUDA(cudaEventRecord(start));
        cuda_hadamard<<<numBlocks, threadsPerBlock>>>(ARRAY_SIZE, ARRAY_SIZE, Z, X, Y);
        CHECK_CUDA(cudaGetLastError());  // Catch kernel launch errors
        CHECK_CUDA(cudaEventRecord(stop));
        CHECK_CUDA(cudaEventSynchronize(stop));

        float milliseconds = 0;
        CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
        total_time += milliseconds;
    }

    printf("Average execution time over %d runs: %.4f ms\n", runs, total_time / runs);

    // Synchronize before printing results
    CHECK_CUDA(cudaDeviceSynchronize());

    // Print initial matrices (only first 10x10 elements)
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, X, "X");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Y, "Y");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Z, "Z");

    // Free allocated memory
    CHECK_CUDA(cudaFree(X));
    CHECK_CUDA(cudaFree(Y));
    CHECK_CUDA(cudaFree(Z));

    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));

    return 0;
}

// Corrected function to print only the first 10x10 elements
void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name) {
    printf("\nMatrix %s (First 10x10 elements):\n", name);
    for (size_t i = 0; i < 10; ++i) {  // Print first 10 rows
        for (size_t j = 0; j < 10; ++j) {  // Print first 10 columns
            printf("%.2f ", Arr[i * numCol + j]);  // Corrected indexing
        }
        printf("\n");
    }
}


Overwriting CUDA_hadamard011.cu


In [20]:
%%shell
nvcc -o CUDA_hadamard011 CUDA_hadamard011.cu -arch=sm_75
nvprof ./CUDA_hadamard011

==2587== NVPROF is profiling process 2587, command: ./CUDA_hadamard011
Average execution time over 30 runs: 0.2252 ms

Matrix X (First 10x10 elements):
6.00 3.00 5.00 1.00 0.00 4.00 1.00 2.00 4.00 6.00 
5.00 5.00 9.00 5.00 8.00 9.00 1.00 10.00 8.00 6.00 
6.00 2.00 9.00 4.00 9.00 1.00 7.00 6.00 6.00 1.00 
1.00 3.00 5.00 0.00 6.00 8.00 0.00 6.00 2.00 3.00 
4.00 4.00 8.00 9.00 4.00 10.00 9.00 1.00 4.00 7.00 
2.00 4.00 4.00 4.00 10.00 2.00 9.00 0.00 3.00 2.00 
1.00 4.00 10.00 8.00 10.00 9.00 2.00 3.00 3.00 0.00 
5.00 10.00 3.00 5.00 0.00 8.00 2.00 5.00 7.00 1.00 
10.00 3.00 5.00 1.00 7.00 7.00 9.00 10.00 8.00 4.00 
10.00 4.00 4.00 6.00 7.00 2.00 8.00 8.00 4.00 2.00 

Matrix Y (First 10x10 elements):
1.00 0.00 2.00 2.00 7.00 5.00 9.00 0.00 9.00 9.00 
8.00 2.00 10.00 2.00 0.00 1.00 7.00 9.00 0.00 9.00 
0.00 4.00 1.00 2.00 8.00 5.00 1.00 1.00 1.00 9.00 
5.00 9.00 0.00 0.00 1.00 9.00 8.00 9.00 5.00 9.00 
5.00 8.00 8.00 9.00 9.00 1.00 2.00 7.00 3.00 4.00 
3.00 9.00 10.00 6.00 5.00 3.00 5.00 4.0



###ARRAY SIZE: 2048x2048 THREADSIZE: 16x16
WRITE FILE = hadamard100

In [22]:
%%writefile CUDA_hadamard100.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>

#define RANDOM_MAX 10  // Ensures values are between 0 and 10
#define ARRAY_SIZE 2048 // Updated matrix size

#define CHECK_CUDA(call)                                                        \
    do {                                                                         \
        cudaError_t err = call;                                                  \
        if (err != cudaSuccess) {                                                \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,     \
                    cudaGetErrorString(err));                                    \
            exit(1);                                                             \
        }                                                                        \
    } while (0)

void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name);

__global__
void cuda_hadamard(size_t numCol, size_t numRow, float* Z, float* X, float* Y) {
    size_t row = blockIdx.y * blockDim.y + threadIdx.y;
    size_t col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < numRow && col < numCol) {
        size_t idx = row * numCol + col;
        Z[idx] = X[idx] * Y[idx];
    }
}

int main() {
    srand(time(NULL));
    size_t ARRAY_BYTES = ARRAY_SIZE * ARRAY_SIZE * sizeof(float);

    // Unified Memory Allocation
    float *X, *Y, *Z;
    CHECK_CUDA(cudaMallocManaged(&X, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Y, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Z, ARRAY_BYTES));

    // Get GPU ID
    int device = -1;
    CHECK_CUDA(cudaGetDevice(&device));

    // Memory Advice
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Z, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, device));

    // Initialize matrices with random values
    for (size_t i = 0; i < ARRAY_SIZE; ++i) {
        for (size_t j = 0; j < ARRAY_SIZE; ++j) {
            size_t idx = i * ARRAY_SIZE + j;
            X[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
            Y[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
        }
    }

    // Prefetch data to GPU
    CHECK_CUDA(cudaMemPrefetchAsync(X, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Y, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Z, ARRAY_BYTES, device, NULL));

    // Ensure data is fully transferred before kernel execution
    CHECK_CUDA(cudaDeviceSynchronize());

    // Configure kernel launch parameters
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((ARRAY_SIZE + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (ARRAY_SIZE + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Timing setup
    float total_time = 0.0f;
    int runs = 30;
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    for (int i = 0; i < runs; ++i) {
        CHECK_CUDA(cudaEventRecord(start));
        cuda_hadamard<<<numBlocks, threadsPerBlock>>>(ARRAY_SIZE, ARRAY_SIZE, Z, X, Y);
        CHECK_CUDA(cudaGetLastError());  // Catch kernel launch errors
        CHECK_CUDA(cudaEventRecord(stop));
        CHECK_CUDA(cudaEventSynchronize(stop));

        float milliseconds = 0;
        CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
        total_time += milliseconds;
    }

    printf("Average execution time over %d runs: %.4f ms\n", runs, total_time / runs);

    // Synchronize before printing results
    CHECK_CUDA(cudaDeviceSynchronize());

    // Print initial matrices (only first 10x10 elements)
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, X, "X");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Y, "Y");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Z, "Z");

    // Free allocated memory
    CHECK_CUDA(cudaFree(X));
    CHECK_CUDA(cudaFree(Y));
    CHECK_CUDA(cudaFree(Z));

    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));

    return 0;
}

// Corrected function to print only the first 10x10 elements
void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name) {
    printf("\nMatrix %s (First 10x10 elements):\n", name);
    for (size_t i = 0; i < 10; ++i) {  // Print first 10 rows
        for (size_t j = 0; j < 10; ++j) {  // Print first 10 columns
            printf("%.2f ", Arr[i * numCol + j]);  // Corrected indexing
        }
        printf("\n");
    }
}


Overwriting CUDA_hadamard100.cu


In [23]:
%%shell
nvcc -o CUDA_hadamard100 CUDA_hadamard100.cu -arch=sm_75
nvprof ./CUDA_hadamard100

==2850== NVPROF is profiling process 2850, command: ./CUDA_hadamard100
Average execution time over 30 runs: 0.2257 ms

Matrix X (First 10x10 elements):
3.00 1.00 9.00 8.00 5.00 5.00 1.00 5.00 6.00 7.00 
7.00 4.00 2.00 2.00 7.00 5.00 7.00 9.00 4.00 10.00 
6.00 2.00 2.00 4.00 7.00 5.00 8.00 2.00 5.00 1.00 
5.00 7.00 7.00 6.00 3.00 4.00 2.00 5.00 1.00 7.00 
5.00 3.00 6.00 1.00 6.00 2.00 9.00 1.00 8.00 5.00 
4.00 4.00 9.00 0.00 7.00 5.00 1.00 10.00 5.00 0.00 
10.00 0.00 2.00 0.00 0.00 4.00 3.00 7.00 10.00 5.00 
10.00 9.00 5.00 5.00 10.00 10.00 2.00 2.00 3.00 6.00 
7.00 2.00 7.00 3.00 2.00 8.00 5.00 0.00 6.00 2.00 
8.00 3.00 5.00 1.00 1.00 1.00 9.00 8.00 0.00 1.00 

Matrix Y (First 10x10 elements):
2.00 9.00 2.00 8.00 3.00 2.00 8.00 0.00 0.00 6.00 
6.00 9.00 8.00 4.00 3.00 6.00 9.00 10.00 5.00 7.00 
6.00 3.00 7.00 1.00 2.00 10.00 3.00 0.00 9.00 2.00 
3.00 1.00 0.00 5.00 9.00 3.00 1.00 9.00 6.00 2.00 
5.00 10.00 6.00 4.00 9.00 2.00 5.00 3.00 6.00 2.00 
2.00 3.00 8.00 8.00 6.00 10.00 9.00 6.0



###ARRAY SIZE: 2048x2048 THREADSIZE: 32x32
WRITE FILE = hadamard101

In [24]:
%%writefile CUDA_hadamard101.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>

#define RANDOM_MAX 10  // Ensures values are between 0 and 10
#define ARRAY_SIZE 2048 // Updated matrix size

#define CHECK_CUDA(call)                                                        \
    do {                                                                         \
        cudaError_t err = call;                                                  \
        if (err != cudaSuccess) {                                                \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,     \
                    cudaGetErrorString(err));                                    \
            exit(1);                                                             \
        }                                                                        \
    } while (0)

void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name);

__global__
void cuda_hadamard(size_t numCol, size_t numRow, float* Z, float* X, float* Y) {
    size_t row = blockIdx.y * blockDim.y + threadIdx.y;
    size_t col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < numRow && col < numCol) {
        size_t idx = row * numCol + col;
        Z[idx] = X[idx] * Y[idx];
    }
}

int main() {
    srand(time(NULL));
    size_t ARRAY_BYTES = ARRAY_SIZE * ARRAY_SIZE * sizeof(float);

    // Unified Memory Allocation
    float *X, *Y, *Z;
    CHECK_CUDA(cudaMallocManaged(&X, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Y, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Z, ARRAY_BYTES));

    // Get GPU ID
    int device = -1;
    CHECK_CUDA(cudaGetDevice(&device));

    // Memory Advice
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Z, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, device));

    // Initialize matrices with random values
    for (size_t i = 0; i < ARRAY_SIZE; ++i) {
        for (size_t j = 0; j < ARRAY_SIZE; ++j) {
            size_t idx = i * ARRAY_SIZE + j;
            X[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
            Y[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
        }
    }

    // Prefetch data to GPU
    CHECK_CUDA(cudaMemPrefetchAsync(X, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Y, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Z, ARRAY_BYTES, device, NULL));

    // Ensure data is fully transferred before kernel execution
    CHECK_CUDA(cudaDeviceSynchronize());

    // Configure kernel launch parameters
    dim3 threadsPerBlock(32, 32);
    dim3 numBlocks((ARRAY_SIZE + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (ARRAY_SIZE + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Timing setup
    float total_time = 0.0f;
    int runs = 30;
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    for (int i = 0; i < runs; ++i) {
        CHECK_CUDA(cudaEventRecord(start));
        cuda_hadamard<<<numBlocks, threadsPerBlock>>>(ARRAY_SIZE, ARRAY_SIZE, Z, X, Y);
        CHECK_CUDA(cudaGetLastError());  // Catch kernel launch errors
        CHECK_CUDA(cudaEventRecord(stop));
        CHECK_CUDA(cudaEventSynchronize(stop));

        float milliseconds = 0;
        CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
        total_time += milliseconds;
    }

    printf("Average execution time over %d runs: %.4f ms\n", runs, total_time / runs);

    // Synchronize before printing results
    CHECK_CUDA(cudaDeviceSynchronize());

    // Print initial matrices (only first 10x10 elements)
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, X, "X");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Y, "Y");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Z, "Z");

    // Free allocated memory
    CHECK_CUDA(cudaFree(X));
    CHECK_CUDA(cudaFree(Y));
    CHECK_CUDA(cudaFree(Z));

    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));

    return 0;
}

// Corrected function to print only the first 10x10 elements
void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name) {
    printf("\nMatrix %s (First 10x10 elements):\n", name);
    for (size_t i = 0; i < 10; ++i) {  // Print first 10 rows
        for (size_t j = 0; j < 10; ++j) {  // Print first 10 columns
            printf("%.2f ", Arr[i * numCol + j]);  // Corrected indexing
        }
        printf("\n");
    }
}


Writing CUDA_hadamard101.cu


In [25]:
%%shell
nvcc -o CUDA_hadamard101 CUDA_hadamard101.cu -arch=sm_75
nvprof ./CUDA_hadamard101

==3001== NVPROF is profiling process 3001, command: ./CUDA_hadamard101
Average execution time over 30 runs: 0.2497 ms

Matrix X (First 10x10 elements):
10.00 9.00 9.00 4.00 2.00 7.00 4.00 4.00 9.00 7.00 
6.00 2.00 2.00 10.00 8.00 3.00 3.00 4.00 0.00 6.00 
1.00 0.00 8.00 10.00 2.00 5.00 0.00 4.00 5.00 7.00 
1.00 0.00 1.00 9.00 6.00 6.00 0.00 8.00 4.00 8.00 
2.00 2.00 2.00 5.00 2.00 0.00 9.00 9.00 2.00 10.00 
7.00 3.00 3.00 4.00 1.00 8.00 2.00 6.00 3.00 3.00 
0.00 2.00 3.00 5.00 2.00 10.00 4.00 10.00 0.00 7.00 
10.00 0.00 1.00 6.00 3.00 4.00 5.00 8.00 7.00 1.00 
6.00 1.00 6.00 10.00 5.00 9.00 6.00 6.00 0.00 4.00 
10.00 8.00 2.00 3.00 8.00 4.00 8.00 3.00 10.00 7.00 

Matrix Y (First 10x10 elements):
8.00 10.00 0.00 1.00 6.00 3.00 7.00 6.00 10.00 3.00 
4.00 9.00 0.00 5.00 0.00 10.00 10.00 3.00 3.00 10.00 
6.00 6.00 5.00 2.00 2.00 3.00 9.00 7.00 3.00 1.00 
1.00 3.00 2.00 10.00 1.00 4.00 9.00 10.00 6.00 3.00 
10.00 9.00 1.00 8.00 10.00 1.00 10.00 6.00 10.00 1.00 
7.00 0.00 3.00 1.00 8.00 10.



##4096x4096

###ARRAY SIZE: 4096x4096 THREADSIZE: 8x8
WRITE FILE = hadamard110

In [26]:
%%writefile CUDA_hadamard110.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>

#define RANDOM_MAX 10  // Ensures values are between 0 and 10
#define ARRAY_SIZE 4096 // Updated matrix size

#define CHECK_CUDA(call)                                                        \
    do {                                                                         \
        cudaError_t err = call;                                                  \
        if (err != cudaSuccess) {                                                \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,     \
                    cudaGetErrorString(err));                                    \
            exit(1);                                                             \
        }                                                                        \
    } while (0)

void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name);

__global__
void cuda_hadamard(size_t numCol, size_t numRow, float* Z, float* X, float* Y) {
    size_t row = blockIdx.y * blockDim.y + threadIdx.y;
    size_t col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < numRow && col < numCol) {
        size_t idx = row * numCol + col;
        Z[idx] = X[idx] * Y[idx];
    }
}

int main() {
    srand(time(NULL));
    size_t ARRAY_BYTES = ARRAY_SIZE * ARRAY_SIZE * sizeof(float);

    // Unified Memory Allocation
    float *X, *Y, *Z;
    CHECK_CUDA(cudaMallocManaged(&X, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Y, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Z, ARRAY_BYTES));

    // Get GPU ID
    int device = -1;
    CHECK_CUDA(cudaGetDevice(&device));

    // Memory Advice
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Z, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, device));

    // Initialize matrices with random values
    for (size_t i = 0; i < ARRAY_SIZE; ++i) {
        for (size_t j = 0; j < ARRAY_SIZE; ++j) {
            size_t idx = i * ARRAY_SIZE + j;
            X[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
            Y[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
        }
    }

    // Prefetch data to GPU
    CHECK_CUDA(cudaMemPrefetchAsync(X, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Y, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Z, ARRAY_BYTES, device, NULL));

    // Ensure data is fully transferred before kernel execution
    CHECK_CUDA(cudaDeviceSynchronize());

    // Configure kernel launch parameters
    dim3 threadsPerBlock(8, 8);
    dim3 numBlocks((ARRAY_SIZE + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (ARRAY_SIZE + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Timing setup
    float total_time = 0.0f;
    int runs = 30;
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    for (int i = 0; i < runs; ++i) {
        CHECK_CUDA(cudaEventRecord(start));
        cuda_hadamard<<<numBlocks, threadsPerBlock>>>(ARRAY_SIZE, ARRAY_SIZE, Z, X, Y);
        CHECK_CUDA(cudaGetLastError());  // Catch kernel launch errors
        CHECK_CUDA(cudaEventRecord(stop));
        CHECK_CUDA(cudaEventSynchronize(stop));

        float milliseconds = 0;
        CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
        total_time += milliseconds;
    }

    printf("Average execution time over %d runs: %.4f ms\n", runs, total_time / runs);

    // Synchronize before printing results
    CHECK_CUDA(cudaDeviceSynchronize());

    // Print initial matrices (only first 10x10 elements)
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, X, "X");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Y, "Y");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Z, "Z");

    // Free allocated memory
    CHECK_CUDA(cudaFree(X));
    CHECK_CUDA(cudaFree(Y));
    CHECK_CUDA(cudaFree(Z));

    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));

    return 0;
}

// Corrected function to print only the first 10x10 elements
void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name) {
    printf("\nMatrix %s (First 10x10 elements):\n", name);
    for (size_t i = 0; i < 10; ++i) {  // Print first 10 rows
        for (size_t j = 0; j < 10; ++j) {  // Print first 10 columns
            printf("%.2f ", Arr[i * numCol + j]);  // Corrected indexing
        }
        printf("\n");
    }
}


Writing CUDA_hadamard110.cu


In [27]:
%%shell
nvcc -o CUDA_hadamard110 CUDA_hadamard110.cu -arch=sm_75
nvprof ./CUDA_hadamard110

==3263== NVPROF is profiling process 3263, command: ./CUDA_hadamard110
Average execution time over 30 runs: 0.8829 ms

Matrix X (First 10x10 elements):
1.00 10.00 1.00 2.00 10.00 5.00 0.00 2.00 1.00 7.00 
2.00 3.00 4.00 2.00 1.00 1.00 3.00 9.00 7.00 0.00 
5.00 4.00 6.00 1.00 7.00 1.00 8.00 6.00 5.00 6.00 
3.00 2.00 10.00 0.00 1.00 2.00 1.00 1.00 6.00 4.00 
8.00 9.00 7.00 1.00 0.00 6.00 0.00 4.00 1.00 5.00 
10.00 2.00 7.00 4.00 10.00 9.00 4.00 3.00 9.00 3.00 
6.00 3.00 7.00 4.00 0.00 7.00 7.00 1.00 10.00 8.00 
3.00 7.00 4.00 0.00 4.00 1.00 5.00 4.00 2.00 5.00 
1.00 0.00 6.00 8.00 8.00 0.00 0.00 6.00 4.00 10.00 
1.00 8.00 8.00 0.00 4.00 1.00 0.00 1.00 10.00 2.00 

Matrix Y (First 10x10 elements):
6.00 10.00 4.00 9.00 10.00 3.00 5.00 5.00 5.00 7.00 
4.00 4.00 0.00 4.00 6.00 4.00 9.00 9.00 2.00 7.00 
7.00 3.00 8.00 1.00 8.00 8.00 5.00 0.00 1.00 4.00 
4.00 9.00 3.00 9.00 1.00 8.00 10.00 9.00 1.00 10.00 
2.00 2.00 3.00 9.00 6.00 0.00 3.00 8.00 0.00 9.00 
10.00 4.00 8.00 9.00 6.00 6.00 10.00 



###ARRAY SIZE: 4096x4096 THREADSIZE: 16x16
WRITE FILE = hadamard111

In [29]:
%%writefile CUDA_hadamard111.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>

#define RANDOM_MAX 10  // Ensures values are between 0 and 10
#define ARRAY_SIZE 4096 // Updated matrix size

#define CHECK_CUDA(call)                                                        \
    do {                                                                         \
        cudaError_t err = call;                                                  \
        if (err != cudaSuccess) {                                                \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,     \
                    cudaGetErrorString(err));                                    \
            exit(1);                                                             \
        }                                                                        \
    } while (0)

void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name);

__global__
void cuda_hadamard(size_t numCol, size_t numRow, float* Z, float* X, float* Y) {
    size_t row = blockIdx.y * blockDim.y + threadIdx.y;
    size_t col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < numRow && col < numCol) {
        size_t idx = row * numCol + col;
        Z[idx] = X[idx] * Y[idx];
    }
}

int main() {
    srand(time(NULL));
    size_t ARRAY_BYTES = ARRAY_SIZE * ARRAY_SIZE * sizeof(float);

    // Unified Memory Allocation
    float *X, *Y, *Z;
    CHECK_CUDA(cudaMallocManaged(&X, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Y, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Z, ARRAY_BYTES));

    // Get GPU ID
    int device = -1;
    CHECK_CUDA(cudaGetDevice(&device));

    // Memory Advice
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Z, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, device));

    // Initialize matrices with random values
    for (size_t i = 0; i < ARRAY_SIZE; ++i) {
        for (size_t j = 0; j < ARRAY_SIZE; ++j) {
            size_t idx = i * ARRAY_SIZE + j;
            X[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
            Y[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
        }
    }

    // Prefetch data to GPU
    CHECK_CUDA(cudaMemPrefetchAsync(X, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Y, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Z, ARRAY_BYTES, device, NULL));

    // Ensure data is fully transferred before kernel execution
    CHECK_CUDA(cudaDeviceSynchronize());

    // Configure kernel launch parameters
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((ARRAY_SIZE + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (ARRAY_SIZE + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Timing setup
    float total_time = 0.0f;
    int runs = 30;
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    for (int i = 0; i < runs; ++i) {
        CHECK_CUDA(cudaEventRecord(start));
        cuda_hadamard<<<numBlocks, threadsPerBlock>>>(ARRAY_SIZE, ARRAY_SIZE, Z, X, Y);
        CHECK_CUDA(cudaGetLastError());  // Catch kernel launch errors
        CHECK_CUDA(cudaEventRecord(stop));
        CHECK_CUDA(cudaEventSynchronize(stop));

        float milliseconds = 0;
        CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
        total_time += milliseconds;
    }

    printf("Average execution time over %d runs: %.4f ms\n", runs, total_time / runs);

    // Synchronize before printing results
    CHECK_CUDA(cudaDeviceSynchronize());

    // Print initial matrices (only first 10x10 elements)
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, X, "X");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Y, "Y");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Z, "Z");

    // Free allocated memory
    CHECK_CUDA(cudaFree(X));
    CHECK_CUDA(cudaFree(Y));
    CHECK_CUDA(cudaFree(Z));

    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));

    return 0;
}

// Corrected function to print only the first 10x10 elements
void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name) {
    printf("\nMatrix %s (First 10x10 elements):\n", name);
    for (size_t i = 0; i < 10; ++i) {  // Print first 10 rows
        for (size_t j = 0; j < 10; ++j) {  // Print first 10 columns
            printf("%.2f ", Arr[i * numCol + j]);  // Corrected indexing
        }
        printf("\n");
    }
}


Overwriting CUDA_hadamard111.cu


In [30]:
%%shell
nvcc -o CUDA_hadamard111 CUDA_hadamard111.cu -arch=sm_75
nvprof ./CUDA_hadamard111

==3525== NVPROF is profiling process 3525, command: ./CUDA_hadamard111
Average execution time over 30 runs: 0.8648 ms

Matrix X (First 10x10 elements):
9.00 10.00 4.00 4.00 1.00 10.00 1.00 9.00 10.00 8.00 
8.00 9.00 1.00 4.00 0.00 7.00 8.00 5.00 0.00 2.00 
9.00 8.00 8.00 3.00 10.00 3.00 5.00 0.00 9.00 5.00 
0.00 10.00 2.00 2.00 10.00 8.00 3.00 3.00 4.00 10.00 
4.00 6.00 5.00 2.00 10.00 0.00 10.00 6.00 9.00 0.00 
2.00 5.00 3.00 6.00 6.00 1.00 10.00 9.00 7.00 3.00 
9.00 5.00 5.00 0.00 2.00 0.00 2.00 7.00 5.00 10.00 
3.00 3.00 7.00 3.00 3.00 0.00 7.00 5.00 4.00 3.00 
2.00 2.00 7.00 10.00 1.00 8.00 2.00 7.00 1.00 7.00 
7.00 3.00 6.00 5.00 8.00 6.00 6.00 0.00 2.00 1.00 

Matrix Y (First 10x10 elements):
6.00 2.00 8.00 9.00 7.00 9.00 5.00 0.00 3.00 3.00 
1.00 3.00 7.00 7.00 5.00 3.00 1.00 9.00 0.00 0.00 
6.00 6.00 3.00 10.00 9.00 2.00 9.00 6.00 0.00 2.00 
4.00 8.00 4.00 6.00 9.00 1.00 2.00 0.00 7.00 2.00 
10.00 2.00 2.00 9.00 7.00 8.00 9.00 8.00 1.00 2.00 
1.00 10.00 2.00 1.00 7.00 4.00 10.0



###ARRAY SIZE: 4096x4096 THREADSIZE: 32x32
WRITE FILE = hadamard1000

In [31]:
%%writefile CUDA_hadamard1000.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>

#define RANDOM_MAX 10  // Ensures values are between 0 and 10
#define ARRAY_SIZE 4096 // Updated matrix size

#define CHECK_CUDA(call)                                                        \
    do {                                                                         \
        cudaError_t err = call;                                                  \
        if (err != cudaSuccess) {                                                \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,     \
                    cudaGetErrorString(err));                                    \
            exit(1);                                                             \
        }                                                                        \
    } while (0)

void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name);

__global__
void cuda_hadamard(size_t numCol, size_t numRow, float* Z, float* X, float* Y) {
    size_t row = blockIdx.y * blockDim.y + threadIdx.y;
    size_t col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < numRow && col < numCol) {
        size_t idx = row * numCol + col;
        Z[idx] = X[idx] * Y[idx];
    }
}

int main() {
    srand(time(NULL));
    size_t ARRAY_BYTES = ARRAY_SIZE * ARRAY_SIZE * sizeof(float);

    // Unified Memory Allocation
    float *X, *Y, *Z;
    CHECK_CUDA(cudaMallocManaged(&X, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Y, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Z, ARRAY_BYTES));

    // Get GPU ID
    int device = -1;
    CHECK_CUDA(cudaGetDevice(&device));

    // Memory Advice
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Z, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, device));

    // Initialize matrices with random values
    for (size_t i = 0; i < ARRAY_SIZE; ++i) {
        for (size_t j = 0; j < ARRAY_SIZE; ++j) {
            size_t idx = i * ARRAY_SIZE + j;
            X[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
            Y[idx] = (float)(rand() % (RANDOM_MAX + 1));  // Random 0-10
        }
    }

    // Prefetch data to GPU
    CHECK_CUDA(cudaMemPrefetchAsync(X, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Y, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Z, ARRAY_BYTES, device, NULL));

    // Ensure data is fully transferred before kernel execution
    CHECK_CUDA(cudaDeviceSynchronize());

    // Configure kernel launch parameters
    dim3 threadsPerBlock(32, 32);
    dim3 numBlocks((ARRAY_SIZE + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (ARRAY_SIZE + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Timing setup
    float total_time = 0.0f;
    int runs = 30;
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    for (int i = 0; i < runs; ++i) {
        CHECK_CUDA(cudaEventRecord(start));
        cuda_hadamard<<<numBlocks, threadsPerBlock>>>(ARRAY_SIZE, ARRAY_SIZE, Z, X, Y);
        CHECK_CUDA(cudaGetLastError());  // Catch kernel launch errors
        CHECK_CUDA(cudaEventRecord(stop));
        CHECK_CUDA(cudaEventSynchronize(stop));

        float milliseconds = 0;
        CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
        total_time += milliseconds;
    }

    printf("Average execution time over %d runs: %.4f ms\n", runs, total_time / runs);

    // Synchronize before printing results
    CHECK_CUDA(cudaDeviceSynchronize());

    // Print initial matrices (only first 10x10 elements)
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, X, "X");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Y, "Y");
    print_matrix(ARRAY_SIZE, ARRAY_SIZE, Z, "Z");

    // Free allocated memory
    CHECK_CUDA(cudaFree(X));
    CHECK_CUDA(cudaFree(Y));
    CHECK_CUDA(cudaFree(Z));

    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));

    return 0;
}

// Corrected function to print only the first 10x10 elements
void print_matrix(size_t numCol, size_t numRow, float* Arr, const char* name) {
    printf("\nMatrix %s (First 10x10 elements):\n", name);
    for (size_t i = 0; i < 10; ++i) {  // Print first 10 rows
        for (size_t j = 0; j < 10; ++j) {  // Print first 10 columns
            printf("%.2f ", Arr[i * numCol + j]);  // Corrected indexing
        }
        printf("\n");
    }
}


Writing CUDA_hadamard1000.cu


In [32]:
%%shell
nvcc -o CUDA_hadamard1000 CUDA_hadamard100.cu -arch=sm_75
nvprof ./CUDA_hadamard1000

==3689== NVPROF is profiling process 3689, command: ./CUDA_hadamard1000
Average execution time over 30 runs: 0.2265 ms

Matrix X (First 10x10 elements):
6.00 10.00 0.00 2.00 7.00 10.00 2.00 1.00 10.00 9.00 
3.00 10.00 4.00 3.00 3.00 10.00 9.00 8.00 3.00 2.00 
4.00 9.00 0.00 7.00 6.00 6.00 8.00 10.00 2.00 4.00 
6.00 1.00 8.00 3.00 5.00 6.00 4.00 9.00 2.00 4.00 
7.00 9.00 7.00 2.00 2.00 8.00 10.00 3.00 8.00 2.00 
2.00 6.00 4.00 8.00 5.00 3.00 0.00 8.00 2.00 3.00 
6.00 10.00 10.00 10.00 1.00 2.00 1.00 9.00 3.00 7.00 
8.00 4.00 3.00 4.00 7.00 5.00 5.00 1.00 4.00 6.00 
8.00 4.00 6.00 3.00 7.00 8.00 5.00 10.00 8.00 7.00 
2.00 4.00 3.00 7.00 1.00 10.00 0.00 2.00 9.00 0.00 

Matrix Y (First 10x10 elements):
2.00 0.00 5.00 1.00 8.00 8.00 2.00 7.00 0.00 8.00 
9.00 5.00 4.00 9.00 9.00 2.00 10.00 1.00 2.00 6.00 
9.00 3.00 3.00 4.00 5.00 7.00 1.00 2.00 2.00 8.00 
10.00 0.00 0.00 8.00 10.00 0.00 1.00 0.00 1.00 5.00 
0.00 1.00 0.00 6.00 5.00 0.00 6.00 10.00 8.00 1.00 
0.00 5.00 1.00 6.00 7.00 9.00 0.



#Deep Dive: Hadamard product using 3D variables

WRITEFILE CODES

3D000 - 1024x1024 - 8x8

3D001 - 1024x1024 - 16x16



##1024x1024

###ARRAY SIZE: 1024x1024 THREADSIZE: 8x8x8
WRITE FILE = hadamard3D000

In [33]:
%%writefile CUDA_hadamard3D000.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>

#define RANDOM_MAX 10      // Values between 0-10
#define DEPTH 1024         // Z-dimension
#define ROWS 1024          // Y-dimension
#define COLS 1024          // X-dimension

#define CHECK_CUDA(call)                                                        \
    do {                                                                         \
        cudaError_t err = call;                                                  \
        if (err != cudaSuccess) {                                                \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,     \
                    cudaGetErrorString(err));                                    \
            exit(1);                                                             \
        }                                                                        \
    } while (0)

// 3D CUDA Kernel for Hadamard Product
__global__
void cuda_hadamard_3D(size_t depth, size_t rows, size_t cols, float* Z, float* X, float* Y) {
    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
    size_t y = blockIdx.y * blockDim.y + threadIdx.y;
    size_t z = blockIdx.z * blockDim.z + threadIdx.z;

    if (x < cols && y < rows && z < depth) {
        size_t idx = (z * rows * cols) + (y * cols) + x;
        Z[idx] = X[idx] * Y[idx];
    }
}

int main() {
    srand(time(NULL));
    size_t ARRAY_BYTES = DEPTH * ROWS * COLS * sizeof(float);

    // Allocate Unified Memory
    float *X, *Y, *Z;
    CHECK_CUDA(cudaMallocManaged(&X, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Y, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Z, ARRAY_BYTES));

    // Get GPU ID
    int device = -1;
    CHECK_CUDA(cudaGetDevice(&device));

    // Memory Advice for optimization
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Z, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, device));

    // Initialize Matrices with random values
    for (size_t i = 0; i < DEPTH; ++i) {
        for (size_t j = 0; j < ROWS; ++j) {
            for (size_t k = 0; k < COLS; ++k) {
                size_t idx = (i * ROWS * COLS) + (j * COLS) + k;
                X[idx] = (float)(rand() % (RANDOM_MAX + 1));
                Y[idx] = (float)(rand() % (RANDOM_MAX + 1));
            }
        }
    }

    // Prefetch Data to GPU
    CHECK_CUDA(cudaMemPrefetchAsync(X, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Y, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Z, ARRAY_BYTES, device, NULL));

    // Ensure data is fully transferred before kernel execution
    CHECK_CUDA(cudaDeviceSynchronize());

    // Define **3D** Thread and Block Dimensions
    dim3 threadsPerBlock(8, 8, 8);  // 8x8x8 threads per block
    dim3 numBlocks((COLS + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (ROWS + threadsPerBlock.y - 1) / threadsPerBlock.y,
                   (DEPTH + threadsPerBlock.z - 1) / threadsPerBlock.z);

    // Timing Setup
    float total_time = 0.0f;
    int runs = 30;
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    // Run Kernel 30 times & Measure Performance
    for (int i = 0; i < runs; ++i) {
        CHECK_CUDA(cudaEventRecord(start));
        cuda_hadamard_3D<<<numBlocks, threadsPerBlock>>>(DEPTH, ROWS, COLS, Z, X, Y);
        CHECK_CUDA(cudaGetLastError());  // Catch kernel launch errors
        CHECK_CUDA(cudaEventRecord(stop));
        CHECK_CUDA(cudaEventSynchronize(stop));

        float milliseconds = 0;
        CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
        total_time += milliseconds;
    }

    printf("Average execution time over %d runs: %.4f ms\n", runs, total_time / runs);

    // Synchronize before freeing memory
    CHECK_CUDA(cudaDeviceSynchronize());

    // Free allocated memory
    CHECK_CUDA(cudaFree(X));
    CHECK_CUDA(cudaFree(Y));
    CHECK_CUDA(cudaFree(Z));

    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));

    return 0;
}


Writing CUDA_hadamard3D000.cu


In [None]:
%%shell
nvcc -o CUDA_hadamard3D000 CUDA_hadamard3D000.cu -arch=sm_75
nvprof ./CUDA_hadamard3D000

==17269== NVPROF is profiling process 17269, command: ./CUDA_hadamard3D000
Average execution time over 30 runs: 55.6947 ms
==17269== Profiling application: ./CUDA_hadamard3D000
==17269== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  1.66977s        30  55.659ms  55.460ms  56.205ms  cuda_hadamard_3D(unsigned long, unsigned long, unsigned long, float*, float*, float*)
      API calls:   48.58%  1.66999s        30  55.666ms  55.465ms  56.216ms  cudaEventSynchronize
                   35.38%  1.21612s         3  405.37ms  23.164ms  682.59ms  cudaMemPrefetchAsync
                   10.30%  354.21ms         3  118.07ms  26.276ms  164.06ms  cudaFree
                    5.69%  195.63ms         3  65.210ms  57.214us  195.49ms  cudaMallocManaged
                    0.03%  992.38us        30  33.079us  15.467us  266.92us  cudaLaunchKernel
                    0.01%  421.29us        60  7.0210us  2.3260us  23.878us  cu



###ARRAY SIZE: 1024x1024 THREADSIZE: 16x16x16
WRITE FILE = hadamard3D000

In [None]:
%%writefile CUDA_hadamard3D001.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>

#define RANDOM_MAX 10      // Values between 0-10
#define DEPTH 1024         // Z-dimension
#define ROWS 1024          // Y-dimension
#define COLS 1024          // X-dimension

#define CHECK_CUDA(call)                                                        \
    do {                                                                         \
        cudaError_t err = call;                                                  \
        if (err != cudaSuccess) {                                                \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,     \
                    cudaGetErrorString(err));                                    \
            exit(1);                                                             \
        }                                                                        \
    } while (0)

// 3D CUDA Kernel for Hadamard Product
__global__
void cuda_hadamard_3D(size_t depth, size_t rows, size_t cols, float* Z, float* X, float* Y) {
    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
    size_t y = blockIdx.y * blockDim.y + threadIdx.y;
    size_t z = blockIdx.z * blockDim.z + threadIdx.z;

    if (x < cols && y < rows && z < depth) {
        size_t idx = (z * rows * cols) + (y * cols) + x;
        Z[idx] = X[idx] * Y[idx];
    }
}

int main() {
    srand(time(NULL));
    size_t ARRAY_BYTES = DEPTH * ROWS * COLS * sizeof(float);

    // Allocate Unified Memory
    float *X, *Y, *Z;
    CHECK_CUDA(cudaMallocManaged(&X, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Y, ARRAY_BYTES));
    CHECK_CUDA(cudaMallocManaged(&Z, ARRAY_BYTES));

    // Get GPU ID
    int device = -1;
    CHECK_CUDA(cudaGetDevice(&device));

    // Memory Advice for optimization
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(X, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Y, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK_CUDA(cudaMemAdvise(Z, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation, device));

    // Initialize Matrices with random values
    for (size_t i = 0; i < DEPTH; ++i) {
        for (size_t j = 0; j < ROWS; ++j) {
            for (size_t k = 0; k < COLS; ++k) {
                size_t idx = (i * ROWS * COLS) + (j * COLS) + k;
                X[idx] = (float)(rand() % (RANDOM_MAX + 1));
                Y[idx] = (float)(rand() % (RANDOM_MAX + 1));
            }
        }
    }

    // Prefetch Data to GPU
    CHECK_CUDA(cudaMemPrefetchAsync(X, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Y, ARRAY_BYTES, device, NULL));
    CHECK_CUDA(cudaMemPrefetchAsync(Z, ARRAY_BYTES, device, NULL));

    // Ensure data is fully transferred before kernel execution
    CHECK_CUDA(cudaDeviceSynchronize());

    // Define **3D** Thread and Block Dimensions
    dim3 threadsPerBlock(16, 16, 4);  //
    dim3 numBlocks((COLS + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (ROWS + threadsPerBlock.y - 1) / threadsPerBlock.y,
                   (DEPTH + threadsPerBlock.z - 1) / threadsPerBlock.z);

    // Timing Setup
    float total_time = 0.0f;
    int runs = 30;
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    // Run Kernel 30 times & Measure Performance
    for (int i = 0; i < runs; ++i) {
        CHECK_CUDA(cudaEventRecord(start));
        cuda_hadamard_3D<<<numBlocks, threadsPerBlock>>>(DEPTH, ROWS, COLS, Z, X, Y);
        CHECK_CUDA(cudaGetLastError());  // Catch kernel launch errors
        CHECK_CUDA(cudaEventRecord(stop));
        CHECK_CUDA(cudaEventSynchronize(stop));

        float milliseconds = 0;
        CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
        total_time += milliseconds;
    }

    printf("Average execution time over %d runs: %.4f ms\n", runs, total_time / runs);

    // Synchronize before freeing memory
    CHECK_CUDA(cudaDeviceSynchronize());

    // Free allocated memory
    CHECK_CUDA(cudaFree(X));
    CHECK_CUDA(cudaFree(Y));
    CHECK_CUDA(cudaFree(Z));

    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));

    return 0;
}


Overwriting CUDA_hadamard3D001.cu


In [None]:
%%shell
nvcc -o CUDA_hadamard3D001 CUDA_hadamard3D001.cu -arch=sm_75
nvprof ./CUDA_hadamard3D001

==18889== NVPROF is profiling process 18889, command: ./CUDA_hadamard3D001
Average execution time over 30 runs: 55.0947 ms
==18889== Profiling application: ./CUDA_hadamard3D001
==18889== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  1.65185s        30  55.062ms  55.005ms  55.425ms  cuda_hadamard_3D(unsigned long, unsigned long, unsigned long, float*, float*, float*)
      API calls:   53.75%  1.65216s        30  55.072ms  55.014ms  55.436ms  cudaEventSynchronize
                   27.56%  847.25ms         3  282.42ms  16.702ms  416.56ms  cudaMemPrefetchAsync
                   11.55%  354.93ms         3  118.31ms  26.270ms  164.83ms  cudaFree
                    7.09%  217.82ms         3  72.608ms  51.136us  217.67ms  cudaMallocManaged
                    0.03%  917.87us        30  30.595us  16.256us  212.88us  cudaLaunchKernel
                    0.01%  386.65us        60  6.4440us  2.2030us  18.790us  cu

