In [16]:
!nvidia-smi

Fri Feb 13 09:33:46 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [17]:
%%writefile vector_addition.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

__global__ void vectorAddGPU(int *A, int *B, int *C, int n)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n)
        C[idx] = A[idx] + B[idx];
}

void vectorAddCPU(int *A, int *B, int *C, int n)
{
    for (int i = 0; i < n; i++)
        C[i] = A[i] + B[i];
}

int main(int argc, char *argv[])
{
    if (argc != 2)
    {
        printf("Usage: ./vector_addition <vector_size>\n");
        return 0;
    }

    int n = atoi(argv[1]);
    int size = n * sizeof(int);

    int *A = (int *)malloc(size);
    int *B = (int *)malloc(size);
    int *C_cpu = (int *)malloc(size);
    int *C_gpu = (int *)malloc(size);

    for (int i = 0; i < n; i++)
    {
        A[i] = rand() % 100;
        B[i] = rand() % 100;
    }

    clock_t start_cpu = clock();
    vectorAddCPU(A, B, C_cpu, n);
    clock_t end_cpu = clock();

    int *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_B, size);
    cudaMalloc((void **)&d_C, size);

    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

    cudaEvent_t start_gpu, stop_gpu;
    cudaEventCreate(&start_gpu);
    cudaEventCreate(&stop_gpu);

    int threads = 256;
    int blocks = (n + threads - 1) / threads;

    cudaEventRecord(start_gpu);
    vectorAddGPU<<<blocks, threads>>>(d_A, d_B, d_C, n);
    cudaEventRecord(stop_gpu);
    cudaEventSynchronize(stop_gpu);

    float gpu_time;
    cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu);

    printf("Vector Size: %d\n", n);
    printf("CPU Time: %f ms\n",
           (double)(end_cpu - start_cpu) * 1000 / CLOCKS_PER_SEC);
    printf("GPU Time: %f ms\n", gpu_time);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(A);
    free(B);
    free(C_cpu);
    free(C_gpu);

    return 0;
}


Overwriting vector_addition.cu


In [18]:
!nvcc vector_addition.cu -o vector_addition
!./vector_addition 5000000

Vector Size: 5000000
CPU Time: 21.823000 ms
GPU Time: 0.375232 ms


In [19]:
%%writefile vector_sum_max_min.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <limits.h>
#include <time.h>

__global__ void reduceGPU(int *A, int *sum, int *max, int *min, int n)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n)
    {
        atomicAdd(sum, A[idx]);
        atomicMax(max, A[idx]);
        atomicMin(min, A[idx]);
    }
}

void reduceCPU(int *A, int n, int *sum, int *max, int *min)
{
    *sum = 0;
    *max = INT_MIN;
    *min = INT_MAX;

    for (int i = 0; i < n; i++)
    {
        *sum += A[i];
        if (A[i] > *max) *max = A[i];
        if (A[i] < *min) *min = A[i];
    }
}

int main(int argc, char *argv[])
{
    if (argc != 2)
    {
        printf("Usage: ./vector_sum_max_min <vector_size>\n");
        return 0;
    }

    int n = atoi(argv[1]);
    int size = n * sizeof(int);

    int *A = (int *)malloc(size);
    for (int i = 0; i < n; i++)
        A[i] = rand() % 100;

    int cpu_sum, cpu_max, cpu_min;

    clock_t start_cpu = clock();
    reduceCPU(A, n, &cpu_sum, &cpu_max, &cpu_min);
    clock_t end_cpu = clock();

    int *d_A, *d_sum, *d_max, *d_min;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_sum, sizeof(int));
    cudaMalloc(&d_max, sizeof(int));
    cudaMalloc(&d_min, sizeof(int));

    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);

    int zero = 0, min_init = INT_MAX, max_init = INT_MIN;
    cudaMemcpy(d_sum, &zero, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_max, &max_init, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_min, &min_init, sizeof(int), cudaMemcpyHostToDevice);

    cudaEvent_t start_gpu, stop_gpu;
    cudaEventCreate(&start_gpu);
    cudaEventCreate(&stop_gpu);

    int threads = 256;
    int blocks = (n + threads - 1) / threads;

    cudaEventRecord(start_gpu);
    reduceGPU<<<blocks, threads>>>(d_A, d_sum, d_max, d_min, n);
    cudaEventRecord(stop_gpu);
    cudaEventSynchronize(stop_gpu);

    int gpu_sum, gpu_max, gpu_min;
    cudaMemcpy(&gpu_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&gpu_max, d_max, sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&gpu_min, d_min, sizeof(int), cudaMemcpyDeviceToHost);

    float gpu_time;
    cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu);

    printf("Vector Size: %d\n", n);
    printf("CPU -> Sum=%d Max=%d Min=%d\n", cpu_sum, cpu_max, cpu_min);
    printf("CPU Time: %f ms\n",
           (double)(end_cpu - start_cpu) * 1000 / CLOCKS_PER_SEC);

    printf("GPU -> Sum=%d Max=%d Min=%d\n", gpu_sum, gpu_max, gpu_min);
    printf("GPU Time: %f ms\n", gpu_time);

    cudaFree(d_A);
    cudaFree(d_sum);
    cudaFree(d_max);
    cudaFree(d_min);
    free(A);

    return 0;
}

Overwriting vector_sum_max_min.cu


In [20]:
!nvcc vector_sum_max_min.cu -o vector_sum_max_min
!./vector_sum_max_min 1000000

Vector Size: 1000000
CPU -> Sum=49498583 Max=99 Min=0
CPU Time: 3.547000 ms
GPU -> Sum=49498583 Max=99 Min=0
GPU Time: 0.253984 ms
