In [1]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available")
else:
    device = torch.device("cpu")
    print("CUDA is not available")

CUDA is available


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-pshpkb9h
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-pshpkb9h
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 28f872a2f99a1b201bcd0db14fdbc5a496b9bfd7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.2.1-py3-none-any.whl size=10742 sha256=fdb30b442a03a2b978d94f242f4f3839c3b517111d4c5bcc31496f40b5aacbba
  Stored in directory: /tmp/pip-ephem-wheel-cache-e05jnund/wheels/ef/1d/c6/f7e47f1aa1bc9d05c4120d94f90a79cf28603ef343b0dd43ff
Successfully bu

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [4]:
%%writefile vector_multiplication.cu
// vector_multiplication_compare.cu
#include <stdio.h>
#include <cuda.h>
#include <sys/time.h>
#include <assert.h>

// CUDA Kernel
__global__ void mult_vect_gpu(float *x, float *y, float *z, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        z[idx] = x[idx] * y[idx];
    }
}

// CPU Function
void mult_vect_cpu(float *x, float *y, float *z, int n) {
    for (int i = 0; i < n; i++) {
        z[i] = x[i] * y[i];
    }
}

// Time utility
double time_diff(struct timeval start, struct timeval end) {
    return (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec) / 1000000.0;
}

int main() {
    float *x_h, *y_h, *z_h_cpu, *z_h_gpu;
    float *x_d, *y_d, *z_d;
    int n = 1000000;  // Use a larger n for better timing comparison
    size_t size = n * sizeof(float);

    // Allocate memory
    x_h = (float *)malloc(size);
    y_h = (float *)malloc(size);
    z_h_cpu = (float *)malloc(size);
    z_h_gpu = (float *)malloc(size);

    cudaMalloc((void **)&x_d, size);
    cudaMalloc((void **)&y_d, size);
    cudaMalloc((void **)&z_d, size);

    // Initialize vectors
    for (int i = 0; i < n; i++) {
        x_h[i] = (float)i;
        y_h[i] = (float)(i + 1);
    }

    // ========== CPU Execution ==========
    struct timeval start_cpu, end_cpu;
    gettimeofday(&start_cpu, NULL);
    mult_vect_cpu(x_h, y_h, z_h_cpu, n);
    gettimeofday(&end_cpu, NULL);
    double time_cpu = time_diff(start_cpu, end_cpu);

    // ========== GPU Execution ==========
    cudaMemcpy(x_d, x_h, size, cudaMemcpyHostToDevice);
    cudaMemcpy(y_d, y_h, size, cudaMemcpyHostToDevice);

    int block_size = 256;
    int num_blocks = (n + block_size - 1) / block_size;

    struct timeval start_gpu, end_gpu;
    gettimeofday(&start_gpu, NULL);
    mult_vect_gpu<<<num_blocks, block_size>>>(x_d, y_d, z_d, n);
    cudaDeviceSynchronize();  // Ensure kernel finishes before timing ends
    gettimeofday(&end_gpu, NULL);
    double time_gpu = time_diff(start_gpu, end_gpu);

    cudaMemcpy(z_h_gpu, z_d, size, cudaMemcpyDeviceToHost);

    // ========== Verify Results ==========
    for (int i = 0; i < n; i++) {
        assert(z_h_cpu[i] == z_h_gpu[i]);
    }

    // ========== Print Times ==========
    printf("\nCPU Time : %lf seconds\n", time_cpu);
    printf("GPU Time : %lf seconds\n\n", time_gpu);

    // ========== Print Speedup ==========
    double speedup = time_cpu / time_gpu;
    printf("Speedup: %lf\n", speedup);

    // ========== Cleanup ==========
    free(x_h);
    free(y_h);
    free(z_h_cpu);
    free(z_h_gpu);
    cudaFree(x_d);
    cudaFree(y_d);
    cudaFree(z_d);

    return 0;
}

Writing vector_multiplication.cu


In [5]:
!nvcc -arch=sm_75 -o vector_multiplication vector_multiplication.cu
!./vector_multiplication


CPU Time : 0.004551 seconds
GPU Time : 0.000171 seconds

Speedup: 26.614035
