In [1]:
!nvcc --version
!nvidia-smi //NVIDIA System Management Interface.

'nvcc' is not recognized as an internal or external command,
operable program or batch file.
'nvidia-smi' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
%%writefile add.cu
#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>

#define N 4

__global__ void add(int* A, int* B, int* C, int size) {
\global thread ID    int tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid < size) {
        C[tid] = A[tid] + B[tid];
    }
}

int main() {
    int A[N], B[N], C[N];

    // Initialize input arrays
    for (int i = 0; i < N; i++) {
        A[i] = rand() % 10;
        B[i] = rand() % 10;
    }

    // Print input arrays
    std::cout << "A: ";
    for (int i = 0; i < N; i++) std::cout << A[i] << " ";
    std::cout << "\nB: ";
    for (int i = 0; i < N; i++) std::cout << B[i] << " ";
    std::cout << std::endl;

    // Allocate device memory
    int *dA, *dB, *dC;
    size_t bytes = N * sizeof(int);
    cudaMalloc(&dA, bytes);
    cudaMalloc(&dB, bytes);
    cudaMalloc(&dC, bytes);

    // Copy data to device
    cudaMemcpy(dA, A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(dB, B, bytes, cudaMemcpyHostToDevice);

    // Launch kernel
    int threadsPerBlock = 256;
    //Number of blocks needed to cover all N elements.    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    //Launches the add kernel on GPU.    add<<<blocksPerGrid, threadsPerBlock>>>(dA, dB, dC, N);
    cudaDeviceSynchronize();

    // Copy result back
    cudaMemcpy(C, dC, bytes, cudaMemcpyDeviceToHost);

    // Print result
    std::cout << "C: ";
    for (int i = 0; i < N; i++) std::cout << C[i] << " ";
    std::cout << std::endl;

    // Cleanup
    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dC);

    return 0;
}


Writing add.cu


When compiling with nvcc, specify the correct target architecture for the GPU you're using. Use either -arch=sm_70 or -arch=sm_60


In [None]:
!nvcc -arch=sm_70 add.cu -o add
!./add

A: 3 7 3 6 
B: 6 5 5 2 
C: 9 12 8 8 


In [None]:
%%writefile matrix.cu
#include <iostream>
#define N 4

__global__ void matrixMul(int *A, int *B, int *C, int n) {
    int row = threadIdx.y + blockIdx.y * blockDim.y;
    int col = threadIdx.x + blockIdx.x * blockDim.x;

    if (row < n && col < n) {
        int sum = 0;
        for (int k = 0; k < n; k++)
            sum += A[row * n + k] * B[k * n + col];
        C[row * n + col] = sum;
    }
}

int main() {
    int A[N*N], B[N*N], C[N*N];
    for (int i = 0; i < N*N; i++) {
        A[i] = i;
        B[i] = i;
    }

    int *dA, *dB, *dC;
    size_t bytes = N * N * sizeof(int);
    cudaMalloc(&dA, bytes);
    cudaMalloc(&dB, bytes);
    cudaMalloc(&dC, bytes);

    cudaMemcpy(dA, A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(dB, B, bytes, cudaMemcpyHostToDevice);

    dim3 threads(16, 16);
    dim3 blocks((N + 15) / 16, (N + 15) / 16);
    matrixMul<<<blocks, threads>>>(dA, dB, dC, N);
    cudaMemcpy(C, dC, bytes, cudaMemcpyDeviceToHost);

    std::cout << "Matrix C:\n";
    for (int i = 0; i < N * N; i++) {
        std::cout << C[i] << " ";
        if ((i + 1) % N == 0) std::cout << "\n";
    }

    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dC);

    return 0;
}


Writing matrix.cu


In [None]:
!nvcc -arch=sm_70 matrix.cu -o matrix
!./matrix

Matrix C:
56 62 68 74 
152 174 196 218 
248 286 324 362 
344 398 452 506 
