In [23]:
!nvidia-smi

Wed May  7 18:01:01 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [16]:
%%writefile add.cu
#include <iostream>
#include<cuda.h>
using namespace std;

__global__ void add(int* A, int* B, int* C, int size) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        C[tid] = A[tid] + B[tid];
    }
}

void initialize(int* vector, int size) {
    for (int i = 0; i < size; i++) {
        vector[i] = rand() % 10;
    }
}

void print(int* vector, int size) {
    for (int i = 0; i < size; i++) {
        cout << vector[i] << " ";
    }
    cout << endl;
}

int main() {
    int N = 4;
    int* A = new int[N];
    int* B = new int[N];
    int* C = new int[N];

    initialize(A, N);
    initialize(B, N);

    cout << "Vector A: ";
    print(A, N);
    cout << "Vector B: ";
    print(B, N);

    int *X, *Y, *Z;
    size_t bytes = N * sizeof(int);
    cudaMalloc((void **)&X, bytes);
    cudaMalloc((void **)&Y, bytes);
    cudaMalloc((void **)&Z, bytes);

    cudaMemcpy(X, A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, bytes, cudaMemcpyHostToDevice);

    dim3 dimBlock(256);
    dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x);

    add<<<dimGrid, dimBlock>>>(X, Y, Z, N);
    cudaDeviceSynchronize();

    cudaMemcpy(C, Z, bytes, cudaMemcpyDeviceToHost);

    cout << "Addition: ";
    print(C, N);

    delete[] A;
    delete[] B;
    delete[] C;

    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);

    return 0;
}

Overwriting add.cu


In [19]:
!nvcc add.cu -arch=sm_70 -o add

In [20]:
!./add.out

Vector A: 3 6 7 5 
Vector B: 3 5 6 2 
Addition: 6 11 13 7 


In [40]:
%%writefile multi.cu
#include<iostream>
#include<cuda.h>

using namespace std;

__global__ void multi(int* A,int* B,int* C,int size){
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;

    if(i < size && j < size){
        int sum = 0;
        for(int k = 0;k<size;k++){
            sum += A[i* size + k] * B[k*size+j];
        }
        C[i*size+j] = sum;
    }
}

void display(int* A,int size){
  for(int i = 0;i<size;i++){
    for (int j = 0;j<size;j++){
      cout<< A[i*size + j]<< " ";
    }
    cout<<endl;
  }
  cout<<endl;
}


int main(){
    // decalre cpu mem
    int N = 16 *16;
    int* A = new int[N];
    int* B = new int[N];
    int* C = new int[N];

    // initialize cpu arr
    for ( int i = 0;i<N;i++){
      A[i] = rand();
      B[i] = rand();
    }

    // decalre GPU mem
    // needs size in bytes
    int size = N * sizeof(int);
    int *X,*Y,*Z;

    cudaMalloc(&X,size);
    cudaMalloc(&Y,size);
    cudaMalloc(&Z,size);

    // copy to gpu
    cudaMemcpy(X,A,size,cudaMemcpyHostToDevice);
    cudaMemcpy(Y,B,size,cudaMemcpyHostToDevice);

    // initialize dimGrid and dimBlock
    dim3 dimBlock(16,16);
    dim3 dimGrid((N + dimBlock.x -1)/dimBlock.x,(N + dimBlock.x -1)/dimBlock.x);

    // call kernal fun
    multi<<<dimGrid,dimBlock>>>(X,Y,Z,N);

    // copy result to cpu mem
    cudaMemcpy(C,Z,size,cudaMemcpyDeviceToHost);

    //display from host
    cout<<"A"<<endl;
    display(A,16);
    cout<<"B"<<endl;
    display(B,16);
    cout<<"C"<<endl;
    display(C,16);

    delete[] A;
    delete[] B;
    delete[] C;

    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);

    return 0;
}

Overwriting mult.cu


In [41]:
!nvcc multi.cu -arch=sm_70 -o multi

In [42]:
!./multi.out

A
1804289383 1681692777 1957747793 719885386 596516649 1025202362 783368690 2044897763 1365180540 304089172 35005211 294702567 336465782 278722862 2145174067 1101513929 
1315634022 1369133069 1059961393 628175011 1131176229 859484421 608413784 1734575198 149798315 1129566413 412776091 1911759956 137806862 982906996 511702305 1937477084 
572660336 805750846 1100661313 1141616124 939819582 1998898814 610515434 1374344043 1477171087 945117276 1780695788 491705403 752392754 2053999932 1411549676 943947739 
855636226 1469348094 1036140795 2040651434 317097467 1376710097 1330573317 1687926652 959997301 402724286 1194953865 364228444 221558440 1063958031 2114738097 1469834481 
1610120709 631704567 1255179497 327254586 269455306 352406219 160051528 112805732 378409503 1713258270 1409959708 1373226340 200747796 1117142618 150122846 990892921 
1231192379 111537764 2147469841 1911165193 2142757034 1869470124 8936987 1275373743 350322227 1960709859 771151432 1244316437 1476153275 1139901474 653468