In [1]:
!nvidia-smi

Fri Feb  6 09:25:43 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   52C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [13]:
%%writefile matrix_add.cu
#include <stdio.h>
#include <cuda.h>

// Macro for error checking
#define cudaCheckError() {                                          \
 cudaError_t e=cudaGetLastError();                                 \
 if(e!=cudaSuccess) {                                              \
   printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e)); \
   return 1;                                                       \
 }                                                                 \
}

__global__ void matrixAdd(int *a, int *b, int *c, int n) {
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    if (id < n) {
        c[id] = a[id] + b[id];
    }
}

int main() {
    int rows, cols;

    // 1. Get Matrix Size
    if(scanf("%d", &rows) != 1) { printf("Error reading rows\n"); return 1; }
    if(scanf("%d", &cols) != 1) { printf("Error reading cols\n"); return 1; }

    int n = rows * cols;
    size_t bytes = n * sizeof(int);

    // 2. Allocate Host Memory
    int *h_a = (int*)malloc(bytes);
    int *h_b = (int*)malloc(bytes);
    int *h_c = (int*)malloc(bytes);

    // 3. Read Input
    for(int i = 0; i < n; i++) scanf("%d", &h_a[i]);
    for(int i = 0; i < n; i++) scanf("%d", &h_b[i]);

    // 4. Allocate Device Memory & Check for Errors
    int *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);
    cudaCheckError(); // Check if GPU allocation worked

    // 5. Copy to Device
    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
    cudaCheckError();

    // 6. Launch Kernel
    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;
    
    // Print debug info
    printf("Launching kernel (Grid: %d, Block: %d) on %d elements...\n", gridSize, blockSize, n);
    
    matrixAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
    cudaDeviceSynchronize(); // Wait for GPU to finish
    cudaCheckError(); // Check if Kernel crashed

    // 7. Copy back
    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
    cudaCheckError();

    // 8. Print Result
    printf("\nResult Matrix C (Addition):\n");
    for(int i = 0; i < rows; i++) {
        for(int j = 0; j < cols; j++) {
            printf("%d\t", h_c[i * cols + j]);
        }
        printf("\n");
    }

    // Free memory
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    free(h_a); free(h_b); free(h_c);
    return 0;
}

Overwriting matrix_add.cu


In [15]:
!nvcc -arch=sm_75 matrix_add.cu -o matrix_add
!echo "2 2 1 2 3 4 5 6 7 8" | ./matrix_add

Launching kernel (Grid: 1, Block: 256) on 4 elements...

Result Matrix C (Addition):
6	8	
10	12	


In [17]:
%%writefile matrix_mul.cu
#include <stdio.h>
#include <cuda.h>

// Macro for error checking
#define cudaCheckError() {                                          \
 cudaError_t e=cudaGetLastError();                                 \
 if(e!=cudaSuccess) {                                              \
   printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e)); \
   return 1;                                                       \
 }                                                                 \
}

// CUDA Kernel for Matrix Multiplication
__global__ void matrixMul(int *a, int *b, int *c, int rowsA, int colsA, int colsB) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < rowsA && col < colsB) {
        int sum = 0;
        for (int k = 0; k < colsA; k++) {
            sum += a[row * colsA + k] * b[k * colsB + col];
        }
        c[row * colsB + col] = sum;
    }
}

int main() {
    int r1, c1, r2, c2;

    printf("--- Matrix Multiplication ---\n");

    // 1. Input Dimensions
    printf("Matrix A - Enter rows and columns: ");
    if(scanf("%d %d", &r1, &c1) != 2) return 1;

    printf("Matrix B - Enter rows and columns: ");
    if(scanf("%d %d", &r2, &c2) != 2) return 1;

    // 2. Check compatibility
    if (c1 != r2) {
        printf("Error: Columns of A (%d) must match Rows of B (%d)\n", c1, r2);
        return -1;
    }

    size_t bytes_a = r1 * c1 * sizeof(int);
    size_t bytes_b = r2 * c2 * sizeof(int);
    size_t bytes_c = r1 * c2 * sizeof(int);

    // 3. Allocate Host Memory
    int *h_a = (int*)malloc(bytes_a);
    int *h_b = (int*)malloc(bytes_b);
    int *h_c = (int*)malloc(bytes_c);

    // 4. Input Elements
    printf("\nEnter Matrix A elements (%d x %d):\n", r1, c1);
    for (int i = 0; i < r1 * c1; i++) scanf("%d", &h_a[i]);

    printf("\nEnter Matrix B elements (%d x %d):\n", r2, c2);
    for (int i = 0; i < r2 * c2; i++) scanf("%d", &h_b[i]);

    // 5. Allocate Device Memory
    int *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, bytes_a);
    cudaMalloc(&d_b, bytes_b);
    cudaMalloc(&d_c, bytes_c);
    cudaCheckError();

    // 6. Copy Host to Device
    cudaMemcpy(d_a, h_a, bytes_a, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes_b, cudaMemcpyHostToDevice);
    cudaCheckError();

    // 7. Define Grid and Block Dimensions
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((c2 + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (r1 + threadsPerBlock.y - 1) / threadsPerBlock.y);

    printf("\nLaunching kernel with Grid(%d, %d)...\n", blocksPerGrid.x, blocksPerGrid.y);
    
    // Launch Kernel
    matrixMul<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, r1, c1, c2);
    cudaDeviceSynchronize(); // Wait for GPU
    cudaCheckError();        // Check for kernel errors

    // 8. Copy Result to Host
    cudaMemcpy(h_c, d_c, bytes_c, cudaMemcpyDeviceToHost);
    cudaCheckError();

    // 9. Display Result
    printf("\nResult Matrix C (Multiplication):\n");
    for (int i = 0; i < r1; i++) {
        for (int j = 0; j < c2; j++) {
            printf("%d\t", h_c[i * c2 + j]);
        }
        printf("\n");
    }

    // 10. Free Memory
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    free(h_a); free(h_b); free(h_c);

    return 0;
}

Writing matrix_mul.cu


In [18]:
!nvcc -arch=sm_75 matrix_mul.cu -o matrix_mul
!echo "2 2 2 2 1 2 3 4 2 0 1 2" | ./matrix_mul

--- Matrix Multiplication ---
Matrix A - Enter rows and columns: Matrix B - Enter rows and columns: 
Enter Matrix A elements (2 x 2):

Enter Matrix B elements (2 x 2):

Launching kernel with Grid(1, 1)...

Result Matrix C (Multiplication):
4	4	
10	8	
