In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
%%writefile vectorAdd.cu

#include <cuda.h>
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void Add(int *a, int *b, int *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    int n = 2;
    int *a, *b, *c;

    // Allocate memory on host
    a = (int *)malloc(n * sizeof(int));
    b = (int *)malloc(n * sizeof(int));
    c = (int *)malloc(n * sizeof(int));

    // Initialize data on host
    for (int i = 0; i < n; ++i) {
      printf("Enter a[%d]: ", i);
      scanf("%d", &a[i]);
      printf("Enter b[%d]: ", i);
      scanf("%d", &b[i]);
    }

    // Allocate memory on device (GPU)
    int *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, n * sizeof(int));
    cudaMalloc(&d_b, n * sizeof(int));
    cudaMalloc(&d_c, n * sizeof(int));

    // Copy data from host to device
    cudaMemcpy(d_a, a, n * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, n * sizeof(int), cudaMemcpyHostToDevice);

    // Launch the kernel on the GPU
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
    Add<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);

    // Copy results back from device to host
    cudaMemcpy(c, d_c, n * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the result
    for (int i = 0; i < n; ++i) {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }

    // Free memory
    free(a);
    free(b);
    free(c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


Overwriting vectorAdd.cu


In [None]:

!nvcc vectorAdd.cu -o vecAdd

In [None]:
!./vecAdd

Enter a[0]: 5
Enter b[0]: 6
Enter a[1]: 11
Enter b[1]: 3
5 + 6 = 11
11 + 3 = 14


In [None]:
%%writefile Mat_Mul.cu

#include <cuda.h>
#include <stdio.h>
#include <cuda_runtime.h>

#define N 3

__global__ void matrix_mul(int *a, int *b, int *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    int sum = 0;
    if (i < n && j < n) {
        for (int k = 0; k < n; k++)
            sum += a[i * n + k] * b[k * n + j];
        c[i * n + j] = sum;
    }
}

int main() {
    int n = N;
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    int size = n * n * sizeof(int);

    // Allocate memory for matrices A, B, and C on the host
    a = (int *)malloc(size);
    b = (int *)malloc(size);
    c = (int *)malloc(size);

    // Initialize matrices A and B with hard-coded values
    int a_values[N][N] = {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
    int b_values[N][N] = {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}};

    // Copy values to matrices A and B
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            a[i * N + j] = a_values[i][j];
            b[i * N + j] = b_values[i][j];
        }
    }

    // Allocate memory for matrices A, B, and C on the device
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    // Copy matrices A and B from host to device
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 blockSize(N, N);
    dim3 gridSize((n + N - 1) / N, (n + N - 1) / N);

    // Launch kernel
    matrix_mul<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);

    // Copy result matrix C from device to host
    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

    // Print the result matrix C
    printf("Result matrix:\n");
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++)
            printf("%d ", c[i * n + j]);
        printf("\n");
    }

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Free host memory
    free(a);
    free(b);
    free(c);

    return 0;
}


Writing Mat_Mul.cu


In [None]:
!nvcc Mat_Mul.cu -o mat_mul

In [None]:
!./mat_mul

Result matrix:
0 0 0 
0 0 0 
0 0 0 


In [None]:
%%writefile BFS.cu

#include <stdio.h>

#define MAX_VERTICES 100

__global__ void BFS(int *graph, int *visited, int *queue, int *front, int *rear, int V) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    while (*front < *rear) {
        int u = atomicAdd(front, 1);

        if (u < *rear) {
            visited[u] = 1;

            for (int v = 0; v < V; v++) {
                if (graph[u * V + v] && !visited[v]) {
                    int rear_local = atomicAdd(rear, 1);
                    queue[rear_local] = v;
                }
            }
        }
    }
}

int main() {
    int V, E, i, j;

    printf("Enter the number of vertices: ");
    scanf("%d", &V);

    printf("Enter the number of edges: ");
    scanf("%d", &E);

    int *graph = (int *)malloc(V * V * sizeof(int));
    int *visited = (int *)calloc(V, sizeof(int));
    int *queue = (int *)malloc(V * sizeof(int));
    int front = 0, rear = 0;

    int *d_graph, *d_visited, *d_queue, *d_front, *d_rear;
    cudaMalloc(&d_graph, V * V * sizeof(int));
    cudaMalloc(&d_visited, V * sizeof(int));
    cudaMalloc(&d_queue, V * sizeof(int));
    cudaMalloc(&d_front, sizeof(int));
    cudaMalloc(&d_rear, sizeof(int));

    printf("Enter the graph connections (source, destination) for %d edges:\n", E);
    for (int e = 0; e < E; e++) {
      scanf("%d %d", &i, &j);
      graph[i * V + j] = graph[j * V + i] = 1;
    }


    cudaMemcpy(d_graph, graph, V * V * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_visited, visited, V * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_front, &front, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_rear, &rear, sizeof(int), cudaMemcpyHostToDevice);

    int start;
    printf("Enter the starting vertex for BFS: ");
    scanf("%d", &start);

    // Initialize visited array to zeros
for (i = 0; i < V; i++) {
    visited[i] = 0;
}

// Mark the start node as visited and enqueue it
visited[start] = 1;
queue[rear++] = start;

// Copy the updated queue back to the device
cudaMemcpy(d_queue, queue, V * sizeof(int), cudaMemcpyHostToDevice);

// Launch the BFS kernel
BFS<<<(V + 255) / 256, 256>>>(d_graph, d_visited, d_queue, d_front, d_rear, V);

// Wait for kernel to finish
cudaDeviceSynchronize();

// Copy the visited array back to the host
cudaMemcpy(visited, d_visited, V * sizeof(int), cudaMemcpyDeviceToHost);

// Print BFS traversal sequence
printf("Breadth First Search Traversal: ");
for (i = 0; i < V; i++) {
    if (visited[i])
        printf("%d ", i);
}
printf("\n");

return 0;
}



Overwriting BFS.cu


In [None]:
!nvcc BFS.cu -o bfs

In [None]:
!./bfs

BFS traversal: 0 0 0 0 0 
