RUHAAN HAWALDAR BE 21137

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [2]:
%%writefile vector_addition.cu
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>

#define N 500  // Number of elements

__global__ void add(int *a, int *b, int *c) {
    int tid = threadIdx.x;
    if (tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

int main() {
    int a[N], b[N], c[N]; // Host arrays
    int *dev_a, *dev_b, *dev_c; // Device pointers

    // Allocate memory on GPU
    cudaError_t err = cudaMalloc((void**)&dev_a, N * sizeof(int));
    if (err != cudaSuccess) {
        printf("Failed to allocate memory on device: %s\n", cudaGetErrorString(err));
        return EXIT_FAILURE;
    }

    cudaMalloc((void**)&dev_b, N * sizeof(int));
    cudaMalloc((void**)&dev_c, N * sizeof(int));

    // Initialize host arrays
    for (int i = 0; i < N; i++) {
        a[i] = i;
        b[i] = i * i;
    }

    // CUDA Events for Timing
    cudaEvent_t start, end;
    cudaEventCreate(&start);
    cudaEventCreate(&end);
    cudaEventRecord(start);

    // Copy data from Host to Device
    cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);

    // Launch Kernel
    add<<<1, N>>>(dev_a, dev_b, dev_c);

    // Copy result back to Host
    err = cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) {
        printf("Failed to copy from device: %s\n", cudaGetErrorString(err));
        return EXIT_FAILURE;
    }

    cudaEventRecord(end);
    cudaEventSynchronize(end);

    // Calculate Execution Time
    float time = 0;
    cudaEventElapsedTime(&time, start, end);
    printf("Execution Time: %f ms\n", time);

    // Print some results
    for (int i = 0; i < 10; i++) { // Only print first 10 values
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }

    // Free device memory
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    return 0;
}

Writing vector_addition.cu


In [3]:
!nvcc vector_addition.cu -o vector_addition
!./vector_addition

Failed to allocate memory on device: CUDA driver version is insufficient for CUDA runtime version


In [4]:
!nvprof ./vector_addition

Failed to allocate memory on device: CUDA driver version is insufficient for CUDA runtime version


In [5]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [6]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmph5c083gf".


In [7]:
%%writefile matrix_multiplication.cu

#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define m 10

__global__ void mul_r(int *a, int *b, int *c){
    int tid = threadIdx.x;
    if (tid < m){
        c[tid]= a[tid] * b[tid];
    }
}

int main(){
    int n, c, d, fst[10][10], snd[10][10], t_snd[10][10];
    int row, col, sum_c, a[10], b[10], ans[10];

    n = m;  // square matrix only

    // Initialize first matrix with random values
    for (c = 0; c < m; c++) {
        for (d = 0; d < n; d++) {
            fst[c][d] = rand() % 10 + 1;
        }
    }

    printf("Elements of first matrix:\n");
    for (c = 0; c < m; c++) {
        for (d = 0; d < n; d++) {
            printf("%d\t", fst[c][d]);
        }
        printf("\n");
    }

    // Initialize second matrix with random values
    for (c = 0; c < m; c++) {
        for (d = 0; d < n; d++) {
            snd[c][d] = rand() % 10 + 1;
        }
    }

    printf("Elements of second matrix:\n");
    for (c = 0; c < m; c++) {
        for (d = 0; d < n; d++) {
            printf("%d\t", snd[c][d]);
        }
        printf("\n");
    }

    // Transpose of second matrix
    for (c = 0; c < m; c++) {
        for (d = 0; d < n; d++) {
            t_snd[d][c] = snd[c][d];
        }
    }

    printf("\nTranspose of second matrix:\n");
    for (c = 0; c < n; c++) {
        for (d = 0; d < m; d++) {
            printf("%d\t", t_snd[c][d]);
        }
        printf("\n");
    }

    // Allocate memory on GPU
    int *dev_a, *dev_b, *dev_ans;
    cudaMalloc((void**)&dev_a, m * sizeof(int));
    cudaMalloc((void**)&dev_b, m * sizeof(int));
    cudaMalloc((void**)&dev_ans, m * sizeof(int));

    cudaEvent_t start, end;
    cudaEventCreate(&start);
    cudaEventCreate(&end);
    cudaEventRecord(start);

    for (row = 0; row < m; row++) {
        for (d = 0; d < m; d++) {
            a[d] = fst[row][d];
        }
        cudaMemcpy(dev_a, a, m * sizeof(int), cudaMemcpyHostToDevice);

        for (col = 0; col < m; col++) {
            for (d = 0; d < m; d++) {
                b[d] = t_snd[col][d];
                ans[d] = 0;
            }
            cudaMemcpy(dev_b, b, m * sizeof(int), cudaMemcpyHostToDevice);
            cudaMemcpy(dev_ans, ans, m * sizeof(int), cudaMemcpyHostToDevice);

            mul_r<<<1, m>>>(dev_a, dev_b, dev_ans);
            cudaMemcpy(ans, dev_ans, m * sizeof(int), cudaMemcpyDeviceToHost);

            sum_c = 0;
            for (d = 0; d < m; d++) {
                sum_c += ans[d];
            }
            snd[row][col] = sum_c;
        }
    }

    cudaEventRecord(end);
    cudaEventSynchronize(end);
    float time = 0;
    cudaEventElapsedTime(&time, start, end);
    printf("Execution time=%f ms\n", time);

    printf("Matrix multiplication result:\n");
    for (c = 0; c < n; c++) {
        for (d = 0; d < m; d++) {
            printf("%d\t", snd[c][d]);
        }
        printf("\n");
    }

    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_ans);

    return 0;
}


Writing matrix_multiplication.cu


In [8]:
!nvcc -o matrix_multiplication matrix_multiplication.cu
!./matrix_multiplication


Elements of first matrix:
4	7	8	6	4	6	7	3	10	2	
3	8	1	10	4	7	1	7	3	7	
2	9	8	10	3	1	3	4	8	6	
10	3	3	9	10	8	4	7	2	3	
10	4	2	10	5	8	9	5	6	1	
4	7	2	1	7	4	3	1	7	2	
6	6	5	8	7	6	7	10	4	8	
5	6	3	6	5	8	5	5	4	1	
8	9	7	9	9	5	4	2	5	10	
3	1	7	9	10	3	7	7	5	10	
Elements of second matrix:
6	1	5	9	8	2	8	3	8	3	
3	7	2	1	7	2	6	10	5	10	
1	10	2	8	8	2	2	6	10	8	
8	7	8	4	7	6	7	4	10	5	
9	2	3	10	4	10	1	9	9	6	
1	10	7	4	9	6	7	2	2	6	
10	9	5	9	2	1	4	1	5	5	
5	5	8	7	4	2	8	6	10	7	
3	2	8	9	6	8	5	2	9	6	
10	8	6	4	9	9	4	2	9	10	

Transpose of second matrix:
6	3	1	8	9	1	10	5	3	10	
1	7	10	7	2	10	9	5	2	8	
5	2	2	8	3	7	5	8	8	6	
9	1	8	4	10	4	9	7	9	4	
8	7	8	7	4	9	2	4	6	9	
2	2	2	6	10	6	1	2	8	9	
8	6	2	7	1	7	4	8	5	4	
3	10	6	4	9	2	1	6	2	2	
8	5	10	10	9	2	5	10	9	9	
3	10	8	5	6	6	5	7	6	10	
Execution time=0.000000 ms
Matrix multiplication result:
0	0	0	0	0	0	0	0	0	0	
0	0	0	0	0	0	0	0	0	0	
0	0	0	0	0	0	0	0	0	0	
0	0	0	0	0	0	0	0	0	0	
0	0	0	0	0	0	0	0	0	0	
0	0	0	0	0	0	0	0	0	0	
0	0	0	0	0	0	0	0	0	0	
0	0	0	0	0	0	0	0	0	0	
0	0	0	0	0	0	0	0	0	0	
0	0	0	0