<a href="https://colab.research.google.com/github/starsinc1708/2lab_pp/blob/master/4lab_cu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!nvcc --version
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-32gvhnpl
  Running command git clone -q https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-32gvhnpl
The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [19]:
%%cu

#include <cublas_v2.h>
#include <malloc.h>
#include <stdio.h>
#include <stdlib.h>

__global__ void addKernel(int* c, int* a, int* b, unsigned int size) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  for (; index < size; index += (blockDim.x * gridDim.x)) {
      if (index < size) {
        c[index] = a[index] + b[index];
      }
  }
}

#define kernel addKernel

int main(int argc, char* argv[])
{
    // int, 15 000 000 (/4/16), [(8192,512);(2048,512);(8192,256);(2048,256);(8192,128);(2048,128)]
    int BLOCK_SIZE = 512; // Кол-во потоков в блоке
    int n = 15000000;
    int BLOCK_NUMBER = 8192; // Кол-во блоков
    printf("Block_Number = %i\n", BLOCK_NUMBER);
    printf("Block_Size = %i\n", BLOCK_SIZE);
    printf("n = %i\n", n);
    int n2b = n * sizeof(int); // размер векторов в байтах
    printf("n2b = %d\n", n2b);
    int n2 = BLOCK_NUMBER*BLOCK_SIZE*sizeof(int);
    printf("n2 = %d\n", n2);

    // Выделение памяти на хосте
    int* a = (int*)calloc(n2, sizeof(int));
    int* b = (int*)calloc(n2, sizeof(int));
    int* c = (int*)calloc(n2, sizeof(int));

    for (int i = 0; i < n; i++) {
        a[i] = 1;
        b[i] = 1;
    }
    // Выделение памяти на устройстве
    int* adev = NULL;
    cudaError_t cuerr = cudaMalloc((void**)&adev, n2b);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot allocate device array for a: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    int* bdev = NULL;
    cuerr = cudaMalloc((void**)&bdev, n2b);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot allocate device array for b: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    int* cdev = NULL;
    cuerr = cudaMalloc((void**)&cdev, n2b);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot allocate device array for c: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    // Создание обработчиков событий
    cudaEvent_t start, stop;
    float gpuTime = 0.0f;
    cuerr = cudaEventCreate(&start);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot create CUDA start event: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    cuerr = cudaEventCreate(&stop);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot create CUDA end event: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    // Копирование данных с хоста на девайс
    cuerr = cudaMemcpy(adev, a, n2b, cudaMemcpyHostToDevice);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot copy a array from host to device: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    cuerr = cudaMemcpy(bdev, b, n2b, cudaMemcpyHostToDevice);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot copy b array from host to device: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    // Установка точки старта
    cuerr = cudaEventRecord(start, 0);
    if (cuerr != cudaSuccess) {
        fprintf(stderr, "Cannot record CUDA event: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    //Запуск ядра
    for (int i = 0; i < 12; i++){
    kernel <<< BLOCK_NUMBER, BLOCK_SIZE >>> (cdev, adev, bdev, n);
    }
    cuerr = cudaGetLastError();
    if (cuerr != cudaSuccess)
    {
        fprintf(stderr, "Cannot launch CUDA kernel: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    // Синхронизация устройств
    cuerr = cudaDeviceSynchronize();
    if (cuerr != cudaSuccess)
    {
        fprintf(stderr, "Cannot synchronize CUDA kernel: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    // Установка точки окончания
    cuerr = cudaEventRecord(stop, 0);
    if (cuerr != cudaSuccess)
    {
        fprintf(stderr, "Cannot copy c array from device to host: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    // Копирование результата на хост
  // Авторство Болославы Миженовой
    cuerr = cudaMemcpy(c, cdev, n2b, cudaMemcpyDeviceToHost);
    if (cuerr != cudaSuccess)
    {
        fprintf(stderr, "Cannot copy c array from device to host: %s\n",
            cudaGetErrorString(cuerr));
        return 0;
    }
    // Расчет времени
    cuerr = cudaEventElapsedTime(&gpuTime, start, stop);
    printf("time spent executing %s: %.9f seconds\n", "kernel", (gpuTime / 1000) / 12);
    for (int i = 0; i < 5; i++) {
        printf("%i) a: %d b: %d c: %d\n", i, a[i], b[i], c[i]);
    }
    for (int i = n-4; i < n; i++) {
        printf("%i) a: %d b: %d c: %d\n", i, a[i], b[i], c[i]);
    }
    // Очищение памяти
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cudaFree(adev);
    cudaFree(bdev);
    cudaFree(cdev);
    free(a);
    free(b);
    free(c);
    return 0;
}

Block_Number = 8192
Block_Size = 512
n = 15000000
n2b = 60000000
n2 = 16777216
time spent executing kernel: 0.000699755 seconds
0) a: 1 b: 1 c: 2
1) a: 1 b: 1 c: 2
2) a: 1 b: 1 c: 2
3) a: 1 b: 1 c: 2
4) a: 1 b: 1 c: 2
14999996) a: 1 b: 1 c: 2
14999997) a: 1 b: 1 c: 2
14999998) a: 1 b: 1 c: 2
14999999) a: 1 b: 1 c: 2

