In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-sf5zcsg0
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-sf5zcsg0
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4295 sha256=65bec635c96c5c6524d61f81c33e23c2b97f9d8994fb301db83ac036340b484a
  Stored in directory: /tmp/pip-ephem-wheel-cache-ah7ojrqk/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [2]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [22]:
%%cu

#include <iostream>
#include <ctime>
#include <cstdlib>
#include <cuda.h>

// Функция для сложения вектора на CPU
int sumVectorCPU(int* vector, int size) {
    int sum = 0;
    for (int i = 0; i < size; ++i) {
        sum += vector[i];
    }
    return sum;
}

// Ядро для распараллеливания сложения на GPU
__global__ void sumVectorGPU(int* vector, int* result, int size) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;

    int sum = 0;
    for (int i = tid; i < size; i += stride) {
        sum += vector[i];
    }

    atomicAdd(result, sum);
}

int main() {
    const int SIZE = 1000000;
    const int BLOCK_SIZE = 256;
    const int GRID_SIZE = (SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE;

    // Выделение памяти на хосте и на устройстве
    int* hostVector = new int[SIZE];
    int* devVector;
    int* devResult;
    cudaMalloc((void**)&devVector, SIZE * sizeof(int));
    cudaMalloc((void**)&devResult, sizeof(int));

    // Заполнение вектора случайными числами на хосте
    srand(time(nullptr));
    for (int i = 0; i < SIZE; ++i) {
        hostVector[i] = rand() % 10 + 1;
    }

    // Копирование вектора с хоста на устройство
    cudaMemcpy(devVector, hostVector, SIZE * sizeof(int), cudaMemcpyHostToDevice);

    // Вычисление на GPU
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    sumVectorGPU<<<GRID_SIZE, BLOCK_SIZE>>>(devVector, devResult, SIZE);
    cudaEventRecord(stop);

    // Копирование результата с устройства на хост
    int gpuResult;
    cudaMemcpy(&gpuResult, devResult, sizeof(int), cudaMemcpyDeviceToHost);

    // Вычисление на CPU
    clock_t cpuStart = clock();
    int cpuResult = sumVectorCPU(hostVector, SIZE);
    clock_t cpuEnd = clock();

    std::cout << "GPU Result: " << gpuResult << std::endl;
    std::cout << "CPU Result: " << cpuResult << std::endl;

    // Очистка памяти
    cudaFree(devVector);
    cudaFree(devResult);
    delete[] hostVector;

    // Расчет времени выполнения на GPU и на CPU
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "GPU Time: " << milliseconds << " ms" << std::endl;

    double cpuTime = (double)(cpuEnd - cpuStart) / CLOCKS_PER_SEC * 1000.0;
    std::cout << "CPU Time: " << cpuTime << " ms" << std::endl;

    return 0;
}

GPU Result: 5500241
CPU Result: 5500241
GPU Time: 0.065792 ms
CPU Time: 2.866 ms

