## Install nvcc jupyter plugin

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-uznfnaur
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-uznfnaur
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 801584cceb559adc54e828ebe9b385c5f53fe70f
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.2.1-py3-none-any.whl size=10743 sha256=3a570d3ab74e9051a615aa78d54722c78a69a0c83a1d407fccde14c37a30e96c
  Stored in directory: /tmp/pip-ephem-wheel-cache-xv2vezrn/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully bu

## numpy version

In [None]:
import numpy

def sigmoid(z):
    return 1.0/(1.0+numpy.exp(-z))

weights = numpy.array([0.80, 0.87, 0.16, 0.96,
                       0.89, 0.87, 0.31, 0.08,
                       0.09, 0.69, 0.03, 0.42])

inputs = numpy.array([0.75,0.98, 0.74, 0.28])
biases = numpy.array([0.68, 0.83, 0.01])

weights = weights.reshape((3, 4))
inputs = inputs.reshape((4, 1))
biases = biases.reshape(-1, 1)

z = numpy.dot(weights, inputs) + biases

activations = sigmoid(z)

print(weights)
print("Z Values: ")
print(z)
print("\nActivations: ")
print(activations)

[[0.8  0.87 0.16 0.96]
 [0.89 0.87 0.31 0.08]
 [0.09 0.69 0.03 0.42]]
Z Values: 
[[2.5198]
 [2.6019]
 [0.8935]]

Activations: 
[[0.92551827]
 [0.93098376]
 [0.70961192]]


## It's time to code

In [None]:
%%bash

cat <<EOF> nn.cu
#include "cuda_runtime.h" // Include the CUDA runtime library
#include "device_launch_parameters.h" // Include device launch parameters for CUDA
#include <math.h> // Include math functions
#include <stdio.h> // Include standard input/output functions
#include <iostream> // Include input/output stream library

// CUDA kernel for linear layer and activation
__global__ void linear_layer_and_activation(
    float *weight_matrix, // Pointer to the weight matrix
    float *biases, // Pointer to the biases
    float *x_inputs, // Pointer to the input values
    float *z_values, // Pointer to the z values (w*x + b)
    float *activation_values, // Pointer to the activation values
    int nr_output_neurons, // Number of output neurons
    int nr_input_neurons) // Number of input neurons
{
    int id = threadIdx.x; // Get the thread ID

    if (id < nr_output_neurons) { // Check if the thread ID is within the number of output neurons
        // Compute w*x for each neuron
        for (int neuron_nr = 0; neuron_nr < nr_input_neurons; neuron_nr++) {
            z_values[id] += weight_matrix[(nr_input_neurons) * id + neuron_nr] * x_inputs[neuron_nr];
        }

        // Add the bias to the computed w*x value
        z_values[id] += biases[id];

        // Apply the sigmoid activation function
        activation_values[id] = 1.0 / (1.0 + exp(-z_values[id]));
    }
}

int main()
{
    const int INPUT_NEURONS = 4; // Define the number of input neurons
    const int OUTPUT_NEURONS = 3; // Define the number of output neurons

    // Initialize weights on CPU/RAM
    const int size_w = INPUT_NEURONS * OUTPUT_NEURONS; // Calculate the size of the weight matrix
    float host_weights[size_w] = {0.80f, 0.87f, 0.16f, 0.96f, 0.89f, 0.87f, 0.31f, 0.08f, 0.09f, 0.69f, 0.03f, 0.42f}; // Initialize the weight matrix

    // Initialize biases on CPU/RAM
    const int size_b = OUTPUT_NEURONS; // Calculate the size of the biases array
    float host_biases[size_b] = {0.68f, 0.83f, 0.01f}; // Initialize the biases
    float host_input[INPUT_NEURONS] = {0.75f, 0.98f, 0.74f, 0.28f}; // Initialize the input values

    // Initialize activations on CPU/RAM
    float host_activations[size_b] = {0.0f, 0.0f, 0.0f}; // Initialize the activations array to zero

    // Initialize z Matrix
    float host_z[size_b] = {0.0f, 0.0f, 0.0f}; // Initialize the z values array to zero

    // Calculate the amount of memory needed so we can provide this information to cuda malloc
    const size_t bytes_biases = size_b * sizeof(float); // Calculate the memory size for biases
    const size_t bytes_z = size_b * sizeof(float); // Calculate the memory size for z values
    const size_t bytes_weights = size_w * sizeof(float); // Calculate the memory size for weights
    const size_t bytes_activations = size_b * sizeof(float); // Calculate the memory size for activations
    const size_t bytes_inputs = INPUT_NEURONS * sizeof(float); // Calculate the memory size for inputs

    // Allocate GPU device memory
    float *d_biases, *d_weights, *d_activations, *d_z, *d_inputs; // Declare pointers for GPU memory
    cudaMalloc(&d_biases, bytes_biases); // Allocate memory for biases on GPU
    cudaMalloc(&d_weights, bytes_weights); // Allocate memory for weights on GPU
    cudaMalloc(&d_activations, bytes_activations); // Allocate memory for activations on GPU
    cudaMalloc(&d_z, bytes_z); // Allocate memory for z values on GPU
    cudaMalloc(&d_inputs, bytes_inputs); // Allocate memory for inputs on GPU

    // Copy data from CPU Memory to GPU Memory
    cudaMemcpy(d_biases, host_biases, bytes_biases, cudaMemcpyHostToDevice); // Copy biases from host to device
    cudaMemcpy(d_weights, host_weights, bytes_weights, cudaMemcpyHostToDevice); // Copy weights from host to device
    cudaMemcpy(d_activations, host_activations, bytes_activations, cudaMemcpyHostToDevice); // Copy activations from host to device
    cudaMemcpy(d_z, host_z, bytes_z, cudaMemcpyHostToDevice); // Copy z values from host to device
    cudaMemcpy(d_inputs, host_input, bytes_inputs, cudaMemcpyHostToDevice); // Copy inputs from host to device

    // Call cuda kernel
    linear_layer_and_activation<<<1, OUTPUT_NEURONS>>>(d_weights, d_biases, d_inputs, d_z, d_activations, OUTPUT_NEURONS, INPUT_NEURONS); // Launch the CUDA kernel

    // Check for any errors during kernel launch
    cudaError_t err = cudaGetLastError(); // Get the last error from the CUDA runtime
    if (err != cudaSuccess) { // Check if there was an error
        std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl; // Print the error message
    }

    // Synchronize the device
    cudaDeviceSynchronize(); // Wait for the kernel to complete

    // After we calculated the activations and z values we need to copy the data from GPU Memory back to the CPU Memory
    cudaMemcpy(host_activations, d_activations, bytes_activations, cudaMemcpyDeviceToHost); // Copy activations from device to host
    cudaMemcpy(host_z, d_z, bytes_z, cudaMemcpyDeviceToHost); // Copy z values from device to host

    // Free our memory
    cudaFree(d_biases); // Free the device memory for biases
    cudaFree(d_weights); // Free the device memory for weights
    cudaFree(d_activations); // Free the device memory for activations
    cudaFree(d_z); // Free the device memory for z values
    cudaFree(d_inputs); // Free the device memory for inputs

    // Print the z values
    std::cout << "Z Values: " << std::endl;
    for (int neuron_nr = 0; neuron_nr < OUTPUT_NEURONS; neuron_nr++) { // Loop over the output neurons
        std::cout << host_z[neuron_nr] << std::endl; // Print each z value
    }

    // Print the activation values
    std::cout << std::endl << "Activations: " << std::endl;
    for (int neuron_nr = 0; neuron_nr < OUTPUT_NEURONS; neuron_nr++) { // Loop over the output neurons
        std::cout << host_activations[neuron_nr] << std::endl; // Print each activation value
    }

    return 0; // Return success
}

EOF

In [None]:
!nvcc nn.cu -o nn.out
! ./nn.out

Z Values: 
2.5198
2.6019
0.8935

Activations: 
0.925518
0.930984
0.709612
