<a href="https://colab.research.google.com/github/rohitpan/datasciencecoursera/blob/master/CUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi
#

Thu Sep 12 16:59:25 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!nvcc --version
#

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [3]:
# Write a simple CUDA program to add two arrays
code = """
#include <stdio.h>

// CUDA Kernel function to add the elements of two arrays
__global__ void add(int *a, int *b, int *c, int N) {
    int index = threadIdx.x;  // Get the index of the current thread
    if (index < N)
        c[index] = a[index] + b[index];
}

int main() {
    int N = 512;
    int a[N], b[N], c[N];
    int *d_a, *d_b, *d_c;

    // Initialize arrays a and b with some values
    for (int i = 0; i < N; i++) {
        a[i] = i;
        b[i] = i * 2;
    }

    // Allocate memory on the GPU
    cudaMalloc((void**)&d_a, N * sizeof(int));
    cudaMalloc((void**)&d_b, N * sizeof(int));
    cudaMalloc((void**)&d_c, N * sizeof(int));

    // Copy arrays a and b to the GPU
    cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice);

    // Launch the kernel on the GPU
    add<<<1, N>>>(d_a, d_b, d_c, N);

    // Copy result array c back to the host
    cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the result
    for (int i = 0; i < N; i++) {
        printf("%d + %d = %d\\n", a[i], b[i], c[i]);
    }

    // Free GPU memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}
"""

# Write the CUDA code to a file
with open('vector_add.cu', 'w') as f:
    f.write(code)


In [4]:
!nvcc vector_add.cu -o vector_add

In [5]:
!./vector_add

0 + 0 = 0
1 + 2 = 3
2 + 4 = 6
3 + 6 = 9
4 + 8 = 12
5 + 10 = 15
6 + 12 = 18
7 + 14 = 21
8 + 16 = 24
9 + 18 = 27
10 + 20 = 30
11 + 22 = 33
12 + 24 = 36
13 + 26 = 39
14 + 28 = 42
15 + 30 = 45
16 + 32 = 48
17 + 34 = 51
18 + 36 = 54
19 + 38 = 57
20 + 40 = 60
21 + 42 = 63
22 + 44 = 66
23 + 46 = 69
24 + 48 = 72
25 + 50 = 75
26 + 52 = 78
27 + 54 = 81
28 + 56 = 84
29 + 58 = 87
30 + 60 = 90
31 + 62 = 93
32 + 64 = 96
33 + 66 = 99
34 + 68 = 102
35 + 70 = 105
36 + 72 = 108
37 + 74 = 111
38 + 76 = 114
39 + 78 = 117
40 + 80 = 120
41 + 82 = 123
42 + 84 = 126
43 + 86 = 129
44 + 88 = 132
45 + 90 = 135
46 + 92 = 138
47 + 94 = 141
48 + 96 = 144
49 + 98 = 147
50 + 100 = 150
51 + 102 = 153
52 + 104 = 156
53 + 106 = 159
54 + 108 = 162
55 + 110 = 165
56 + 112 = 168
57 + 114 = 171
58 + 116 = 174
59 + 118 = 177
60 + 120 = 180
61 + 122 = 183
62 + 124 = 186
63 + 126 = 189
64 + 128 = 192
65 + 130 = 195
66 + 132 = 198
67 + 134 = 201
68 + 136 = 204
69 + 138 = 207
70 + 140 = 210
71 + 142 = 213
72 + 144 = 216
73 + 146

In [7]:
!nvcc matrix_mul.cu -o matrix_mul

In [8]:
!./matrix_mul

19840 19960 20080 20200 20320 20440 20560 20680 20800 20920 21040 21160 21280 21400 21520 21640 
50560 50936 51312 51688 52064 52440 52816 53192 53568 53944 54320 54696 55072 55448 55824 56200 
81280 81912 82544 83176 83808 84440 85072 85704 86336 86968 87600 88232 88864 89496 90128 90760 
112000 112888 113776 114664 115552 116440 117328 118216 119104 119992 120880 121768 122656 123544 124432 125320 
142720 143864 145008 146152 147296 148440 149584 150728 151872 153016 154160 155304 156448 157592 158736 159880 
173440 174840 176240 177640 179040 180440 181840 183240 184640 186040 187440 188840 190240 191640 193040 194440 
204160 205816 207472 209128 210784 212440 214096 215752 217408 219064 220720 222376 224032 225688 227344 229000 
234880 236792 238704 240616 242528 244440 246352 248264 250176 252088 254000 255912 257824 259736 261648 263560 
265600 267768 269936 272104 274272 276440 278608 280776 282944 285112 287280 289448 291616 293784 295952 298120 
296320 298744 301168 303592 306

In [9]:
import numpy as np
from numba import cuda

# Define the kernel function for vector addition
@cuda.jit
def vector_add_kernel(a, b, c):
    idx = cuda.threadIdx.x  # Get the thread index
    if idx < a.size:
        c[idx] = a[idx] + b[idx]

# Initialize arrays
N = 512
a = np.arange(N, dtype=np.float32)
b = np.arange(N, dtype=np.float32)
c = np.zeros(N, dtype=np.float32)

# Copy arrays to the GPU
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.device_array(N)

# Launch the kernel with N threads
vector_add_kernel[1, N](d_a, d_b, d_c)

# Copy result back to the host
c = d_c.copy_to_host()

# Print the result
print(c)




[   0.    2.    4.    6.    8.   10.   12.   14.   16.   18.   20.   22.
   24.   26.   28.   30.   32.   34.   36.   38.   40.   42.   44.   46.
   48.   50.   52.   54.   56.   58.   60.   62.   64.   66.   68.   70.
   72.   74.   76.   78.   80.   82.   84.   86.   88.   90.   92.   94.
   96.   98.  100.  102.  104.  106.  108.  110.  112.  114.  116.  118.
  120.  122.  124.  126.  128.  130.  132.  134.  136.  138.  140.  142.
  144.  146.  148.  150.  152.  154.  156.  158.  160.  162.  164.  166.
  168.  170.  172.  174.  176.  178.  180.  182.  184.  186.  188.  190.
  192.  194.  196.  198.  200.  202.  204.  206.  208.  210.  212.  214.
  216.  218.  220.  222.  224.  226.  228.  230.  232.  234.  236.  238.
  240.  242.  244.  246.  248.  250.  252.  254.  256.  258.  260.  262.
  264.  266.  268.  270.  272.  274.  276.  278.  280.  282.  284.  286.
  288.  290.  292.  294.  296.  298.  300.  302.  304.  306.  308.  310.
  312.  314.  316.  318.  320.  322.  324.  326.  3

In [3]:
# Create and save the CUDA code to a file
code = """
#include <stdio.h>

#define TILE_WIDTH 2  // Block size (2x2 threads)

// Kernel for matrix multiplication using shared memory
__global__ void matMulSharedMemory(float *A, float *B, float *C, int N) {
    __shared__ float tile_A[TILE_WIDTH][TILE_WIDTH];
    __shared__ float tile_B[TILE_WIDTH][TILE_WIDTH];

    int row = blockIdx.y * TILE_WIDTH + threadIdx.y;
    int col = blockIdx.x * TILE_WIDTH + threadIdx.x;
    float value = 0;

    for (int i = 0; i < N / TILE_WIDTH; ++i) {
        // Load tiles into shared memory
        tile_A[threadIdx.y][threadIdx.x] = A[row * N + (i * TILE_WIDTH + threadIdx.x)];
        tile_B[threadIdx.y][threadIdx.x] = B[(i * TILE_WIDTH + threadIdx.y) * N + col];
        __syncthreads();

        // Perform the computation on the tile
        for (int j = 0; j < TILE_WIDTH; ++j) {
            value += tile_A[threadIdx.y][j] * tile_B[j][threadIdx.x];
        }
        __syncthreads();
    }

    // Store the result in C
    C[row * N + col] = value;
}

int main() {
    int N = 4;  // Size of the matrix (4x4)
    int size = N * N * sizeof(float);

    // Allocate and initialize matrices on the host
    float h_A[N*N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
    float h_B[N*N] = {1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0};
    float h_C[N*N];

    // Allocate device memory
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    // Copy input data from host to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);  // 2x2 threads per block
    dim3 dimGrid(N / TILE_WIDTH, N / TILE_WIDTH);  // Grid size (2x2 blocks)

    // Launch the matrix multiplication kernel
    matMulSharedMemory<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);

    // Copy the result back to the host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Print the result
    printf("Result matrix C:\\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", h_C[i * N + j]);
        }
        printf("\\n");
    }

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}
"""

# Write the CUDA code to a .cu file
with open('matrix_mul_shared_memory.cu', 'w') as f:
    f.write(code)


In [4]:
# Compile the CUDA code
!nvcc matrix_mul_shared_memory.cu -o matrix_mul_shared_memory

# Run the compiled program
!./matrix_mul_shared_memory

Result matrix C:
4.000000 6.000000 6.000000 4.000000 
12.000000 14.000000 14.000000 12.000000 
20.000000 22.000000 22.000000 20.000000 
28.000000 30.000000 30.000000 28.000000 
