# Compiling the `vecMatMultipply` Code (double and float versions)

In [None]:
!make

nvcc -arch=sm_75 -o vecMatMultiply vecMatMultiply.cu
nvcc -arch=sm_75 -o vecMatMultiplyFloat vecMatMultiplyFloat.cu


# Checking NVIDIA GPU Information

In [None]:
!nvidia-smi

Sun Nov  9 16:41:34 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Achieved Occupancy for (128 x 256) and (256 x 32)

In [None]:
!ncu ./vecMatMultiply 128 256 256 32 0

==PROF== Connected to process 1695 (/content/vecMatMultiply)
==PROF== Profiling "multiplyGpu" - 0: 0%....50%....100% - 9 passes
host_to_device: 0.002067 s
kernel_exec: 0.469855 s
device_to_host: 0.000049 s
==PROF== Disconnected from process 1695
[1695] vecMatMultiply@127.0.0.1
  multiplyGpu(double *, double *, double *, int, int, int, int) (4, 1, 1)x(32, 32, 1), Context 1, Stream 7, Device 0, CC 7.5
    Section: GPU Speed Of Light Throughput
    ----------------------- ----------- ------------
    Metric Name             Metric Unit Metric Value
    ----------------------- ----------- ------------
    DRAM Frequency                  Ghz         5.00
    SM Frequency                    Mhz       584.81
    Elapsed Cycles                cycle      138,522
    Memory Throughput                 %         0.60
    DRAM Throughput                   %         0.29
    Duration                         us       236.86
    L1/TEX Cache Throughput           %        23.90
    L2 Cache Throughput 

# Achieved Occupancy for (1024 x 8191) and (8191 x 8197)

In [None]:
!ncu ./vecMatMultiply 1024 8191 8191 8197 0

==PROF== Connected to process 1772 (/content/vecMatMultiply)
==PROF== Profiling "multiplyGpu" - 0: 0%....50%....100% - 9 passes
host_to_device: 0.142536 s
kernel_exec: 2.101773 s
device_to_host: 0.072810 s
==PROF== Disconnected from process 1772
[1772] vecMatMultiply@127.0.0.1
  multiplyGpu(double *, double *, double *, int, int, int, int) (32, 257, 1)x(32, 32, 1), Context 1, Stream 7, Device 0, CC 7.5
    Section: GPU Speed Of Light Throughput
    ----------------------- ----------- --------------
    Metric Name             Metric Unit   Metric Value
    ----------------------- ----------- --------------
    DRAM Frequency                  Ghz           5.00
    SM Frequency                    Mhz         585.00
    Elapsed Cycles                cycle    113,631,703
    Memory Throughput                 %          32.42
    DRAM Throughput                   %          30.43
    Duration                         ms         194.24
    L1/TEX Cache Throughput           %          53.21
 

# Getting Data for Stacked Bar Chart (double and float)

In [None]:
!python3 run_experiments.py vecMatMultiply

Running for config:  ['1200', '8900', '8900', '9000']
Run num: 0
Run num: 1
Run num: 2
Averages: host_to_device=0.166s, kernel_exec=0.237s, device_to_host=0.063s

---------------
Running for config:  ['1200', '17800', '17800', '9000']
Run num: 0
Run num: 1
Run num: 2
Averages: host_to_device=0.310s, kernel_exec=0.466s, device_to_host=0.058s

---------------
Running for config:  ['1200', '35600', '35600', '9000']
Run num: 0
Run num: 1
Run num: 2
Averages: host_to_device=0.620s, kernel_exec=1.010s, device_to_host=0.055s

---------------
Running for config:  ['1200', '71200', '71200', '9000']
Run num: 0
Run num: 1
Run num: 2
Averages: host_to_device=1.258s, kernel_exec=2.440s, device_to_host=0.061s

---------------


In [None]:
!python3 run_experiments.py vecMatMultiplyFloat

Running for config:  ['1200', '8900', '8900', '9000']
Run num: 0
Run num: 1
Run num: 2
Averages: host_to_device=0.079s, kernel_exec=0.086s, device_to_host=0.030s

---------------
Running for config:  ['1200', '17800', '17800', '9000']
Run num: 0
Run num: 1
Run num: 2
Averages: host_to_device=0.152s, kernel_exec=0.161s, device_to_host=0.028s

---------------
Running for config:  ['1200', '35600', '35600', '9000']
Run num: 0
Run num: 1
Run num: 2
Averages: host_to_device=0.306s, kernel_exec=0.300s, device_to_host=0.028s

---------------
Running for config:  ['1200', '71200', '71200', '9000']
Run num: 0
Run num: 1
Run num: 2
Averages: host_to_device=0.615s, kernel_exec=0.587s, device_to_host=0.028s

---------------
