<a href="https://colab.research.google.com/github/prabur90/PyTorch/blob/main/matrix_mutliplication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn

import time
from typing import Tuple

In [5]:
def run():
    # Available CUDA Devices
    cuda0 = ""
    gpu_devices = get_cuda_devices()
    if gpu_devices:
        torch.cuda.set_device(0)
        cuda0 = torch.cuda.current_device()
        for i in range(0, gpu_devices):
            print(f"Device Name of {i}: {torch.cuda.get_device_name(i)}")
        device_name = cuda0
    else:
        print("No GPU available")
        device_name = "CPU"

    # Data Type
    dt = torch.float16

    # get shapes
    shapes = get_shapes()
    for i in range(len(shapes)):
        m,n,k = shapes[i]
        print(f"Matrix Dimensions are m:{m}, n:{n}, k:{k}")
        torch.manual_seed(0)
        a = torch.randn(m, n, k).to(dt)
        b = torch.randn(m, n, k).to(dt)
        if gpu_devices:
          a = torch.randn(m, n, k).to(dt).to(cuda0)
          b = torch.randn(m, n, k).to(dt).to(cuda0)

        start = time.perf_counter()
        # Matrix Opertions
        do_matmul(a,b)
        end = time.perf_counter()
        duration = end-start

        tflops = performance_measure(duration, shapes[i])
        print(f"Data Type Used is: {dt}")
        print(f"Device Name of {cuda0}: {device_name}")
        print(f"Time taken for matmul: {end-start}")
        print(f"TFLOPS: {tflops}")
        print(f"Completed for the shape: {shapes[i]}")
        break  #To avoid memory usage issue, we are currenly calculating for one dim for now

def get_cuda_devices():
    # Get the available Devices
    if torch.cuda.is_available():
        print(f"No of GPUs available: {torch.cuda.device_count()}")
        return torch.cuda.device_count()
    else:
        print("cuda not available")

def get_shapes() -> Tuple[int, int, int]:
    available_shapes = [
            (32, 32, 32),
            (128, 1024, 1024),
            (256, 1024, 1024),
            (512, 1024, 1024),
            (1024, 1024, 1024),
            (128, 2048, 2048),
            (256, 2048, 2048),
            (512, 2048, 2048),
            (1024, 2048, 2048),
            (128, 4096, 4096),
            (256, 4096, 4096),
            (512, 4096, 4096),
            (1024, 4096, 4096),
            (64, 64, 64),
            (128, 128, 128),
            (256, 256, 256),
            (512, 512, 512),
        ]
    return available_shapes

def do_matmul(a,b):
    # This method is used to do the Matrix Multiplication
    torch.matmul(a, b)


def performance_measure(duration, shape):
    """
    To calculate the TFLOPS value for a 32x32x32 matrix multiplication, we need to first determine the number of floating-point operations (FLOPs) required for the operation.
    A 32x32x32 matrix multiplication involves multiplying two matrices with dimensions 32x32 and 32x32, resulting in a 32x32 output matrix.
    Each element of the output matrix is calculated by multiplying the corresponding elements of the input matrices and summing them up.
    Therefore, the total number of FLOPs required for a 32x32x32 matrix multiplication can be calculated as follows:

    FLOPs = (32 * 32 * 32) + (32 * 32 * 32) = 2,048,000

    Now, to calculate the TFLOPS value, we need to divide the total number of FLOPs by the time taken to perform the operation.
    Let's assume that the operation takes 1 second to complete.
    TFLOPS = FLOPs / Time
    = 2,048,000 / 1
    = 2.048 TFLOPS

    Therefore, the TFLOPS value for a 32x32x32 matrix multiplication is approximately 2.048 TFLOPS.
    """
    m,n,k = shape
    flops = (m * n * k) + (m * n * k)
    tflops = flops/duration
    return tflops

if __name__== "__main__": # noqa
    run()


cuda not available
No GPU available
Matrix Dimensions are m:32, n:32, k:32
Data Type Used is: torch.float16
Device Name of : CPU
Time taken for matmul: 0.002699554000059834
TFLOPS: 24276602.72717176
Completed for the shape: (32, 32, 32)
