In [None]:
times = [512, 1024, 2048]

# CPU

In [1]:
from google.colab import files
uploaded = files.upload()

ModuleNotFoundError: No module named 'google.colab'

In [None]:
!gcc matrix_cpu.c -o matrix_cpu -O2

In [None]:
import subprocess

cpu_times = []

for t in times:
    result = subprocess.run(['./matrix_cpu', str(t)], capture_output=True, text=True)
    print(result.stdout)
    line = result.stdout.strip()
    elapsed = float(line.split(':')[-1].split()[0])
    cpu_times.append(elapsed)

CPU execution time (N=512): 0.200676 seconds

CPU execution time (N=1024): 3.280318 seconds

CPU execution time (N=2048): 82.709221 seconds



# Naive GPU

In [None]:
uploaded = files.upload()

Saving naive_matrix_gpu.cu to naive_matrix_gpu (1).cu


In [None]:
!nvcc -arch=sm_75 naive_matrix_gpu.cu -o naive_matrix_gpu

In [None]:
naive_gpu_times = []

for t in times:
    result = subprocess.run(['./naive_matrix_gpu', str(t)], capture_output=True, text=True)
    print(result.stdout)
    line = result.stdout.strip()
    elapsed = float(line.split(':')[-1].split()[0])
    naive_gpu_times.append(elapsed)

naive_gpu_times = [t/1000 for t in naive_gpu_times]

GPU execution time (N=512): 1.269600 ms

GPU execution time (N=1024): 9.362752 ms

GPU execution time (N=2048): 75.006180 ms



# Optimized GPU

In [None]:
uploaded = files.upload()

Saving optimized_matrix_gpu.cu to optimized_matrix_gpu.cu


In [None]:
!nvcc -arch=sm_75 optimized_matrix_gpu.cu -o optimized_matrix_gpu

In [None]:
optimized_gpu_times = []

for t in times:
    result = subprocess.run(['./optimized_matrix_gpu', str(t)], capture_output=True, text=True)
    print(result.stdout)
    line = result.stdout.strip()
    elapsed = float(line.split(':')[-1].split()[0])
    optimized_gpu_times.append(elapsed)

optimized_gpu_times = [t/1000 for t in optimized_gpu_times]

GPU execution time (N=512): 0.835584 ms

GPU execution time (N=1024): 5.886112 ms

GPU execution time (N=2048): 46.391041 ms



# Table 1

In [None]:
import pandas as pd

data = {
    'Implementation': ['CPU (C)', 'Naive CUDA', 'Optimized CUDA'],
    'N=512': [cpu_times[0], naive_gpu_times[0], optimized_gpu_times[0]],
    'Speedup 512': [1, cpu_times[0] / naive_gpu_times[0], cpu_times[0] / optimized_gpu_times[0]],
    'N=1024': [cpu_times[1], naive_gpu_times[1], optimized_gpu_times[1]],
    'Speedup 1024': [1, cpu_times[1] / naive_gpu_times[1], cpu_times[1] / optimized_gpu_times[1]],
    'N=2048': [cpu_times[2], naive_gpu_times[2], optimized_gpu_times[2]],
    'Speedup 2048': [1, cpu_times[2] / naive_gpu_times[2], cpu_times[2] / optimized_gpu_times[2]]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Implementation,N=512,Speedup 512,N=1024,Speedup 1024,N=2048,Speedup 2048
0,CPU (C),0.200676,1.0,3.280318,1.0,82.709221,1.0
1,Naive CUDA,0.00127,158.062382,0.009363,350.358313,0.075006,1102.698751
2,Optimized CUDA,0.000836,240.162569,0.005886,557.297924,0.046391,1782.870555


# Cublas GPU

In [None]:
uploaded = files.upload()

Saving cublas_matrix.cu to cublas_matrix.cu


In [None]:
!nvcc cublas_matrix.cu -lcublas -o cublas_matrix

In [None]:
cublas_gpu_times = []

for t in times:
    result = subprocess.run(['./cublas_matrix', str(t)], capture_output=True, text=True)
    print(result.stdout)
    line = result.stdout.strip()
    elapsed = float(line.split(':')[-1].split()[0])
    cublas_gpu_times.append(elapsed)

cublas_gpu_times = [t/1000 for t in cublas_gpu_times]

cuBLAS SGEMM time (N=512): 5.715712 ms

cuBLAS SGEMM time (N=1024): 6.321120 ms

cuBLAS SGEMM time (N=2048): 11.183136 ms



# Table 2

In [None]:
df.loc[len(df)] = ['cuBLAS',
                   cublas_gpu_times[0],
                   cpu_times[0] / cublas_gpu_times[0],
                   cublas_gpu_times[1],
                   cpu_times[1] / cublas_gpu_times[1],
                   cublas_gpu_times[2],
                   cpu_times[2] / cublas_gpu_times[2]
                   ]

df

Unnamed: 0,Implementation,N=512,Speedup 512,N=1024,Speedup 1024,N=2048,Speedup 2048
0,CPU (C),0.200676,1.0,3.280318,1.0,82.709221,1.0
1,Naive CUDA,0.00127,158.062382,0.009363,350.358313,0.075006,1102.698751
2,Optimized CUDA,0.000836,240.162569,0.005886,557.297924,0.046391,1782.870555
3,cuBLAS,0.005716,35.109537,0.006321,518.945693,0.011183,7395.887969


In [None]:
uploaded = files.upload()

Saving matrix_lib.cu to matrix_lib.cu


In [None]:
!nvcc -Xcompiler -fPIC -shared matrix_lib.cu -o libmatrix.so

In [None]:
uploaded = files.upload()

Saving lib_matrix.py to lib_matrix.py


In [None]:
!python3 lib_matrix.py

Python call to CUDA library completed in 0.2088 seconds
