In [1]:
!pip install ninja

Collecting ninja
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (180 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/180.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ninja
Successfully installed ninja-1.13.0


In [2]:
import os

# 1. Nsight Systems (nsys) 설치 여부 확인 및 경로 설정
if os.path.exists('/usr/local/cuda/bin/nsys'):
    print("Found nsys in /usr/local/cuda/bin. Adding to PATH...")
    os.environ['PATH'] += ':/usr/local/cuda/bin'
else:
    print("Installing Nsight Systems...")
    # 최신 버전 설치 (apt repository 업데이트)
    !apt-get update -y
    !apt-get install -y nsight-systems-2023.3.3
    # 설치 후 경로 추가 (보통 /usr/local/bin에 생기지만 혹시 모르니)
    os.environ['PATH'] += ':/usr/local/cuda/bin'

# 2. Nsight Compute (ncu) 설치 여부 확인
if os.path.exists('/usr/local/cuda/bin/ncu'):
    print("Found ncu in /usr/local/cuda/bin.")
else:
    print("Installing Nsight Compute...")
    !apt-get install -y nsight-compute-2023.3.1
    os.environ['PATH'] += ':/usr/local/cuda/bin'

print("\n=== Installation Check ===")
!nsys --version
!ncu --version

Installing Nsight Systems...
Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 https://cli.github.com/packages stable InRelease [3,917 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:9 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [85.0 kB]
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,361 kB]
Get:11 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:12 https://cli.github.com/packages stable/main amd64 Packages [355 B]
Hit:13 https://ppa.launchp

In [3]:
%%writefile profile_run.py
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.cpp_extension import load_inline
import torch.cuda.nvtx as nvtx
import time

# ---------------------------------------------------------
# 1. CUDA Kernel Source (작성하신 코드 그대로)
# ---------------------------------------------------------
cuda_source = """
#include <cuda_runtime.h>
#include <torch/extension.h>

template<int BM, int BN, int BK, int TM, int TN_PHY>
__global__ void sgemm2D_kernel(int M, int K, int N, float* A, int* B, float* C, float* scale, float* zero_point)
{
    int cRow = blockIdx.y;
    int cCol = blockIdx.x;

    constexpr int TN_LOG = TN_PHY * 8;

    int threadRow = threadIdx.x / (BN / TN_LOG);
    int threadCol = threadIdx.x % (BN / TN_LOG);

    __shared__ float As[BM * BK];
    __shared__ int Bs[BK * (BN / 8)];

    // 128 * 8 = 1024 , 256 threads ,
    int innerRowA = threadIdx.x / (BK / 4);
    int innerColA = threadIdx.x % (BK / 4);

    // (128 / 8) * 8 = 128
    int innerRowB = threadIdx.x / (BN / 8 / 4);
    int innerColB = threadIdx.x % (BN / 8 / 4);

    A += cRow * BM * K;
    B += cCol * (BN / 8);
    C += cRow * BM * N + cCol * BN;

    float threadResults[TM * TN_LOG] = {0.0};
    float regM[TM] = {0.0};
    int regN[TN_PHY] = {0};

    float my_scales[TN_LOG];
    float my_zeros[TN_LOG];

    for (int i = 0; i < TN_LOG; ++i) {
        int globalN = cCol * BN + threadCol * TN_LOG + i;
        if (globalN < N) {
            my_scales[i] = scale[globalN];
            my_zeros[i]  = zero_point[globalN];
        }
        else {
            my_scales[i] = 1.0f;
            my_zeros[i] = 0.0f;
        }
    }

    for (int bk = 0; bk < K; bk += BK) {

        int globalM = cRow * BM + innerRowA;
        int globalK = bk + innerColA * 4;

        float4 tmp = {0.0f, 0.0f, 0.0f, 0.0f};

        if (globalM < M) {
            if (globalK + 4 <= K) {
                tmp = reinterpret_cast<float4 *>(&A[innerRowA * K + innerColA * 4])[0];
            }
            else {
                float* ptr = &A[innerRowA * K + innerColA * 4];
                if (globalK + 0 < K) tmp.x = ptr[0];
                if (globalK + 1 < K) tmp.y = ptr[1];
                if (globalK + 2 < K) tmp.z = ptr[2];
                if (globalK + 3 < K) tmp.w = ptr[3];
            }
        }

        As[(innerColA * 4 + 0) * BM + innerRowA] = tmp.x;
        As[(innerColA * 4 + 1) * BM + innerRowA] = tmp.y;
        As[(innerColA * 4 + 2) * BM + innerRowA] = tmp.z;
        As[(innerColA * 4 + 3) * BM + innerRowA] = tmp.w;

        // 8 * 4 = 32 개 불러옴
        if (threadIdx.x < BK * (BN / 8) / 4) {

            if (bk + innerRowB < K) {
                reinterpret_cast<int4 *>(&Bs[innerRowB * (BN / 8) + innerColB * 4])[0] =
                reinterpret_cast<int4 *>(&B[innerRowB * (N / 8) + innerColB * 4])[0];
            }
            else {
                reinterpret_cast<int4 *>(&Bs[innerRowB * (BN / 8) + innerColB * 4])[0] = {0, 0, 0, 0};
            }
        }
        __syncthreads();

        A += BK;
        B += BK * (N / 8);

        for (int dot = 0; dot < BK; ++dot) {
            for (int i = 0; i < TM; ++i){
                regM[i] = As[dot * BM + threadRow * TM + i];
            }
            for (int i = 0; i < TN_PHY; ++i) {
                regN[i] = Bs[dot * (BN / 8) + threadCol * TN_PHY + i];
            }

            for (int i = 0; i < TN_PHY; ++i) {
                int packed_val = regN[i];

                for (int subN = 0; subN < 8; ++subN) {
                    int int4_val = (packed_val >> (subN * 4)) & 0xF;
                    float real_val = (float(int4_val) - my_zeros[i * 8 + subN]) * my_scales[i * 8 + subN];

                    for (int m = 0; m < TM; ++m) {
                        int resNidx = i * 8 + subN;

                        threadResults[m * TN_LOG + resNidx] += regM[m] * real_val;
                    }
                }
            }
        }
        __syncthreads();
    }

    for (uint resIdxM = 0; resIdxM < TM; resIdxM += 1) {
        for (uint resIdxN = 0; resIdxN < TN_LOG; resIdxN += 4) {
            int globalRowC = cRow * BM + threadRow * TM + resIdxM;
            int globalColC = cCol * BN + threadCol * TN_LOG + resIdxN;

            if (globalRowC < M) {
                float4 tmp;
                tmp.x = threadResults[resIdxM * TN_LOG + resIdxN];
                tmp.y = threadResults[resIdxM * TN_LOG + resIdxN + 1];
                tmp.z = threadResults[resIdxM * TN_LOG + resIdxN + 2];
                tmp.w = threadResults[resIdxM * TN_LOG + resIdxN + 3];

                if (globalColC + 4 <= N) {
                    reinterpret_cast<float4 *>(&C[(threadRow * TM + resIdxM) * N + threadCol * TN_LOG + resIdxN])[0] = tmp;
                }
                else {
                    float* ptr = &C[(threadRow * TM + resIdxM) * N + threadCol * TN_LOG + resIdxN];
                    if (globalColC + 0 < N) ptr[0] = tmp.x;
                    if (globalColC + 1 < N) ptr[1] = tmp.y;
                    if (globalColC + 2 < N) ptr[2] = tmp.z;
                    if (globalColC + 3 < N) ptr[3] = tmp.w;
                }
            }
        }
    }
}

void sgemm_int4_cuda(torch::Tensor A, torch::Tensor B_packed, torch::Tensor C, torch::Tensor scale, torch::Tensor zero_point) {

    // [Safety Checks]
    // 커널이 Vectorized Load를 수행하므로 차원과 정렬이 안 맞으면 SegFault 발생.
    // 따라서 C++ 단에서 엄격하게 검사합니다.

    int M = A.size(0);
    int K = A.size(1);
    int N = C.size(1);

    const int BM = 128;
    const int BN = 128;
    const int BK = 8;

    // 1. 배수 조건 검사
    TORCH_CHECK(N % BN == 0, "N must be a multiple of 128");

    // 2. Vectorization 조건 검사
    // A inner dim (K) must be multiple of 4 (float4)
    TORCH_CHECK(K % 4 == 0, "K must be multiple of 4 for float4 loading");
    // B packed inner dim (N/8) must be multiple of 4 (int4 loading) -> N multiple of 32
    TORCH_CHECK(N % 32 == 0, "N must be multiple of 32 for int4 loading");

    // 3. Contiguity 검사
    TORCH_CHECK(A.is_contiguous(), "A must be contiguous");
    TORCH_CHECK(B_packed.is_contiguous(), "B_packed must be contiguous");
    TORCH_CHECK(C.is_contiguous(), "C must be contiguous");
    TORCH_CHECK(scale.is_contiguous(), "scale must be contiguous");
    TORCH_CHECK(zero_point.is_contiguous(), "zero_point must be contiguous");

    dim3 blockDim(256);
    dim3 gridDim((N + BN - 1) / BN, (M + BM - 1) / BM);

    sgemm2D_kernel<BM, BN, BK, 8, 1><<<gridDim, blockDim>>>(
        M, K, N,
        A.data_ptr<float>(),
        B_packed.data_ptr<int>(),
        C.data_ptr<float>(),
        scale.data_ptr<float>(),
        zero_point.data_ptr<float>()
    );
}
"""

cpp_source = "void sgemm_int4_cuda(torch::Tensor A, torch::Tensor B_packed, torch::Tensor C, torch::Tensor scale, torch::Tensor zero_point);"

# ---------------------------------------------------------
# 2. Compile (JIT)
# ---------------------------------------------------------
sgemm_module = load_inline(
    name="sgemm_int4_v1",
    cpp_sources=cpp_source,
    cuda_sources=cuda_source, # 주의: 실제로는 위에서 정의한 긴 문자열이 들어가야 합니다.
    functions=['sgemm_int4_cuda'],
    verbose=False,
    with_cuda=True,
    extra_cuda_cflags=["-O3", "-lineinfo"]
)

# ---------------------------------------------------------
# 3. Quantized Layer Definition
# ---------------------------------------------------------
class QuantizedConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, BM=128, BK=8, BN=128):
        super().__init__()
        # ... (작성하신 클래스 초기화 코드) ...
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.K = self.in_channels * self.kernel_size * self.kernel_size
        self.N = self.out_channels

        # Buffer 등록 (더미 데이터로 초기화)
        pad_n = (BN - self.N % BN) % BN
        pad_k = (BK - self.K % BK) % BK
        self.register_buffer('w_packed', torch.zeros(self.K + pad_k, (self.N + pad_n) // 8, dtype=torch.int32).cuda())
        self.register_buffer('scale', torch.ones(self.N + pad_n, dtype=torch.float32).cuda())
        self.register_buffer('zero_point', torch.zeros(self.N + pad_n, dtype=torch.float32).cuda())

    def forward(self, x):
        # ... (작성하신 forward 코드) ...
        # 간단한 테스트를 위해 핵심 로직만 재구성
        batch_size, _, h_in, w_in = x.shape
        x_unfold = F.unfold(x, kernel_size=self.kernel_size, padding=self.padding, stride=self.stride)
        input = x_unfold.transpose(1, 2).reshape(-1, self.K).contiguous()

        M, K = input.shape
        N_padded = self.w_packed.shape[1] * 8
        C_padded = torch.empty((M, N_padded), dtype=torch.float32, device=x.device)

        # ★★★ 여기가 프로파일링 대상입니다 ★★★
        sgemm_module.sgemm_int4_cuda(input, self.w_packed, C_padded, self.scale, self.zero_point)

        return C_padded # 뒷부분 생략

# ---------------------------------------------------------
# 4. Main Execution Block for Profiling
# ---------------------------------------------------------
def main():
    device = torch.device('cuda')

    # 모델 생성 (ResNet 전체 대신 무거운 레이어 하나만 테스트해도 충분합니다)
    # 실제로는 전체 모델을 로드해서 하셔도 됩니다.
    model = QuantizedConv2d(64, 128, kernel_size=3, stride=1, padding=1).to(device)
    dummy_input = torch.randn(32, 64, 32, 32).to(device)

    # Warm-up
    print("Warm-up...")
    for _ in range(5):
        model(dummy_input)
    torch.cuda.synchronize()

    # Profiling Run
    print("Profiling Start...")

    # NVTX Range로 전체 구간 표시
    nvtx.range_push("My_Model_Inference")

    for i in range(10): # 10번 반복
        nvtx.range_push(f"Iter_{i}")
        model(dummy_input)
        torch.cuda.synchronize() # 정확한 시간 측정을 위해 (배포시는 제거)
        nvtx.range_pop()

    nvtx.range_pop()
    print("Done.")

if __name__ == "__main__":
    main()

Writing profile_run.py


In [4]:
# --trace=cuda,nvtx,osrt  <-- 여기서 osrt 제거
!nsys profile \
  --trace=cuda,nvtx \
  --output=nsys_result_32_im2col_fuse \
  --force-overwrite=true \
  --stats=true \
  python profile_run.py

Warm-up...
Profiling Start...
Done.
Generating '/tmp/nsys-report-a59d.qdstrm'
[3/7] Executing 'nvtx_sum' stats report

 Time (%)  Total Time (ns)  Instances    Avg (ns)      Med (ns)     Min (ns)    Max (ns)   StdDev (ns)   Style         Range       
 --------  ---------------  ---------  ------------  ------------  ----------  ----------  -----------  -------  ------------------
     50.0       51,499,099          1  51,499,099.0  51,499,099.0  51,499,099  51,499,099          0.0  PushPop  My_Model_Inference
      5.1        5,204,104          1   5,204,104.0   5,204,104.0   5,204,104   5,204,104          0.0  PushPop  Iter_0            
      5.0        5,159,819          1   5,159,819.0   5,159,819.0   5,159,819   5,159,819          0.0  PushPop  Iter_1            
      5.0        5,146,267          1   5,146,267.0   5,146,267.0   5,146,267   5,146,267          0.0  PushPop  Iter_5            
      5.0        5,142,368          1   5,142,368.0   5,142,368.0   5,142,368   5,142,368

In [5]:
!./sgemm_run

/bin/bash: line 1: ./sgemm_run: No such file or directory


In [6]:
# Colab 셀에서 실행
!ncu \
  --set full \
  --kernel-name regex:sgemm2D \
  --launch-count 1 \
  -o ncu_result_32_im2col_fuse \
  -f \
  python profile_run.py

==PROF== Connected to process 2318 (/usr/bin/python3.12)
Warm-up...
Profiling Start...
Done.
==PROF== Disconnected from process 2318
Available Kernels:
1. sgemm2D
