## 1. How GPU Works?

In [1]:
# 1. 환경 설정 : 필요한 패키지 설치
!pip install pynvml transformers --quiet

In [2]:
# 2. 라이브러리 import 및 GPU 확인
import torch
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

# GPU 연결 확인
if not torch.cuda.is_available():
	raise RuntimeError("GPU undetected.")
device = torch.device("cuda:0")
print(f'Currently using {device}')

# NVML 초기화
nvmlInit()
handle = nvmlDeviceGetHandleByIndex(0)

Currently using cuda:0


In [3]:
# 3. GPU 메모리 사용량 확인 함수 정의 및 초기 상태
def print_gpu_mem(label: str):
    info = nvmlDeviceGetMemoryInfo(handle)
    used_gb = info.used / (1024**3)
    total_gb = info.total / (1024**3)
    print(f'[{label}] GPU 메모리 사용량: {used_gb:.2f} GB / {total_gb:.2f} GB')

print_gpu_mem("초기 상태")

[초기 상태] GPU 메모리 사용량: 1.16 GB / 15.92 GB


In [4]:
# 4. CPU 텐서 생성 (GPU 사용 전)
# 512*512 크기 텐서를 CPU 상에서 생성해보자.
x_cpu = torch.randn(512, 512)
# GPU 메모리 사용량 변화 없음
print_gpu_mem("CPU 텐서 생성 후")

[CPU 텐서 생성 후] GPU 메모리 사용량: 1.16 GB / 15.92 GB


In [5]:
# 5. 작은 GPU 텐서 (1*1) 올리기
# 1x1 크기 텐서를 GPU로 옮기면 CUDA 커널이 로드되어 약 1-2GB 사용
x_small = torch.randn(1, 1).to(device)
print_gpu_mem("1x1 텐서 GPU 로드 후")

[1x1 텐서 GPU 로드 후] GPU 메모리 사용량: 1.38 GB / 15.92 GB


In [6]:
# 5.1. 실제 텐서 데이터 크기 확인
import torch

# 1x1 float32 텐서 생성 및 GPU로 이동
x = torch.randn(1, 1, device="cuda:0")

# 요소 개수 및 원소당 바이트 수 계산
numel = x.nelement()         # 요소 개수 : 1
bytes_per = x.element_size() # float32 한 요소당 4바이트

# 실제 데이터 바이트 수 계산
tensor_bytes = numel * bytes_per # 총 4byte
tensor_mb = tensor_bytes / (1024 ** 2)
tensor_gb = tensor_bytes / (1024 ** 3)

# 출력: B, MB, GB 단위 모두 표시
print(f"실제 텐서 데이터 크기: {tensor_bytes} B  ({tensor_mb:.6f} MB, {tensor_gb:.9f} GB)")

실제 텐서 데이터 크기: 4 B  (0.000004 MB, 0.000000004 GB)


In [7]:
# 6. BERT-Large 모델 로드 및 GPU로 이동
from transformers import BertForMaskedLM, BertTokenizerFast

# 모델과 토크나이저 로드
model_name = "google-bert/bert-large-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

# 모델을 GPU로 이동
model.to(device)
print_gpu_mem("BERT-Large GPU 로드 후")

2025-08-01 12:59:22.214347: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-01 12:59:22.221740: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754020762.230221   11067 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754020762.232925   11067 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754020762.239910   11067 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

[BERT-Large GPU 로드 후] GPU 메모리 사용량: 2.63 GB / 15.92 GB


In [8]:
# 7. 간단 학습 스텝 (batch size=4)
from torch.optim import AdamW

# 더미 입력(batch_size=4, seq_len=8) 생성
batch_size, seq_len = 4, 8
input_ids = torch.randint(0, tokenizer.vocab_size, (batch_size, seq_len), device=device)
attention_mask = torch.ones_like(input_ids, device=device)
labels = input_ids.clone() # 자기 자신을 예측하도록

optimizer = AdamW(model.parameters(), lr=1e-5)

model.train()
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()

print(f'Loss: {loss.item():.4f}')
print_gpu_mem("학습 스텝(batch=4) 후")

Loss: 13.2833
[학습 스텝(batch=4) 후] GPU 메모리 사용량: 7.77 GB / 15.92 GB


  return forward_call(*args, **kwargs)


In [12]:
import time
import pandas as pd

def time_operation(operation, *args, n_iters=100):
    # GPU sync 후 정확한 시간 측정 시작
    torch.cuda.synchronize()
    start = time.time()
    for _ in range(n_iters):
        operation(*args)
    torch.cuda.synchronize()
    end = time.time()
    
    avg_time_ms = (end - start) / n_iters * 1000  # ms 단위
    return avg_time_ms

In [13]:
# 1) GEMM (2048*2048)
size = 2048
A = torch.randn(size, size, device=device)
B = torch.randn(size, size, device=device)
gemm_time = time_operation(lambda x, y: x.matmul(y), A, B)

# 2) LayerNorm (2048*2048)
seq_len, hidden = 2048, 2048
x = torch.randn(seq_len, hidden, device=device)
ln = torch.nn.LayerNorm(hidden).to(device)
ln_time = time_operation(lambda inp: ln(inp), x)

# 3) Element-wise ReLU (2048*2048)
y = torch.randn(size, size, device=device)
elt_time = time_operation(lambda inp: inp.relu(), y)

# 결과 프레임
df = pd.DataFrame({
	'Operation': ['GEMM (2048*2048)', 'LayerNorm (2048*2048)', 'Element-wise ReLU (2048*2048)'],
    'Avg Time (ms)': [gemm_time, ln_time, elt_time]
})
print(df)

                       Operation  Avg Time (ms)
0               GEMM (2048*2048)       0.590971
1          LayerNorm (2048*2048)       0.019050
2  Element-wise ReLU (2048*2048)       0.013139


## 2. Batch Size Choice

In [20]:
# 본인 컴퓨터 사양에서는 얼마의 배치가 최적인지? (사용 모델 및 연산 종류 등에 따라서 더 달라질 수 있음)

import torch
import time
from torch.cuda.amp import autocast, GradScaler

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = torch.nn.Sequential(
    torch.nn.Linear(2048, 2048),
    torch.nn.ReLU(),
    torch.nn.Linear(2048, 2048)
).to(device)
scaler = GradScaler()

for B in [4, 8, 16, 32, 64, 128, 256, 512, 1024]:
    x = torch.randn(B, 2048, device=device)
    y = torch.randn(B, 2048, device=device)

    optimizer = torch.optim.Adam(model.parameters())
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    mem_start = torch.cuda.memory_allocated()

    start = time.time()
    with autocast(dtype=torch.float16):
        out = model(x)
        loss = (out - y).abs().mean()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
    torch.cuda.synchronize()
    elapsed_ms = (time.time() - start) * 1000

    mem_end = torch.cuda.memory_allocated()
    delta_gb = (mem_end - mem_start) / 1024**3

    print(f"B={B:4} | Time: {elapsed_ms:6.2f} ms | ∆Mem: {delta_gb:+.2f} GB")


B=   4 | Time:   2.01 ms | ∆Mem: +0.03 GB
B=   8 | Time:   1.50 ms | ∆Mem: +0.06 GB
B=  16 | Time:   1.46 ms | ∆Mem: +0.06 GB
B=  32 | Time:   1.42 ms | ∆Mem: +0.06 GB
B=  64 | Time:   1.56 ms | ∆Mem: +0.06 GB
B= 128 | Time:   1.55 ms | ∆Mem: +0.06 GB
B= 256 | Time:   1.49 ms | ∆Mem: +0.06 GB
B= 512 | Time:   1.57 ms | ∆Mem: +0.06 GB
B=1024 | Time:   1.83 ms | ∆Mem: +0.06 GB


  scaler = GradScaler()
  with autocast(dtype=torch.float16):


In [22]:
# Tiling experiment
# 테스트할 데이터 타입들
dtypes = [torch.float32, torch.float16]

# 테스트할 행렬 크기들
shapes = [
    (511, 511),  # 타일보다 약간 작은 경우
    (512, 512),  # 이상적인 타일 크기
    (513, 513),  # 타일보다 약간 큰 경우
]

for dtype in dtypes:
    print(f"\n🔍 dtype = {dtype}")
    for H, W in shapes:
        A = torch.randn(H, W, device=device, dtype=dtype)
        B = torch.randn(W, H, device=device, dtype=dtype)

        # 워밍업 (초기 CUDA 실행 지연 제거용)
        for _ in range(5):
            _ = A @ B
        torch.cuda.synchronize()

        # 속도 측정
        start = time.time()
        for _ in range(20):
            _ = A @ B
        torch.cuda.synchronize()
        elapsed = (time.time() - start) / 20

        print(f"▶ {H}×{W} 행렬곱: {elapsed * 1000:.2f} ms")



🔍 dtype = torch.float32
▶ 511×511 행렬곱: 0.03 ms
▶ 512×512 행렬곱: 0.02 ms
▶ 513×513 행렬곱: 0.04 ms

🔍 dtype = torch.float16
▶ 511×511 행렬곱: 0.02 ms
▶ 512×512 행렬곱: 0.01 ms
▶ 513×513 행렬곱: 0.02 ms


## 3. Gradient Accumulation

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# 1. 하이퍼파라미터 설정
batch_size = 32              # 작은 배치
accumulation_steps = 4       # 누적할 step 수
lr = 0.01
num_epochs = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. 데이터셋 준비
transform = transforms.ToTensor()
train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# 3. 모델 정의
model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(28*28, 128),
    nn.ReLU(),
    nn.Linear(128, 10)
).to(device)

# 4. 손실 함수 및 옵티마이저
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

# 5. 학습 루프 (Gradient Accumulation)
model.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    optimizer.zero_grad()

    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        # forward + loss
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss = loss / accumulation_steps  # 누적을 위한 평균화

        # backward (누적)
        loss.backward()

        # 일정 횟수마다 optimizer step + grad 초기화
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        running_loss += loss.item()

        if (i + 1) % (accumulation_steps * 10) == 0:
            print(f"[Epoch {epoch+1}], Step {i+1}, Loss: {running_loss:.4f}")
            running_loss = 0.0

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


100%|██████████████████████████████████████| 9.91M/9.91M [00:03<00:00, 2.69MB/s]
100%|███████████████████████████████████████| 28.9k/28.9k [00:00<00:00, 144kB/s]
100%|██████████████████████████████████████| 1.65M/1.65M [00:01<00:00, 1.27MB/s]
100%|██████████████████████████████████████| 4.54k/4.54k [00:00<00:00, 6.73MB/s]


[Epoch 1], Step 40, Loss: 23.0305
[Epoch 1], Step 80, Loss: 22.8590
[Epoch 1], Step 120, Loss: 22.6783
[Epoch 1], Step 160, Loss: 22.5134
[Epoch 1], Step 200, Loss: 22.3058
[Epoch 1], Step 240, Loss: 22.1221
[Epoch 1], Step 280, Loss: 21.9336
[Epoch 1], Step 320, Loss: 21.6852
[Epoch 1], Step 360, Loss: 21.5077
[Epoch 1], Step 400, Loss: 21.3264
[Epoch 1], Step 440, Loss: 21.0508
[Epoch 1], Step 480, Loss: 20.8227
[Epoch 1], Step 520, Loss: 20.6629
[Epoch 1], Step 560, Loss: 20.3314
[Epoch 1], Step 600, Loss: 20.0644
[Epoch 1], Step 640, Loss: 19.7670
[Epoch 1], Step 680, Loss: 19.6221
[Epoch 1], Step 720, Loss: 19.3917
[Epoch 1], Step 760, Loss: 19.0303
[Epoch 1], Step 800, Loss: 18.8091
[Epoch 1], Step 840, Loss: 18.3280
[Epoch 1], Step 880, Loss: 17.9145
[Epoch 1], Step 920, Loss: 17.6815
[Epoch 1], Step 960, Loss: 17.3458
[Epoch 1], Step 1000, Loss: 16.9637
[Epoch 1], Step 1040, Loss: 16.5880
[Epoch 1], Step 1080, Loss: 16.3032
[Epoch 1], Step 1120, Loss: 15.8250
[Epoch 1], Step 11

## 4. Gradient Checkpointing

In [25]:
import torch
import torch.nn as nn
import torch.utils.checkpoint as checkpoint

class MyBlock(nn.Module):
    def __init__(self):
        super().__init__()
        self.seq = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Linear(1024, 1024),
            nn.ReLU()
        )

    def forward(self, x):
        return checkpoint.checkpoint(self.seq, x)

class BigModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.block1 = MyBlock()
        self.block2 = MyBlock()
        self.fc = nn.Linear(1024, 10)

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        return self.fc(x)

model = BigModel().cuda()
x = torch.randn(64, 1024, device='cuda')  # batch size = 64
output = model(x)
print(output)

tensor([[-3.7616e-02, -3.1428e-02,  6.8262e-03,  3.8644e-02, -6.3441e-03,
         -1.9613e-02,  4.1357e-02,  1.9341e-02,  5.2598e-03,  6.3985e-03],
        [-1.6576e-02, -5.3709e-02,  4.7012e-03,  4.2796e-02,  3.5075e-02,
         -2.6529e-02,  2.8517e-02,  1.2006e-02, -3.5837e-05, -4.5435e-03],
        [ 9.6738e-04, -1.7537e-02, -1.0928e-02,  3.5094e-02, -4.9903e-03,
         -8.8588e-03,  2.0412e-02,  9.8992e-03,  1.5639e-02, -2.4861e-03],
        [-2.0573e-02, -3.0641e-02, -1.2131e-02,  4.4564e-02,  1.8414e-02,
         -1.8255e-02,  3.4164e-02,  1.8577e-03,  3.5629e-03,  5.7991e-03],
        [-1.6200e-02, -5.1690e-02,  1.1797e-02,  5.1125e-02,  1.7877e-02,
         -1.3097e-02,  1.8381e-02,  2.4275e-02,  1.6612e-03, -4.7723e-03],
        [-5.3039e-04, -2.5847e-02,  2.3918e-02,  3.3754e-02,  5.1899e-03,
         -9.6100e-03,  3.9026e-02,  1.4256e-02,  5.0952e-04, -3.9692e-03],
        [-1.2131e-02, -1.7107e-02,  4.9355e-03,  5.8596e-02,  1.5842e-02,
         -2.2700e-02,  1.5244e-0

## 5. Mixed Precision Training

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,                # 실 배치
    gradient_accumulation_steps=8,                # 누적 배치 → 총 4x8=32
    fp16=True,                                    # Mixed Precision 활성화 (FP16)
    # bf16=True,                                  # BF16 사용 시 주석 해제
    learning_rate=5e-5,
    logging_steps=50,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    report_to="wandb",                            # WandB 로깅
)
