In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [3]:
#로컬 모듈 import
import sys
import os
print(os.getcwd())
# 모듈 경로를 sys.path에 추가
module_path = os.path.abspath(os.path.join('..', 'my_transformer'))
if module_path not in sys.path:
    sys.path.append(module_path)
from my_transformer.my_transformer import Transformer

/root/25-1-DS-Week-1-Assignment


Wandb Sweep 사용해보기!

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import wandb

# 데이터셋 정의
class PatternDataset(Dataset):
    def __init__(self, num_samples=1000, sequence_length=4, max_num=10):
        self.data = []
        self.targets = []
        for _ in range(num_samples):
            start = torch.randint(0, max_num, (1,)).item()
            diff = torch.randint(1, 5, (1,)).item()
            sequence = [start + i * diff for i in range(sequence_length)]
            next_value = sequence[-1] + diff

            self.data.append(torch.tensor(sequence))
            self.targets.append(next_value)
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

def train():
    with wandb.init() as run: 
        config = wandb.config  

        batch_size = config.batch_size
        learning_rate = config.learning_rate
        d_model = config.d_model

        dataset = PatternDataset(num_samples=1000)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        model = Transformer(src_vocab_size=100, tgt_vocab_size=200, 
                            d_model=d_model, n_heads=4, d_ff=64, 
                            num_encoder_layers=6, num_decoder_layers=6, dropout=0.1)
        
        # 손실 함수 및 최적화 도구 정의
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        for epoch in range(10): 
            model.train()
            total_loss = 0

            for src, tgt in dataloader:
                tgt_input = torch.zeros_like(tgt).unsqueeze(1)
                tgt = tgt.unsqueeze(1)

                optimizer.zero_grad()
                output = model(src, tgt_input)

                output = output.view(-1, 200)  # `tgt_vocab_size = 200`
                tgt = tgt.view(-1)

                loss = criterion(output, tgt)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

            avg_loss = total_loss / len(dataloader)
            print(f"Epoch {epoch+1}/10, Loss: {avg_loss:.4f}")

            wandb.log({"epoch": epoch+1, "loss": avg_loss})

        wandb.finish()


In [8]:

src_vocab_size = 100
tgt_vocab_size = 200
n_heads = 4
d_ff = 64
num_encoder_layers = 6
num_decoder_layers = 6
dropout = 0.1
num_epochs = 50


sweep_config = {
    "method": "grid",  # "random", "bayes", "grid" 등 사용 가능
    "metric": {
        "name": "loss",
        "goal": "minimize"
    },
    "parameters": {
        "learning_rate": {
            "values": [0.0001, 0.0005]
        },
        "batch_size": {
            "values": [32, 64]
        },
        "d_model": {
            "values": [32, 64]
        }
    }
}

# Sweep 등록 (프로젝트 이름은 wandb.init에서 사용한 것과 동일해야 함)
sweep_id = wandb.sweep(sweep_config, project="transformer")
# 지정한 횟수만큼 에이전트를 실행 (count를 조절하여 실험 횟수를 늘릴 수 있습니다)
wandb.agent(sweep_id, function=train, count=10)

Create sweep with ID: kqekerl2
Sweep URL: https://wandb.ai/25th-project-BubbleFreeNewsletter/transformer/sweeps/kqekerl2


[34m[1mwandb[0m: Agent Starting Run: 00in57p0 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	d_model: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001


[1;34mwandb[0m: 
[1;34mwandb[0m: 🚀 View run [33mblooming-sweep-1[0m at: [34mhttps://wandb.ai/25th-project-BubbleFreeNewsletter/transformer/runs/6g285zat[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20250317_135554-6g285zat/logs[0m


Epoch 1/10, Loss: 4.9127
Epoch 2/10, Loss: 4.3619
Epoch 3/10, Loss: 4.0051
Epoch 4/10, Loss: 3.6795
Epoch 5/10, Loss: 3.4265
Epoch 6/10, Loss: 3.2102
Epoch 7/10, Loss: 3.0003
Epoch 8/10, Loss: 2.7934
Epoch 9/10, Loss: 2.6241
Epoch 10/10, Loss: 2.4880


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▆▅▄▄▃▂▂▁▁

0,1
epoch,10.0
loss,2.488


[34m[1mwandb[0m: Agent Starting Run: 65wftmw1 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	d_model: 32
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/10, Loss: 4.4490
Epoch 2/10, Loss: 3.3548
Epoch 3/10, Loss: 2.5953
Epoch 4/10, Loss: 1.9655
Epoch 5/10, Loss: 1.4389
Epoch 6/10, Loss: 1.0398
Epoch 7/10, Loss: 0.7561
Epoch 8/10, Loss: 0.5447
Epoch 9/10, Loss: 0.4039
Epoch 10/10, Loss: 0.3167


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▆▅▄▃▂▂▁▁▁

0,1
epoch,10.0
loss,0.31669


[34m[1mwandb[0m: Agent Starting Run: vhfjpfzg with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	d_model: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001


Epoch 1/10, Loss: 4.6040
Epoch 2/10, Loss: 3.4410
Epoch 3/10, Loss: 2.7514
Epoch 4/10, Loss: 2.3065
Epoch 5/10, Loss: 1.9568
Epoch 6/10, Loss: 1.6732
Epoch 7/10, Loss: 1.4342
Epoch 8/10, Loss: 1.2160
Epoch 9/10, Loss: 1.0636
Epoch 10/10, Loss: 0.9104


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▆▄▄▃▂▂▂▁▁

0,1
epoch,10.0
loss,0.91044


[34m[1mwandb[0m: Agent Starting Run: nw92i7ap with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	d_model: 64
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/10, Loss: 3.2936
Epoch 2/10, Loss: 1.7073
Epoch 3/10, Loss: 0.8577
Epoch 4/10, Loss: 0.4335
Epoch 5/10, Loss: 0.2514
Epoch 6/10, Loss: 0.1686
Epoch 7/10, Loss: 0.1207
Epoch 8/10, Loss: 0.0917
Epoch 9/10, Loss: 0.0734
Epoch 10/10, Loss: 0.0598


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▅▃▂▁▁▁▁▁▁

0,1
epoch,10.0
loss,0.05979


[34m[1mwandb[0m: Agent Starting Run: pbumrr3y with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001


Epoch 1/10, Loss: 5.2730
Epoch 2/10, Loss: 4.8589
Epoch 3/10, Loss: 4.5753
Epoch 4/10, Loss: 4.3020
Epoch 5/10, Loss: 4.0671
Epoch 6/10, Loss: 3.8628
Epoch 7/10, Loss: 3.7022
Epoch 8/10, Loss: 3.5296
Epoch 9/10, Loss: 3.3940
Epoch 10/10, Loss: 3.2582


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▇▆▅▄▃▃▂▁▁

0,1
epoch,10.0
loss,3.25821


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 7ipfamqh with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 32
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/10, Loss: 4.7867
Epoch 2/10, Loss: 3.8897
Epoch 3/10, Loss: 3.2792
Epoch 4/10, Loss: 2.7986
Epoch 5/10, Loss: 2.3567
Epoch 6/10, Loss: 1.9790
Epoch 7/10, Loss: 1.6557
Epoch 8/10, Loss: 1.3641
Epoch 9/10, Loss: 1.1345
Epoch 10/10, Loss: 0.9371


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▆▅▄▄▃▂▂▁▁

0,1
epoch,10.0
loss,0.93714


[34m[1mwandb[0m: Agent Starting Run: cubd51zd with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001


Epoch 1/10, Loss: 4.6493
Epoch 2/10, Loss: 3.9540
Epoch 3/10, Loss: 3.3814
Epoch 4/10, Loss: 2.9185
Epoch 5/10, Loss: 2.5787
Epoch 6/10, Loss: 2.2971
Epoch 7/10, Loss: 2.0760
Epoch 8/10, Loss: 1.8899
Epoch 9/10, Loss: 1.7135
Epoch 10/10, Loss: 1.5649


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▆▅▄▃▃▂▂▁▁

0,1
epoch,10.0
loss,1.56487


[34m[1mwandb[0m: Agent Starting Run: 6hz8ogjn with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 64
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/10, Loss: 4.0633
Epoch 2/10, Loss: 2.5260
Epoch 3/10, Loss: 1.6698
Epoch 4/10, Loss: 1.0789
Epoch 5/10, Loss: 0.6991
Epoch 6/10, Loss: 0.4682
Epoch 7/10, Loss: 0.3276
Epoch 8/10, Loss: 0.2462
Epoch 9/10, Loss: 0.1921
Epoch 10/10, Loss: 0.1558


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▅▄▃▂▂▁▁▁▁

0,1
epoch,10.0
loss,0.1558


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


선택한 파라미터로 학습


In [9]:
import wandb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

best_hyperparams = {
    "learning_rate": 0.0005,  
    "batch_size": 32,       
    "d_model": 64           
}

wandb.init(project="transformer", name="final_train", config=best_hyperparams)

dataset = PatternDataset(num_samples=5000)
dataloader = DataLoader(dataset, batch_size=best_hyperparams["batch_size"], shuffle=True)

model = Transformer(src_vocab_size=100, tgt_vocab_size=200, 
                    d_model=best_hyperparams["d_model"], n_heads=4, d_ff=64, 
                    num_encoder_layers=6, num_decoder_layers=6, dropout=0.1)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=best_hyperparams["learning_rate"])

wandb.watch(model, log="all", log_freq=10)

def final_train():
    for epoch in range(40): 
        model.train()
        total_loss = 0

        for src, tgt in dataloader:
            tgt_input = torch.zeros_like(tgt).unsqueeze(1)
            tgt = tgt.unsqueeze(1)

            optimizer.zero_grad()
            output = model(src, tgt_input)

            output = output.view(-1, 200)
            tgt = tgt.view(-1)

            loss = criterion(output, tgt)
            loss.backward()

            wandb.log({"grad_norm": torch.norm(torch.stack([torch.norm(p.grad) for p in model.parameters() if p.grad is not None]))})

            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/50, Loss: {avg_loss:.4f}")

        wandb.log({
            "epoch": epoch+1,
            "loss": avg_loss,
            "learning_rate": optimizer.param_groups[0]["lr"]
        })

    wandb.finish() 

final_train()


Epoch 1/50, Loss: 1.4574
Epoch 2/50, Loss: 0.1062
Epoch 3/50, Loss: 0.0388


KeyboardInterrupt: 

In [4]:
# 테스트 데이터셋 생성 (학습 데이터와 동일한 방식으로 생성)
test_dataset = PatternDataset(num_samples=1000)  # 테스트용 샘플 수
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

def test_model(model, dataloader):
    model.eval()  # 평가 모드로 설정
    total_correct = 0
    total_samples = 0

    with torch.no_grad():  # 그래디언트 계산 중지 (평가 시에는 필요하지 않음)
        for src, tgt in dataloader:
            # 입력 시퀀스 준비
            tgt_input = torch.zeros_like(tgt).unsqueeze(1)  # 예측 시작을 위한 빈 타겟 시퀀스
            tgt = tgt.unsqueeze(1)  # 타겟을 2D 텐서로 변환
            
            # 모델에 입력 시퀀스를 전달하고 예측 값 생성
            output = model(src, tgt_input)
            predicted = output.argmax(dim=-1)  # 예측 결과는 argmax를 통해 얻음
            
            # 실제 타겟과 예측값 비교
            correct = (predicted.view(-1) == tgt.view(-1)).sum().item()
            total_correct += correct
            total_samples += tgt.size(0)

    # 정확도 계산
    accuracy = total_correct / total_samples * 100
    print(f"Test Accuracy: {accuracy:.2f}%")
    
    # WandB에 테스트 정확도 로깅
    wandb.log({"test_accuracy": accuracy})
    wandb.run.summary["test_accuracy"] = accuracy

# 학습된 모델 테스트
test_model(model, test_dataloader)


Test Accuracy: 100.00%
