In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torchvision import datasets, transforms

In [2]:
# device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print('Current device is', device)

# for reproducibility
torch.manual_seed(2023)
if device == 'cuda':
    torch.cuda.manual_seed_all(2023)

Current device is cuda


In [3]:
import shutil
import os

# 손상된 캐시 폴더 삭제
cache_dir = os.path.expanduser("~/.cache/kagglehub")
if os.path.exists(cache_dir):
    shutil.rmtree(cache_dir)
    print("KaggleHub cache deleted.")

KaggleHub cache deleted.


In [4]:
import kagglehub

path = kagglehub.dataset_download("andrewmvd/metastatic-tissue-classification-patchcamelyon")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/andrewmvd/metastatic-tissue-classification-patchcamelyon?dataset_version_number=9...


100%|█████████████████████████████████████| 7.47G/7.47G [11:35<00:00, 11.5MB/s]

Extracting files...





Path to dataset files: C:\Users\roobe\.cache\kagglehub\datasets\andrewmvd\metastatic-tissue-classification-patchcamelyon\versions\9


In [5]:
import os

# 사용자 홈 기준으로 자동 경로 설정
base_path = os.path.expanduser("~/.cache/kagglehub/datasets/andrewmvd/metastatic-tissue-classification-patchcamelyon")

# 모든 하위 버전 디렉토리 확인
for root, dirs, files in os.walk(base_path):
    print("📁", root)
    for f in files:
        print("   └─", f)


📁 C:\Users\roobe/.cache/kagglehub/datasets/andrewmvd/metastatic-tissue-classification-patchcamelyon
   └─ 9.complete
📁 C:\Users\roobe/.cache/kagglehub/datasets/andrewmvd/metastatic-tissue-classification-patchcamelyon\versions
📁 C:\Users\roobe/.cache/kagglehub/datasets/andrewmvd/metastatic-tissue-classification-patchcamelyon\versions\9
📁 C:\Users\roobe/.cache/kagglehub/datasets/andrewmvd/metastatic-tissue-classification-patchcamelyon\versions\9\camelyonpatch_level_2_split_train_mask
   └─ camelyonpatch_level_2_split_train_mask.h5
📁 C:\Users\roobe/.cache/kagglehub/datasets/andrewmvd/metastatic-tissue-classification-patchcamelyon\versions\9\Labels
📁 C:\Users\roobe/.cache/kagglehub/datasets/andrewmvd/metastatic-tissue-classification-patchcamelyon\versions\9\Labels\Labels
   └─ camelyonpatch_level_2_split_test_y.h5
   └─ camelyonpatch_level_2_split_train_y.h5
   └─ camelyonpatch_level_2_split_valid_y.h5
📁 C:\Users\roobe/.cache/kagglehub/datasets/andrewmvd/metastatic-tissue-classification-pa

In [7]:
import os
import h5py

# Windows 사용자 홈 디렉토리 기준 경로
data_dir = os.path.expanduser("~/.cache/kagglehub/datasets/andrewmvd/metastatic-tissue-classification-patchcamelyon/versions/9/pcam")
h5_file = os.path.join(data_dir, "training_split.h5")

# HDF5 파일 내부 키 확인
with h5py.File(h5_file, "r") as file:
    print("✅ HDF5 파일 로드 성공!")
    print("📂 포함된 키 목록:", list(file.keys()))

✅ HDF5 파일 로드 성공!
📂 포함된 키 목록: ['x']


In [10]:
import h5py
import torch
from torch.utils.data import Dataset,DataLoader
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import argparse
import numpy as np
import time
from copy import deepcopy # Add Deepcopy for args

import matplotlib.pyplot as plt

In [48]:
# HDF5 데이터셋 클래스 정의
import os
from torch.utils.data import Dataset

import torchvision.transforms as transforms
from PIL import Image

# Define the transform
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),#pcam 1채널인데 resnet 3채널만 받음-> rgb값 복사
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])  # 3채널 이미지로 변경했기 때문에 정규화 값도 3개 필요
])

class HDF5Dataset(Dataset):
    def __init__(self, image_h5_path, label_h5_path, transform=None):
        self.image_h5_path = image_h5_path
        self.label_h5_path = label_h5_path
        self.transform = transform
        # HDF5 파일 열기
        with h5py.File(self.image_h5_path, "r") as img_file, h5py.File(self.label_h5_path, "r") as lbl_file:
             self.images = img_file["x"][:]  #이미지 데이터를 넘파이 배열로 변환해서 로드
             self.labels = lbl_file["y"][:]  #레이블 데이터를 넘파이 배열로 변환해서 로드

    def __len__(self):
        return len(self.images)

  

    def __getitem__(self, idx):
      image = self.images[idx]
      label = self.labels[idx]

    # NumPy 배열을 PIL 이미지로 변환 (PIL은 HWC 순서를 기대함)
      image = Image.fromarray(image.astype(np.uint8))  # (H, W, C)

      if self.transform:
         image = self.transform(image)  # transform이 (C, H, W) 등으로 바꿔줌
      label = torch.tensor(label).squeeze().long()
      return image, label


# 데이터셋 경로 설정
# 경로 수정
data_dir = os.path.expanduser("~/.cache/kagglehub/datasets/andrewmvd/metastatic-tissue-classification-patchcamelyon/versions/9")
image_dir = os.path.join(data_dir, "pcam")  # ✅ 여기까지만
label_dir = os.path.join(data_dir, "Labels", "Labels")  # ✅ 그대로 유지


# 학습 데이터 로드
train_dataset = HDF5Dataset(
    image_h5_path=os.path.join(image_dir, "training_split.h5"),
    label_h5_path=os.path.join(label_dir, "camelyonpatch_level_2_split_train_y.h5"),  # 수정된 레이블 경로
    transform=transform
)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# 검증 데이터 로드
val_dataset = HDF5Dataset(
    image_h5_path=os.path.join(image_dir, "validation_split.h5"),
    label_h5_path=os.path.join(label_dir, "camelyonpatch_level_2_split_valid_y.h5"),  # 수정된 레이블 경로
    transform=transform
)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)  #정확도가 낮게 나와서 배치 크기 키우기,학습률과 epoch도 올려서 정확도 높이려고 했어요

# 테스트 데이터 로드
test_dataset = HDF5Dataset(
    image_h5_path=os.path.join(image_dir, "test_split.h5"),
    label_h5_path=os.path.join(label_dir, "camelyonpatch_level_2_split_test_y.h5"),  # 수정된 레이블 경로
    transform=transform
)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 데이터 개수 확인
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 262144
Validation dataset size: 32768
Test dataset size: 32768


In [49]:
import torchvision.models as models

# 사전 학습된 ResNet-18 모델 불러오기
model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
model.eval()  # 추론 모드로 설정
model.fc = nn.Linear(model.fc.in_features, 2) # PCam은 양성/음성 2클래스
model = model.to(device)
learning_rate = 0.01
epochs = 4
# define loss & optimizer
criterion = nn.CrossEntropyLoss().to(device)    # Softmax is internally computed.
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [50]:
# hyperparameters
learning_rate = 0.01
epochs = 10



# define loss & optimizer
criterion = nn.CrossEntropyLoss().to(device)    # Softmax is internally computed.
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [51]:
train_batch = len(train_loader)
val_batch = len(val_loader)

In [53]:
print('Learning started.')

train_loss_list = []
train_acc_list = []

val_loss_list = []
val_acc_list = []
for epoch in range(epochs): # Use 'epochs' instead of 'epoch'
    # training
    train_loss = 0
    correct = 0

    for X, Y in train_loader:
        # image is already size of (1, 28, 28), no reshape
        # label is not one-hot encoded
        X = X.to(device)
        Y = Y.to(device).squeeze()

        optimizer.zero_grad()
        Y_pred = model.forward(X)
        loss = criterion(Y_pred,Y)
        loss.backward()
        optimizer.step()

        train_loss += loss / train_batch

        correct_prediction = torch.argmax(Y_pred, 1) == Y
        correct += correct_prediction.sum()

    train_acc = (100*correct/len(train_loader.dataset))
    train_acc_list.append(train_acc.item())
    train_loss_list.append(train_loss.item())


    # validation                                                                                                                                                                                                                                    
    with torch.no_grad(): #validation 에서는 gardient 업데이트x
        val_loss = 0
        correct = 0

        for X, Y in val_loader:
            X_val = X.to(device)
            Y_val = Y.to(device).squeeze()

            Y_pred = model.forward(X_val)
            loss = criterion(Y_pred, Y_val)

            val_loss += loss / val_batch

            correct_prediction = torch.argmax(Y_pred, 1) == Y_val
            correct += correct_prediction.sum()

        val_acc = (100*correct/len(val_loader.dataset))
        val_acc_list.append(val_acc.item())
        val_loss_list.append(val_loss.item())

    print("[Epoch %2d] loss %.4f  acc %.2f, val loss %.4f  val acc %.2f"
          % (epoch+1, train_loss, train_acc, val_loss, val_acc))

Learning started.
[Epoch  1] loss 0.2204  acc 91.19, val loss 0.4012  val acc 84.44
[Epoch  2] loss 0.1697  acc 93.44, val loss 0.3879  val acc 86.11
[Epoch  3] loss 0.1375  acc 94.83, val loss 0.3926  val acc 85.40
[Epoch  4] loss 0.1146  acc 95.74, val loss 0.3244  val acc 86.79
[Epoch  5] loss 0.0955  acc 96.50, val loss 0.5036  val acc 84.03
[Epoch  6] loss 0.0781  acc 97.17, val loss 0.5284  val acc 83.44
[Epoch  7] loss 0.0655  acc 97.63, val loss 0.5204  val acc 85.36
[Epoch  8] loss 0.0547  acc 98.02, val loss 0.5985  val acc 84.37
[Epoch  9] loss 0.0468  acc 98.35, val loss 0.6998  val acc 84.18
[Epoch 10] loss 0.0401  acc 98.58, val loss 0.6593  val acc 84.37


In [None]:
plt.figure(figsize=(14, 5))
plt.subplot(1,2,1)
plt.plot(range(len(train_loss_list)), train_loss_list, range(len(val_loss_list)), val_loss_list)
plt.legend(['Training', 'Validation'])
plt.title('Loss')
plt.subplot(1,2,2)
plt.plot(range(len(train_acc_list)), train_acc_list, range(len(val_acc_list)),val_acc_list)
plt.legend(['Training', 'Validation'])
plt.title('Accuracy')
plt.show()

In [None]:
with torch.no_grad():
    correct = 0
    # Test the model using test sets
    for X, Y in test_loader:
        X_test = X.to(device)
        Y_test = Y.to(device)

        Y_pred = model.forward(X_test)
        prediction = torch.argmax(Y_pred, 1)
        correct_prediction = prediction == Y_test
        correct += correct_prediction.sum()

    accuracy = 100*correct/len(test_loader.dataset)  
    print('Test set Accuracy: %.2f' % (accuracy))

In [33]:
import numpy as np
labels = np.concatenate([train_dataset.labels, val_dataset.labels])
print(np.unique(labels, return_counts=True))


(array([0, 1], dtype=uint8), array([147471, 147441], dtype=int64))


In [34]:
for name, param in model.named_parameters():
    print(f"{name}: requires_grad = {param.requires_grad}")


conv1.weight: requires_grad = True
bn1.weight: requires_grad = True
bn1.bias: requires_grad = True
layer1.0.conv1.weight: requires_grad = True
layer1.0.bn1.weight: requires_grad = True
layer1.0.bn1.bias: requires_grad = True
layer1.0.conv2.weight: requires_grad = True
layer1.0.bn2.weight: requires_grad = True
layer1.0.bn2.bias: requires_grad = True
layer1.1.conv1.weight: requires_grad = True
layer1.1.bn1.weight: requires_grad = True
layer1.1.bn1.bias: requires_grad = True
layer1.1.conv2.weight: requires_grad = True
layer1.1.bn2.weight: requires_grad = True
layer1.1.bn2.bias: requires_grad = True
layer2.0.conv1.weight: requires_grad = True
layer2.0.bn1.weight: requires_grad = True
layer2.0.bn1.bias: requires_grad = True
layer2.0.conv2.weight: requires_grad = True
layer2.0.bn2.weight: requires_grad = True
layer2.0.bn2.bias: requires_grad = True
layer2.0.downsample.0.weight: requires_grad = True
layer2.0.downsample.1.weight: requires_grad = True
layer2.0.downsample.1.bias: requires_grad =