In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("s076923/pytorch-transformer")

print("Path to dataset files:", path)

In [1]:
import os
import torch
from PIL import Image
from pycocotools.coco import COCO
from torch.utils.data import Dataset

## 데이터 클래스 생성

In [2]:
class COCODataset(Dataset):
    def __init__(self, root, train, transform=None):
        #초기화
        super().__init__()
        directory = "train" if train else "val"
        annotations = os.path.join(root, "annotations", f"{directory}_annotations.json")
        self.coco = COCO(annotations)
        self.image_path = os.path.join(root, directory)
        self.transform = transform 
        self.categories = self.__get_categories()
        self.data = self.__load_data()
        
           
    def __get_categories(self):
        #coco 데이터셋 기반으로 라벨 지정
        categories = {0 : 'background'}
        for category in self.coco.cats.values():
            categories[category['id']] = category['name']
        return categories
    
    def __load_data(self):
        data = []
        for _id in self.coco.imgs:
            file_name = self.coco.loadImgs(_id)[0]['file_name']
            image_path = os.path.join(self.image_path, file_name)
            image = Image.open(image_path).convert("RGB")

            boxes = []
            labels = []
            anns = self.coco.loadAnns(self.coco.getAnnIds(_id))

            for ann in anns:
                x, y, w, h = ann['bbox']
                boxes.append([x, y, x+w, y+h])
                labels.append(ann['category_id'])
            target = {
                'image_id' : torch.LongTensor([_id]),
                'boxes' : torch.FloatTensor(boxes),
                'labels' : torch.LongTensor(labels)
            }
            data.append([image, target])
        return data
       
    
    def __getitem__(self, index):
        image, target = self.data[index]
        if self.transform:
            image =self.transform(image)
        return image, target
        
    def __len__(self):
        return len(self.data)


In [3]:
from torchvision import transforms
from torch.utils.data import DataLoader

def collator(batch):
    return tuple(zip(*batch))
    #사진 한 장에 여러 라벨이 들어있는 경우에 첫 번째 라벨만 들어갈 수 있도록 한다.

transform = transforms.Compose([
    transforms.PILToTensor(),
    transforms.ConvertImageDtype(dtype = torch.float)
])
root = '/kaggle/input/pytorch-transformer/datasets/coco'
train_dataset = COCODataset(root, train = True, transform = transform)
train_dataloader = DataLoader(train_dataset, batch_size = 4, shuffle = True, drop_last = True,
          collate_fn = collator)

loading annotations into memory...
Done (t=0.09s)
creating index...
index created!


In [4]:
train_dataset = COCODataset(root, train = False, transform = transform)
train_dataloader = DataLoader(train_dataset, batch_size = 4, shuffle = True, drop_last = True,
          collate_fn = collator)

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


In [5]:
from torchvision import models
from torchvision import ops
from torchvision.models.detection import rpn, FasterRCNN

## 백본으로 VGG16 사용

In [6]:
backbone = models.vgg16(weights="VGG16_Weights.IMAGENET1K_V1").features
backbone.out_channels = 512

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:02<00:00, 232MB/s] 


In [7]:
anchor_generator = rpn.AnchorGenerator(
    sizes = ((32, 64, 128, 256, 512),),
    aspect_ratios = ((0.5, 1.0, 2.0))

)

In [8]:
roi_pooler = ops.MultiScaleRoIAlign(
    featmap_names = ["0"],
    output_size=(7,7),
    sampling_ratio = 2
)

In [9]:
import torch
if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')
print('Using PyTorch version:', torch.__version__, ' Device:', DEVICE)

Using PyTorch version: 2.6.0+cu124  Device: cuda


In [10]:
model = FasterRCNN(
    backbone=backbone, 
    num_classes = 3,
    rpn_anchor_generator = anchor_generator,
    box_roi_pool  = roi_pooler
).to(DEVICE)

In [11]:
from torch import optim
params = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.SGD(params, lr=0.001, momentum=0.9, weight_decay=0.0005)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma = 0.1)

In [None]:

for epoch in range(5):
    cost = 0.0
    for idx, (images, targets)  in enumerate(train_dataloader):
        #image = image.to(device)
        images = list(image.to(DEVICE) for image in images)
        targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
        loss_dict = model(images, targets)
        losses = sum([loss for loss in loss_dict.values()])

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        cost += losses 
    lr_scheduler.step()
    cost = cost / len(train_dataloader)
    print(f"Epoch : {epoch + 1:4d}, Cost : {cost:.3f}")


Epoch :    1, Cost : 1.164
Epoch :    2, Cost : 0.708
Epoch :    3, Cost : 0.386
Epoch :    4, Cost : 0.342


In [None]:
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
from torchvision.transforms.functional import to_pil_image


def draw_bbox(ax, box, text, color):
    ax.add_patch(
        plt.Rectangle(
            xy=(box[0], box[1]),
            width=box[2] - box[0],
            height=box[3] - box[1],
            fill=False,
            edgecolor=color,
            linewidth=2,
        )
    )
    ax.annotate(
        text=text,
        xy=(box[0] - 5, box[1] - 5),
        color=color,
        weight="bold",
        fontsize=13,
    )
