In [1]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from pycocotools.coco import COCO
import albumentations as A
from albumentations.pytorch import ToTensorV2
import torch.nn as nn
import torch.optim as optim
from torchvision.models.segmentation import deeplabv3_resnet50

class CocoMaskedDataset(Dataset):
    def __init__(self, images_path, annotations_path, transform=None):
        self.images_path = images_path
        self.coco = COCO(annotations_path)
        self.img_ids = self.coco.getImgIds()
        self.transform = transform

    def __len__(self):
        return len(self.img_ids)

    def __getitem__(self, idx):
        img_id = self.img_ids[idx]
        img_info = self.coco.loadImgs(img_id)[0]
        img_path = os.path.join(self.images_path, img_info['file_name'])

        img = np.array(Image.open(img_path).convert("RGB"))

        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        anns = self.coco.loadAnns(ann_ids)
        mask = np.zeros(img.shape[:2], dtype=np.uint8)
        for ann in anns:
            m = self.coco.annToMask(ann)
            mask = np.maximum(mask, m)

        if self.transform:
            augmented = self.transform(image=img, mask=mask)
            img = augmented["image"]
            mask = augmented["mask"]

        return img, mask.long()

transform = A.Compose([
    A.Resize(256, 256),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.2),
    A.RandomRotate90(p=0.5),
    A.ColorJitter(p=0.3),
    A.Normalize(mean=(0.485, 0.456, 0.406),
                std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

images_path = r"D:\val2017\val2017"
annotations_path = r"D:\annotations_trainval2017\annotations\instances_val2017.json"

dataset = CocoMaskedDataset(images_path, annotations_path, transform=transform)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = deeplabv3_resnet50(pretrained=True)

model.classifier[4] = nn.Conv2d(256, 2, kernel_size=1)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

def train_model(model, train_loader, val_loader, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for images, masks in train_loader:
            images, masks = images.to(device), masks.to(device)

            optimizer.zero_grad()
            outputs = model(images)["out"]
            loss = criterion(outputs, masks)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for images, masks in val_loader:
                images, masks = images.to(device), masks.to(device)
                outputs = model(images)["out"]
                loss = criterion(outputs, masks)
                val_loss += loss.item()
        avg_val_loss = val_loss / len(val_loader)

        print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    return model

trained_model = train_model(model, train_loader, val_loader, num_epochs=5)
torch.save(trained_model.state_dict(), "deeplabv3_hand_segmentation.pth")
print("✅ Model saved as deeplabv3_hand_segmentation.pth")

loading annotations into memory...
Done (t=0.49s)
creating index...
index created!


Epoch 1/5 - Training: 100%|██████████████████████████████████████████████████████| 1000/1000 [2:36:13<00:00,  9.37s/it]
Epoch 1/5 - Validation: 100%|████████████████████████████████████████████████████████| 250/250 [12:51<00:00,  3.09s/it]


Epoch [1/5] Train Loss: 0.4150 | Val Loss: 0.3339 | Val IoU: 0.6932


Epoch 2/5 - Training: 100%|██████████████████████████████████████████████████████| 1000/1000 [2:33:32<00:00,  9.21s/it]
Epoch 2/5 - Validation: 100%|████████████████████████████████████████████████████████| 250/250 [11:17<00:00,  2.71s/it]


Epoch [2/5] Train Loss: 0.3817 | Val Loss: 0.3297 | Val IoU: 0.6960


Epoch 3/5 - Training: 100%|██████████████████████████████████████████████████████| 1000/1000 [2:29:55<00:00,  9.00s/it]
Epoch 3/5 - Validation: 100%|████████████████████████████████████████████████████████| 250/250 [09:22<00:00,  2.25s/it]


Epoch [3/5] Train Loss: 0.3652 | Val Loss: 0.3319 | Val IoU: 0.6873


Epoch 4/5 - Training: 100%|██████████████████████████████████████████████████████| 1000/1000 [2:10:20<00:00,  7.82s/it]
Epoch 4/5 - Validation: 100%|████████████████████████████████████████████████████████| 250/250 [11:13<00:00,  2.69s/it]


Epoch [4/5] Train Loss: 0.3559 | Val Loss: 0.3313 | Val IoU: 0.6897


Epoch 5/5 - Training: 100%|██████████████████████████████████████████████████████| 1000/1000 [2:12:16<00:00,  7.94s/it]
Epoch 5/5 - Validation: 100%|████████████████████████████████████████████████████████| 250/250 [09:13<00:00,  2.21s/it]


Epoch [5/5] Train Loss: 0.3508 | Val Loss: 0.3340 | Val IoU: 0.7012
✅ Model saved as deeplabv3_hand_segmentation.pth


In [7]:
print(os.path.exists("deeplabv3_hand_segmentation.pth"))
print(os.path.getsize("deeplabv3_hand_segmentation.pth") / 1024**2, "MB")

True
160.55620861053467 MB


In [8]:
state_dict = torch.load("deeplabv3_hand_segmentation.pth")
print("Keys in checkpoint:", list(state_dict.keys())[:5])  # print first 5 keys

Keys in checkpoint: ['backbone.conv1.weight', 'backbone.bn1.weight', 'backbone.bn1.bias', 'backbone.bn1.running_mean', 'backbone.bn1.running_var']
