In [11]:
from google.colab import files
uploaded = files.upload()

Saving puzzarm_dataset.zip to puzzarm_dataset (1).zip


In [2]:
import zipfile

with zipfile.ZipFile("puzzarm_dataset.zip", "r") as z:
    z.extractall(".")


In [3]:
!pip install torch torchvision opencv-python




In [10]:
import os, glob, json, cv2, torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

DATASET_DIR = "puzzarm_dataset"

READY_JOINTS = torch.tensor([-59.75, 0.50, -35.00, 63.50, -43.00, 12.50], dtype=torch.float32)

class GripDataset(Dataset):
    def __init__(self, dir_path):
        self.img_paths = sorted(
            glob.glob(os.path.join(dir_path, "image_*.jpg"))
        )
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]
            )
        ])

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]

        # 1) read image
        img_bgr = cv2.imread(img_path)
        img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
        img = self.transform(img_rgb)

        # 2) read joints from matching JSON
        json_path = img_path.replace("image_", "joints_").replace(".jpg", ".json")
        with open(json_path, "r") as f:
            data = json.load(f)

        joints = torch.tensor(data["joints"], dtype=torch.float32)

        # 3) deltas = how far this pose is from ready_to_grab
        deltas = joints - READY_JOINTS

        return img, joints, deltas

dataset = GripDataset(DATASET_DIR)
print("Number of samples:", len(dataset))


Number of samples: 223


In [12]:
import torch.nn as nn
import torch.optim as optim
from torchvision import models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class GripNet(nn.Module):
    def __init__(self):
        super().__init__()
        # ResNet backbone
        self.backbone = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
        in_feats = self.backbone.fc.in_features
        self.backbone.fc = nn.Identity()

        # New head: 512 (image features) + 6 (joints) -> 6 deltas
        self.fc = nn.Sequential(
            nn.Linear(in_feats + 6, 128),
            nn.ReLU(),
            nn.Linear(128, 6)
        )

    def forward(self, img, joints):
        feats = self.backbone(img)
        x = torch.cat([feats, joints], dim=1)
        return self.fc(x)

model = GripNet().to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


Using device: cpu


In [6]:
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for imgs, joints, deltas in dataloader:
        imgs = imgs.to(device)
        joints = joints.to(device)
        deltas = deltas.to(device)

        preds = model(imgs, joints) # model's guess
        loss = criterion(preds, deltas)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)

    epoch_loss = running_loss / len(dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")


Epoch 1/20, Loss: 250.6187
Epoch 2/20, Loss: 112.4406
Epoch 3/20, Loss: 81.7612
Epoch 4/20, Loss: 61.1693
Epoch 5/20, Loss: 44.5919
Epoch 6/20, Loss: 30.8345
Epoch 7/20, Loss: 22.6474
Epoch 8/20, Loss: 18.0549
Epoch 9/20, Loss: 15.2935
Epoch 10/20, Loss: 11.9719
Epoch 11/20, Loss: 12.2136
Epoch 12/20, Loss: 10.4057
Epoch 13/20, Loss: 7.7959
Epoch 14/20, Loss: 7.8120
Epoch 15/20, Loss: 7.6823
Epoch 16/20, Loss: 8.8682
Epoch 17/20, Loss: 6.3839
Epoch 18/20, Loss: 6.2233
Epoch 19/20, Loss: 6.8242
Epoch 20/20, Loss: 5.5168


In [7]:
torch.save(model.state_dict(), "grip_model.pth")
from google.colab import files
files.download("grip_model.pth")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
model.eval()

# Take the first sample from the dataset
img, joints_true, deltas_true = dataset[0]

with torch.no_grad():
    pred_deltas = model(
        img.unsqueeze(0).to(device), # add batch dimension
        joints_true.unsqueeze(0).to(device)
    )

print("True deltas:", deltas_true)
print("Predicted deltas:", pred_deltas.cpu().squeeze())


True deltas: tensor([ 2.7500, -0.5000,  0.0000,  0.5000, -4.0000, -0.5000])
Predicted deltas: tensor([-0.1500, -1.3344, -1.3843,  0.9432, -2.6950, -0.5861])


In [9]:
pred_deltas_cpu = pred_deltas.cpu().squeeze()
pred_joints = READY_JOINTS + pred_deltas_cpu

print("READY_JOINTS:", READY_JOINTS)
print("True joints: ", joints_true)
print("Pred joints: ", pred_joints)


READY_JOINTS: tensor([-59.7500,   0.5000, -35.0000,  63.5000, -43.0000,  12.5000])
True joints:  tensor([-57.,   0., -35.,  64., -47.,  12.])
Pred joints:  tensor([-59.9000,  -0.8344, -36.3843,  64.4432, -45.6950,  11.9139])
