In [1]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

In [2]:
DATA_DIR = r"C:\Users\user\desktop\machine-learning-prodigy\MLTask04\leapGestRecog"

In [4]:
class LeapDataset(Dataset):
    def __init__(self, data_dir, subjects, transform=None):
        self.transform = transform
        self.samples = []
        self.labels = []
        self.label_map = {}  # map gesture folder name -> label
        gesture_idx = 0
        
        for subject in subjects:
            subject_path = os.path.join(data_dir, subject)
            for gesture in sorted(os.listdir(subject_path)):
                gesture_path = os.path.join(subject_path, gesture)
                if os.path.isdir(gesture_path):
                    if gesture not in self.label_map:
                        self.label_map[gesture] = gesture_idx
                        gesture_idx += 1
                    label = self.label_map[gesture]
                    for img_file in os.listdir(gesture_path):
                        if img_file.endswith(".png") or img_file.endswith(".jpg"):
                            self.samples.append(os.path.join(gesture_path, img_file))
                            self.labels.append(label)
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        img_path = self.samples[idx]
        label = self.labels[idx]
        img = Image.open(img_path).convert("L")  # 1-channel
        if self.transform:
            img = self.transform(img)
        return img, label

In [5]:
IMG_SIZE = 128
BATCH_SIZE = 64

transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
])

# Subjects split: 00-07 train, 08 val, 09 test
train_subjects = [f"{i:02d}" for i in range(8)]
val_subjects = ["08"]
test_subjects = ["09"]

train_dataset = LeapDataset(DATA_DIR, train_subjects, transform=transform)
val_dataset = LeapDataset(DATA_DIR, val_subjects, transform=transform)
test_dataset = LeapDataset(DATA_DIR, test_subjects, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

num_classes = len(train_dataset.label_map)
print(f"Number of classes: {num_classes}")

Number of classes: 10


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = models.resnet18(weights=None)
model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
model.fc = nn.Linear(model.fc.in_features, num_classes)
model = model.to(device)

Using device: cpu


In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

In [None]:
EPOCHS = 10
best_val_loss = float('inf')
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
    
    val_loss /= len(val_loader)
    scheduler.step(val_loss)
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {running_loss/len(train_loader):.4f} | Val Loss: {val_loss:.4f}")
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best.pt")
        print("Saved best model!")

Epoch 1/10 | Train Loss: 0.0850 | Val Loss: 2.2977
Saved best model!
Epoch 2/10 | Train Loss: 0.0119 | Val Loss: 0.6601
Saved best model!
Epoch 3/10 | Train Loss: 0.0089 | Val Loss: 1.8875
Epoch 4/10 | Train Loss: 0.0131 | Val Loss: 1.0717
Epoch 5/10 | Train Loss: 0.0022 | Val Loss: 1.2329
Epoch 6/10 | Train Loss: 0.0001 | Val Loss: 1.4272


In [None]:
def predict_image(img_path):
    img = Image.open(img_path).convert("L")
    img = transform(img).unsqueeze(0).to(device)
    model.eval()
    with torch.no_grad():
        output = model(img)
        pred = torch.argmax(output, dim=1).item()
    for gesture, idx in train_dataset.label_map.items():
        if idx == pred:
            return gesture