In [1]:
import glob
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.optimize import minimize

In [3]:
# Custom Dataset
class HandGestureDataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        
        # Convert to PyTorch expected format (C, H, W)
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)
        
        if self.transform:
            image = torch.from_numpy(image)
            image = self.transform(image)
        else:
            image = torch.from_numpy(image)
            
        return image, label

# Data augmentation
train_transform = transforms.Compose([
    transforms.RandomRotation(15),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
])

In [4]:
# Model Builder
class HandGestureModel(nn.Module):
    def __init__(self, base_model_name, num_classes=14):
        super(HandGestureModel, self).__init__()
        
        # Load pre-trained model
        if base_model_name == 'mobilenet_v2':
            self.base_model = models.mobilenet_v2(pretrained=True)
            num_ftrs = self.base_model.classifier[1].in_features
            self.base_model.classifier = nn.Identity()
        elif base_model_name == 'vgg19':
            self.base_model = models.vgg19(pretrained=True)
            num_ftrs = self.base_model.classifier[0].in_features
            self.base_model.classifier = nn.Identity()
        else:
            raise ValueError(f"Unsupported model: {base_model_name}")
        
        # Freeze base model layers
        for param in self.base_model.parameters():
            param.requires_grad = False
            
        # Custom classifier
        self.classifier = nn.Sequential(
            nn.BatchNorm1d(num_ftrs),
            nn.Dropout(0.3),
            nn.Linear(num_ftrs, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, x):
        x = self.base_model(x)
        # Apply global average pooling if needed
        if len(x.shape) > 2:
            x = torch.mean(x, dim=[2, 3])
        x = self.classifier(x)
        return x

In [5]:
# Training function
def train_model(model, dataloaders, criterion, optimizer, num_epochs=50, patience=5):
    best_model_wts = model.state_dict()
    best_acc = 0.0
    no_improve_epochs = 0
    
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        print('-' * 10)
        
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
                
            running_loss = 0.0
            running_corrects = 0
            
            # Iterate over data
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                # Zero the parameter gradients
                optimizer.zero_grad()
                
                # Forward
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    
                    # Backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        
                # Statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
            
            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
            
            # Deep copy the model if best validation accuracy
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict().copy()
                no_improve_epochs = 0
            elif phase == 'val':
                no_improve_epochs += 1
        
        # Early stopping
        if no_improve_epochs >= patience:
            print(f'Early stopping at epoch {epoch+1}')
            break
                
        print()
    
    print(f'Best val Acc: {best_acc:.4f}')
    
    # Load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [6]:
# Function to get predictions
def get_predictions(model, dataloader):
    model.eval()
    all_preds = []
    
    with torch.no_grad():
        for inputs, _ in dataloader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            probs = nn.Softmax(dim=1)(outputs)
            all_preds.append(probs.cpu().numpy())
            
    return np.vstack(all_preds)

In [7]:
# Main execution
if __name__ == "__main__":
    # Load all image paths
    all_paths = glob.glob('/kaggle/input/hg14-handgesture14-dataset/HG14/HG14-Hand Gesture/*/*.jpg')
    
    # Prepare dataset
    data, labels = [], []
    for path in tqdm(all_paths):
        img = cv2.imread(path)
        img = cv2.resize(img, (128, 128))
        label = path.split('/')[-2]
        data.append(img)
        labels.append(label)
    
    data = np.array(data) / 255.0
    labels_factorized = pd.factorize(labels)[0]
    
    # Split data
    X_temp, X_test, y_temp, y_test = train_test_split(
        data, labels_factorized, test_size=0.10, stratify=labels_factorized, random_state=42
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=0.20, stratify=y_temp, random_state=42
    )
    
    # Create datasets
    train_dataset = HandGestureDataset(X_train, y_train, transform=train_transform)
    val_dataset = HandGestureDataset(X_val, y_val)
    test_dataset = HandGestureDataset(X_test, y_test)
    
    # Create dataloaders
    batch_size = 20
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    dataloaders = {
        'train': train_loader,
        'val': val_loader
    }
    
    # Settings
    models_dict = {
        'mobilenet_v2': 'mobilenet_v2',
        'vgg19': 'vgg19'
    }
    
    trained_models = {}
    val_preds = {}
    test_preds = {}
    
    # Train models
    for name, model_type in models_dict.items():
        print(f"\nTraining {name}")
        model = HandGestureModel(model_type).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters())
        
        # Train and validate
        model = train_model(model, dataloaders, criterion, optimizer)

        
        # Save trained model
        trained_models[name] = model
        
        #saving
        torch.save(model.state_dict(), f"{name}_hand_gesture_model.pth")
        
        # Get predictions
        val_preds[name] = get_predictions(model, val_loader)
        test_preds[name] = get_predictions(model, test_loader)
    
    # Ensemble with Dirichlet optimization
    val_stack = np.stack([val_preds[name] for name in models_dict.keys()], axis=-1)
    
    def dirichlet_loss(weights):
        ensemble_pred = np.tensordot(val_stack, weights, axes=([3], [0]))
        y_val_onehot = np.zeros((len(y_val), 14))
        y_val_onehot[np.arange(len(y_val)), y_val] = 1
        return -np.mean(np.sum(y_val_onehot * np.log(ensemble_pred + 1e-8), axis=1))
    
    init_weights = np.ones(len(models_dict)) / len(models_dict)
    bounds = [(0, 1)] * len(models_dict)
    constraints = [{'type': 'eq', 'fun': lambda w: 1 - sum(w)}]
    
    res = minimize(dirichlet_loss, init_weights, bounds=bounds, constraints=constraints)
    final_weights = res.x
    print("Optimized Weights:", final_weights)
    np.save('models/ensemble_weights.npy', final_weights)
    
    # Final Test Prediction
    test_stack = np.stack([test_preds[name] for name in models_dict.keys()], axis=-1)
    ensemble_test_pred = np.tensordot(test_stack, final_weights, axes=([3], [0]))
    ensemble_test_labels = np.argmax(ensemble_test_pred, axis=1)
    
    # Extract ground truth labels from test dataset
    test_labels = []
    for _, label in test_loader:
        test_labels.extend(label.numpy())
    test_labels = np.array(test_labels)
    
    print("Ensemble Accuracy on Test Set:", accuracy_score(test_labels, ensemble_test_labels))

100%|██████████| 14000/14000 [01:48<00:00, 129.56it/s]
  labels_factorized = pd.factorize(labels)[0]
Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth



Training mobilenet_v2


100%|██████████| 13.6M/13.6M [00:00<00:00, 113MB/s] 


Epoch 1/50
----------
train Loss: 1.4239 Acc: 0.5115
val Loss: 1.1408 Acc: 0.6087

Epoch 2/50
----------
train Loss: 1.1665 Acc: 0.5847
val Loss: 1.0882 Acc: 0.6115

Epoch 3/50
----------
train Loss: 1.1208 Acc: 0.6003
val Loss: 0.9427 Acc: 0.6690

Epoch 4/50
----------
train Loss: 1.0616 Acc: 0.6265
val Loss: 0.9882 Acc: 0.6579

Epoch 5/50
----------
train Loss: 1.0782 Acc: 0.6182
val Loss: 0.9313 Acc: 0.6778

Epoch 6/50
----------
train Loss: 1.0330 Acc: 0.6323
val Loss: 0.9532 Acc: 0.6714

Epoch 7/50
----------
train Loss: 1.0218 Acc: 0.6379
val Loss: 0.8478 Acc: 0.7040

Epoch 8/50
----------
train Loss: 1.0289 Acc: 0.6332
val Loss: 0.9056 Acc: 0.6921

Epoch 9/50
----------
train Loss: 0.9988 Acc: 0.6434
val Loss: 0.9117 Acc: 0.6762

Epoch 10/50
----------
train Loss: 0.9900 Acc: 0.6532
val Loss: 0.8963 Acc: 0.6786

Epoch 11/50
----------
train Loss: 0.9759 Acc: 0.6510
val Loss: 0.8635 Acc: 0.6988

Epoch 12/50
----------
train Loss: 0.9661 Acc: 0.6546
val Loss: 0.9184 Acc: 0.6718
Ea

Downloading: "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth" to /root/.cache/torch/hub/checkpoints/vgg19-dcbb9e9d.pth
100%|██████████| 548M/548M [00:02<00:00, 209MB/s] 


Epoch 1/50
----------
train Loss: 1.0003 Acc: 0.6591
val Loss: 0.5931 Acc: 0.8242

Epoch 2/50
----------
train Loss: 0.7070 Acc: 0.7520
val Loss: 0.5467 Acc: 0.8353

Epoch 3/50
----------
train Loss: 0.6492 Acc: 0.7673
val Loss: 0.5349 Acc: 0.8389

Epoch 4/50
----------
train Loss: 0.5988 Acc: 0.7880
val Loss: 0.5309 Acc: 0.8516

Epoch 5/50
----------
train Loss: 0.5786 Acc: 0.7936
val Loss: 0.4891 Acc: 0.8587

Epoch 6/50
----------
train Loss: 0.5697 Acc: 0.7933
val Loss: 0.4679 Acc: 0.8659

Epoch 7/50
----------
train Loss: 0.5323 Acc: 0.8084
val Loss: 0.4411 Acc: 0.8778

Epoch 8/50
----------
train Loss: 0.5241 Acc: 0.8132
val Loss: 0.5092 Acc: 0.8659

Epoch 9/50
----------
train Loss: 0.5053 Acc: 0.8183
val Loss: 0.4399 Acc: 0.8813

Epoch 10/50
----------
train Loss: 0.4924 Acc: 0.8261
val Loss: 0.5030 Acc: 0.8679

Epoch 11/50
----------
train Loss: 0.4859 Acc: 0.8289
val Loss: 0.5375 Acc: 0.8619

Epoch 12/50
----------
train Loss: 0.4547 Acc: 0.8347
val Loss: 0.4327 Acc: 0.8786

E

IndexError: tuple index out of range

In [None]:
model = HandGestureModel(model_type)
model.load_state_dict(torch.load("mobilenet_v2_hand_gesture_model.pth"))
model.to(device)
model.eval()