# SignSpeak: Sign Language Recognition Model Training

This notebook covers the training pipeline for the sign language recognition model used in the SignSpeak application.

In [None]:
# Import libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm

# Add src directory to path
sys.path.append(os.path.abspath('../src'))

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Data Preparation

In this section, we'll load and prepare the data for training. This assumes we've already extracted features using the data_exploration.ipynb notebook.

In [None]:
# Define a custom PyTorch Dataset for sign language data
class SignLanguageDataset(Dataset):
    def __init__(self, features, labels):
        """
        Args:
            features: Processed keypoint features
            labels: Sign labels
        """
        self.features = features
        self.labels = labels
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        # Convert to PyTorch tensors
        feature = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return feature, label

# Function to load sample data (in a real implementation, this would load from files)
def load_sample_data(num_samples=100, num_frames=30, num_features=189, num_classes=5):
    """
    Generate sample data for demonstration purposes.
    
    Args:
        num_samples: Number of sign samples
        num_frames: Number of frames per sign
        num_features: Number of features per frame
        num_classes: Number of sign classes
        
    Returns:
        X: Features (num_samples, num_frames, num_features)
        y: Labels (num_samples,)
    """
    # Generate random features
    X = np.random.randn(num_samples, num_frames, num_features).astype(np.float32)
    
    # Generate random labels
    y = np.random.randint(0, num_classes, size=num_samples).astype(np.int64)
    
    print(f"Generated {num_samples} samples with shape {X.shape}")
    return X, y

# Load sample data
X, y = load_sample_data()

# Split data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")

## Model Architecture

We'll define a recurrent neural network (RNN) model for sign language recognition.

In [None]:
class SignRecognitionModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout_rate=0.2):
        """
        Args:
            input_size: Feature size (number of keypoint features)
            hidden_size: Size of LSTM hidden layers
            num_layers: Number of LSTM layers
            num_classes: Number of sign classes
            dropout_rate: Dropout probability
        """
        super(SignRecognitionModel, self).__init__()
        
        # LSTM layers
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout_rate if num_layers > 1 else 0,
            bidirectional=True
        )
        
        # Attention mechanism
        self.attention = nn.Linear(hidden_size * 2, 1)  # Bidirectional means *2
        
        # Fully connected layers
        self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # x shape: (batch_size, seq_length, input_size)
        
        # LSTM output: (batch_size, seq_length, hidden_size * 2)
        lstm_out, _ = self.lstm(x)
        
        # Attention weights: (batch_size, seq_length, 1)
        attention_weights = torch.softmax(self.attention(lstm_out), dim=1)
        
        # Apply attention: (batch_size, hidden_size * 2)
        context_vector = torch.sum(attention_weights * lstm_out, dim=1)
        
        # Fully connected layers
        out = self.fc1(context_vector)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        
        return out

# Define model parameters
input_size = X.shape[2]  # Number of features
hidden_size = 128
num_layers = 2
num_classes = len(np.unique(y))

# Create model
model = SignRecognitionModel(
    input_size=input_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    num_classes=num_classes
).to(device)

# Print model summary
print(model)

## Training Loop

Let's define the training process for our sign recognition model.

In [None]:
# Create data loaders
train_dataset = SignLanguageDataset(X_train, y_train)
val_dataset = SignLanguageDataset(X_val, y_val)
test_dataset = SignLanguageDataset(X_test, y_test)

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

# Define training function
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    # Training loop
    for inputs, labels in tqdm(dataloader, desc="Training"):
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Statistics
        running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    
    return epoch_loss, epoch_acc

# Define validation function
def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, labels in tqdm(dataloader, desc="Validation"):
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Statistics
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    
    return epoch_loss, epoch_acc

# Training parameters
num_epochs = 30
early_stopping_patience = 10
best_val_loss = float('inf')
no_improve_epochs = 0

# Lists to store metrics
train_losses = []
train_accs = []
val_losses = []
val_accs = []

# Uncomment to train the model (will take time)
'''
# Training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    
    # Train
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    
    # Validate
    val_loss, val_acc = validate(model, val_loader, criterion, device)
    val_losses.append(val_loss)
    val_accs.append(val_acc)
    
    # Update learning rate
    scheduler.step(val_loss)
    
    # Print metrics
    print(f"Training Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}")
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), '../src/models/sign_recognition_model.pth')
        print("Model saved!")
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
    
    # Early stopping
    if no_improve_epochs >= early_stopping_patience:
        print(f"No improvement for {early_stopping_patience} epochs. Stopping training.")
        break
    
    print("---")
'''

## Model Evaluation

Let's evaluate our model on the test set and visualize the results.

In [None]:
# Define evaluation function
def evaluate_model(model, dataloader, device):
    model.eval()
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            
            # Forward pass
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            
            # Store predictions and labels
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.numpy())
    
    return np.array(all_predictions), np.array(all_labels)

# Visualize training history
def plot_training_history(train_losses, val_losses, train_accs, val_accs):
    plt.figure(figsize=(12, 5))
    
    # Plot loss
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    
    # Plot accuracy
    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Training Accuracy')
    plt.plot(val_accs, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

# Since we haven't actually trained the model, let's create some mock data for visualization
mock_train_losses = [0.8, 0.7, 0.6, 0.5, 0.45, 0.42, 0.4, 0.38, 0.36, 0.35]
mock_val_losses = [0.85, 0.75, 0.65, 0.58, 0.54, 0.52, 0.5, 0.49, 0.48, 0.48]
mock_train_accs = [0.6, 0.7, 0.75, 0.78, 0.8, 0.82, 0.84, 0.85, 0.86, 0.87]
mock_val_accs = [0.55, 0.65, 0.7, 0.72, 0.74, 0.75, 0.76, 0.77, 0.77, 0.78]

# Plot mock training history
plot_training_history(mock_train_losses, mock_val_losses, mock_train_accs, mock_val_accs)

## Model Deployment

Finally, let's look at how to deploy the trained model in the SignSpeak application.

In [None]:
# Save model architecture and weights
def save_model(model, save_path, model_info=None):
    """
    Save model and info for deployment.
    
    Args:
        model: Trained PyTorch model
        save_path: Path to save the model
        model_info: Dictionary with model metadata
    """
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    
    # Save model state
    torch.save({
        'model_state_dict': model.state_dict(),
        'model_info': model_info
    }, save_path)
    
    print(f"Model saved to {save_path}")

# Example model info
model_info = {
    'input_size': input_size,
    'hidden_size': hidden_size,
    'num_layers': num_layers,
    'num_classes': num_classes,
    'classes': list(range(num_classes)),  # In a real scenario, these would be actual sign names
    'preprocessing': {
        'normalize': True,
        'frame_length': X.shape[1]
    }
}

# Save model (commented out since we haven't actually trained it)
# save_model(model, '../src/models/sign_recognition_model.pth', model_info)

# Code to load the model for deployment
def load_model(model_path, device):
    """
    Load model for deployment.
    
    Args:
        model_path: Path to the saved model
        device: Device to load the model on (CPU or GPU)
        
    Returns:
        Loaded model and model info
    """
    if not os.path.exists(model_path):
        print(f"Model file not found: {model_path}")
        return None, None
    
    # Load checkpoint
    checkpoint = torch.load(model_path, map_location=device)
    model_info = checkpoint.get('model_info', {})
    
    # Create model with same architecture
    model = SignRecognitionModel(
        input_size=model_info.get('input_size', 189),
        hidden_size=model_info.get('hidden_size', 128),
        num_layers=model_info.get('num_layers', 2),
        num_classes=model_info.get('num_classes', 5)
    ).to(device)
    
    # Load weights
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()  # Set to evaluation mode
    
    return model, model_info

print("To deploy the model in the SignSpeak application, use the load_model function.")
print("Example: model, model_info = load_model('src/models/sign_recognition_model.pth', device)")

## Conclusion

In this notebook, we've covered the entire pipeline for training a sign language recognition model:

1. Data preparation and loading
2. Model architecture design
3. Training process
4. Model evaluation and visualization
5. Model deployment

Next steps include gathering real sign language data, fine-tuning the model, and integrating it into the SignSpeak application.