ASL - Deep Learning

CMPE-258

Team - 16

Samrudh Sivva - 017520659
Sai Prasad Shivanatri - 017507191
Nithin Aleti - 017401930

In [7]:
import os
import shutil
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from timm import create_model
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# -----------------------------------
# 1. Dataset Splitter
# -----------------------------------
def create_train_val_split(dataset_path, train_dir="train", val_dir="val", val_ratio=0.2):
    # Locate the actual nested path with class folders
    nested_train_path = os.path.join(dataset_path, "asl_alphabet_train", "asl_alphabet_train")
    classes = sorted(os.listdir(nested_train_path))

    train_path = os.path.join(dataset_path, train_dir)
    val_path = os.path.join(dataset_path, val_dir)

    os.makedirs(train_path, exist_ok=True)
    os.makedirs(val_path, exist_ok=True)

    for cls in classes:
        cls_path = os.path.join(nested_train_path, cls)
        if os.path.isdir(cls_path):
            # Get all images for this class
            images = [os.path.join(cls_path, img) for img in os.listdir(cls_path) if img.endswith((".jpg", ".png", ".jpeg"))]
            if len(images) == 0:
                print(f"Skipping empty class folder: {cls}")
                continue

            # Split into train and validation
            train_images, val_images = train_test_split(images, test_size=val_ratio, random_state=42)

            # Create class directories in train and val
            os.makedirs(os.path.join(train_path, cls), exist_ok=True)
            os.makedirs(os.path.join(val_path, cls), exist_ok=True)

            # Move files
            for img in train_images:
                shutil.copy(img, os.path.join(train_path, cls))
            for img in val_images:
                shutil.copy(img, os.path.join(val_path, cls))

    print(f"Train and validation splits created at: {train_path}, {val_path}")

# Call the function to split dataset
dataset_root = "/Users/samrudhsivva/.cache/kagglehub/datasets/grassknoted/asl-alphabet/versions/1"
create_train_val_split(dataset_root)

# -----------------------------------
# 2. Dataset Class
# -----------------------------------
class ASLDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.data = []
        self.labels = []
        self.classes = sorted(os.listdir(root_dir))
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(self.classes)}

        # Traverse the dataset directory
        for cls_name in self.classes:
            cls_path = os.path.join(root_dir, cls_name)
            if os.path.isdir(cls_path):
                for img_file in os.listdir(cls_path):
                    self.data.append(os.path.join(cls_path, img_file))
                    self.labels.append(self.class_to_idx[cls_name])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = self.data[idx]
        label = self.labels[idx]
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label

# -----------------------------------
# 3. Transformations and DataLoaders
# -----------------------------------
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_path = os.path.join(dataset_root, "train")
val_path = os.path.join(dataset_root, "val")

train_dataset = ASLDataset(root_dir=train_path, transform=transform)
val_dataset = ASLDataset(root_dir=val_path, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# -----------------------------------
# 4. Model Definition
# -----------------------------------
class ASLClassifier(nn.Module):
    def __init__(self, num_classes):
        super(ASLClassifier, self).__init__()
        self.base_model = create_model("efficientnet_b0", pretrained=True)
        self.base_model.classifier = nn.Linear(self.base_model.num_features, num_classes)

    def forward(self, x):
        return self.base_model(x)

# Initialize the model
num_classes = len(train_dataset.classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ASLClassifier(num_classes=num_classes).to(device)

# -----------------------------------
# 5. Training Loop
# -----------------------------------
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=5):
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        correct = 0
        total = 0
        for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]"):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, preds = outputs.max(1)
            correct += preds.eq(labels).sum().item()
            total += labels.size(0)

        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for images, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]"):
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, preds = outputs.max(1)
                val_correct += preds.eq(labels).sum().item()
                val_total += labels.size(0)

        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {train_loss/len(train_loader):.4f}, Train Accuracy: {100.0 * correct / total:.2f}%")
        print(f"Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {100.0 * val_correct / val_total:.2f}%")

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=1)

# -----------------------------------
# 6. Save Model
# -----------------------------------
torch.save(model.state_dict(), "asl_classifier.pth")
print("Model saved to asl_classifier.pth")

Train and validation splits created at: /Users/samrudhsivva/.cache/kagglehub/datasets/grassknoted/asl-alphabet/versions/1/train, /Users/samrudhsivva/.cache/kagglehub/datasets/grassknoted/asl-alphabet/versions/1/val


  scaler = GradScaler(enabled=torch.cuda.is_available())
  with autocast():
Epoch 1/1 [Train]: 100%|██████████████████| 2175/2175 [1:30:23<00:00,  2.49s/it]

Epoch 1/1
Train Loss: 0.1240, Train Accuracy: 96.95%
Model saved to asl_classifier.pth





In [9]:
# -----------------------------------
# 7. Testing Code
# -----------------------------------
def test_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(test_loader, desc="Testing"):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, preds = outputs.max(1)
            correct += preds.eq(labels).sum().item()
            total += labels.size(0)

    accuracy = 100.0 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")

# Create a test DataLoader
test_path = os.path.join(dataset_root, "val")  # Using validation set as a test set for simplicity
test_dataset = ASLDataset(root_dir=test_path, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Run the test
test_model(model, test_loader, device)


Testing: 100%|██████████████████████████████| 1088/1088 [09:52<00:00,  1.84it/s]

Test Accuracy: 99.95%



