In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import zipfile
import os

ZIP_PATH = "/content/drive/MyDrive/archive.zip"  # <-- CHANGE if needed
EXTRACT_PATH = "/content/asl_dataset"

os.makedirs(EXTRACT_PATH, exist_ok=True)

with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall(EXTRACT_PATH)

print("Extracted to:", EXTRACT_PATH)
print("Folders:", os.listdir(EXTRACT_PATH))


Extracted to: /content/asl_dataset
Folders: ['asl_alphabet_train', 'asl_alphabet_test']


In [3]:
TRAIN_DIR = f"{EXTRACT_PATH}/asl_alphabet_train"
TEST_DIR = f"{EXTRACT_PATH}/asl_alphabet_test"


In [4]:
import os

print("Train exists?", os.path.exists(TRAIN_DIR))
print("Train folders:", os.listdir(TRAIN_DIR))
print("-" * 50)
print("Test exists?", os.path.exists(TEST_DIR))
print("Test folders:", os.listdir(TEST_DIR))


Train exists? True
Train folders: ['asl_alphabet_train']
--------------------------------------------------
Test exists? True
Test folders: ['asl_alphabet_test']


In [5]:
# Cell 1: Imports (run this first)

import os
import time
import numpy as np
import cv2
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator


In [6]:
import tensorflow as tf

print("TF version:", tf.__version__)
print("GPUs:", tf.config.list_physical_devices('GPU'))


TF version: 2.19.0
GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [11]:
# Cell 2: Configuration (UPDATED)

import os

TRAIN_DIR = "/content/asl_dataset/asl_alphabet_train/asl_alphabet_train"
print("TRAIN_DIR:", TRAIN_DIR)
print("Exists?", os.path.exists(TRAIN_DIR))

# Peek inside to see what’s there
print("Sample contents:", os.listdir(TRAIN_DIR)[:20])

IMG_HEIGHT = 200
IMG_WIDTH = 200
BATCH_SIZE = 32
EPOCHS = 20

# Our target classes
CLASSES = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ") + ["Space", "Del", "Nothing"]

print("Number of classes:", len(CLASSES))
print("Classes:", CLASSES)


TRAIN_DIR: /content/asl_dataset/asl_alphabet_train/asl_alphabet_train
Exists? True
Sample contents: ['J', 'V', 'G', 'H', 'M', 'B', 'L', 'X', 'space', 'U', 'E', 'A', 'S', 'P', 'D', 'O', 'Y', 'del', 'I', 'Z']
Number of classes: 29
Classes: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Space', 'Del', 'Nothing']


In [12]:
# Cell 3: Create train & validation generators

datagen = ImageDataGenerator(
    rescale=1.0/255.0,
    validation_split=0.2,
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    shear_range=0.1,
    horizontal_flip=True,
    fill_mode="nearest"
)

train_generator = datagen.flow_from_directory(
    TRAIN_DIR,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
    class_mode="categorical",
    subset="training",
    shuffle=True,
    classes=CLASSES  # force class order
)

val_generator = datagen.flow_from_directory(
    TRAIN_DIR,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
    class_mode="categorical",
    subset="validation",
    shuffle=False,
    classes=CLASSES
)


Found 62400 images belonging to 29 classes.
Found 15600 images belonging to 29 classes.


In [13]:
# Cell 4: Define an improved CNN model with BatchNorm & Dropout

num_classes = train_generator.num_classes   # should be 29
print("Number of classes in generator:", num_classes)

from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3)),

    # Block 1
    layers.Conv2D(32, (3, 3), padding="same"),
    layers.BatchNormalization(),
    layers.Activation("relu"),
    layers.MaxPooling2D((2, 2)),

    # Block 2
    layers.Conv2D(64, (3, 3), padding="same"),
    layers.BatchNormalization(),
    layers.Activation("relu"),
    layers.MaxPooling2D((2, 2)),

    # Block 3
    layers.Conv2D(128, (3, 3), padding="same"),
    layers.BatchNormalization(),
    layers.Activation("relu"),
    layers.MaxPooling2D((2, 2)),

    # Block 4 (light)
    layers.Conv2D(256, (3, 3), padding="same"),
    layers.BatchNormalization(),
    layers.Activation("relu"),
    layers.MaxPooling2D((2, 2)),

    layers.Flatten(),
    layers.Dense(512),
    layers.BatchNormalization(),
    layers.Activation("relu"),
    layers.Dropout(0.5),

    layers.Dense(num_classes, activation="softmax")
])

# If mixed precision is enabled, optimizer will handle scaling
opt = keras.optimizers.Adam(learning_rate=1e-3)

model.compile(
    optimizer=opt,
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


Number of classes in generator: 29


In [14]:
# Cell 5: Train the model with callbacks (best model selection)

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

BEST_MODEL_PATH = "asl_best_model.h5"

checkpoint_cb = ModelCheckpoint(
    BEST_MODEL_PATH,
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    verbose=1
)

earlystop_cb = EarlyStopping(
    monitor="val_loss",
    mode="min",
    patience=5,          # stop if no improvement for 5 epochs
    restore_best_weights=True,
    verbose=1
)

reducelr_cb = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.3,
    patience=3,          # after 3 bad epochs → reduce LR
    min_lr=1e-6,
    verbose=1
)

steps_per_epoch = train_generator.samples // BATCH_SIZE
validation_steps = val_generator.samples // BATCH_SIZE

history = model.fit(
    train_generator,
    epochs=EPOCHS,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_generator,
    validation_steps=validation_steps,
    callbacks=[checkpoint_cb, earlystop_cb, reducelr_cb]
)


Epoch 1/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 329ms/step - accuracy: 0.4294 - loss: 1.9314
Epoch 1: val_accuracy improved from -inf to 0.57713, saving model to asl_best_model.h5




[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m796s[0m 404ms/step - accuracy: 0.4295 - loss: 1.9310 - val_accuracy: 0.5771 - val_loss: 1.4167 - learning_rate: 0.0010
Epoch 2/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 327ms/step - accuracy: 0.8557 - loss: 0.4264
Epoch 2: val_accuracy did not improve from 0.57713
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m784s[0m 402ms/step - accuracy: 0.8557 - loss: 0.4263 - val_accuracy: 0.5061 - val_loss: 1.9561 - learning_rate: 0.0010
Epoch 3/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 329ms/step - accuracy: 0.9171 - loss: 0.2423
Epoch 3: val_accuracy did not improve from 0.57713
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m788s[0m 404ms/step - accuracy: 0.9171 - loss: 0.2423 - val_accuracy: 0.4587 - val_loss: 2.8449 - learning_rate: 0.0010
Epoch 4/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 332ms/step - accuracy: 0.947



[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m793s[0m 406ms/step - accuracy: 0.9472 - loss: 0.1551 - val_accuracy: 0.7929 - val_loss: 0.6733 - learning_rate: 0.0010
Epoch 5/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 325ms/step - accuracy: 0.9587 - loss: 0.1194
Epoch 5: val_accuracy did not improve from 0.79286
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m778s[0m 399ms/step - accuracy: 0.9587 - loss: 0.1194 - val_accuracy: 0.7306 - val_loss: 0.8971 - learning_rate: 0.0010
Epoch 6/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 326ms/step - accuracy: 0.9668 - loss: 0.0997
Epoch 6: val_accuracy did not improve from 0.79286
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m802s[0m 399ms/step - accuracy: 0.9668 - loss: 0.0997 - val_accuracy: 0.7153 - val_loss: 1.0619 - learning_rate: 0.0010
Epoch 7/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 322ms/step - accuracy: 0.970




Epoch 7: ReduceLROnPlateau reducing learning rate to 0.0003000000142492354.
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m773s[0m 396ms/step - accuracy: 0.9706 - loss: 0.0853 - val_accuracy: 0.8079 - val_loss: 0.7962 - learning_rate: 0.0010
Epoch 8/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 325ms/step - accuracy: 0.9860 - loss: 0.0420
Epoch 8: val_accuracy improved from 0.80794 to 0.85511, saving model to asl_best_model.h5




[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m807s[0m 399ms/step - accuracy: 0.9860 - loss: 0.0420 - val_accuracy: 0.8551 - val_loss: 0.6480 - learning_rate: 3.0000e-04
Epoch 9/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 327ms/step - accuracy: 0.9917 - loss: 0.0260
Epoch 9: val_accuracy did not improve from 0.85511
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m782s[0m 401ms/step - accuracy: 0.9917 - loss: 0.0260 - val_accuracy: 0.8309 - val_loss: 0.7065 - learning_rate: 3.0000e-04
Epoch 10/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 327ms/step - accuracy: 0.9926 - loss: 0.0230
Epoch 10: val_accuracy improved from 0.85511 to 0.86531, saving model to asl_best_model.h5




[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m779s[0m 400ms/step - accuracy: 0.9926 - loss: 0.0230 - val_accuracy: 0.8653 - val_loss: 0.6768 - learning_rate: 3.0000e-04
Epoch 11/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 328ms/step - accuracy: 0.9942 - loss: 0.0183
Epoch 11: val_accuracy did not improve from 0.86531

Epoch 11: ReduceLROnPlateau reducing learning rate to 9.000000427477062e-05.
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m785s[0m 402ms/step - accuracy: 0.9942 - loss: 0.0183 - val_accuracy: 0.8592 - val_loss: 0.6525 - learning_rate: 3.0000e-04
Epoch 12/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 331ms/step - accuracy: 0.9948 - loss: 0.0157
Epoch 12: val_accuracy did not improve from 0.86531
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m791s[0m 405ms/step - accuracy: 0.9948 - loss: 0.0157 - va



[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m771s[0m 395ms/step - accuracy: 0.9961 - loss: 0.0118 - val_accuracy: 0.8742 - val_loss: 0.6478 - learning_rate: 9.0000e-05
Epoch 14/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 319ms/step - accuracy: 0.9963 - loss: 0.0116
Epoch 14: val_accuracy did not improve from 0.87423
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m793s[0m 391ms/step - accuracy: 0.9963 - loss: 0.0116 - val_accuracy: 0.8699 - val_loss: 0.6658 - learning_rate: 9.0000e-05
Epoch 15/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 321ms/step - accuracy: 0.9970 - loss: 0.0098
Epoch 15: val_accuracy did not improve from 0.87423
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m769s[0m 394ms/step - accuracy: 0.9970 - loss: 0.0098 - val_accuracy: 0.8727 - val_loss: 0.5963 - learning_rate: 9.0000e-05
Epoch 16/20




[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m779s[0m 400ms/step - accuracy: 0.9968 - loss: 0.0094 - val_accuracy: 0.8770 - val_loss: 0.6293 - learning_rate: 9.0000e-05
Epoch 17/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 321ms/step - accuracy: 0.9975 - loss: 0.0085
Epoch 17: val_accuracy did not improve from 0.87699
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m770s[0m 395ms/step - accuracy: 0.9975 - loss: 0.0085 - val_accuracy: 0.8733 - val_loss: 0.6300 - learning_rate: 9.0000e-05
Epoch 18/20
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 330ms/step - accuracy: 0.9975 - loss: 0.0081
Epoch 18: val_accuracy did not improve from 0.87699

Epoch 18: ReduceLROnPlateau reducing learning rate to 2.700000040931627e-05.
[1m1950/1950[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m788s[0m 404ms/step - accuracy: 0.9975 - loss: 0.0081 - va

In [15]:
# Cell 6: Save class indices (model already saved via checkpoint)

CLASS_INDICES_PATH = "asl_class_indices.npy"

np.save(CLASS_INDICES_PATH, train_generator.class_indices)

print("✅ Saved best model to:", BEST_MODEL_PATH)
print("✅ Saved class indices to:", CLASS_INDICES_PATH)
print("Class indices:", train_generator.class_indices)


✅ Saved best model to: asl_best_model.h5
✅ Saved class indices to: asl_class_indices.npy
Class indices: {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9, 'K': 10, 'L': 11, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 16, 'R': 17, 'S': 18, 'T': 19, 'U': 20, 'V': 21, 'W': 22, 'X': 23, 'Y': 24, 'Z': 25, 'Space': 26, 'Del': 27, 'Nothing': 28}


In [16]:
# Cell 7: Load BEST model + class mapping & evaluate

from tensorflow import keras
import numpy as np

BEST_MODEL_PATH = "asl_best_model.h5"
CLASS_INDICES_PATH = "asl_class_indices.npy"

model = keras.models.load_model(BEST_MODEL_PATH)
class_indices = np.load(CLASS_INDICES_PATH, allow_pickle=True).item()

index_to_class = {v: k for k, v in class_indices.items()}

print("✅ Loaded best model from:", BEST_MODEL_PATH)
print("Classes (index_to_class):", index_to_class)

# Quick evaluation on validation set
val_loss, val_acc = model.evaluate(val_generator, verbose=1)
print(f"Validation loss: {val_loss:.4f}")
print(f"Validation accuracy: {val_acc:.4f}")




✅ Loaded best model from: asl_best_model.h5
Classes (index_to_class): {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I', 9: 'J', 10: 'K', 11: 'L', 12: 'M', 13: 'N', 14: 'O', 15: 'P', 16: 'Q', 17: 'R', 18: 'S', 19: 'T', 20: 'U', 21: 'V', 22: 'W', 23: 'X', 24: 'Y', 25: 'Z', 26: 'Space', 27: 'Del', 28: 'Nothing'}
[1m488/488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 304ms/step - accuracy: 0.9427 - loss: 0.2897
Validation loss: 0.6140
Validation accuracy: 0.8783


In [17]:
# Cell 8: Prediction helper for one frame (ROI from camera)

def preprocess_frame(frame):
    # frame is BGR from OpenCV
    img = cv2.resize(frame, (IMG_WIDTH, IMG_HEIGHT))
    img = img.astype("float32") / 255.0
    img = np.expand_dims(img, axis=0)
    return img

def predict_label(frame):
    """
    frame: ROI (BGR) with hand sign
    returns: label (A-Z, Space, Del, Nothing), confidence
    """
    img = preprocess_frame(frame)
    preds = model.predict(img, verbose=0)[0]
    idx = np.argmax(preds)
    conf = float(preds[idx])
    label = index_to_class[idx]
    return label, conf


In [18]:
# Cell 9: Live ASL detection using webcam

CAPTURE_DURATION = 2.0   # seconds per capture
MIN_CONFIDENCE = 0.5     # adjust if needed

def run_asl_live():
    cap = cv2.VideoCapture(0)  # change to 1 if external cam

    if not cap.isOpened():
        print("Error: Cannot open camera")
        return

    current_word = ""
    sentence = ""
    last_sentence = ""
    capturing = False
    start_time = None
    predictions_window = []
    last_action = ""

    print("Controls:")
    print("  's'  - start capturing next sign (2 seconds)")
    print("  'c'  - clear current word & sentence")
    print("  'q'  - quit")
    print("")
    print("Sign meanings:")
    print("  A-Z     -> letters")
    print("  Space   -> space (end word / add space)")
    print("  Del     -> delete last character")
    print("  Nothing -> ENTER (finalize sentence)")

    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame")
            break

        h, w, _ = frame.shape

        # Central ROI box
        box_size = int(min(h, w) * 0.5)
        x1 = w // 2 - box_size // 2
        y1 = h // 2 - box_size // 2
        x2 = x1 + box_size
        y2 = y1 + box_size

        roi = frame[y1:y2, x1:x2]

        # Capture logic
        if capturing:
            elapsed = time.time() - start_time
            if elapsed <= CAPTURE_DURATION:
                if roi.size != 0:
                    label, conf = predict_label(roi)
                    if conf >= MIN_CONFIDENCE:
                        predictions_window.append(label)

                cv2.putText(frame, "CAPTURING...", (10, 30),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
            else:
                capturing = False
                final_label = None
                if predictions_window:
                    final_label = max(set(predictions_window),
                                      key=predictions_window.count)
                predictions_window = []

                if final_label is not None:
                    if final_label in list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
                        current_word += final_label
                        last_action = f"Letter: {final_label}"
                        print(f"Captured letter: {final_label}")

                    elif final_label == "Space":
                        if current_word:
                            sentence += current_word + " "
                            print("Word added to sentence:", current_word)
                            current_word = ""
                        else:
                            sentence += " "
                        last_action = "Space (word/space added)"

                    elif final_label == "Del":
                        if current_word:
                            current_word = current_word[:-1]
                            last_action = "Deleted last char in word"
                        else:
                            sentence = sentence[:-1]
                            last_action = "Deleted last char in sentence"
                        print("Delete action.")

                    elif final_label == "Nothing":
                        last_sentence = sentence.strip()
                        print("Sentence ENTERED:", last_sentence)
                        current_word = ""
                        sentence = ""
                        last_action = "Entered sentence (Nothing)"

        # Draw ROI box
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

        # Draw text overlays
        cv2.putText(frame, f"Word: {current_word}", (10, h - 70),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
        cv2.putText(frame, f"Sentence: {sentence}", (10, h - 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2)
        cv2.putText(frame, f"Last: {last_action}", (10, h - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2)

        if last_sentence:
            cv2.putText(frame, f"Entered: {last_sentence}",
                        (10, 40), cv2.FONT_HERSHEY_SIMPLEX,
                        0.7, (0, 200, 255), 2)

        cv2.imshow("ASL Live", frame)

        key = cv2.waitKey(1) & 0xFF

        if key == ord('q'):
            break

        if key == ord('s') and not capturing:
            capturing = True
            start_time = time.time()
            predictions_window = []
            last_action = "Capturing started"

        if key == ord('c'):
            current_word = ""
            sentence = ""
            last_sentence = ""
            last_action = "Cleared all"
            print("Cleared word/sentence.")

    cap.release()
    cv2.destroyAllWindows()
    print("Final sentence buffer:", sentence)
    print("Last entered sentence:", last_sentence)


In [19]:
# Cell 10: Start live ASL detection

run_asl_live()


Error: Cannot open camera
