## Train the VGG16 Model

In [None]:
import tensorflow as tf  # Core deep learning framework
from tensorflow.keras import layers, models  # For building the model
from tensorflow.keras.applications import VGG16  # Pretrained VGG16 model
import numpy as np  # For array operations

# Load preprocessed data
X_train = np.load("X_train.npy")  # Training images
X_test = np.load("X_test.npy")    # Testing images
y_train = np.load("y_train.npy")  # Training labels
y_test = np.load("y_test.npy")    # Testing labels

# Load VGG16 pretrained on ImageNet
base_model = VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))
# weights="imagenet": Use pretrained weights from ImageNet—gives a head start
# include_top=False: Exclude final 1000-class layer—we add our own for 3 gestures
# input_shape: Matches our 224x224 RGB images

# Unfreeze last 10 layers for fine-tuning
base_model.trainable = True  # Allow training of base model layers
for layer in base_model.layers[:-10]:  # Freeze all but last 10 layers
    layer.trainable = False  # Keep early layers fixed—general features like edges

# Build model with custom layers
model = models.Sequential([
    base_model,  # VGG16 base—outputs 7x7x512 feature maps
    layers.GlobalAveragePooling2D(),  # Reduce 7x7x512 to 1x512—no parameters, just averaging
    layers.Dense(256, activation="relu"),  # 256 neurons—learns gesture-specific patterns
    layers.Dropout(0.5),  # Drop 50% of neurons—prevents overfitting to training data
    layers.Dense(3, activation="softmax")  # 3 outputs—probabilities for whats_up, not_good, good
])

# Compute class weights to prevent bias (e.g., "Yes-only" issue)
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))  # e.g., {0: 1.0, 1: 1.0, 2: 1.0}—adjusts for slight imbalances

# Compile model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),  # Low rate—fine-tunes gently
    loss="sparse_categorical_crossentropy",  # Loss for integer labels (0, 1, 2)
    metrics=["accuracy"]  # Track accuracy during training
)

# Show model summary
model.summary()  # Displays layers, trainable parameters (~7M), non-trainable (~8M)



In [None]:
# Early stopping to avoid overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=5  # Stop if validation loss doesn’t improve for 5 epochs
)

# Train the model
history = model.fit(
    X_train, y_train,  # Training data
    epochs=30,  # Max 30 passes—early stopping may halt earlier
    batch_size=32,  # 32 images per batch—balances speed and memory
    validation_data=(X_test, y_test),  # Validate on test set
    callbacks=[early_stopping],  # Apply early stopping
    class_weight=class_weight_dict  # Ensure balanced learning across gestures
)

# Evaluate on test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_accuracy:.4f}")  # e.g., 0.9500—95%

# Check prediction distribution
predictions = model.predict(X_test)  # Predict on test set
pred_labels = np.argmax(predictions, axis=1)  # Get predicted class indices
from collections import Counter
print("Prediction distribution:", Counter(pred_labels))  # e.g., {0: 80, 1: 80, 2: 80}—should be balanced

# Save model in Keras 3 format
model.save("gesture_model_vgg16_new.keras")  # Saves as .keras—compatible with TensorFlow 3.x

## Real Time prediction

In [None]:
from tensorflow import keras
import cv2  # For webcam and image processing
import mediapipe as mp  # For hand detection
import numpy as np  # For array operations
import tensorflow as tf  # For loading model and prediction

# Initialize Mediapipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1, min_detection_confidence=0.7)
mp_draw = mp.solutions.drawing_utils

# Load trained model
model = tf.keras.models.load_model("gesture_model_vgg16_new.keras")
gestures = ["What's up", "Not good", "Good"]  # New gesture labels

# Open webcam
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

print("Webcam opened successfully! Press 'q' to quit.")

# Loop for real-time prediction
while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Could not read frame.")
        break
    
    frame = cv2.flip(frame, 1)  # Mirror feed—matches training
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)
    
    if results.multi_hand_landmarks:  # If hand detected
        for hand_landmarks in results.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            
            # Get bounding box coordinates
            h, w, _ = frame.shape
            x_min = int(min([lm.x for lm in hand_landmarks.landmark]) * w)
            y_min = int(min([lm.y for lm in hand_landmarks.landmark]) * h)
            x_max = int(max([lm.x for lm in hand_landmarks.landmark]) * w)
            y_max = int(max([lm.y for lm in hand_landmarks.landmark]) * h)
            
            # NEW: Fix coordinates to stay inside frame and avoid empty crop
            x_min = max(0, x_min)  # Don’t go left of the picture
            y_min = max(0, y_min)  # Don’t go above the picture
            x_max = min(w, x_max)  # Don’t go right of the picture
            y_max = min(h, y_max)  # Don’t go below the picture
            
            # Crop and preprocess hand region
            hand_img = frame[y_min:y_max, x_min:x_max]
            # NEW: Check if the crop worked
            if hand_img.size > 0:  # Only proceed if we got something
                hand_img = cv2.resize(hand_img, (224, 224))  # VGG16 input size
                hand_img = cv2.cvtColor(hand_img, cv2.COLOR_BGR2RGB)  # Convert to RGB
                hand_img = hand_img / 255.0  # Normalize to 0-1
                hand_img = np.expand_dims(hand_img, axis=0)  # Add batch dimension
                
                # Predict gesture with verbose=0
                prediction = model.predict(hand_img, verbose=0)
                gesture_idx = np.argmax(prediction)
                confidence = prediction[0][gesture_idx]
                gesture = gestures[gesture_idx]
                
                # Display prediction with confidence
                text = f"{gesture} ({confidence:.2f})"
                cv2.putText(frame, text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    
    cv2.imshow("Sign Language Prediction", frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        print("closed")
        break

cap.release()
cv2.destroyAllWindows()
hands.close()

Webcam opened successfully! Press 'q' to quit.
