In [None]:
import os
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Define dataset path and labels
DATASET_PATH = r"C:\Users\hp\OneDrive\Desktop\FYP 2021\video_alphabets"
LABELS = ["Alifmad", "Aray", "Jeem"]
num_classes = len(LABELS)

# Video Processing Parameters
IMG_SIZE = 224  # MobileNetV2 input size
SEQUENCE_LENGTH = 30  # Number of frames per video



In [5]:
# Load Pre-trained MobileNetV2 Model (Feature Extractor)
base_model = MobileNetV2(weights="imagenet", include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
feature_extractor = Model(inputs=base_model.input, outputs=tf.keras.layers.GlobalAveragePooling2D()(base_model.output))

# Function to Extract Frames from Video
def extract_frames(video_path, max_frames=SEQUENCE_LENGTH):
    cap = cv2.VideoCapture(video_path)
    frames = []
    
    while len(frames) < max_frames:
        success, frame = cap.read()
        if not success:
            break
        frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))  # Resize for MobileNetV2
        frame = frame / 255.0  # Normalize
        frames.append(frame)

    cap.release()

    # Pad with empty frames if video is too short
    while len(frames) < max_frames:
        frames.append(np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.float32))
    
    return np.array(frames)

# Function to Extract Features from Frames using MobileNetV2
def extract_features_from_frames(frames):
    features = [feature_extractor.predict(np.expand_dims(frame, axis=0), verbose=0).squeeze() for frame in frames]
    return np.array(features)

# Load Video Dataset
video_data = []
video_labels = []

for label in LABELS:
    folder_path = os.path.join(DATASET_PATH, label)
    for video in os.listdir(folder_path):
        video_path = os.path.join(folder_path, video)
        frames = extract_frames(video_path)  # Extract frames
        features = extract_features_from_frames(frames)  # Convert frames to features
        video_data.append(features)
        video_labels.append(LABELS.index(label))  # Convert label to index

video_data = np.array(video_data)  # Shape: (num_samples, SEQUENCE_LENGTH, 1280)
video_labels = to_categorical(np.array(video_labels), num_classes=num_classes)  # Shape: (num_samples, num_classes)



In [6]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(video_data, video_labels, test_size=0.2, random_state=42)

# Ensure Shapes are Correct Before Training
print("X_train shape:", X_train.shape)  # (num_samples, 30, 1280)
print("y_train shape:", y_train.shape)  # (num_samples, 3)

# Build LSTM Model for Video Classification
model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(SEQUENCE_LENGTH, 1280)),
    Dropout(0.3),
    LSTM(64, return_sequences=False),
    Dense(32, activation="relu"),
    Dense(num_classes, activation="softmax")
])

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

# Train the Model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=16)

# Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


X_train shape: (244, 30, 1280)
y_train shape: (244, 3)


  super().__init__(**kwargs)


Epoch 1/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 69ms/step - accuracy: 0.3864 - loss: 1.1089 - val_accuracy: 0.5000 - val_loss: 0.9856
Epoch 2/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - accuracy: 0.5499 - loss: 0.9409 - val_accuracy: 0.6129 - val_loss: 0.8178
Epoch 3/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - accuracy: 0.6595 - loss: 0.6949 - val_accuracy: 0.6290 - val_loss: 0.8236
Epoch 4/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - accuracy: 0.7527 - loss: 0.5561 - val_accuracy: 0.6129 - val_loss: 0.7500
Epoch 5/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.7441 - loss: 0.5294 - val_accuracy: 0.6290 - val_loss: 0.8003
Epoch 6/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - accuracy: 0.8280 - loss: 0.4084 - val_accuracy: 0.6452 - val_loss: 0.8774
Epoch 7/20
[1m16/16[0m [32m━━━━

In [7]:
# Save the trained model
model.save("sign_language_cnn_lstm.h5")  # Saves as an HDF5 file
print("Model saved successfully!")




Model saved successfully!


In [17]:
# Load Trained Model
MODEL_PATH = r"C:\Users\hp\OneDrive\Desktop\VScode\Machine_Learning\sign_language_cnn_lstm.h5"  # Update with actual path
model = load_model(MODEL_PATH)



In [18]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import load_model



# Labels
LABELS = ["Alifmad", "Aray", "Jeem"]

# Video Processing Parameters
IMG_SIZE = 224
SEQUENCE_LENGTH = 30

# Load Pre-trained MobileNetV2 Feature Extractor
base_model = MobileNetV2(weights="imagenet", include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
feature_extractor = tf.keras.Model(inputs=base_model.input, outputs=tf.keras.layers.GlobalAveragePooling2D()(base_model.output))

# Function to Extract Frames from Video
def extract_frames(video_path, max_frames=SEQUENCE_LENGTH):
    cap = cv2.VideoCapture(video_path)
    frames = []

    while len(frames) < max_frames:
        success, frame = cap.read()
        if not success:
            break
        frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
        frame = frame / 255.0  # Normalize
        frames.append(frame)

    cap.release()

    # Pad if the video has fewer frames
    while len(frames) < max_frames:
        frames.append(np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.float32))

    return np.array(frames)

# Function to Extract Features from Frames
def extract_features_from_frames(frames):
    features = [feature_extractor.predict(np.expand_dims(frame, axis=0), verbose=0).squeeze() for frame in frames]
    return np.array(features)  # Shape: (30, 1280)

# Function to Predict the Class of a Video
def predict_video(video_path):
    frames = extract_frames(video_path)
    features = extract_features_from_frames(frames)
    features = np.expand_dims(features, axis=0)  # Shape: (1, 30, 1280)

    # Model Prediction
    prediction = model.predict(features)
    predicted_class = np.argmax(prediction)
    confidence = prediction[0][predicted_class]

    print(f"Predicted Class: {LABELS[predicted_class]} (Confidence: {confidence:.2f})")

# Test on a Video
video_path = r"C:\Users\hp\OneDrive\Desktop\FYP 2021\video_alphabets\Aray\s0315.mp4"  # Update with actual test video path
predict_video(video_path)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 472ms/step
Predicted Class: Aray (Confidence: 1.00)
