In [2]:
import opendatasets as od
import pandas as pd
import json
import os
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

In [None]:
od.download( 
    "https://www.kaggle.com/datasets/risangbaskoro/wlasl-processed") 

In [3]:
json_file_path = "./wlasl-processed/WLASL_v0.3.json"
videos_folder_path = "./wlasl-processed/videos"

# Load the JSON file
with open(json_file_path, 'r') as file:
    data = json.load(file)

In [4]:
rows = []
for entry in data:
    gloss = entry["gloss"]
    for instance in entry["instances"]:
        video_id = instance["video_id"]
        video_file_path = os.path.join(videos_folder_path, f"{video_id}.mp4")
        rows.append({
            "gloss": gloss,
            "video_id": video_id,
            "video_file_path": video_file_path
        })

In [5]:
df = pd.DataFrame(rows)

In [6]:
df.head()

Unnamed: 0,gloss,video_id,video_file_path
0,book,69241,./wlasl-processed/videos\69241.mp4
1,book,65225,./wlasl-processed/videos\65225.mp4
2,book,68011,./wlasl-processed/videos\68011.mp4
3,book,68208,./wlasl-processed/videos\68208.mp4
4,book,68012,./wlasl-processed/videos\68012.mp4


In [7]:
df["video_exists"] = df["video_file_path"].apply(lambda path: os.path.exists(path))

In [8]:
missing_videos_df = df[~df["video_exists"]]
print("Missing videos:")
print(missing_videos_df[["gloss", "video_id", "video_file_path"]])

Missing videos:
            gloss video_id                     video_file_path
1            book    65225  ./wlasl-processed/videos\65225.mp4
2            book    68011  ./wlasl-processed/videos\68011.mp4
3            book    68208  ./wlasl-processed/videos\68208.mp4
4            book    68012  ./wlasl-processed/videos\68012.mp4
5            book    70212  ./wlasl-processed/videos\70212.mp4
...           ...      ...                                 ...
21074  wheelchair    63049  ./wlasl-processed/videos\63049.mp4
21076     whistle    63185  ./wlasl-processed/videos\63185.mp4
21077     whistle    67065  ./wlasl-processed/videos\67065.mp4
21079     whistle    63187  ./wlasl-processed/videos\63187.mp4
21081     whistle    63189  ./wlasl-processed/videos\63189.mp4

[9103 rows x 3 columns]


In [9]:
df_filtered = df[df["video_exists"]].drop(columns="video_exists").reset_index(drop=True)

In [10]:
print(df_filtered)

            gloss video_id                     video_file_path
0            book    69241  ./wlasl-processed/videos\69241.mp4
1            book    07069  ./wlasl-processed/videos\07069.mp4
2            book    07068  ./wlasl-processed/videos\07068.mp4
3            book    07070  ./wlasl-processed/videos\07070.mp4
4            book    07099  ./wlasl-processed/videos\07099.mp4
...           ...      ...                                 ...
11975  wheelchair    63047  ./wlasl-processed/videos\63047.mp4
11976  wheelchair    63050  ./wlasl-processed/videos\63050.mp4
11977     whistle    63186  ./wlasl-processed/videos\63186.mp4
11978     whistle    63188  ./wlasl-processed/videos\63188.mp4
11979     whistle    63190  ./wlasl-processed/videos\63190.mp4

[11980 rows x 3 columns]


In [11]:
print(df_filtered.dtypes)

gloss              object
video_id           object
video_file_path    object
dtype: object


In [12]:
# Create a StringLookup layer for 'gloss'
gloss_lookup = tf.keras.layers.StringLookup(output_mode="int")
gloss_lookup.adapt(df_filtered['gloss'].values)

# Transform 'gloss' to integer labels
df_filtered['gloss_encoded'] = gloss_lookup(df_filtered['gloss']).numpy()

In [13]:
print(df_filtered.dtypes)

gloss              object
video_id           object
video_file_path    object
gloss_encoded       int64
dtype: object


In [14]:
print(df_filtered)

            gloss video_id                     video_file_path  gloss_encoded
0            book    69241  ./wlasl-processed/videos\69241.mp4           1030
1            book    07069  ./wlasl-processed/videos\07069.mp4           1030
2            book    07068  ./wlasl-processed/videos\07068.mp4           1030
3            book    07070  ./wlasl-processed/videos\07070.mp4           1030
4            book    07099  ./wlasl-processed/videos\07099.mp4           1030
...           ...      ...                                 ...            ...
11975  wheelchair    63047  ./wlasl-processed/videos\63047.mp4           1579
11976  wheelchair    63050  ./wlasl-processed/videos\63050.mp4           1579
11977     whistle    63186  ./wlasl-processed/videos\63186.mp4           1912
11978     whistle    63188  ./wlasl-processed/videos\63188.mp4           1912
11979     whistle    63190  ./wlasl-processed/videos\63190.mp4           1912

[11980 rows x 4 columns]


In [14]:
def load_video_frames(video_path, frame_count=30, frame_size=(224, 224)):
    video_capture = cv2.VideoCapture(video_path)
    frames = []
    success, frame = video_capture.read()
    while success and len(frames) < frame_count:
        frame = cv2.resize(frame, frame_size)
        frames.append(frame)
        success, frame = video_capture.read()
    video_capture.release()
    frames = np.array(frames) / 255.0  # Normalize to [0, 1]
    return frames

In [None]:
def load_video_frames(video_path, frame_count=30, frame_size=(224, 224)):
    # Check if the video path is valid
    if not isinstance(video_path, str) or not os.path.exists(video_path):
        print(f"Invalid video path: {video_path}")
        return np.zeros((frame_count, *frame_size, 3))  # Return empty frames if path is invalid
    
    video_capture = cv2.VideoCapture(video_path)
    frames = []
    success, frame = video_capture.read()
    while success and len(frames) < frame_count:
        frame = cv2.resize(frame, frame_size)
        frames.append(frame)
        success, frame = video_capture.read()
    video_capture.release()
    
    # If fewer than frame_count frames were captured, pad with zeros
    if len(frames) < frame_count:
        frames += [np.zeros(frame_size + (3,))] * (frame_count - len(frames))
        
    frames = np.array(frames) / 255.0  # Normalize to [0, 1]
    return frames

In [15]:
def load_video_frames(video_path, frame_count=30, frame_size=(224, 224)):
    # Convert the video path from bytes to string
    video_path = video_path.numpy().decode('utf-8')  # Decode from bytes to string
    
    if not os.path.exists(video_path):
        print(f"Invalid video path: {video_path}")  # Print error if file does not exist
        return np.zeros((frame_count, frame_size[0], frame_size[1], 3), dtype=np.float32)  # Return empty frames
    
    video_capture = cv2.VideoCapture(video_path)
    frames = []
    success, frame = video_capture.read()
    while success and len(frames) < frame_count:
        frame = cv2.resize(frame, frame_size)
        frames.append(frame)
        success, frame = video_capture.read()
    video_capture.release()
    
    # If fewer than frame_count frames, pad with zeros
    while len(frames) < frame_count:
        # Create a black frame
        black_frame = np.zeros((frame_size[0], frame_size[1], 3), dtype=np.float32)  # Shape (224, 224, 3)
        frames.append(black_frame)
        
    frames = np.array(frames, dtype=np.float32) / 255.0  # Normalize to [0, 1]
    return frames

In [16]:
def load_video_frames(video_path, frame_count=30, frame_size=(224, 224)):
    # Convert the video path from bytes to string
    video_path = video_path.numpy().decode('utf-8')
    
    if not os.path.exists(video_path):
        print(f"Invalid video path: {video_path}")
        return np.zeros((frame_count, frame_size[0], frame_size[1], 3), dtype=np.float32)
    
    video_capture = cv2.VideoCapture(video_path)
    frames = []
    
    # More efficient frame extraction
    frame_indices = np.linspace(0, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 1, frame_count, dtype=int)
    
    for idx in frame_indices:
        video_capture.set(cv2.CAP_PROP_POS_FRAMES, idx)
        success, frame = video_capture.read()
        
        if success:
            frame = cv2.resize(frame, frame_size)
            frames.append(frame)
        else:
            # Black frame if extraction fails
            frames.append(np.zeros((frame_size[0], frame_size[1], 3), dtype=np.float32))
    
    video_capture.release()
    
    # Normalize and ensure consistent shape
    frames = np.array(frames, dtype=np.float32)[:frame_count] / 255.0
    return frames

In [17]:
def preprocess_video(video_path, label):
    frames = tf.py_function(load_video_frames, [video_path], tf.float32)
    frames.set_shape([30, 224, 224, 3])
    label = tf.cast(label, tf.int32)
    return frames, label

In [None]:
for path in df_filtered['video_file_path']:
    if pd.isna(path) or path == '':
        print("Empty video file path detected.")
    else:
        print("Valid video path:", path)

In [None]:
# Count empty or NaN video file paths
empty_video_paths_count = df_filtered['video_file_path'].isna().sum() + (df_filtered['video_file_path'] == '').sum()

# Print the count of empty video file paths
print(f"Count of empty video file paths: {empty_video_paths_count}")

In [18]:
from sklearn.model_selection import train_test_split

video_paths = df_filtered['video_file_path'].values
labels = df_filtered['gloss_encoded'].values

# Split dataset
train_paths, val_paths, train_labels, val_labels = train_test_split(
    video_paths, labels, test_size=0.2, random_state=42, stratify=labels
)

In [None]:
def create_tf_dataset(paths, labels):    
    dataset = tf.data.Dataset.from_tensor_slices((paths, labels))
    dataset = dataset.map(lambda path, label: tf.py_function(
        func=preprocess_video, inp=[path, label], Tout=(tf.float32, tf.int32)), 
        num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.shuffle(buffer_size=100).batch(32).prefetch(tf.data.AUTOTUNE)
    return dataset

In [19]:
def create_tf_dataset(video_paths, labels, batch_size=16, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((video_paths, labels))
    
    dataset = dataset.map(
        preprocess_video, 
        num_parallel_calls=tf.data.AUTOTUNE
    )
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(video_paths))
    
    # Reduced batch size to manage memory
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    return dataset

In [20]:
train_dataset = create_tf_dataset(train_paths, train_labels, batch_size=16)
val_dataset = create_tf_dataset(val_paths, val_labels, batch_size=16, shuffle=False)

In [32]:
def create_sign_language_model():
    model = tf.keras.models.Sequential([
        layers.Conv3D(32, (3, 3, 3), activation='relu', input_shape=(30, 224, 224, 3)),
        layers.MaxPooling3D((2, 2, 2)),
        layers.Conv3D(32, (3, 3, 3), activation='relu'),
        layers.MaxPooling3D((2, 2, 2)),
        layers.Flatten(),
        layers.Dense(32, activation='relu'),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    model.summary()
    return model

num_classes = 2000
model = create_sign_language_model()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv3d_2 (Conv3D)           (None, 28, 222, 222, 32   2624      
                             )                                   
                                                                 
 max_pooling3d_2 (MaxPoolin  (None, 14, 111, 111, 32   0         
 g3D)                        )                                   
                                                                 
 conv3d_3 (Conv3D)           (None, 12, 109, 109, 32   27680     
                             )                                   
                                                                 
 max_pooling3d_3 (MaxPoolin  (None, 6, 54, 54, 32)     0         
 g3D)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 559872)           

In [33]:
def create_sign_language_model(input_shape=(30, 224, 224, 3), num_classes=2000):
    model = models.Sequential([
        # First 3D Convolutional Layer with fewer filters
        layers.Conv3D(8, (3, 3, 3), activation='relu', input_shape=input_shape, data_format='channels_last'),
        layers.BatchNormalization(),
        layers.MaxPooling3D((2, 2, 2)),

        # Second 3D Convolutional Layer
        layers.Conv3D(16, (3, 3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling3D((2, 2, 2)),

        # Third 3D Convolutional Layer with even fewer filters
        layers.Conv3D(32, (3, 3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling3D((2, 2, 2)),

        # Flatten the output before passing it to Dense layers
        layers.Flatten(),
        
        # Smaller Dense layer with fewer neurons
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        
        # Output layer with softmax activation for multiclass classification
        layers.Dense(num_classes, activation='softmax')
    ])

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    # Print model summary to verify architecture and parameter count
    model.summary()

    return model

In [21]:
def create_sign_language_model(input_shape=(30, 224, 224, 3), num_classes=2000):
    model = tf.keras.models.Sequential([
        # Reduced number of filters and layers
        layers.Conv3D(16, (3, 3, 3), activation='relu', input_shape=input_shape, data_format='channels_last'),
        layers.BatchNormalization(),
        layers.MaxPooling3D((2, 2, 2)),
        
        layers.Conv3D(32, (3, 3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling3D((2, 2, 2)),
        
        layers.Flatten(),
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss='sparse_categorical_crossentropy', 
        metrics=['accuracy']
    )
    
    # Print model summary to verify architecture
    model.summary()
    
    return model

In [34]:
num_classes = len(np.unique(labels))
model = create_sign_language_model(num_classes=num_classes)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv3d_4 (Conv3D)           (None, 28, 222, 222, 8)   656       
                                                                 
 batch_normalization_2 (Bat  (None, 28, 222, 222, 8)   32        
 chNormalization)                                                
                                                                 
 max_pooling3d_4 (MaxPoolin  (None, 14, 111, 111, 8)   0         
 g3D)                                                            
                                                                 
 conv3d_5 (Conv3D)           (None, 12, 109, 109, 16   3472      
                             )                                   
                                                                 
 batch_normalization_3 (Bat  (None, 12, 109, 109, 16   64        
 chNormalization)            )                        

In [35]:
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        # Limit GPU memory growth
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
    except RuntimeError as e:
        print(e)

In [36]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

lr_reducer = ReduceLROnPlateau(
    factor=0.5, 
    patience=3, 
    min_lr=0.00001
)

early_stopper = EarlyStopping(
    patience=5, 
    restore_best_weights=True
)

In [None]:
history = model.fit(
    train_dataset,
    epochs=10,
    validation_data=val_dataset,
    callbacks=[lr_reducer, early_stopper]
)

Epoch 1/10


In [3]:
import platform
print(platform.architecture())

('64bit', 'WindowsPE')


In [2]:
import cv2 as cv
print( cv.__version__ )

4.10.0


In [None]:
for train_batch in train_dataset.take(1):
    train_paths, train_labels = train_batch
    print(f"Train dataset shape: {train_paths.shape}, {train_labels.shape}")

In [None]:
for val_batch in val_dataset.take(1):
    val_paths, val_labels = val_batch
    print(f"Val dataset shape: {val_paths.shape}, {val_labels.shape}")

In [None]:
try:
    for train_paths, train_labels in train_dataset.take(1):
        print("Input Tensor Shape:", train_paths.shape)
        print("Label Tensor Shape:", train_labels.shape)
        print("Input Tensor Dtype:", train_paths.dtype)
        print("Label Tensor Dtype:", train_labels.dtype)
except Exception as e:
    print("Error examining dataset:", str(e))

In [30]:
def analyze_video_dataset(video_paths):
    """Analisis komprehensif dataset video"""
    video_stats = {
        'total_videos': len(video_paths),
        'valid_videos': 0,
        'invalid_videos': 0,
        'resolution_stats': [],
        'fps_stats': [],
        'duration_stats': []
    }

    for video_path in video_paths:  # Sampling 100 video untuk analisis
        try:
            cap = cv2.VideoCapture(video_path)
            
            if not cap.isOpened():
                video_stats['invalid_videos'] += 1
                continue

            video_stats['valid_videos'] += 1
            
            # Resolusi
            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            video_stats['resolution_stats'].append((width, height))
            
            # FPS
            fps = cap.get(cv2.CAP_PROP_FPS)
            video_stats['fps_stats'].append(fps)
            
            # Durasi
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            duration = total_frames / fps if fps > 0 else 0
            video_stats['duration_stats'].append(duration)
            
            cap.release()
        
        except Exception as e:
            print(f"Error analyzing {video_path}: {e}")
            video_stats['invalid_videos'] += 1

    # Statistik
    print("\n--- Analisis Dataset Video ---")
    print(f"Total Video: {video_stats['total_videos']}")
    print(f"Video Valid: {video_stats['valid_videos']}")
    print(f"Video Invalid: {video_stats['invalid_videos']}")
    
    print("\nResolusi Video:")
    unique_resolutions = set(video_stats['resolution_stats'])
    for res in unique_resolutions:
        count = video_stats['resolution_stats'].count(res)
        print(f"{res}: {count} video")
    
    print("\nStatistik FPS:")
    print(f"Rata-rata: {np.mean(video_stats['fps_stats']):.2f}")
    print(f"Min: {np.min(video_stats['fps_stats'])}")
    print(f"Max: {np.max(video_stats['fps_stats'])}")
    
    print("\nStatistik Durasi Video:")
    print(f"Rata-rata: {np.mean(video_stats['duration_stats']):.2f} detik")
    print(f"Min: {np.min(video_stats['duration_stats']):.2f} detik")
    print(f"Max: {np.max(video_stats['duration_stats']):.2f} detik")


In [31]:
analyze_video_dataset(video_paths)


--- Analisis Dataset Video ---
Total Video: 11980
Video Valid: 11980
Video Invalid: 0

Resolusi Video:
(720, 400): 1875 video
(1920, 1080): 1768 video
(656, 370): 1045 video
(654, 480): 1 video
(640, 480): 1824 video
(288, 192): 2668 video
(320, 180): 161 video
(720, 540): 21 video
(1280, 720): 721 video
(736, 414): 195 video
(640, 360): 24 video
(320, 240): 1584 video
(854, 480): 93 video

Statistik FPS:
Rata-rata: 28.53
Min: 12.0
Max: 59.94

Statistik Durasi Video:
Rata-rata: 2.43 detik
Min: 0.63 detik
Max: 8.12 detik


In [None]:
print("Sample labels:")
print(train_labels[:5])

In [None]:
print("Sample frames (first frame of the batch):")
for i in range(min(5, train_paths.shape[0])):  # Print up to 5 samples
    print(f"Frame {i}: {train_paths[i]}")

In [None]:
def count_video_frames(video_path):
    video_capture = cv2.VideoCapture(video_path)
    frame_count = 0

    while True:
        success, _ = video_capture.read()
        if not success:
            break
        frame_count += 1

    video_capture.release()
    return frame_count

In [None]:
count_below_30_frames = 0

for video_path in df_filtered['video_file_path']:
    frame_count = count_video_frames(video_path)
    if frame_count < 30:
        count_below_30_frames += 1

# Menampilkan jumlah video yang memiliki kurang dari 30 frame
print(f"Jumlah video dengan kurang dari 30 frame: {count_below_30_frames}")

In [None]:
def fit_vectorizer(corpus):
vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=None,  # No limit on the number of tokens
        output_mode='int',  # Output mode as integers
        output_sequence_length=None,
        standardize='lower_and_strip_punctuation',
        ragged=True,
        ngrams=None
    )
vectorizer.adapt(corpus)
return vectorizer

In [None]:
def preprocess(video_path, label):
    # Konversi EagerTensor ke string path
    video_path = video_path.numpy().decode("utf-8")
    # Baca video dan ekstrak frame
    frames = extract_frames(video_path)
    return frames, label

def process_data(video_path, label):
    # Konversi label dari string ke int
    label = tf.strings.to_number(label, tf.int32)
    # Tambahkan logika lain sesuai kebutuhan
    return video_path, label

def create_dataset(paths, labels):
    labels = [int(label) for label in labels]  # Pastikan label berupa int
    dataset = tf.data.Dataset.from_tensor_slices((paths, labels))
    dataset = dataset.map(process_data)
    return dataset
    # Tambahkan batching dan shuffle
    dataset = dataset.batch(8).shuffle(buffer_size=100).prefetch(tf.data.AUTOTUNE)
    return dataset

In [None]:
from tensorflow.keras import layers, models

def create_model(input_shape, num_classes):
    model = models.Sequential()
    model.add(layers.Conv3D(32, (3, 3, 3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling3D((2, 2, 2)))
    model.add(layers.Conv3D(64, (3, 3, 3), activation='relu'))
    model.add(layers.MaxPooling3D((2, 2, 2)))
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(num_classes, activation='softmax'))
    return model

# Contoh penggunaan
num_frames = 30
input_shape = (num_frames, 150, 150, 3)  # Sesuaikan dengan output extract_frames
model = create_model(input_shape, num_classes)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'], run_eagerly=True)

In [None]:
from sklearn.model_selection import train_test_split
import os

# Mendapatkan path video dan gloss dari JSON
video_paths = []
gloss_labels = []

video_dir = './wlasl-processed/videos'

for item in data:
    gloss = item['gloss']
    for instance in item['instances']:
        video_id = instance['video_id']
        video_path = os.path.join(video_dir, f'{video_id}.mp4')
        if os.path.exists(video_path):  # Pastikan file video ada
            video_paths.append(video_path)
            gloss_labels.append(gloss)


In [None]:
# Split dataset menjadi train dan validation
train_paths, val_paths, train_labels, val_labels = train_test_split(
    video_paths, gloss_labels, test_size=0.2, stratify=gloss_labels
)

# Buat dataset TensorFlow
train_dataset = create_dataset(train_paths, train_labels)
val_dataset = create_dataset(val_paths, val_labels)

for video, label in train_dataset.take(5):  # menampilkan 5 batch pertama
    print("Video batch:", video.numpy())
    print("Label batch:", label.numpy())

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)
]

In [None]:
epochs = 20
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=epochs,
    callbacks=callbacks
)

In [None]:
print(f"Total training samples: {len(train_paths)}, Total validation samples: {len(val_paths)}")
# Coba cetak beberapa contoh untuk memastikan data benar
for video, label in create_dataset(train_paths[:2], train_labels[:2]).take(1):
    print(video.shape, label)  # Video shape harus sesuai dengan input model

In [None]:
for video, label in create_dataset(train_paths[:2], train_labels[:2]).take(1):
    print("Video shape:", video.shape)  # Harus (batch_size, num_frames, height, width, channels)
    print("Label shape:", label.shape)  # Harus (batch_size,)

In [None]:
print(train_dataset.head())