In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
import mediapipe as mp 
import cv2 as cv
import os 

In [2]:
file_path_Dataset='Top_Classes_Landmarks/Top_Classes_Landmarks'
file_path_Preprocessed='Top_Classes_Landmarks_Preprocessed_No_SlidingWindow_OR_Mask/Top_Classes_Landmarks_Preprocessed_No_SlidingWindow_OR_Mask'


Dataset= []
Dataset_glosses = []


for file in os.listdir(file_path_Dataset):
    if not file.endswith(".npy"):
        continue

    data = np.load(os.path.join(file_path_Dataset, file))
    label = file.split(' ')[0].lower() 

    Dataset.append(data)
    Dataset_glosses.append(label)

Dataset_preprocessed= []
Dataset_preprocessed_glosses = []



for file in os.listdir(file_path_Preprocessed):
    if not file.endswith(".npy"):
        continue

    data = np.load(os.path.join(file_path_Preprocessed, file))
    label = file.split('_')[0].lower()

    Dataset_preprocessed.append(data)
    Dataset_preprocessed_glosses.append(label)

print(len(Dataset_preprocessed), len(Dataset_preprocessed_glosses))


5076 5076


### After preprocessing 

In [3]:
from sklearn.model_selection import train_test_split

X_train_preprocessed, X_temp, y_train_preprocessed, y_temp = train_test_split(
    Dataset_preprocessed,
    Dataset_preprocessed_glosses,
    test_size=0.10,   
    random_state=42
)

X_val_preprocessed, X_test_preprocessed, y_val_preprocessed, y_test_preprocessed = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50,  
    random_state=42
)

In [4]:

X_train_preprocessed = np.array(X_train_preprocessed)
X_val_preprocessed = np.array(X_val_preprocessed)
X_test_preprocessed = np.array(X_test_preprocessed)
y_train_preprocessed = np.array(y_train_preprocessed)
y_val_preprocessed = np.array(y_val_preprocessed)
y_test_preprocessed = np.array(y_test_preprocessed)

In [5]:
np.unique(y_train_preprocessed )

array(['about', 'after', 'angry', 'apple', 'aunt', 'baby', 'bad',
       'bathroom', 'before', 'big', 'bird', 'blue', 'boy', 'brother',
       'brown', 'brush', 'bug', 'can', 'candy', 'cannot', 'car', 'cat',
       'cereal', 'cheese', 'child', 'church', 'clean', 'close', 'cold',
       'come', 'cookie', 'cost', 'cow', 'cry', 'cup', 'dark', 'day',
       'divorce', 'dog', 'down', 'drink', 'drive', 'eat', 'egg', 'excuse',
       'father', 'finish', 'fork', 'friend', 'full', 'girl', 'go', 'gold',
       'good', 'grandfather', 'grandmother', 'green', 'hamburger',
       'happy', 'hear', 'help', 'here', 'holiday', 'home', 'homework',
       'horse', 'hot', 'hotdog', 'how', 'hungry', 'hurt', 'in', 'less',
       'light', 'like', 'love', 'milk', 'month', 'more', 'mother', 'need',
       'nice', 'night', 'no', 'not', 'now', 'off', 'open', 'orange',
       'out', 'pants', 'pig', 'pizza', 'play', 'please', 'red', 'run',
       'sad', 'same', 'school', 'see', 'sheep', 'shirt', 'shoes',
       'si

In [6]:
from sklearn.preprocessing import LabelEncoder


# Create label encoder
le = LabelEncoder()

# Fit on training labels and transform
y_train_encoded = le.fit_transform(y_train_preprocessed)
y_val_encoded   = le.transform(y_val_preprocessed)
y_test_encoded  = le.transform(y_test_preprocessed)

In [7]:
import tensorflow as tf
from tensorflow.keras.layers import Layer


class PositionalEncoding(Layer):
    def __init__(self, max_len, d_model):
        super().__init__()
        pos = np.arange(max_len)[:, np.newaxis]
        i = np.arange(d_model)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / d_model)
        angle_rads = pos * angle_rates

        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        self.pos_encoding = tf.constant(angle_rads[np.newaxis, ...], dtype=tf.float32)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]


In [8]:
from tensorflow.keras.layers import MultiHeadAttention, Dense, Dropout, LayerNormalization

class TransformerEncoderBlock(Layer):
    def __init__(self, d_model, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(d_model)
        ])
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.norm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

    def call(self, x, training=False):
        attn_output = self.att(x, x)
        x = self.norm1(x + self.dropout1(attn_output, training=training))
        ffn_output = self.ffn(x)
        return self.norm2(x + self.dropout2(ffn_output, training=training))


In [9]:
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling1D
from tensorflow.keras.models import Model

def build_signbert_encoder(
   T=157,
    D=438,
    d_model=256,
    num_heads=8,
    ff_dim=512,
    num_layers=4
):
    inputs = Input(shape=(T, D))

    # Pose embedding
    x = Dense(d_model)(inputs)

    # Positional encoding
    x = PositionalEncoding(T, d_model)(x)

    # Transformer encoder stack
    for _ in range(num_layers):
        x = TransformerEncoderBlock(d_model, num_heads, ff_dim)(x)

    return Model(inputs, x, name="SignBERT_Encoder")


In [10]:
def build_signbert_word_model(
    T=157,
    D=438,
    num_classes=132
):
    encoder = build_signbert_encoder(T, D)

    inputs = encoder.input
    x = encoder.output

    # Pool over time
    x = GlobalAveragePooling1D()(x)

    # Classification head
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.3)(x)
    outputs = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs, outputs)
    return model


In [11]:
model = build_signbert_word_model(
   
    num_classes=132
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()





In [12]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=6,
    restore_best_weights=True
)



history = model.fit(
    X_train_preprocessed,      # encoder input
    y_train_encoded,           # target word labels
    validation_data=(X_val_preprocessed, y_val_encoded),
    batch_size=32,
    epochs=50,
     callbacks=[early_stop]
)



Epoch 1/50
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 2s/step - accuracy: 0.0160 - loss: 4.9079 - val_accuracy: 0.0394 - val_loss: 4.8265
Epoch 2/50
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 2s/step - accuracy: 0.0298 - loss: 4.7980 - val_accuracy: 0.0276 - val_loss: 4.5903
Epoch 3/50
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 2s/step - accuracy: 0.0650 - loss: 4.3746 - val_accuracy: 0.1142 - val_loss: 3.9498
Epoch 4/50
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 2s/step - accuracy: 0.1055 - loss: 3.8461 - val_accuracy: 0.1772 - val_loss: 3.3834
Epoch 5/50
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m222s[0m 2s/step - accuracy: 0.1250 - loss: 3.5399 - val_accuracy: 0.1929 - val_loss: 3.1359
Epoch 6/50
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 2s/step - accuracy: 0.1806 - loss: 3.2321 - val_accuracy: 0.2283 - val_loss: 2.8498
Epoch 7/50
[1m143/143

In [13]:
test_loss, test_acc = model.evaluate(X_test_preprocessed, y_test_encoded)
print("Test accuracy:", test_acc)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 455ms/step - accuracy: 0.7402 - loss: 1.1545
Test accuracy: 0.7401574850082397


In [19]:
model.save("signbert_baseline.keras")



### Before preprocessing 

In [13]:
# Count how many samples per word
from collections import Counter


label_counts = Counter(Dataset_glosses)

MIN_SAMPLES = 2  # must be >=2 for stratification

valid_labels = {
    label for label, count in label_counts.items()
    if count >= MIN_SAMPLES
}

print("Original samples:", len(Dataset))
print("Original vocab size:", len(label_counts))

# Filter dataset
Dataset_filtered = []
Glosses_filtered = []

for x, y in zip(Dataset, Dataset_glosses):
    if y in valid_labels:
        Dataset_filtered.append(x)
        Glosses_filtered.append(y)

print("Filtered samples:", len(Dataset_filtered))
print("Filtered vocab size:", len(set(Glosses_filtered)))

Original samples: 5568
Original vocab size: 292
Filtered samples: 5422
Filtered vocab size: 146


In [14]:
X_train_raw, X_temp, y_train_raw, y_temp = train_test_split(
    Dataset_filtered,
    Glosses_filtered,
    test_size=0.10,
    random_state=42,
    stratify=Glosses_filtered
)

X_val_raw, X_test_raw, y_val_raw, y_test_raw = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50,
    random_state=42,
    stratify=y_temp
)


In [15]:
MAX_LEN = 64

def pad_or_truncate(sequence, max_len=64):
    T, D = sequence.shape
    if T > max_len:
        return sequence[:max_len]
    elif T < max_len:
        padding = np.zeros((max_len - T, D))
        return np.vstack([sequence, padding])
    return sequence

In [16]:
X_train_raw = np.array([pad_or_truncate(seq, MAX_LEN) for seq in X_train_raw])
X_val_raw   = np.array([pad_or_truncate(seq, MAX_LEN) for seq in X_val_raw])
X_test_raw  = np.array([pad_or_truncate(seq, MAX_LEN) for seq in X_test_raw])

print("X_train shape:", X_train_raw.shape)  # (N, 64, D)
print("X_val shape:", X_val_raw.shape)  # (N, 64, D)
print("X_test shape:", X_test_raw.shape)  # (N, 64, D)

X_train shape: (4879, 64, 438)
X_val shape: (271, 64, 438)
X_test shape: (272, 64, 438)


In [20]:
model_raw = build_signbert_word_model(
    T=64,
    D=438,
    num_classes=146
)

model_raw.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model_raw.summary()

In [21]:
le = LabelEncoder()

y_train = le.fit_transform(y_train_raw)
y_val   = le.transform(y_val_raw)
y_test  = le.transform(y_test_raw)

In [22]:
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=6,
    restore_best_weights=True
)



history = model_raw.fit(
 X_train_raw,      # encoder input
    y_train,           # target word labels
    validation_data=(X_val_raw, y_val),
    epochs=50,
     callbacks=[early_stop]
)


Epoch 1/50
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 643ms/step - accuracy: 0.0119 - loss: 5.0096 - val_accuracy: 0.0221 - val_loss: 4.9465
Epoch 2/50
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 650ms/step - accuracy: 0.0170 - loss: 4.9473 - val_accuracy: 0.0185 - val_loss: 4.8122
Epoch 3/50
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 628ms/step - accuracy: 0.0238 - loss: 4.8100 - val_accuracy: 0.0443 - val_loss: 4.6376
Epoch 4/50
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 632ms/step - accuracy: 0.0320 - loss: 4.6638 - val_accuracy: 0.0480 - val_loss: 4.4899
Epoch 5/50
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 630ms/step - accuracy: 0.0451 - loss: 4.4939 - val_accuracy: 0.0627 - val_loss: 4.2474
Epoch 6/50
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 654ms/step - accuracy: 0.0627 - loss: 4.2955 - val_accuracy: 0.0664 - val_loss: 4.0769
Epoch 7/

In [23]:
test_loss, test_acc = model_raw.evaluate(X_test_raw, y_test)
print("Test accuracy:", test_acc)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 207ms/step - accuracy: 0.5000 - loss: 1.9167
Test accuracy: 0.5
