In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
import mediapipe as mp 
import cv2 as cv
import os 

In [2]:
from pathlib import Path

DATA_DIR = Path(r"Preprocessed_No_Sliding_Window_OR_Mask\Preprocessed_No_Sliding_Window_OR_Mask")

print(len(list(DATA_DIR.glob("*.npy"))))

0


In [None]:

from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR

# ============================================================
# CONFIG
# ============================================================

DATA_DIR = Path(r"Preprocessed_No_Sliding_Window_OR_Mask\Preprocessed_No_Sliding_Window_OR_Mask")
DEVICE = "cpu"

TARGET_FRAMES = 157
FEATURE_DIM = 438

BATCH_SIZE = 8
EPOCHS = 90
LR = 3e-4
WEIGHT_DECAY = 1e-4

PATIENCE = 12
GRAD_CLIP = 1.0
LABEL_SMOOTH = 0.1

MODEL_SAVE_PATH = DATA_DIR / "tcn_best_cpu_3rdcode.pth"
LABEL_ENCODER_PATH = DATA_DIR / "label_encoder_3rdcode.npy"

In [4]:
files, masks, labels = [], [], []

for f in DATA_DIR.glob("*.npy"):
    # print("HH")

    mask_f = f.with_name(f.stem + ".npy")
    # print(mask_f.exists())
    if not mask_f.exists():
        continue

    arr = np.load(f)
    if arr.shape != (TARGET_FRAMES, FEATURE_DIM):
        continue

    files.append(str(f))
    masks.append(str(mask_f))
    labels.append(f.stem.split("_")[0])

# Filter rare classes
cnt = Counter(labels)
keep = [i for i, y in enumerate(labels) if cnt[y] >= 2]

files = [files[i] for i in keep]
masks = [masks[i] for i in keep]
labels = [labels[i] for i in keep]

# Encode labels
le = LabelEncoder()
y = le.fit_transform(labels)
np.save(LABEL_ENCODER_PATH, le.classes_)
num_classes = len(le.classes_)


In [6]:
set(labels)

{'about',
 'after',
 'angry',
 'apple',
 'aunt',
 'baby',
 'bad',
 'bathroom',
 'before',
 'big',
 'bird',
 'blue',
 'boy',
 'brother',
 'brown',
 'brush',
 'bug',
 'can',
 'candy',
 'cannot',
 'car',
 'cat',
 'cereal',
 'cheese',
 'child',
 'church',
 'clean',
 'close',
 'cold',
 'come',
 'cookie',
 'cost',
 'cow',
 'cry',
 'cup',
 'dark',
 'day',
 'divorce',
 'dog',
 'down',
 'drink',
 'drive',
 'eat',
 'egg',
 'excuse',
 'father',
 'finish',
 'fork',
 'friend',
 'full',
 'girl',
 'go',
 'gold',
 'good',
 'grandfather',
 'grandmother',
 'green',
 'hamburger',
 'happy',
 'hear',
 'help',
 'here',
 'holiday',
 'home',
 'homework',
 'horse',
 'hot',
 'hotdog',
 'how',
 'hungry',
 'hurt',
 'in',
 'less',
 'light',
 'like',
 'love',
 'milk',
 'month',
 'more',
 'mother',
 'need',
 'nice',
 'night',
 'no',
 'not',
 'now',
 'off',
 'open',
 'orange',
 'out',
 'pants',
 'pig',
 'pizza',
 'play',
 'please',
 'red',
 'run',
 'sad',
 'same',
 'school',
 'see',
 'sheep',
 'shirt',
 'shoes',
 'si

### After preprocessing 

In [7]:
X_tr, X_tmp, y_tr, y_tmp, m_tr, m_tmp = train_test_split(
    files, y, masks, test_size=0.2, stratify=y, random_state=42
)
X_val, X_te, y_val, y_te, m_val, m_te = train_test_split(
    X_tmp, y_tmp, m_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)

In [8]:
X_tr_data = [np.load(Path(f)) for f in X_tr]

print(len(X_tr_data))
print(X_tr_data[0])




4454
[[-5.3782193e-03 -6.5089405e-01 -1.6726000e+00 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 [-1.9704890e-03 -6.5519047e-01 -1.6631397e+00 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 [ 1.4372411e-03 -6.5948683e-01 -1.6536793e+00 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 ...
 [ 6.9793008e-02 -6.8986171e-01 -1.1281021e+00 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 [ 6.9115520e-02 -6.8984360e-01 -1.1260076e+00 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 [ 6.8438031e-02 -6.8982548e-01 -1.1239130e+00 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]]


In [11]:
X_te_data = [np.load(Path(f)) for f in X_te]

print(len(X_te_data))
print(X_te_data[0].shape)


557
(157, 438)


In [12]:
X_val_data = [np.load(Path(f)) for f in X_val]

print(len(X_val_data))
print(X_val_data[0].shape)

557
(157, 438)


In [76]:
y_tr.shape

(4454,)

In [13]:
X_val_data=np.array(X_val_data)
X_te_data=np.array(X_te_data)
X_tr_data=np.array(X_tr_data)

In [78]:
X_tr_data.shape

(4454, 157, 438)

In [14]:
import tensorflow as tf
from tensorflow.keras.layers import Layer


class PositionalEncoding(Layer):
    def __init__(self, max_len, d_model):
        super().__init__()
        pos = np.arange(max_len)[:, np.newaxis]
        i = np.arange(d_model)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / d_model)
        angle_rads = pos * angle_rates

        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        self.pos_encoding = tf.constant(angle_rads[np.newaxis, ...], dtype=tf.float32)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]


In [15]:
from tensorflow.keras.layers import MultiHeadAttention, Dense, Dropout, LayerNormalization

class TransformerEncoderBlock(Layer):
    def __init__(self, d_model, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(d_model)
        ])
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.norm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

    def call(self, x, training=False):
        attn_output = self.att(x, x)
        x = self.norm1(x + self.dropout1(attn_output, training=training))
        ffn_output = self.ffn(x)
        return self.norm2(x + self.dropout2(ffn_output, training=training))


In [16]:
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling1D
from tensorflow.keras.models import Model

def build_signbert_encoder(
   T=157,
    D=438,
    d_model=256,
    num_heads=8,
    ff_dim=512,
    num_layers=4
):
    inputs = Input(shape=(T, D))

    # Pose embedding
    x = Dense(d_model)(inputs)

    # Positional encoding
    x = PositionalEncoding(T, d_model)(x)

    # Transformer encoder stack
    for _ in range(num_layers):
        x = TransformerEncoderBlock(d_model, num_heads, ff_dim)(x)

    return Model(inputs, x, name="SignBERT_Encoder")


In [17]:
def build_signbert_word_model(
    T=157,
    D=438,
    num_classes=132
):
    encoder = build_signbert_encoder(T, D)

    inputs = encoder.input
    x = encoder.output

    # Pool over time
    x = GlobalAveragePooling1D()(x)

    # Classification head
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.3)(x)
    outputs = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs, outputs)
    return model


In [18]:
model = build_signbert_word_model(
   
    num_classes=146
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 157, 438)]        0         
                                                                 
 dense (Dense)               (None, 157, 256)          112384    
                                                                 
 positional_encoding (Posit  (None, 157, 256)          0         
 ionalEncoding)                                                  
                                                                 
 transformer_encoder_block   (None, 157, 256)          2367488   
 (TransformerEncoderBlock)                                       
                                                                 
 transformer_encoder_block_  (None, 157, 256)          2367488   
 1 (TransformerEncoderBlock                                      
 )                                                           

In [19]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=6,
    restore_best_weights=True
)



history = model.fit(
    X_tr_data,      # encoder input
    y_tr,           # target word labels
    validation_data=(X_val_data, y_val),
    batch_size=32,
    epochs=50,
     callbacks=[early_stop]
)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50


In [20]:
test_loss, test_acc = model.evaluate(X_te_data, y_te)
print("Test accuracy:", test_acc)


Test accuracy: 0.7073608636856079


In [21]:
model.save("sign_bert_model.plt")

INFO:tensorflow:Assets written to: sign_bert_model.plt\assets


INFO:tensorflow:Assets written to: sign_bert_model.plt\assets


In [51]:
model.save("sign_bert_model.keras")

In [22]:
yhat=model.predict(X_te_data)



In [23]:
# ytrue = np.argmax( y_test_encoded, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [24]:
yhat

[41,
 2,
 18,
 105,
 11,
 83,
 108,
 14,
 108,
 24,
 125,
 61,
 121,
 132,
 19,
 13,
 61,
 91,
 71,
 135,
 18,
 76,
 33,
 142,
 12,
 23,
 126,
 29,
 114,
 11,
 109,
 47,
 102,
 51,
 141,
 123,
 7,
 47,
 44,
 93,
 138,
 0,
 68,
 87,
 104,
 35,
 47,
 64,
 119,
 11,
 5,
 37,
 14,
 102,
 104,
 72,
 79,
 126,
 123,
 128,
 9,
 32,
 101,
 29,
 137,
 79,
 134,
 17,
 35,
 4,
 137,
 107,
 83,
 61,
 38,
 84,
 65,
 42,
 33,
 138,
 91,
 116,
 122,
 9,
 125,
 4,
 40,
 89,
 92,
 48,
 134,
 21,
 47,
 18,
 113,
 12,
 14,
 45,
 101,
 56,
 121,
 105,
 134,
 26,
 16,
 5,
 44,
 76,
 44,
 30,
 22,
 18,
 47,
 33,
 3,
 38,
 131,
 35,
 98,
 130,
 67,
 64,
 130,
 96,
 33,
 34,
 144,
 112,
 33,
 30,
 119,
 95,
 52,
 65,
 139,
 82,
 95,
 126,
 56,
 121,
 132,
 48,
 96,
 100,
 106,
 28,
 14,
 33,
 28,
 21,
 104,
 96,
 38,
 27,
 38,
 38,
 143,
 82,
 133,
 127,
 0,
 25,
 76,
 141,
 51,
 144,
 142,
 130,
 9,
 4,
 51,
 33,
 20,
 57,
 11,
 76,
 138,
 35,
 89,
 27,
 77,
 144,
 116,
 8,
 48,
 25,
 26,
 72,
 75,
 131,
 11

In [26]:
import numpy as np

sample = np.expand_dims(X_te_data[4], axis=0)
y_predicted_first = model.predict(sample)



In [28]:
y_predicted_first.shape

(1, 146)

In [41]:
np.argmax(y_predicted_first[0])


47

In [43]:
predicted_label = le.inverse_transform([np.argmax(y_predicted_first[0])])[0]
predicted_label


'fork'

In [29]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score
multilabel_confusion_matrix(y_te, yhat)

array([[[550,   1],
        [  1,   5]],

       [[554,   0],
        [  2,   1]],

       [[554,   0],
        [  0,   3]],

       [[554,   0],
        [  1,   2]],

       [[548,   6],
        [  1,   2]],

       [[550,   1],
        [  2,   4]],

       [[551,   3],
        [  1,   2]],

       [[552,   2],
        [  2,   1]],

       [[554,   0],
        [  0,   3]],

       [[552,   2],
        [  0,   3]],

       [[553,   1],
        [  0,   3]],

       [[552,   2],
        [  1,   2]],

       [[554,   0],
        [  0,   3]],

       [[554,   0],
        [  0,   3]],

       [[551,   3],
        [  0,   3]],

       [[554,   0],
        [  1,   2]],

       [[554,   0],
        [  2,   1]],

       [[550,   1],
        [  2,   4]],

       [[548,   2],
        [  2,   5]],

       [[554,   0],
        [  1,   2]],

       [[554,   0],
        [  1,   2]],

       [[546,   2],
        [  5,   4]],

       [[554,   0],
        [  1,   2]],

       [[554,   0],
        [  1, 

In [30]:
accuracy_score(y_te, yhat)

0.7073608617594255

In [31]:


def predict(land_marks):
    sample = np.expand_dims(land_marks, axis=0)
    y_predicted = model.predict(sample)
    predicted_index = np.argmax(y_predicted[0])
    predicted_label = le.inverse_transform([predicted_index])[0]
    confidence = float(np.max(y_predicted[0]))
    out_put={"word":predicted_label, "confidence":confidence}
    return out_put

In [33]:
predict(X_te_data[100])



{'word': 'understand', 'confidence': 0.7880738377571106}

In [35]:
le.inverse_transform([y_te[100]])[0]

'grandfather'

### Testing in real time 

In [36]:
import cv2
import mediapipe as mp
import numpy as np
import itertools  # <-- missing import

# =========================
# CONSTANTS
# =========================
POSE_LANDMARKS = 33
HAND_LANDMARKS = 21
FACE_LANDMARKS = 60

# =========================
# INIT MEDIAPIPE
# =========================
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
mp_face_mesh = mp.solutions.face_mesh  # <-- missing

holistic = mp_holistic.Holistic(
    static_image_mode=False,
    model_complexity=1,
    smooth_landmarks=True,
    enable_segmentation=False,
    refine_face_landmarks=True
)

# =========================
# FACE MESH INDICES
# =========================
FACEMESH_LIPS = set(itertools.chain(*mp_face_mesh.FACEMESH_LIPS))
FACEMESH_LEFT_EYEBROW = set(itertools.chain(*mp_face_mesh.FACEMESH_LEFT_EYEBROW))
FACEMESH_RIGHT_EYEBROW = set(itertools.chain(*mp_face_mesh.FACEMESH_RIGHT_EYEBROW))

RELEVANT_FACE_INDICES = list(FACEMESH_LIPS | FACEMESH_LEFT_EYEBROW | FACEMESH_RIGHT_EYEBROW)
RELEVANT_FACE_INDICES.sort()

# Replace with your 60 indices if needed
FACE_INDICES = list(range(60))

# =========================
# LANDMARK EXTRACTION
# =========================
def extract_landmarks(image):
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = holistic.process(image_rgb)

    # ---- POSE ----
    if results.pose_landmarks:
        pose = np.array([
            [lm.x, lm.y, lm.z, lm.visibility]
            for lm in results.pose_landmarks.landmark
        ]).flatten()
    else:
        pose = np.zeros(POSE_LANDMARKS * 4)

    # ---- LEFT HAND ----
    if results.left_hand_landmarks:
        left_hand = np.array([
            [lm.x, lm.y, lm.z]
            for lm in results.left_hand_landmarks.landmark
        ]).flatten()
    else:
        left_hand = np.zeros(HAND_LANDMARKS * 3)

    # ---- RIGHT HAND ----
    if results.right_hand_landmarks:
        right_hand = np.array([
            [lm.x, lm.y, lm.z]
            for lm in results.right_hand_landmarks.landmark
        ]).flatten()
    else:
        right_hand = np.zeros(HAND_LANDMARKS * 3)

    # ---- FACE (LIPS + EYEBROWS ONLY) ----
    if results.face_landmarks:
        relevant = [results.face_landmarks.landmark[i] for i in RELEVANT_FACE_INDICES]
        face = np.array([[lm.x, lm.y, lm.z] for lm in relevant]).flatten()
    else:
        face = np.zeros(len(RELEVANT_FACE_INDICES) * 3)

    # IMPORTANT: order must match training
    landmarks = np.concatenate([pose, face, left_hand, right_hand])

    return landmarks, results


In [50]:
import cv2
import mediapipe as mp
import numpy as np
import itertools


# =========================
# REAL-TIME WEBCAM LOOP
# =========================
SEQUENCE_LENGTH = 157  # same as training
sequence = []

cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    landmarks, results = extract_landmarks(frame)
    sequence.append(landmarks)

    # Keep only last SEQUENCE_LENGTH frames
    if len(sequence) > SEQUENCE_LENGTH:
        sequence = sequence[-SEQUENCE_LENGTH:]

    # ================= DRAW LANDMARKS =================
    if results.pose_landmarks:
        mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)

    if results.left_hand_landmarks:
        mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

    if results.right_hand_landmarks:
        mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

    # Draw lips + eyebrows only
    if results.face_landmarks:
        face_connections = list(mp_face_mesh.FACEMESH_LIPS) + \
                           list(mp_face_mesh.FACEMESH_LEFT_EYEBROW) + \
                           list(mp_face_mesh.FACEMESH_RIGHT_EYEBROW)

        mp_drawing.draw_landmarks(
            frame,
            results.face_landmarks,
            connections=face_connections,
            landmark_drawing_spec=None,
            connection_drawing_spec=mp_drawing.DrawingSpec(color=(0,255,0), thickness=1)
        )

    # ================= PREDICT =================
    if len(sequence) == SEQUENCE_LENGTH:
        predicted = predict(sequence)  # Your SignBERT predict function here

        cv2.putText(
            frame,
            f"Pred: {predicted['word']}  Conf: {predicted['confidence']}",
             
            (10, 40),
            cv2.FONT_HERSHEY_SIMPLEX,
            1,
            (0, 255, 0),
            2
        )

    cv2.imshow("SignBERT Real-Time", frame)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

    

