<a href="https://colab.research.google.com/github/rcharan05/UGP/blob/main/One_Shot_Sign_Language_Recognition_Improved_I3D_Features_and_Pose_Velocity_Fusion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Mounting the drive, importing all necessary libraries and loading all the data in the needed format

In [None]:
# 0) Mount Drive & Install
from google.colab import drive
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
!pip install pose_format
from pose_format import Pose  # pip install pose-format
import pickle

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
DATA_DIR       = "/content/drive/MyDrive/UGP"
VIDEO_POSE_DIR = os.path.join(DATA_DIR, "CISLR_v1.5-a_videos_poses")
I3D_PKL        = os.path.join(DATA_DIR, "I3D_features.pkl")
PROTO_CSV      = os.path.join(DATA_DIR, "prototype.csv")
TEST_CSV       = os.path.join(DATA_DIR, "test.csv")

In [None]:
df_proto = pd.read_csv(PROTO_CSV); df_test = pd.read_csv(TEST_CSV)
for df in (df_proto, df_test):
    df["gloss"] = df["gloss"].astype(str)
y_train = df_proto["gloss"].tolist()
y_test  = df_test ["gloss"].tolist()

In [None]:
i3d_df   = pd.read_pickle(I3D_PKL)
i3d_dict = {row["id"]: np.array(row["I3D_features"],dtype=np.float32)
            for _,row in i3d_df.iterrows()}

Improved I3D Feature Extraction (Mean, max and std dev pooled together)

In [None]:
def improved_i3d_feat(uid):
    arr = i3d_dict[uid].squeeze((0,3,4))  # → (1024,11)
    m   = arr.mean(axis=1)
    M   = arr.max (axis=1)
    s   = arr.std (axis=1)
    feat = np.concatenate([m,M,s])        # (3072,)
    feat = np.sign(feat) * np.sqrt(np.abs(feat)+1e-8)
    return feat / np.linalg.norm(feat)

Build I3D Features and Evaluate One-Shot Matching

In [None]:
# 3) Build features + one‑shot match
X_tr = np.stack([improved_i3d_feat(u) for u in df_proto["uid"]])
X_te = np.stack([improved_i3d_feat(u) for u in df_test ["uid"]])
X_trn = normalize(X_tr, axis=1); X_ten = normalize(X_te, axis=1)
S = X_ten.dot(X_trn.T); ranks = np.argsort(-S,axis=1)

In [None]:
def topk_acc(ranks, y_tr, y_te, k):
    return np.mean([y_te[i] in [y_tr[j] for j in ranks[i,:k]] for i in range(len(y_te))]) * 100

Min, Max and Std-dev pooling pushed accuracy very slightly from just mean pooling used in CISLR

In [None]:
print("=== Improved I3D‑Only One‑Shot ===")
for k in (1,5,10):
    print(f"Top-{k} Accuracy: {topk_acc(ranks, y_train, y_test, k):.2f}%")

=== Improved I3D‑Only One‑Shot ===
Top-1 Accuracy: 17.20%
Top-5 Accuracy: 21.05%
Top-10 Accuracy: 22.84%


Pose Velocity Feature Extraction

In [None]:
def pose_velocity_feat(uid):
    """
    Load your .pose (shape: (T,1,576,3)), squeeze → (T,576,3),
    compute per‐frame speed = ||kp[t] – kp[t–1]||_2 for each of 576 joints,
    then aggregate mean+max over time → (576,)×2 = (1152,)
    """
    buf = open(f"{VIDEO_POSE_DIR}/{uid}.pose","rb").read()
    p   = Pose.read(buf)
    coords = p.body.data.squeeze(1)    # → (T,576,3)
    # compute frame‐wise speeds
    diffs = np.linalg.norm(coords[1:] - coords[:-1], axis=2)  # (T–1,576)
    mean_sp = diffs.mean(axis=0)       # (576,)
    max_sp  = diffs.max(axis=0)        # (576,)
    feat    = np.concatenate([mean_sp, max_sp])  # (1152,)
    # signed‐sqrt + L2 norm
    feat    = np.sign(feat)*np.sqrt(np.abs(feat)+1e-8)
    norm    = np.linalg.norm(feat)
    return feat/norm if norm>0 else feat


Both the features extracted are concatenated

In [None]:
# I3D (3072‐d) + velocity (1152‐d) for train & test
X_i3d_imp = np.stack([ improved_i3d_feat(uid) for uid in df_proto.uid ])
X_i3d_imp_te = np.stack([ improved_i3d_feat(uid) for uid in df_test.uid ])

X_vel_tr = np.stack([ pose_velocity_feat(uid) for uid in df_proto.uid ])
X_vel_te = np.stack([ pose_velocity_feat(uid) for uid in df_test.uid ])

Normalize the new feature set

In [None]:
from sklearn.preprocessing import normalize

# L2‐normalize (just in case)
X_i3d_tr_n = normalize(X_i3d_imp, axis=1)
X_i3d_te_n = normalize(X_i3d_imp_te, axis=1)
X_vel_tr_n  = normalize(X_vel_tr, axis=1)
X_vel_te_n  = normalize(X_vel_te, axis=1)

Buid and evaluate this concatenated feature set (including velocity just brings down the accuracy from pure I3D alone)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def eval_fusion(beta):
    # Compute cosine similarities for each modality separately
    S_i3d = cosine_similarity(X_i3d_te_n, X_i3d_tr_n)  # shape: (N_test, N_proto)
    S_vel = cosine_similarity(X_vel_te_n, X_vel_tr_n)   # shape: (N_test, N_proto)

    # Fuse the similarity scores with weight beta on I3D and (1-beta) on velocity
    S_fused = beta * S_i3d + (1-beta) * S_vel

    # Get ranking indices (largest similarity first)
    ranks = np.argsort(-S_fused, axis=1)

    def topk(k):
        return np.mean([df_test.gloss[i] in df_proto.gloss.values[ranks[i,:k]]
                        for i in range(len(ranks))]) * 100

    print(f"β={beta:.2f}  Top‑1: {topk(1):.2f}%  Top‑5: {topk(5):.2f}%  Top‑10: {topk(10):.2f}%")

for beta in [0.0, 0.25, 0.5, 0.75, 1.0]:
    eval_fusion(beta)


β=0.00  Top‑1: 7.05%  Top‑5: 9.32%  Top‑10: 10.50%
β=0.25  Top‑1: 13.44%  Top‑5: 16.46%  Top‑10: 18.73%
β=0.50  Top‑1: 14.35%  Top‑5: 17.77%  Top‑10: 19.82%
β=0.75  Top‑1: 15.14%  Top‑5: 18.38%  Top‑10: 20.26%
β=1.00  Top‑1: 17.20%  Top‑5: 21.05%  Top‑10: 22.84%


Extract Full-Body, Hand, Face, and Face+Hand Velocity Features

In [None]:
def pose_velocity_feat_hands(uid):
    """
    Compute velocity features for just the hands.
    Uses landmarks with indices 501:543 (21 left-hand + 21 right-hand = 42 landmarks).
    Computes per-frame differences, then aggregates mean and max over time.
    Final feature dimension: 2 * 42 = 84.
    """
    buf = open(os.path.join(VIDEO_POSE_DIR, f"{uid}.pose"), "rb").read()
    p = Pose.read(buf)
    coords = p.body.data.squeeze(1)  # (T,576,3)
    # Extract hand landmarks (indices 501 to 543)
    hands = coords[:, 501:543]  # (T,42,3)
    diffs = np.linalg.norm(hands[1:] - hands[:-1], axis=2)  # (T-1, 42)
    mean_sp = diffs.mean(axis=0)  # (42,)
    max_sp  = diffs.max(axis=0)   # (42,)
    feat = np.concatenate([mean_sp, max_sp])  # (84,)
    feat = np.sign(feat) * np.sqrt(np.abs(feat) + 1e-8)
    norm = np.linalg.norm(feat)
    return feat / norm if norm > 0 else feat

def pose_velocity_feat_face(uid):
    """
    Compute velocity features for just the face.
    Uses landmarks with indices 33 to 501 (468 landmarks).
    Aggregates per-frame differences via mean and max.
    Final feature dimension: 2 * 468 = 936.
    """
    buf = open(os.path.join(VIDEO_POSE_DIR, f"{uid}.pose"), "rb").read()
    p = Pose.read(buf)
    coords = p.body.data.squeeze(1)  # (T,576,3)
    # Extract face landmarks (indices 33 to 501)
    face = coords[:, 33:501]  # (T,468,3)
    diffs = np.linalg.norm(face[1:] - face[:-1], axis=2)  # (T-1,468)
    mean_sp = diffs.mean(axis=0)  # (468,)
    max_sp  = diffs.max(axis=0)   # (468,)
    feat = np.concatenate([mean_sp, max_sp])  # (936,)
    feat = np.sign(feat) * np.sqrt(np.abs(feat) + 1e-8)
    norm = np.linalg.norm(feat)
    return feat / norm if norm > 0 else feat

def pose_velocity_feat_face_hands(uid):
    """
    Compute velocity features for both face and hands.
    Concatenates face (468 landmarks) and hands (42 landmarks) yielding 510 landmarks.
    Aggregates per-frame differences via mean and max.
    Final feature dimension: 2 * 510 = 1020.
    """
    buf = open(os.path.join(VIDEO_POSE_DIR, f"{uid}.pose"), "rb").read()
    p = Pose.read(buf)
    coords = p.body.data.squeeze(1)  # (T,576,3)
    face = coords[:, 33:501]          # (T,468,3)
    hands = coords[:, 501:543]        # (T,42,3)
    pts = np.concatenate([face, hands], axis=1)  # (T,510,3)
    diffs = np.linalg.norm(pts[1:] - pts[:-1], axis=2)  # (T-1,510)
    mean_sp = diffs.mean(axis=0)  # (510,)
    max_sp  = diffs.max(axis=0)   # (510,)
    feat = np.concatenate([mean_sp, max_sp])  # (1020,)
    feat = np.sign(feat) * np.sqrt(np.abs(feat) + 1e-8)
    norm = np.linalg.norm(feat)
    return feat / norm if norm > 0 else feat


Prepare Features for All Modalities

In [None]:
# I3D features (3072-d)
X_i3d_tr = np.stack([improved_i3d_feat(uid) for uid in df_proto.uid])
X_i3d_te = np.stack([improved_i3d_feat(uid) for uid in df_test.uid])
X_i3d_tr_n = normalize(X_i3d_tr, axis=1)
X_i3d_te_n = normalize(X_i3d_te, axis=1)

In [None]:
# For Hands (84-d)
X_vel_hands_tr = np.stack([pose_velocity_feat_hands(uid) for uid in df_proto.uid])
X_vel_hands_te = np.stack([pose_velocity_feat_hands(uid) for uid in df_test.uid])
X_vel_hands_tr_n = normalize(X_vel_hands_tr, axis=1)
X_vel_hands_te_n = normalize(X_vel_hands_te, axis=1)

In [None]:
# For Face (936-d)
X_vel_face_tr = np.stack([pose_velocity_feat_face(uid) for uid in df_proto.uid])
X_vel_face_te = np.stack([pose_velocity_feat_face(uid) for uid in df_test.uid])
X_vel_face_tr_n = normalize(X_vel_face_tr, axis=1)
X_vel_face_te_n = normalize(X_vel_face_te, axis=1)

In [None]:
# For Face+Hands (1020-d)
X_vel_face_hands_tr = np.stack([pose_velocity_feat_face_hands(uid) for uid in df_proto.uid])
X_vel_face_hands_te = np.stack([pose_velocity_feat_face_hands(uid) for uid in df_test.uid])
X_vel_face_hands_tr_n = normalize(X_vel_face_hands_tr, axis=1)
X_vel_face_hands_te_n = normalize(X_vel_face_hands_te, axis=1)

Fusion Evaluation Functions

In [None]:
def eval_score_level_fusion(beta, X_vel_tr_n, X_vel_te_n, y_tr, y_te):
    """
    Score-level fusion:
      - Compute cosine similarities for I3D and velocity features separately,
      - Fuse the similarity scores using weight beta (I3D) and 1-beta (velocity),
      - Compute ranking and top-k accuracy.
    """
    S_i3d = cosine_similarity(X_i3d_te_n, X_i3d_tr_n)
    S_vel = cosine_similarity(X_vel_te_n, X_vel_tr_n)
    S_fused = beta * S_i3d + (1 - beta) * S_vel
    ranks = np.argsort(-S_fused, axis=1)
    for k in (1, 5, 10):
        acc = topk_acc(ranks, y_tr, y_te, k)
        print(f"Score-level: β={beta:.2f}  Top-{k} Accuracy: {acc:.2f}%")


In [None]:
def eval_feature_level_fusion(beta, X_vel_tr_n, X_vel_te_n, y_tr, y_te):
    """
    Feature-level fusion:
      - Multiply I3D features by beta and velocity features by (1-beta),
      - Concatenate them,
      - L2-normalize the concatenated features,
      - Compute cosine similarity, ranking and top-k accuracy.
    """
    X_tr_concat = np.concatenate((beta * X_i3d_tr_n, (1 - beta) * X_vel_tr_n), axis=1)
    X_te_concat = np.concatenate((beta * X_i3d_te_n, (1 - beta) * X_vel_te_n), axis=1)
    X_tr_concat = normalize(X_tr_concat, axis=1)
    X_te_concat = normalize(X_te_concat, axis=1)
    S = X_te_concat.dot(X_tr_concat.T)
    ranks = np.argsort(-S, axis=1)
    for k in (1, 5, 10):
        acc = topk_acc(ranks, y_tr, y_te, k)
        print(f"Feature-level: β={beta:.2f}  Top-{k} Accuracy: {acc:.2f}%")


Evaluation: Different Fusion Strategies.

In [None]:
# Ground truth gloss lists
y_train = df_proto.gloss.tolist()
y_test = df_test.gloss.tolist()

print("=== FUSION EVALUATION (Score-level) with Velocity from Hands ===")
for beta in [0.0, 0.25, 0.5, 0.75, 1.0]:
    eval_score_level_fusion(beta, X_vel_hands_tr_n, X_vel_hands_te_n, y_train, y_test)
print("\n=== FUSION EVALUATION (Feature-level) with Velocity from Hands ===")
for beta in [0.0, 0.25, 0.5, 0.75, 1.0]:
    eval_feature_level_fusion(beta, X_vel_hands_tr_n, X_vel_hands_te_n, y_train, y_test)

print("\n=== FUSION EVALUATION (Score-level) with Velocity from Face ===")
for beta in [0.0, 0.25, 0.5, 0.75, 1.0]:
    eval_score_level_fusion(beta, X_vel_face_tr_n, X_vel_face_te_n, y_train, y_test)
print("\n=== FUSION EVALUATION (Feature-level) with Velocity from Face ===")
for beta in [0.0, 0.25, 0.5, 0.75, 1.0]:
    eval_feature_level_fusion(beta, X_vel_face_tr_n, X_vel_face_te_n, y_train, y_test)

print("\n=== FUSION EVALUATION (Score-level) with Velocity from Face+Hands ===")
for beta in [0.0, 0.25, 0.5, 0.75, 1.0]:
    eval_score_level_fusion(beta, X_vel_face_hands_tr_n, X_vel_face_hands_te_n, y_train, y_test)
print("\n=== FUSION EVALUATION (Feature-level) with Velocity from Face+Hands ===")
for beta in [0.0, 0.25, 0.5, 0.75, 1.0]:
    eval_feature_level_fusion(beta, X_vel_face_hands_tr_n, X_vel_face_hands_te_n, y_train, y_test)


=== FUSION EVALUATION (Score-level) with Velocity from Hands ===
Score-level: β=0.00  Top-1 Accuracy: 2.63%
Score-level: β=0.00  Top-5 Accuracy: 3.98%
Score-level: β=0.00  Top-10 Accuracy: 5.03%
Score-level: β=0.25  Top-1 Accuracy: 16.76%
Score-level: β=0.25  Top-5 Accuracy: 20.13%
Score-level: β=0.25  Top-10 Accuracy: 21.58%
Score-level: β=0.50  Top-1 Accuracy: 16.81%
Score-level: β=0.50  Top-5 Accuracy: 20.00%
Score-level: β=0.50  Top-10 Accuracy: 21.88%
Score-level: β=0.75  Top-1 Accuracy: 16.81%
Score-level: β=0.75  Top-5 Accuracy: 20.04%
Score-level: β=0.75  Top-10 Accuracy: 22.06%
Score-level: β=1.00  Top-1 Accuracy: 17.20%
Score-level: β=1.00  Top-5 Accuracy: 21.05%
Score-level: β=1.00  Top-10 Accuracy: 22.84%

=== FUSION EVALUATION (Feature-level) with Velocity from Hands ===
Feature-level: β=0.00  Top-1 Accuracy: 2.63%
Feature-level: β=0.00  Top-5 Accuracy: 3.98%
Feature-level: β=0.00  Top-10 Accuracy: 5.03%
Feature-level: β=0.25  Top-1 Accuracy: 16.02%
Feature-level: β=0.25  

Closer checking of different hyper-parameters. Shows velocity can very slightly push accuracy.

In [None]:
beta_values = np.linspace(0.75, 1.0, num=6)
print("=== Evaluation: Score-level Fusion with Velocity (Hands Only) ===")
for beta in beta_values:
    eval_score_level_fusion(beta, X_vel_hands_tr_n, X_vel_hands_te_n, y_train, y_test)

print("=== Evaluation: Feature-level Fusion with Velocity (Hands Only) ===")
for beta in beta_values:
    eval_feature_level_fusion(beta, X_vel_hands_tr_n, X_vel_hands_te_n, y_train, y_test)

# Repeat for Face modality if desired:
print("=== Evaluation: Score-level Fusion with Velocity (Face Only) ===")
for beta in beta_values:
    eval_score_level_fusion(beta, X_vel_face_tr_n, X_vel_face_te_n, y_train, y_test)

print("=== Evaluation: Feature-level Fusion with Velocity (Face Only) ===")
for beta in beta_values:
    eval_feature_level_fusion(beta, X_vel_face_tr_n, X_vel_face_te_n, y_train, y_test)

# And for Face+Hands:
print("=== Evaluation: Score-level Fusion with Velocity (Face+Hands) ===")
for beta in beta_values:
    eval_score_level_fusion(beta, X_vel_face_hands_tr_n, X_vel_face_hands_te_n, y_train, y_test)

print("=== Evaluation: Feature-level Fusion with Velocity (Face+Hands) ===")
for beta in beta_values:
    eval_feature_level_fusion(beta, X_vel_face_hands_tr_n, X_vel_face_hands_te_n, y_train, y_test)


=== Evaluation: Score-level Fusion with Velocity (Hands Only) ===
Score-level: β=0.75  Top-1 Accuracy: 16.81%
Score-level: β=0.75  Top-5 Accuracy: 20.04%
Score-level: β=0.75  Top-10 Accuracy: 22.06%
Score-level: β=0.80  Top-1 Accuracy: 16.81%
Score-level: β=0.80  Top-5 Accuracy: 20.26%
Score-level: β=0.80  Top-10 Accuracy: 22.28%
Score-level: β=0.85  Top-1 Accuracy: 17.02%
Score-level: β=0.85  Top-5 Accuracy: 20.48%
Score-level: β=0.85  Top-10 Accuracy: 22.45%
Score-level: β=0.90  Top-1 Accuracy: 17.24%
Score-level: β=0.90  Top-5 Accuracy: 20.61%
Score-level: β=0.90  Top-10 Accuracy: 22.63%
Score-level: β=0.95  Top-1 Accuracy: 17.20%
Score-level: β=0.95  Top-5 Accuracy: 20.79%
Score-level: β=0.95  Top-10 Accuracy: 22.93%
Score-level: β=1.00  Top-1 Accuracy: 17.20%
Score-level: β=1.00  Top-5 Accuracy: 21.05%
Score-level: β=1.00  Top-10 Accuracy: 22.84%
=== Evaluation: Feature-level Fusion with Velocity (Hands Only) ===
Feature-level: β=0.75  Top-1 Accuracy: 17.24%
Feature-level: β=0.75 