In [1]:
# !pip install tensorflow==2.16.1 \
#                numpy==1.26.4 \
#                pandas \
#                matplotlib \
#                scikit-learn \
#                opencv-python \
#                mediapipe \
#                tqdm \
#                h5py \
#                GPUtil \
#                psutil


In [2]:
#%pip install --upgrade --no-deps keras==3.1.1

SECTION 1: IMPORTS AND SETUP

In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
import cv2
import mediapipe as mp
import os
import json
import pickle
from pathlib import Path
from tqdm import tqdm
import gc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Enable GPU memory growth and mixed precision for better performance
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    tf.keras.mixed_precision.set_global_policy('mixed_float16')

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {len(tf.config.list_physical_devices('GPU')) > 0}")
print("Setup completed successfully!")

TensorFlow version: 2.16.2
GPU Available: False
Setup completed successfully!


# SECTION 2: DATASET CONFIGURATION
# UPDATE YOUR DATASET PATH HERE!

In [4]:
DATASET_PATH = r"C:\Users\Shravan\Downloads\ISL_CSLRT_Corpus\ISL_CSLRT_Corpus" # UPDATE THIS PATH!

# Configuration parameters
MAX_SEQUENCE_LENGTH = 150  # Maximum frames to process per sequence
TARGET_SIZE = (224, 224)   # Image size for CNN processing
BATCH_SIZE = 8             # Batch size for training
EPOCHS = 200               # Number of training epochs
VOCAB_SIZE = 5000          # Vocabulary size for text tokenizer
MAX_TEXT_LENGTH = 50       # Maximum sentence length

print(f"Dataset path: {DATASET_PATH}")
print(f"Configuration loaded successfully!")

Dataset path: C:\Users\Shravan\Downloads\ISL_CSLRT_Corpus\ISL_CSLRT_Corpus
Configuration loaded successfully!


 SECTION 3: DATASET EXPLORER CLASS
# This helps you understand your dataset structure before processing

In [5]:
class ISLDatasetExplorer:
    """Explore and understand ISL-CSLTR dataset structure"""
    
    def __init__(self, dataset_path):
        self.dataset_path = Path(dataset_path)
        self.structure = {}
        
    def explore_dataset(self):
        """Explore the dataset structure - IMAGE FILES ONLY"""
        print("=== ISL-CSLTR Dataset Structure Analysis ===")
        print(f"Dataset path: {self.dataset_path}")
        
        if not self.dataset_path.exists():
            print(f"‚ùå Error: Dataset path {self.dataset_path} does not exist!")
            return {}
        
        # Find all files and directories
        all_items = list(self.dataset_path.rglob('*'))
        
        # Categorize files - REMOVED video file extensions
        images = [f for f in all_items if f.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp']]
        annotations = [f for f in all_items if f.suffix.lower() in ['.json', '.csv', '.txt', '.xml']]
        directories = [f for f in all_items if f.is_dir()]
        
        print(f"\nüìä Found:")
        print(f"   Images: {len(images)}")
        print(f"   Annotation files: {len(annotations)}")
        print(f"   Directories: {len(directories)}")
        
        # Show sample images
        if images:
            print(f"\nüñºÔ∏è Sample images:")
            for img in images[:5]:
                rel_path = img.relative_to(self.dataset_path)
                size_mb = img.stat().st_size / (1024 * 1024)
                print(f"   - {rel_path} ({size_mb:.1f} MB)")
        
        if annotations:
            print(f"\nüìÑ Annotation files:")
            for ann in annotations:
                rel_path = ann.relative_to(self.dataset_path)
                print(f"   - {rel_path}")
        
        self.structure = {
            'images': images,
            'annotations': annotations,
            'directories': directories
        }
        
        return self.structure
    
    def analyze_annotations(self):
        """Analyze annotation files to understand format"""
        print("\n=== Annotation Analysis ===")
        
        for ann_file in self.structure.get('annotations', [])[:3]:  # Analyze first 3 files
            print(f"\nüìã Analyzing: {ann_file.name}")
            try:
                if ann_file.suffix.lower() == '.json':
                    with open(ann_file, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                    print(f"   JSON structure: {type(data)}")
                    if isinstance(data, dict):
                        print(f"   Keys: {list(data.keys())[:5]}")
                    elif isinstance(data, list):
                        print(f"   List length: {len(data)}")
                        if data:
                            print(f"   Sample: {str(data[0])[:100]}...")
                
                elif ann_file.suffix.lower() == '.csv':
                    df = pd.read_csv(ann_file)
                    print(f"   CSV shape: {df.shape}")
                    print(f"   Columns: {list(df.columns)}")
                    if not df.empty:
                        print(f"   Sample row: {df.iloc[0].to_dict()}")
                
            except Exception as e:
                print(f"   ‚ùå Error: {e}")

# ============================================================================
# SECTION 4: RUN DATASET EXPLORATION
# Execute this to understand your dataset structure
# ============================================================================

In [6]:
explorer = ISLDatasetExplorer(DATASET_PATH)
dataset_structure = explorer.explore_dataset()

# Run detailed analysis
if dataset_structure.get('images'):
    print("\nüîç Running detailed analysis...")
    explorer.analyze_annotations()
else:
    print("‚ùå No images found! Please check your dataset path.")

=== ISL-CSLTR Dataset Structure Analysis ===
Dataset path: C:\Users\Shravan\Downloads\ISL_CSLRT_Corpus\ISL_CSLRT_Corpus

üìä Found:
   Images: 19899
   Annotation files: 2
   Directories: 897

üñºÔ∏è Sample images:
   - Frames_Sentence_Level\are you free today\1\are you free today 01.jpg (0.0 MB)
   - Frames_Sentence_Level\are you free today\1\are you free today 02.jpg (0.0 MB)
   - Frames_Sentence_Level\are you free today\1\are you free today 03.jpg (0.0 MB)
   - Frames_Sentence_Level\are you free today\1\are you free today 04.jpg (0.0 MB)
   - Frames_Sentence_Level\are you free today\1\are you free today 05.jpg (0.0 MB)

üìÑ Annotation files:
   - ISL_CSLRT.txt
   - corpus_csv_files\ISL Corpus sign glosses.csv

üîç Running detailed analysis...

=== Annotation Analysis ===

üìã Analyzing: ISL_CSLRT.txt

üìã Analyzing: ISL Corpus sign glosses.csv
   CSV shape: (101, 2)
   Columns: ['Sentence', 'SIGN GLOSSES']
   Sample row: {'Sentence': 'are you free today', 'SIGN GLOSSES': 'YOU 

# SECTION 5: DATA MANAGER CLASS
# Handles video processing and feature extraction for folder-structured dataset

In [7]:
class ISLContinuousDataManager:
    """Data manager for ISL-CSLTR continuous sign language dataset - IMAGE FRAMES ONLY"""
    
    def __init__(self, base_path, max_sequence_length=150, target_size=(224, 224)):
        self.base_path = Path(base_path)
        self.max_sequence_length = max_sequence_length
        self.target_size = target_size
        
        # Feature dimensions
        self.cnn_feature_dim = 1280  # MobileNetV2 features
        self.mp_feature_dim = 1662   # MediaPipe features
        self.total_feature_dim = self.cnn_feature_dim + self.mp_feature_dim
        
        # Text processing
        self.text_tokenizer = None
        self.vocab_size = VOCAB_SIZE
        self.max_text_length = MAX_TEXT_LENGTH
        
        # Setup MediaPipe
        self.mp_holistic = mp.solutions.holistic
        self.mp_drawing = mp.solutions.drawing_utils
        
        # Setup CNN model
        self.setup_cnn_extractor()
        
        # Dataset metadata
        self.sequence_annotations = {}
        self.signer_info = {}
        
        print(f"‚úÖ ISL Data Manager initialized")
        print(f"   Max sequence length: {max_sequence_length}")
        print(f"   Total feature dimension: {self.total_feature_dim}")
    
    def setup_cnn_extractor(self):
        """Setup pre-trained CNN for feature extraction"""
        from tensorflow.keras.applications import MobileNetV2
        
        self.cnn_model = MobileNetV2(
            weights='imagenet',
            include_top=False,
            input_shape=(*self.target_size, 3),
            pooling='avg'
        )
        
        # Freeze CNN layers for feature extraction
        for layer in self.cnn_model.layers:
            layer.trainable = False
            
        print(f"   CNN feature extractor ready: {self.cnn_model.output_shape}")
    
    def extract_mediapipe_features(self, results):
        """Extract MediaPipe keypoints from holistic results"""
        pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
        face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
        lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
        rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
        return np.concatenate([pose, face, lh, rh])
    
    def extract_cnn_features(self, image_batch):
        """Extract CNN features from batch of images"""
        processed_batch = tf.keras.applications.mobilenet_v2.preprocess_input(image_batch)
        features = self.cnn_model(processed_batch, training=False)
        return features.numpy()
    
    def load_isl_annotations(self):
        """Load ISL-CSLTR dataset annotations and metadata"""
        print("üìñ Loading ISL-CSLTR annotations...")
        
        # Look for annotation files
        annotation_patterns = [
            '*.json', '*.csv', '*annotations*', '*labels*', '*metadata*', 
            '*sentences*', '*glosses*', '*translations*'
        ]
        
        annotation_files = []
        for pattern in annotation_patterns:
            annotation_files.extend(self.base_path.rglob(pattern))
        
        # Remove duplicates
        annotation_files = list(set(annotation_files))
        print(f"   Found {len(annotation_files)} potential annotation files")
        
        annotations = {}
        
        for ann_file in annotation_files:
            try:
                print(f"   Processing {ann_file.name}...")
                
                if ann_file.suffix.lower() == '.json':
                    with open(ann_file, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                    
                    # Handle different JSON structures
                    if isinstance(data, dict):
                        for key, value in data.items():
                            if isinstance(value, dict) and any(field in value for field in ['sentence', 'translation', 'gloss', 'text']):
                                annotations[key] = value
                            elif isinstance(value, str) and len(value.split()) > 1:
                                annotations[key] = {'sentence': value, 'text': value}
                    
                    elif isinstance(data, list):
                        for item in data:
                            if isinstance(item, dict):
                                sequence_id = item.get('sequence_id') or item.get('folder') or item.get('id')
                                if sequence_id:
                                    annotations[sequence_id] = item
                
                elif ann_file.suffix.lower() == '.csv':
                    df = pd.read_csv(ann_file)
                    
                    # Find relevant columns
                    sequence_cols = [col for col in df.columns if any(term in col.lower() for term in ['sequence', 'folder', 'name', 'id'])]
                    text_cols = [col for col in df.columns if any(term in col.lower() for term in ['sentence', 'translation', 'gloss', 'text'])]
                    signer_cols = [col for col in df.columns if 'signer' in col.lower()]
                    
                    if sequence_cols and text_cols:
                        sequence_col = sequence_cols[0]
                        text_col = text_cols[0]
                        signer_col = signer_cols[0] if signer_cols else None
                        
                        for _, row in df.iterrows():
                            sequence_id = str(row[sequence_col])
                            text = str(row[text_col])
                            
                            annotation = {
                                'sentence': text,
                                'text': text,
                                'translation': text
                            }
                            
                            if signer_col:
                                annotation['signer'] = str(row[signer_col])
                            
                            annotations[sequence_id] = annotation
                
                elif ann_file.suffix.lower() == '.txt':
                    with open(ann_file, 'r', encoding='utf-8') as f:
                        lines = f.readlines()
                    
                    for line in lines:
                        line = line.strip()
                        if ':' in line and len(line.split(':')) == 2:
                            sequence_id, text = line.split(':', 1)
                            annotations[sequence_id.strip()] = {
                                'sentence': text.strip(),
                                'text': text.strip()
                            }
                        elif '\t' in line:
                            parts = line.split('\t')
                            if len(parts) >= 2:
                                sequence_id, text = parts[0], parts[1]
                                annotations[sequence_id.strip()] = {
                                    'sentence': text.strip(),
                                    'text': text.strip()
                                }
            
            except Exception as e:
                print(f"   ‚ùå Error processing {ann_file}: {e}")
        
        self.sequence_annotations = annotations
        print(f"   ‚úÖ Loaded annotations for {len(annotations)} sequences")
        
        # Extract signer information
        signers = set()
        for ann in annotations.values():
            if isinstance(ann, dict) and 'signer' in ann:
                signers.add(ann['signer'])
        
        if signers:
            self.signer_info = {signer: idx for idx, signer in enumerate(sorted(signers))}
            print(f"   Found {len(signers)} signers: {list(signers)}")
        
        return annotations


# SECTION 6: INITIALIZE DATA MANAGER
# Create the data manager instance for folder-structured dataset

In [14]:
# Initialize the data manager
print("üöÄ Initializing ISL Data Manager...")
data_manager = ISLContinuousDataManager(
    base_path=DATASET_PATH,
    max_sequence_length=MAX_SEQUENCE_LENGTH,
    target_size=TARGET_SIZE
)

# Load annotations
annotations = data_manager.load_isl_annotations()
print(f"‚úÖ Data manager initialized with {len(annotations)} annotations")

üöÄ Initializing ISL Data Manager...
   CNN feature extractor ready: (None, 1280)
‚úÖ ISL Data Manager initialized
   Max sequence length: 150
   Total feature dimension: 2942
üìñ Loading ISL-CSLTR annotations...
   Found 4 potential annotation files
   Processing sentences.pickle...
   Processing metadata.pickle...
   Processing ISL Corpus sign glosses.csv...
   Processing processing_params.json...
   ‚úÖ Loaded annotations for 0 sequences
‚úÖ Data manager initialized with 0 annotations


# SECTION 7: IMAGE SEQUENCE PROCESSING FUNCTIONS
# Functions to process image frame sequences and extract features

In [15]:
def find_image_sequences(base_path):
    """
    Find all image sequences organized in folders.
    Assumes structure like: base_path/sequence_name/frame_001.jpg, frame_002.jpg, etc.
    """
    base_path = Path(base_path)
    sequences = {}
    
    print(f"   Searching for image sequences in: {base_path}")
    
    # Find all directories that contain images
    all_dirs = [d for d in base_path.rglob('*') if d.is_dir()]
    print(f"   Found {len(all_dirs)} directories to check")
    
    for folder in all_dirs:
        # Get all images in this folder
        images = sorted([
            f for f in folder.iterdir() 
            if f.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp']
        ])
        
        if images:
            sequence_name = folder.name
            sequences[sequence_name] = images
            print(f"   ‚úì Sequence '{sequence_name}': {len(images)} frames")
    
    print(f"   Total sequences found: {len(sequences)}")
    return sequences

def process_image_sequence(data_manager, image_paths, annotation=None, signer_id=None):
    """
    Process a sequence of image frames for sign language recognition.
    REPLACES: process_continuous_video() - now works with image lists instead of video files
    """
    total_frames = len(image_paths)
    
    if total_frames == 0:
        print(f"‚ö†Ô∏è Warning: No frames in sequence")
        return None
    
    # Handle variable length sequences
    if total_frames > data_manager.max_sequence_length:
        # Sample frames uniformly
        frame_indices = np.linspace(0, total_frames-1, data_manager.max_sequence_length, dtype=int)
        selected_images = [image_paths[i] for i in frame_indices]
    else:
        selected_images = image_paths
    
    # Read selected frames
    frames = []
    for img_path in selected_images:
        try:
            frame = cv2.imread(str(img_path))
            if frame is not None:
                frames.append(frame)
            else:
                print(f"‚ö†Ô∏è Warning: Could not read {img_path}")
        except Exception as e:
            print(f"‚ö†Ô∏è Error reading {img_path}: {e}")
        
        if len(frames) >= data_manager.max_sequence_length:
            break
    
    if len(frames) == 0:
        print(f"‚ö†Ô∏è Warning: No frames successfully loaded")
        return None
    
    # Process frames with MediaPipe and CNN
    mp_features = []
    cnn_images = []
    
    with data_manager.mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        for frame in frames:
            # MediaPipe processing
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = holistic.process(frame_rgb)
            mp_feat = data_manager.extract_mediapipe_features(results)
            mp_features.append(mp_feat)
            
            # Prepare image for CNN
            img_resized = cv2.resize(frame, data_manager.target_size)
            cnn_images.append(img_resized)
    
    # Extract CNN features in batch
    cnn_batch = np.array(cnn_images, dtype=np.float32)
    cnn_features = data_manager.extract_cnn_features(cnn_batch)
    
    # Combine features
    hybrid_sequence = []
    actual_length = len(frames)
    
    for i in range(len(frames)):
        combined = np.concatenate([cnn_features[i], mp_features[i]])
        hybrid_sequence.append(combined)
    
    # Pad sequence to max_sequence_length if necessary
    while len(hybrid_sequence) < data_manager.max_sequence_length:
        hybrid_sequence.append(np.zeros(data_manager.total_feature_dim))
    
    return {
        'features': np.array(hybrid_sequence[:data_manager.max_sequence_length]),
        'sentence': annotation.get('sentence', '') if annotation else '',
        'text': annotation.get('text', '') if annotation else '',
        'signer': signer_id,
        'actual_length': actual_length,
        'sequence_name': image_paths[0].parent.name if image_paths else 'unknown'
    }

def process_isl_dataset(data_manager, save_processed=True):
    """
    Process the entire ISL-CSLTR dataset from image sequences.
    UPDATED: Now processes image folders instead of video files
    """
    print("üé¨ Processing ISL-CSLTR Dataset (Image Sequences)...")
    
    # First, let's check what's in the dataset path
    print(f"   Scanning dataset path: {data_manager.base_path}")
    
    # Find all image sequences
    sequences = find_image_sequences(data_manager.base_path)
    
    if not sequences:
        print("‚ùå No image sequences found! Please check your dataset structure.")
        print("Expected structure: base_path/sequence_name/frame_001.jpg, frame_002.jpg, etc.")
        print("\nTrying alternative: Looking for loose images...")
        
        # Alternative: check if images are directly in base_path without subdirectories
        all_images = sorted([
            f for f in data_manager.base_path.rglob('*')
            if f.is_file() and f.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp']
        ])
        
        if all_images:
            print(f"   Found {len(all_images)} loose image files")
            print(f"   Sample paths: {[str(img.relative_to(data_manager.base_path)) for img in all_images[:3]]}")
            
            # Group by parent directory
            from collections import defaultdict
            sequences_dict = defaultdict(list)
            for img in all_images:
                parent_name = img.parent.name if img.parent != data_manager.base_path else 'root'
                sequences_dict[parent_name].append(img)
            
            sequences = dict(sequences_dict)
            print(f"   Organized into {len(sequences)} sequences by parent folder")
        else:
            print("   No images found at all in the dataset path!")
            return [], None
    
    print(f"   Found {len(sequences)} image sequences")
    
    # ADDED: Show annotation matching info
    print(f"\nüìã Annotation Matching:")
    print(f"   Available annotations: {len(data_manager.sequence_annotations)}")
    if data_manager.sequence_annotations:
        print(f"   Sample annotation keys: {list(data_manager.sequence_annotations.keys())[:5]}")
    print(f"   Sample sequence names: {list(sequences.keys())[:5]}")
    
    processed_data = []
    all_texts = []
    
    # ADDED: Track annotation matching
    matched_count = 0
    unmatched_sequences = []
    
    # Process each sequence with progress bar
    for sequence_name, image_paths in tqdm(sequences.items(), desc="Processing sequences"):
        try:
            # Find annotation for this sequence
            annotation = None
            signer_id = None
            
            # IMPROVED: Try multiple matching strategies
            # Strategy 1: Direct match
            if sequence_name in data_manager.sequence_annotations:
                annotation = data_manager.sequence_annotations[sequence_name]
                matched_count += 1
            
            # Strategy 2: Try with parent folder name
            if not annotation and image_paths:
                parent_folder = image_paths[0].parent.parent.name
                if parent_folder in data_manager.sequence_annotations:
                    annotation = data_manager.sequence_annotations[parent_folder]
                    matched_count += 1
            
            # Strategy 3: Try case-insensitive match
            if not annotation:
                for key in data_manager.sequence_annotations.keys():
                    if key.lower() == sequence_name.lower():
                        annotation = data_manager.sequence_annotations[key]
                        matched_count += 1
                        break
            
            # Strategy 4: Try numeric match (e.g., '001' matches '1')
            if not annotation and sequence_name.isdigit():
                for key in data_manager.sequence_annotations.keys():
                    if key.lstrip('0') == sequence_name.lstrip('0'):
                        annotation = data_manager.sequence_annotations[key]
                        matched_count += 1
                        break
            
            if not annotation:
                unmatched_sequences.append(sequence_name)
            
            if isinstance(annotation, dict):
                signer_id = annotation.get('signer')
            
            # Process image sequence
            result = process_image_sequence(data_manager, image_paths, annotation, signer_id)
            
            if result is not None:
                processed_data.append(result)
                if result['text']:
                    all_texts.append(result['text'])
        
        except Exception as e:
            print(f"‚ùå Error processing {sequence_name}: {e}")
            continue
    
    # ADDED: Show matching statistics
    print(f"\nüìä Annotation Matching Results:")
    print(f"   Sequences with annotations: {matched_count}/{len(sequences)}")
    print(f"   Sequences without annotations: {len(unmatched_sequences)}")
    if unmatched_sequences[:5]:
        print(f"   Sample unmatched: {unmatched_sequences[:5]}")
    
    print(f"\n‚úÖ Successfully processed {len(processed_data)} sequences")
    print(f"   Sequences with text: {len(all_texts)}")
    
    # Setup text tokenizer
    if all_texts:
        setup_text_tokenizer(data_manager, all_texts)
    else:
        print("‚ö†Ô∏è WARNING: No text annotations found! Model will train without text labels.")
        print("   Check that your annotation file format matches the sequence names.")
    
    # Save processed data
    if save_processed:
        save_path = save_processed_data(data_manager, processed_data)
        return processed_data, save_path
    
    return processed_data, None

def setup_text_tokenizer(data_manager, text_data):
    """Setup tokenizer for sentence annotations"""
    # FIXED: Updated import for TensorFlow 2.x compatibility
    try:
        from tensorflow.keras.preprocessing.text import Tokenizer
    except ImportError:
        try:
            from keras.preprocessing.text import Tokenizer
        except ImportError:
            # For TensorFlow 2.16+, use keras_preprocessing
            from keras_preprocessing.text import Tokenizer
    
    print("üìù Setting up text tokenizer...")
    
    # Clean text data
    cleaned_texts = []
    for text in text_data:
        if isinstance(text, str) and text.strip():
            cleaned = text.lower().strip()
            cleaned = ''.join(c if c.isalnum() or c.isspace() or c in '.,!?' else ' ' for c in cleaned)
            cleaned_texts.append(cleaned)
    
    if not cleaned_texts:
        print("‚ö†Ô∏è No valid text data found for tokenizer")
        return
    
    # Create tokenizer
    data_manager.text_tokenizer = Tokenizer(
        num_words=data_manager.vocab_size,
        oov_token='<OOV>',
        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    )
    
    data_manager.text_tokenizer.fit_on_texts(cleaned_texts)
    print(f"   ‚úÖ Tokenizer created with {len(data_manager.text_tokenizer.word_index)} unique words")
    
    # Save tokenizer
    with open(data_manager.base_path / 'isl_text_tokenizer.pickle', 'wb') as f:
        pickle.dump(data_manager.text_tokenizer, f)

def save_processed_data(data_manager, processed_data):
    """
    Save processed ISL-CSLTR features and metadata.
    UPDATED: Changed 'video_paths' to 'sequence_names' to reflect image-based processing
    """
    save_path = data_manager.base_path / 'processed_isl_features'
    save_path.mkdir(exist_ok=True)
    
    print(f"üíæ Saving processed data to {save_path}")
    
    # Separate data components
    features = []
    texts = []
    sentences = []
    signers = []
    lengths = []
    sequence_names = []
    
    for item in processed_data:
        features.append(item['features'])
        texts.append(item['text'] if item['text'] else '')
        sentences.append(item['sentence'] if item['sentence'] else '')
        signers.append(item['signer'] if item['signer'] else 'unknown')
        lengths.append(item['actual_length'])
        sequence_names.append(item['sequence_name'])
    
    # Save arrays and metadata
    np.save(save_path / 'sequence_features.npy', np.array(features))
    
    with open(save_path / 'texts.pickle', 'wb') as f:
        pickle.dump(texts, f)
    
    with open(save_path / 'sentences.pickle', 'wb') as f:
        pickle.dump(sentences, f)
    
    # Save metadata
    metadata = {
        'signers': signers,
        'sequence_lengths': lengths,
        'sequence_names': sequence_names,
        'signer_info': data_manager.signer_info
    }
    
    with open(save_path / 'metadata.pickle', 'wb') as f:
        pickle.dump(metadata, f)
    
    # Save parameters
    params = {
        'max_sequence_length': data_manager.max_sequence_length,
        'target_size': data_manager.target_size,
        'total_feature_dim': data_manager.total_feature_dim,
        'num_samples': len(processed_data),
        'num_signers': len(data_manager.signer_info),
        'vocab_size': data_manager.vocab_size
    }
    
    with open(save_path / 'processing_params.json', 'w') as f:
        json.dump(params, f, indent=2)
    
    print(f"   ‚úÖ Saved {len(processed_data)} samples")
    print(f"   Features shape: {np.array(features).shape}")
    print(f"   Unique signers: {len(set(signers))}")
    print(f"   Text samples: {len([t for t in texts if t])}")
    
    return save_path


# SECTION 8: PROCESS DATASET

In [16]:
# Check if processed data already exists
processed_path = Path(DATASET_PATH) / 'processed_isl_features'

if processed_path.exists():
    print("üìÇ Found existing processed data. Loading...")
    
    # Load existing processed data
    features = np.load(processed_path / 'sequence_features.npy')
    
    with open(processed_path / 'texts.pickle', 'rb') as f:
        texts = pickle.load(f)
    
    with open(processed_path / 'metadata.pickle', 'rb') as f:
        metadata = pickle.load(f)
    
    # Load tokenizer
    tokenizer_path = Path(DATASET_PATH) / 'isl_text_tokenizer.pickle'
    if tokenizer_path.exists():
        with open(tokenizer_path, 'rb') as f:
            tokenizer = pickle.load(f)
    else:
        tokenizer = None
    
    print(f"‚úÖ Loaded {len(features)} processed samples")
    print(f"   Feature shape: {features.shape}")
    
else:
    print("üé¨ Processing dataset for the first time...")
    print("‚ö†Ô∏è This may take several hours depending on dataset size!")
    
    # Process the dataset
    processed_data, save_path = process_isl_dataset(data_manager, save_processed=True)
    
    # Extract processed components
    features = np.array([item['features'] for item in processed_data])
    texts = [item['text'] for item in processed_data]
    metadata = {
        'signers': [item['signer'] for item in processed_data],
        'sequence_lengths': [item['actual_length'] for item in processed_data],
        'sequence_names': [item['sequence_name'] for item in processed_data]
    }
    
    # Load tokenizer
    tokenizer_path = Path(DATASET_PATH) / 'isl_text_tokenizer.pickle'
    if tokenizer_path.exists():
        with open(tokenizer_path, 'rb') as f:
            tokenizer = pickle.load(f)
    else:
        tokenizer = None
    
    print("‚úÖ Dataset processing completed!")

üìÇ Found existing processed data. Loading...
‚úÖ Loaded 121 processed samples
   Feature shape: (121, 150, 2942)
