In [53]:
# Hugging Face Video Processing Pipeline for Neural Network Training Dataset Creation
# This notebook downloads videos from hugging face, processes them through the complete pipeline, and creates training datasets

print("=== Video Processing Pipeline ===")
print("This code downloads videos from hugging face, processes them, and creates training datasets")
print("Step 1: Installing required dependencies...")

# Install yt-dlp for YouTube downloading
%pip install yt-dlp --quiet
%pip install python-dotenv --quiet

# Check if all required libraries are available
import subprocess
import sys

required_packages = [
    #  we dont need yt-dlp for hugging face
    'cv2', 'mediapipe', 'librosa', 'moviepy', 
    'numpy', 'json', 'tqdm', 'pathlib', 'matplotlib',
    'huggingface_hub' , 'pandas' , 'python-dotenv'
]

print("\nChecking required packages:")
for package in required_packages:
    try:
        if package == 'python-dotenv':
            __import__("dotenv")
        else:
            __import__(package)
        print(f"✅ {package}")
    except ImportError:
        print(f"❌ {package} - has  not been installed please install it first")

print("\n🎉 All dependencies are ready!")

=== Video Processing Pipeline ===
This code downloads videos from hugging face, processes them, and creates training datasets
Step 1: Installing required dependencies...

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

Checking required packages:
✅ cv2
✅ mediapipe
✅ librosa
✅ moviepy
✅ numpy
✅ json
✅ tqdm
✅ pathlib
✅ matplotlib
✅ huggingface_hub
✅ pandas
✅ python-dotenv

🎉 All dependencies are ready!


In [54]:
# HUgging face Dataset URLs

huggingface_test_dataset_urls = [
    {"name":"test_video" , "url": "https://huggingface.co/datasets/sanjuhs/test-video-dataset"},
]

huggingface_actual_dataset_urls = [
    {"name":"ml_sanjay_5_15min_datasets" , "url": "https://huggingface.co/datasets/sanjuhs/ml_video_dataset"},

]

# these are the  corresponding youtube URLs, the same has been uploaded to hugging face
test_urls = [ {"name":"test_video" , "url": "https://youtu.be/1wO0Rx9REAA"}]

actual_urls = [
    {"name":"test_video" , "url": "https://youtu.be/1wO0Rx9REAA"},
    {"name":"ml_sanjay_assortmentSounds55_15min_dataset" , "url": "https://youtu.be/W_-ZsKm_MZc"},
    {"name":"ml_sanjay_frutratedAngry_15min_dataset" , "url": "https://youtu.be/OnrWa45WCdI"},
    {"name":"ml_sanjay_explanationRnn_15min_dataset" , "url": "https://youtu.be/iqrOrChYwlw"},
    {"name":"ml_sanjay_commentary_15min_dataset" , "url": "https://youtu.be/mu3StD1YpR4"},
    {"name":"ml_sanjay_calmSinging_15min_dataset" , "url": "https://youtu.be/V1xG5qYp98U"},
    ]

print("loaded all the URLs")


loaded all the URLs


In [55]:
# Hugging Face Dataset Downloader
# Install required dependencies for Hugging Face datasets

print("Installing Hugging Face Hub library...")
%pip install huggingface_hub --quiet

# Import required libraries
import os
import json
import shutil
from pathlib import Path
from huggingface_hub import hf_hub_download, list_repo_files
import requests
from tqdm import tqdm

class HuggingFaceDatasetDownloader:
    def __init__(self, work_dir="hf_downloads"):
        """
        Initialize the Hugging Face dataset downloader
        
        Args:
            work_dir: Directory to download datasets to
        """
        self.work_dir = Path(work_dir)
        self.work_dir.mkdir(exist_ok=True)
        
        print(f"🤗 Hugging Face Dataset Downloader initialized")
        print(f"Download directory: {self.work_dir.absolute()}")
    
    def get_dataset_info(self, repo_id):
        """
        Get information about a Hugging Face dataset
        
        Args:
            repo_id: Repository ID (e.g., "sanjuhs/test-video-dataset")
        
        Returns:
            dict: Dataset information
        """
        try:
            # Get list of files in the repository
            files = list_repo_files(repo_id, repo_type="dataset")
            
            print(f"📋 Dataset: {repo_id}")
            print(f"📁 Files found: {len(files)}")
            
            # Categorize files by type
            video_files = [f for f in files if f.endswith(('.mp4', '.webm', '.mkv', '.avi'))]
            json_files = [f for f in files if f.endswith('.json')]
            other_files = [f for f in files if not f.endswith(('.mp4', '.webm', '.mkv', '.avi', '.json'))]
            
            info = {
                'repo_id': repo_id,
                'total_files': len(files),
                'video_files': video_files,
                'json_files': json_files,
                'other_files': other_files,
                'all_files': files
            }
            
            print(f"🎥 Video files: {len(video_files)}")
            print(f"📄 JSON files: {len(json_files)}")
            print(f"📎 Other files: {len(other_files)}")
            
            return info
            
        except Exception as e:
            print(f"❌ Error getting dataset info: {e}")
            return None
    
    def download_dataset_files(self, repo_id, dataset_name, file_types=['video', 'json']):
        """
        Download specific file types from a Hugging Face dataset
        
        Args:
            repo_id: Repository ID (e.g., "sanjuhs/test-video-dataset")
            dataset_name: Local name for the dataset
            file_types: List of file types to download ('video', 'json', 'all')
        
        Returns:
            dict: Download results
        """
        print(f"\n📥 Downloading dataset: {repo_id}")
        
        # Create dataset directory
        dataset_dir = self.work_dir / dataset_name
        dataset_dir.mkdir(exist_ok=True)
        
        # Get dataset info
        dataset_info = self.get_dataset_info(repo_id)
        if not dataset_info:
            return None
        
        downloaded_files = []
        failed_files = []
        
        # Determine which files to download
        files_to_download = []
        
        if 'all' in file_types:
            files_to_download = dataset_info['all_files']
        else:
            if 'video' in file_types:
                files_to_download.extend(dataset_info['video_files'])
            if 'json' in file_types:
                files_to_download.extend(dataset_info['json_files'])
            if 'other' in file_types:
                files_to_download.extend(dataset_info['other_files'])
        
        print(f"📦 Downloading {len(files_to_download)} files...")
        
        # Download each file
        for filename in tqdm(files_to_download, desc="Downloading files"):
            try:
                print(f"  📥 Downloading: {filename}")
                
                # Download file to dataset directory
                local_path = hf_hub_download(
                    repo_id=repo_id,
                    filename=filename,
                    repo_type="dataset",
                    local_dir=dataset_dir,
                    local_dir_use_symlinks=False
                )
                
                downloaded_files.append({
                    'filename': filename,
                    'local_path': local_path,
                    'size_mb': os.path.getsize(local_path) / (1024 * 1024)
                })
                
                print(f"    ✅ Downloaded to: {local_path}")
                
            except Exception as e:
                print(f"    ❌ Failed to download {filename}: {e}")
                failed_files.append({'filename': filename, 'error': str(e)})
        
        # Create download summary
        results = {
            'repo_id': repo_id,
            'dataset_name': dataset_name,
            'dataset_dir': str(dataset_dir),
            'total_files_requested': len(files_to_download),
            'downloaded_files': downloaded_files,
            'failed_files': failed_files,
            'success_count': len(downloaded_files),
            'failure_count': len(failed_files),
            'total_size_mb': sum([f['size_mb'] for f in downloaded_files])
        }
        
        # Save download summary
        summary_file = dataset_dir / "download_summary.json"
        with open(summary_file, 'w') as f:
            json.dump(results, f, indent=2)
        
        print(f"\n✅ Download complete!")
        print(f"📁 Dataset saved to: {dataset_dir}")
        print(f"📊 Downloaded: {results['success_count']}/{results['total_files_requested']} files")
        print(f"💾 Total size: {results['total_size_mb']:.2f} MB")
        
        if failed_files:
            print(f"⚠️  Failed downloads: {len(failed_files)}")
        
        return results
    
    def list_downloaded_datasets(self):
        """List all downloaded datasets"""
        if not self.work_dir.exists():
            print("No downloads directory found")
            return []
        
        datasets = []
        for item in self.work_dir.iterdir():
            if item.is_dir():
                summary_file = item / "download_summary.json"
                if summary_file.exists():
                    with open(summary_file, 'r') as f:
                        summary = json.load(f)
                    datasets.append(summary)
                else:
                    # Directory exists but no summary - manual inspection
                    files = list(item.glob("*"))
                    datasets.append({
                        'dataset_name': item.name,
                        'dataset_dir': str(item),
                        'file_count': len(files),
                        'files': [f.name for f in files]
                    })
        
        return datasets

# Initialize the downloader
print("\n🚀 Initializing Hugging Face Dataset Downloader...")
hf_downloader = HuggingFaceDatasetDownloader(work_dir="hf_datasets")

print("\n📋 Available datasets:")
print("Test datasets:")
for dataset in huggingface_test_dataset_urls:
    print(f"  - {dataset['name']}: {dataset['url']}")

print("\nActual datasets:")  
for dataset in huggingface_actual_dataset_urls:
    print(f"  - {dataset['name']}: {dataset['url']}")

print("\n✅ Hugging Face downloader ready!")


Installing Hugging Face Hub library...

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

🚀 Initializing Hugging Face Dataset Downloader...
🤗 Hugging Face Dataset Downloader initialized
Download directory: /Users/sanjayprasads/Desktop/Coding/Python/NN_training/V2A-over-training-old-nn/1_data_cleaning/hf_datasets

📋 Available datasets:
Test datasets:
  - test_video: https://huggingface.co/datasets/sanjuhs/test-video-dataset

Actual datasets:
  - ml_sanjay_5_15min_datasets: https://huggingface.co/datasets/sanjuhs/ml_video_dataset

✅ Hugging Face downloader ready!


In [57]:
# Download and Use Hugging Face Datasets

def download_dataset_hf( dataset_url ):
    """Download the test dataset from Hugging Face"""

    for ds in dataset_url:
        print("dataset object is" , ds)
        print("dataset name is" , ds["name"] , "data set url is" , ds["url"])

    for ds in dataset_url:
        print(" Dl data sets one by one")
        print(" Now downloadingd ataset name is" , ds["name"] , "data set url is" , ds["url"])
    
        repo_id = ds["url"].split("/")[-1]  # Extract repo name from URL
        full_repo_id = f"sanjuhs/{repo_id}"  # Full repository ID
        
        print(f"🔍 Getting info for test dataset: {full_repo_id}")
        
        # Get dataset information first
        dataset_info = hf_downloader.get_dataset_info(full_repo_id)
        print("dataset_info" , dataset_info)
        
        if dataset_info:
            print(f"\n📥 Downloading test dataset...")
            # Download all files (video + json)
            results = hf_downloader.download_dataset_files(
                repo_id=full_repo_id,
                dataset_name=ds["name"],
                file_types=['all']  # Download everything
            )
            return results
        else:
            print("❌ Could not get dataset information")
            return None


# Example usage - Download and process test dataset
print("="*80)
print("🚀 DOWNLOADING HUGGING FACE DATASETS")
print("="*80)

# Option 1: Download and process test dataset
print("\n1️⃣ TEST DATASET:")
# dataset_results = download_dataset_hf(huggingface_test_dataset_urls) # change test to actual to download actual dataset
dataset_results = download_dataset_hf(huggingface_actual_dataset_urls) # change test to actual to download actual dataset


if dataset_results:
    print(f"\n✅ Dataset downloaded successfully!")
    print(f"📁 Location: {dataset_results['dataset_dir']}")
    print(f"📊 Files: {dataset_results['success_count']} downloaded")
    
print("\n" + "="*80)


🚀 DOWNLOADING HUGGING FACE DATASETS

1️⃣ TEST DATASET:
dataset object is {'name': 'ml_sanjay_5_15min_datasets', 'url': 'https://huggingface.co/datasets/sanjuhs/ml_video_dataset'}
dataset name is ml_sanjay_5_15min_datasets data set url is https://huggingface.co/datasets/sanjuhs/ml_video_dataset
 Dl data sets one by one
 Now downloadingd ataset name is ml_sanjay_5_15min_datasets data set url is https://huggingface.co/datasets/sanjuhs/ml_video_dataset
🔍 Getting info for test dataset: sanjuhs/ml_video_dataset
📋 Dataset: sanjuhs/ml_video_dataset
📁 Files found: 6
🎥 Video files: 5
📄 JSON files: 0
📎 Other files: 1
dataset_info {'repo_id': 'sanjuhs/ml_video_dataset', 'total_files': 6, 'video_files': ['ml_sanjay_assortmentSounds55_15min_dataset.mp4', 'ml_sanjay_calmSinging_15min_dataset.mp4', 'ml_sanjay_commentary_15min_dataset.mp4', 'ml_sanjay_explanationRnn_15min_dataset.mp4', 'ml_sanjay_frutratedAngry_15min_dataset.mp4'], 'json_files': [], 'other_files': ['.gitattributes'], 'all_files': ['.gi

Downloading files:   0%|          | 0/6 [00:00<?, ?it/s]

  📥 Downloading: .gitattributes


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.
Downloading files:  17%|█▋        | 1/6 [00:00<00:04,  1.02it/s]

    ✅ Downloaded to: hf_datasets/ml_sanjay_5_15min_datasets/.gitattributes
  📥 Downloading: ml_sanjay_assortmentSounds55_15min_dataset.mp4


Downloading files:  33%|███▎      | 2/6 [04:17<10:04, 151.15s/it]

    ✅ Downloaded to: hf_datasets/ml_sanjay_5_15min_datasets/ml_sanjay_assortmentSounds55_15min_dataset.mp4
  📥 Downloading: ml_sanjay_calmSinging_15min_dataset.mp4


Downloading files:  50%|█████     | 3/6 [09:00<10:35, 211.67s/it]

    ✅ Downloaded to: hf_datasets/ml_sanjay_5_15min_datasets/ml_sanjay_calmSinging_15min_dataset.mp4
  📥 Downloading: ml_sanjay_commentary_15min_dataset.mp4


Downloading files:  67%|██████▋   | 4/6 [12:36<07:06, 213.21s/it]

    ✅ Downloaded to: hf_datasets/ml_sanjay_5_15min_datasets/ml_sanjay_commentary_15min_dataset.mp4
  📥 Downloading: ml_sanjay_explanationRnn_15min_dataset.mp4


Downloading files:  83%|████████▎ | 5/6 [17:00<03:51, 231.47s/it]

    ✅ Downloaded to: hf_datasets/ml_sanjay_5_15min_datasets/ml_sanjay_explanationRnn_15min_dataset.mp4
  📥 Downloading: ml_sanjay_frutratedAngry_15min_dataset.mp4


Downloading files: 100%|██████████| 6/6 [21:49<00:00, 218.28s/it]

    ✅ Downloaded to: hf_datasets/ml_sanjay_5_15min_datasets/ml_sanjay_frutratedAngry_15min_dataset.mp4

✅ Download complete!
📁 Dataset saved to: hf_datasets/ml_sanjay_5_15min_datasets
📊 Downloaded: 6/6 files
💾 Total size: 7677.63 MB

✅ Dataset downloaded successfully!
📁 Location: hf_datasets/ml_sanjay_5_15min_datasets
📊 Files: 6 downloaded






In [58]:
# step 0 video analysis functions

#!/usr/bin/env python3
"""
Video Analysis Script - Analyze video properties and quality
"""

import cv2
import numpy as np
import os
import sys
from pathlib import Path

def analyze_video(video_path):
    """
    Analyze video file properties and quality
    """
    if not os.path.exists(video_path):
        print(f"Error: Video file not found at {video_path}")
        return None

    # Open video file
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print(f"Error: Could not open video file {video_path}")
        return None

    # Get video properties
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    duration = frame_count / fps if fps > 0 else 0

    # Get file size
    file_size = os.path.getsize(video_path) / (1024 * 1024)  # MB

    # Sample frames to check quality
    sample_frames = []
    frame_indices = np.linspace(0, frame_count - 1, min(10, frame_count), dtype=int)

    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            sample_frames.append(frame)

    cap.release()

    # Analyze frame quality
    avg_brightness = 0
    avg_contrast = 0
    if sample_frames:
        brightnesses = []
        contrasts = []
        for frame in sample_frames:
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            brightnesses.append(np.mean(gray))
            contrasts.append(np.std(gray))

        avg_brightness = np.mean(brightnesses)
        avg_contrast = np.mean(contrasts)

    # Compile results
    analysis = {
        'file_path': video_path,
        'file_size_mb': file_size,
        'frame_count': frame_count,
        'fps': fps,
        'width': width,
        'height': height,
        'duration_seconds': duration,
        'duration_minutes': duration / 60,
        'resolution': f"{width}x{height}",
        'avg_brightness': avg_brightness,
        'avg_contrast': avg_contrast,
        'bitrate_mbps': (file_size * 8) / duration if duration > 0 else 0,
    }

    return analysis

def print_analysis_report(analysis):
    """
    Print a formatted analysis report
    """
    if not analysis:
        return

    print("="*60)
    print("VIDEO ANALYSIS REPORT")
    print("="*60)
    print(f"File: {analysis['file_path']}")
    print(f"File Size: {analysis['file_size_mb']:.2f} MB")
    print(f"Duration: {analysis['duration_minutes']:.2f} minutes ({analysis['duration_seconds']:.1f} seconds)")
    print(f"Resolution: {analysis['resolution']}")
    print(f"Frame Rate: {analysis['fps']:.2f} FPS")
    print(f"Total Frames: {analysis['frame_count']}")
    print(f"Estimated Bitrate: {analysis['bitrate_mbps']:.2f} Mbps")
    print()
    print("QUALITY METRICS:")
    print(f"Average Brightness: {analysis['avg_brightness']:.1f} (0-255)")
    print(f"Average Contrast: {analysis['avg_contrast']:.1f}")
    print()

    # Quality recommendations
    print("RECOMMENDATIONS:")

    # Check FPS consistency
    if analysis['fps'] < 25:
        print("⚠️  Low frame rate detected. Consider using 25-30 FPS for better model training.")
    elif analysis['fps'] > 35:
        print("ℹ️  High frame rate detected. You may downsample to 25-30 FPS to reduce computational load.")
    else:
        print("✅ Frame rate is suitable for training (25-30 FPS range).")

    # Check resolution
    if analysis['width'] < 640 or analysis['height'] < 480:
        print("⚠️  Low resolution detected. Higher resolution may improve face tracking accuracy.")
    else:
        print("✅ Resolution is adequate for face tracking.")

    # Check brightness
    if analysis['avg_brightness'] < 80:
        print("⚠️  Video appears dark. Consider brightness adjustment for better face detection.")
    elif analysis['avg_brightness'] > 200:
        print("⚠️  Video appears overexposed. Consider brightness adjustment.")
    else:
        print("✅ Brightness levels appear good.")

    # Check contrast
    if analysis['avg_contrast'] < 20:
        print("⚠️  Low contrast detected. May affect feature extraction quality.")
    else:
        print("✅ Contrast levels appear adequate.")

    print("="*60)

def step0_main( video_path):
    """
    Main function to analyze the video
    """

    # root path
    root_path = Path.cwd()  # use current working directory in Colab
    print(f"Root path: {root_path}")

    print("Starting video analysis...")
    analysis = analyze_video(video_path)

    if analysis:
        print_analysis_report(analysis)

        # Save analysis to file
        # lets also make the name unique by adding the video name to the file name
        import json
        video_name = os.path.splitext(os.path.basename(video_path))[0]
        output_file = f"{root_path}/data/analysis/0_video_analysis_{video_name}.json"

        if not os.path.exists(output_file):
            os.makedirs(os.path.dirname(output_file), exist_ok=True)

        with open(output_file, 'w') as f:
            json.dump(analysis, f, indent=2)
        print(f"Analysis saved to: {output_file}")
    else:
        print("Failed to analyze video.")
        sys.exit(1)

# for all videos in the dataset you need to analyze and do step 0 and store in 
# so we need to first eget the path of the video/s from the dataset path

print(dataset_results)

# so from teh below we have to get paths of teh Vidoes or MP4 files etc
# {'repo_id': 'sanjuhs/test-video-dataset', 'dataset_name': 'test_video', 'dataset_dir': 'hf_datasets/test_video', 'total_files_requested': 3, 'downloaded_files': [{'filename': '.gitattributes', 'local_path': 'hf_datasets/test_video/.gitattributes', 'size_mb': 0.0023469924926757812}, {'filename': 'README.md', 'local_path': 'hf_datasets/test_video/README.md', 'size_mb': 2.288818359375e-05}, {'filename': 'test.mp4', 'local_path': 'hf_datasets/test_video/test.mp4', 'size_mb': 10.61563491821289}], 'failed_files': [], 'success_count': 3, 'failure_count': 0, 'total_size_mb': 10.61800479888916}

for f in dataset_results["downloaded_files"]:
    if f["filename"].endswith(".mp4"):
        video_path = f["local_path"]
        print("will now process step 0 for video" , video_path)
        step0_main(video_path)
        print("finsihed step0  for video" , f["filename"])


{'repo_id': 'sanjuhs/ml_video_dataset', 'dataset_name': 'ml_sanjay_5_15min_datasets', 'dataset_dir': 'hf_datasets/ml_sanjay_5_15min_datasets', 'total_files_requested': 6, 'downloaded_files': [{'filename': '.gitattributes', 'local_path': 'hf_datasets/ml_sanjay_5_15min_datasets/.gitattributes', 'size_mb': 0.0023469924926757812}, {'filename': 'ml_sanjay_assortmentSounds55_15min_dataset.mp4', 'local_path': 'hf_datasets/ml_sanjay_5_15min_datasets/ml_sanjay_assortmentSounds55_15min_dataset.mp4', 'size_mb': 1513.3162021636963}, {'filename': 'ml_sanjay_calmSinging_15min_dataset.mp4', 'local_path': 'hf_datasets/ml_sanjay_5_15min_datasets/ml_sanjay_calmSinging_15min_dataset.mp4', 'size_mb': 1506.6638174057007}, {'filename': 'ml_sanjay_commentary_15min_dataset.mp4', 'local_path': 'hf_datasets/ml_sanjay_5_15min_datasets/ml_sanjay_commentary_15min_dataset.mp4', 'size_mb': 1493.8126392364502}, {'filename': 'ml_sanjay_explanationRnn_15min_dataset.mp4', 'local_path': 'hf_datasets/ml_sanjay_5_15min_dat

In [None]:
# step 1 extract blendshapes

#!/usr/bin/env python3
"""
MediaPipe Blendshapes and Head Pose Extraction Script
Extracts 52 blendshapes + 7 head pose values (x,y,z,qw,qx,qy,qz) = 59 values per frame
"""

import cv2
import mediapipe as mp
import numpy as np
import json
import os
from pathlib import Path
from tqdm import tqdm
import sys
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

class FaceBlendshapeExtractor:
    def __init__(self):
        """Initialize MediaPipe Face Landmarker with blendshapes"""
        self.mp_face_mesh = mp.solutions.face_mesh
        self.mp_drawing = mp.solutions.drawing_utils
        self.mp_drawing_styles = mp.solutions.drawing_styles
        
        # MediaPipe blendshape names (52 categories)
        self.blendshape_names = [
            '_neutral', 'browDownLeft', 'browDownRight', 'browInnerUp', 'browOuterUpLeft', 
            'browOuterUpRight', 'cheekPuff', 'cheekSquintLeft', 'cheekSquintRight', 'eyeBlinkLeft', 
            'eyeBlinkRight', 'eyeLookDownLeft', 'eyeLookDownRight', 'eyeLookInLeft', 'eyeLookInRight', 
            'eyeLookOutLeft', 'eyeLookOutRight', 'eyeLookUpLeft', 'eyeLookUpRight', 'eyeSquintLeft', 
            'eyeSquintRight', 'eyeWideLeft', 'eyeWideRight', 'jawForward', 'jawLeft', 'jawOpen', 
            'jawRight', 'mouthClose', 'mouthDimpleLeft', 'mouthDimpleRight', 'mouthFrownLeft', 
            'mouthFrownRight', 'mouthFunnel', 'mouthLeft', 'mouthLowerDownLeft', 'mouthLowerDownRight', 
            'mouthPressLeft', 'mouthPressRight', 'mouthPucker', 'mouthRight', 'mouthRollLower', 
            'mouthRollUpper', 'mouthShrugLower', 'mouthShrugUpper', 'mouthSmileLeft', 'mouthSmileRight', 
            'mouthStretchLeft', 'mouthStretchRight', 'mouthUpperUpLeft', 'mouthUpperUpRight', 
            'noseSneerLeft', 'noseSneerRight'
        ]
        
        # Download the face landmarker model if it doesn't exist
        model_path = self._download_face_landmarker_model()
        
        # Create Face Landmarker
        base_options = python.BaseOptions(model_asset_path=model_path)
        options = vision.FaceLandmarkerOptions(
            base_options=base_options,
            output_face_blendshapes=True,
            output_facial_transformation_matrixes=True,
            num_faces=1
        )
        self.detector = vision.FaceLandmarker.create_from_options(options)
        
    def _download_face_landmarker_model(self):
        """Download the face landmarker model if it doesn't exist"""
        model_dir = Path("models")
        model_dir.mkdir(exist_ok=True)
        model_path = model_dir / "face_landmarker_v2_with_blendshapes.task"
        
        if not model_path.exists():
            print("Downloading Face Landmarker model...")
            import urllib.request
            url = "https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task"
            try:
                urllib.request.urlretrieve(url, model_path)
                print(f"Model downloaded to: {model_path}")
            except Exception as e:
                print(f"Error downloading model: {e}")
                print("Please download the model manually from:")
                print(url)
                sys.exit(1)
        
        return str(model_path)
    
    def extract_from_video(self, video_path, output_dir="extracted_features", max_frames=None, fps_limit=30):
        """
        Extract blendshapes and head pose from video. Default fps limit is 30. 
        If this function is called with no mention of fps_limit, then it will consider the fps_limit as 30.
        If fps_limit is explicitly mentioned as None, then it will process the entire video at the fps of the video.
        
        Args:
            video_path: Path to input video
            output_dir: Directory to save extracted features
            max_frames: Maximum number of frames to process (for testing)
            fps_limit: Target FPS for extraction (default 30)
        
        Returns:
            dict: Extracted features data
        """
        
        if not os.path.exists(video_path):
            raise FileNotFoundError(f"Video file not found: {video_path}")
        
        # Create output directory
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        
        # Open video
        cap = cv2.VideoCapture(video_path)
        
        if not cap.isOpened():
            raise ValueError(f"Could not open video: {video_path}")
        
        # Get video properties
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = cap.get(cv2.CAP_PROP_FPS)
        
        # Determine effective FPS limit
        if fps_limit is None:
            fps_limit = fps
            print(f"FPS limit not provided, will process at original {fps} FPS")
        else:
            print(f"FPS limit: {fps_limit}, original video FPS: {fps}")
        
        # Calculate frame interval based on FPS limit
        if fps_limit >= fps:
            # Process every frame if fps_limit is higher than or equal to video fps
            frame_interval = 1
            effective_fps = fps
        else:
            # Skip frames to achieve target fps_limit
            frame_interval = int(fps / fps_limit)
            effective_fps = fps_limit
        
        print(f"Frame interval: {frame_interval} (processing every {frame_interval} frame(s))")
        print(f"Effective extraction FPS: {effective_fps}")
        
        # Calculate frames to process
        frames_to_process = list(range(0, total_frames, frame_interval))
        if max_frames:
            frames_to_process = frames_to_process[:max_frames]
        
        print(f"Processing {len(frames_to_process)} frames out of {total_frames} total frames...")
        
        # Storage for extracted features
        frame_data = []
        failed_frames = []
        
        # Process frames
        pbar = tqdm(total=len(frames_to_process), desc="Extracting features")
        
        for i, frame_idx in enumerate(frames_to_process):
            # Seek to specific frame
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            ret, frame = cap.read()
            
            if not ret:
                print(f"Failed to read frame {frame_idx}")
                failed_frames.append(frame_idx)
                # Add placeholder for failed frame
                blendshapes = {name: 0.0 for name in self.blendshape_names}
                placeholder = {
                    'frame_index': frame_idx,
                    'timestamp': frame_idx / fps,
                    'blendshapes': blendshapes,
                    'headPosition': {'x': 0.0, 'y': 0.0, 'z': 0.0},
                    'headRotation': {'w': 1.0, 'x': 0.0, 'y': 0.0, 'z': 0.0},
                    'has_face': False
                }
                frame_data.append(placeholder)
                pbar.update(1)
                continue
            
            # Convert BGR to RGB
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Create MediaPipe Image
            mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)
            
            # Extract features
            features = self._extract_frame_features(mp_image, frame_idx)
            
            if features:
                # Update timestamp to use original frame index for accurate timing
                features['timestamp'] = frame_idx / fps
                frame_data.append(features)
            else:
                failed_frames.append(frame_idx)
                # Add placeholder data for failed frames
                blendshapes = {name: 0.0 for name in self.blendshape_names}
                placeholder = {
                    'frame_index': frame_idx,
                    'timestamp': frame_idx / fps,
                    'blendshapes': blendshapes,
                    'headPosition': {'x': 0.0, 'y': 0.0, 'z': 0.0},
                    'headRotation': {'w': 1.0, 'x': 0.0, 'y': 0.0, 'z': 0.0},
                    'has_face': False
                }
                frame_data.append(placeholder)
            
            pbar.update(1)
        
        pbar.close()
        cap.release()
        
        # Generate session ID based on timestamp
        import time
        session_start_time = int(time.time() * 1000)  # Current time in milliseconds
        session_id = f"session_{session_start_time}_extract"
        
        # Convert timestamps to milliseconds and add session ID
        for frame in frame_data:
            frame['timestamp'] = int(frame['timestamp'] * 1000)  # Convert to milliseconds
            frame['sessionId'] = session_id
        
        # Create final output structure similar to human head data
        output_data = {
            'sessionInfo': {
                'sessionId': session_id,
                'startTime': session_start_time,
                'targetFPS': fps_limit,
                'originalFPS': fps,
                'frameInterval': frame_interval,
                'videoPath': video_path
            },
            'frameCount': len(frame_data),
            'failedFrames': len(failed_frames),
            'failureRate': len(failed_frames) / len(frame_data) if frame_data else 1.0,
            'frames': frame_data
        }
        
        # Save single clean JSON file
        video_name = os.path.splitext(os.path.basename(video_path))[0]
        output_file = Path(output_dir) / f"blendshapes_and_pose_{video_name}.json"
        with open(output_file, 'w') as f:
            json.dump(output_data, f, indent=2)
        
        print(f"\\nExtraction complete!")
        print(f"Original video: {total_frames} frames at {fps} FPS")
        print(f"Processed frames: {len(frame_data)} at {effective_fps} FPS (every {frame_interval} frame(s))")
        print(f"Failed frames: {len(failed_frames)} ({output_data['failureRate']:.2%})")
        print(f"Features saved to: {output_file}")
        print(f"Session ID: {session_id}")
        
        return output_data
    
    def _extract_frame_features(self, mp_image, frame_idx):
        """
        Extract features from a single frame
        
        Args:
            mp_image: MediaPipe Image object
            frame_idx: Frame index
        
        Returns:
            dict: Frame features or None if extraction failed
        """
        try:
            # Detect face landmarks and blendshapes
            detection_result = self.detector.detect(mp_image)
            
            if not detection_result.face_landmarks:
                return None  # No face detected
            
            # Get first face (we only process one face)
            face_landmarks = detection_result.face_landmarks[0]
            
            # Extract blendshapes as named dictionary
            blendshapes = {}
            
            if detection_result.face_blendshapes:
                face_blendshapes = detection_result.face_blendshapes[0]
                for i, bs in enumerate(face_blendshapes):
                    if i < len(self.blendshape_names):
                        blendshapes[self.blendshape_names[i]] = bs.score
            else:
                # Fallback if no blendshapes detected
                for name in self.blendshape_names:
                    blendshapes[name] = 0.0
            
            # Extract head pose from transformation matrix
            head_position = {'x': 0.0, 'y': 0.0, 'z': 0.0}
            head_rotation = {'w': 1.0, 'x': 0.0, 'y': 0.0, 'z': 0.0}
            
            if detection_result.facial_transformation_matrixes:
                transform_matrix = detection_result.facial_transformation_matrixes[0]
                pose_array = self._matrix_to_pose(transform_matrix)
                head_position = {'x': pose_array[0], 'y': pose_array[1], 'z': pose_array[2]}
                head_rotation = {'w': pose_array[3], 'x': pose_array[4], 'y': pose_array[5], 'z': pose_array[6]}
            
            # Calculate timestamp
            timestamp = frame_idx / 30.0  # Will be updated with actual FPS later
            
            frame_features = {
                'frame_index': frame_idx,
                'timestamp': timestamp,
                'blendshapes': blendshapes,
                'headPosition': head_position,
                'headRotation': head_rotation,
                'has_face': True
            }
            
            return frame_features
            
        except Exception as e:
            print(f"Error processing frame {frame_idx}: {e}")
            return None
    
    def _matrix_to_pose(self, transform_matrix):
        """
        Convert 4x4 transformation matrix to translation + quaternion
        
        Args:
            transform_matrix: 4x4 transformation matrix
        
        Returns:
            list: [x, y, z, qw, qx, qy, qz]
        """
        try:
            # Convert to numpy array
            matrix = np.array(transform_matrix.data).reshape(4, 4)
            
            # Extract translation (x, y, z)
            translation = matrix[:3, 3]
            
            # Extract rotation matrix
            rotation_matrix = matrix[:3, :3]
            
            # Convert rotation matrix to quaternion
            quaternion = self._rotation_matrix_to_quaternion(rotation_matrix)
            
            # Normalize quaternion
            quaternion = quaternion / np.linalg.norm(quaternion)
            
            # Clamp translation to reasonable range (±0.2 m as suggested)
            translation = np.clip(translation, -0.2, 0.2)
            
            # Return as [x, y, z, qw, qx, qy, qz]
            pose = [translation[0], translation[1], translation[2], 
                   quaternion[0], quaternion[1], quaternion[2], quaternion[3]]
            
            return pose
            
        except Exception as e:
            print(f"Error converting matrix to pose: {e}")
            return [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]  # Default identity pose
    
    def _rotation_matrix_to_quaternion(self, R):
        """
        Convert 3x3 rotation matrix to quaternion [w, x, y, z]
        """
        trace = np.trace(R)
        
        if trace > 0:
            S = np.sqrt(trace + 1.0) * 2  # S = 4 * qw
            qw = 0.25 * S
            qx = (R[2, 1] - R[1, 2]) / S
            qy = (R[0, 2] - R[2, 0]) / S
            qz = (R[1, 0] - R[0, 1]) / S
        elif R[0, 0] > R[1, 1] and R[0, 0] > R[2, 2]:
            S = np.sqrt(1.0 + R[0, 0] - R[1, 1] - R[2, 2]) * 2  # S = 4 * qx
            qw = (R[2, 1] - R[1, 2]) / S
            qx = 0.25 * S
            qy = (R[0, 1] + R[1, 0]) / S
            qz = (R[0, 2] + R[2, 0]) / S
        elif R[1, 1] > R[2, 2]:
            S = np.sqrt(1.0 + R[1, 1] - R[0, 0] - R[2, 2]) * 2  # S = 4 * qy
            qw = (R[0, 2] - R[2, 0]) / S
            qx = (R[0, 1] + R[1, 0]) / S
            qy = 0.25 * S
            qz = (R[1, 2] + R[2, 1]) / S
        else:
            S = np.sqrt(1.0 + R[2, 2] - R[0, 0] - R[1, 1]) * 2  # S = 4 * qz
            qw = (R[1, 0] - R[0, 1]) / S
            qx = (R[0, 2] + R[2, 0]) / S
            qy = (R[1, 2] + R[2, 1]) / S
            qz = 0.25 * S
        
        return np.array([qw, qx, qy, qz])

def step1_main( video_path):
    """
    Main function to extract features from video
    """

    # root path
    root_path = Path.cwd()  # use current working directory in Colab
    print(f"Root path: {root_path}")

    # video_path = f"{root_path}/data/test.mp4"
    
    print("Initializing Face Blendshape Extractor...")
    extractor = FaceBlendshapeExtractor()
    
    print("Starting feature extraction...")
    # For initial testing, limit to first 1000 frames (~33 seconds at 30fps)
    # Remove max_frames=1000 to process the entire video
    extraction_data = extractor.extract_from_video(
        video_path, 
        output_dir=f"{root_path}/data/extracted_features",
        # max_frames=1000  # Remove this line to process full video
    )
    
    # Print summary
    print("\\n" + "="*60)
    print("FEATURE EXTRACTION SUMMARY")
    print("="*60)
    print(f"Session ID: {extraction_data['sessionInfo']['sessionId']}")
    print(f"Video: {extraction_data['sessionInfo']['videoPath']}")
    print(f"Total frames processed: {extraction_data['frameCount']}")
    print(f"Target FPS: {extraction_data['sessionInfo']['targetFPS']}")
    print(f"Original FPS: {extraction_data['sessionInfo']['originalFPS']:.2f}")
    print(f"Duration: {extraction_data['frameCount']/extraction_data['sessionInfo']['targetFPS']:.2f} seconds")
    print(f"Failed frames: {extraction_data['failedFrames']}")
    print(f"Success rate: {(1-extraction_data['failureRate'])*100:.1f}%")
    
    # Sample feature verification
    if extraction_data['frames']:
        sample_frame = extraction_data['frames'][0]
        print(f"\\nSample frame features:")
        print(f"  Blendshapes count: {len(sample_frame['blendshapes'])}")
        print(f"  Head position: {sample_frame['headPosition']}")
        print(f"  Head rotation: {sample_frame['headRotation']}")
        print(f"  Has face: {sample_frame['has_face']}")
        print(f"  Session ID: {sample_frame['sessionId']}")
    
    print("="*60)


print("Now doing step 1 extract blendshapes and pose for all videos in the dataset")
for f in dataset_results["downloaded_files"]:
    if f["filename"].endswith(".mp4"):
        video_path = f["local_path"]
        print("will now process step 1 for video" , video_path)
        step1_main(video_path)
        print("finsihed step1  for video" , f["filename"])
        


I0000 00:00:1757394356.553881 51401898 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M1
W0000 00:00:1757394356.571178 51401898 face_landmarker_graph.cc:174] Sets FaceBlendshapesGraph acceleration to xnnpack by default.
W0000 00:00:1757394356.592810 52473064 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1757394356.606550 52473064 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Now doing step 1 extract blendshapes and pose for all videos in the dataset
will now process step 1 for video hf_datasets/ml_sanjay_5_15min_datasets/ml_sanjay_assortmentSounds55_15min_dataset.mp4
Root path: /Users/sanjayprasads/Desktop/Coding/Python/NN_training/V2A-over-training-old-nn/1_data_cleaning
Initializing Face Blendshape Extractor...
Starting feature extraction...
FPS limit: 30, original video FPS: 30.0
Frame interval: 1 (processing every 1 frame(s))
Effective extraction FPS: 30.0
Processing 27385 frames out of 27385 total frames...


Extracting features:   7%|▋         | 2034/27385 [11:00<46:22,  9.11it/s]  

KeyboardInterrupt: 

Extracting features:   7%|▋         | 2034/27385 [11:16<46:22,  9.11it/s]

In [None]:
# step 2 extract audio features

#!/usr/bin/env python3
"""
Audio Feature Extraction Script
Extracts audio from video and computes mel spectrograms for the TCN model
"""

import librosa
import numpy as np
import soundfile as sf
import json
import os
from pathlib import Path
from tqdm import tqdm
from moviepy import VideoFileClip

class AudioFeatureExtractor:
    def __init__(self, sample_rate=16000, n_mels=80, hop_length=160, win_length=400, n_fft=512):
        """
        Initialize audio feature extractor
        
        Args:
            sample_rate: Target sample rate (16kHz for efficiency)
            n_mels: Number of mel filter banks (80 is standard)
            hop_length: Hop length in samples (10ms at 16kHz)
            win_length: Window length in samples (25ms at 16kHz)
            n_fft: FFT size
        """
        self.sample_rate = sample_rate
        self.n_mels = n_mels
        self.hop_length = hop_length
        self.win_length = win_length
        self.n_fft = n_fft
        
        # Calculate frame rate for mel spectrograms
        # At 16kHz with hop_length=160, we get 100 mel frames per second
        self.mel_frame_rate = sample_rate / hop_length
        
        print(f"Audio Feature Extractor initialized:")
        print(f"  Sample rate: {sample_rate} Hz")
        print(f"  Mel features: {n_mels}")
        print(f"  Mel frame rate: {self.mel_frame_rate} Hz")
        print(f"  Hop length: {hop_length} samples ({hop_length/sample_rate*1000:.1f}ms)")
        print(f"  Window length: {win_length} samples ({win_length/sample_rate*1000:.1f}ms)")
    
    def extract_audio_from_video(self, video_path, output_dir="extracted_features"):
        """
        Extract audio from video file
        
        Args:
            video_path: Path to input video
            output_dir: Directory to save audio
        
        Returns:
            str: Path to extracted audio file
        """
        if not os.path.exists(video_path):
            raise FileNotFoundError(f"Video file not found: {video_path}")
        
        # Create output directory
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        
        print(f"Extracting audio from: {video_path}")
        
        # Load video with moviepy
        video = VideoFileClip(video_path)
        audio = video.audio
        
        # Save audio as WAV file
        video_name = os.path.splitext(os.path.basename(video_path))[0]

        audio_path = Path(output_dir) / f"extracted_audio_{video_name}.wav"
        audio.write_audiofile(str(audio_path), 
                             fps=self.sample_rate)
        
        # Clean up
        audio.close()
        video.close()
        
        print(f"Audio extracted to: {audio_path}")
        return str(audio_path)
    
    def extract_mel_features(self, audio_path, output_dir="extracted_features", max_duration=None):
        """
        Extract mel spectrogram features from audio
        
        Args:
            audio_path: Path to audio file
            output_dir: Directory to save features
            max_duration: Maximum duration to process (seconds, for testing)
        
        Returns:
            dict: Extracted audio features
        """
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")
        
        print(f"Loading audio from: {audio_path}")
        
        # Load audio
        audio, sr = librosa.load(audio_path, sr=self.sample_rate)
        
        # Limit duration for testing
        if max_duration:
            max_samples = int(max_duration * self.sample_rate)
            audio = audio[:max_samples]
            print(f"Limited audio to {max_duration} seconds ({len(audio)} samples)")
        
        print(f"Audio loaded: {len(audio)} samples, {len(audio)/sr:.2f} seconds")
        
        # Extract mel spectrogram
        print("Computing mel spectrogram...")
        mel_spec = librosa.feature.melspectrogram(
            y=audio,
            sr=self.sample_rate,
            n_mels=self.n_mels,
            hop_length=self.hop_length,
            win_length=self.win_length,
            n_fft=self.n_fft,
            fmin=0,
            fmax=self.sample_rate // 2
        )
        
        # Convert to log mel spectrogram (dB)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        
        # Transpose to (time, features) format
        mel_features = log_mel_spec.T  # Shape: (time_frames, n_mels)
        
        # Compute additional features
        print("Computing additional audio features...")
        
        # Voice Activity Detection (VAD) using RMS energy
        rms_energy = librosa.feature.rms(
            y=audio,
            hop_length=self.hop_length,
            frame_length=self.win_length
        )[0]
        
        # Simple VAD threshold (adjust based on your data)
        vad_threshold = np.percentile(rms_energy, 30)  # Bottom 30% is likely silence
        voice_activity = (rms_energy > vad_threshold).astype(float)
        
        # Zero Crossing Rate (useful for voiced/unvoiced detection)
        zcr = librosa.feature.zero_crossing_rate(
            audio,
            hop_length=self.hop_length,
            frame_length=self.win_length
        )[0]
        
        # Ensure all features have the same length
        min_length = min(len(mel_features), len(voice_activity), len(zcr))
        mel_features = mel_features[:min_length]
        voice_activity = voice_activity[:min_length]
        zcr = zcr[:min_length]
        
        # Create time stamps for each frame
        timestamps = librosa.frames_to_time(
            range(min_length),
            sr=self.sample_rate,
            hop_length=self.hop_length
        )
        
        # Compile features
        audio_features = {
            'audio_path': audio_path,
            'sample_rate': self.sample_rate,
            'duration_seconds': len(audio) / self.sample_rate,
            'n_mels': self.n_mels,
            'hop_length': self.hop_length,
            'mel_frame_rate': self.mel_frame_rate,
            'n_frames': min_length,
            'timestamps': timestamps.tolist(),
            'mel_features': mel_features.tolist(),  # Shape: (time, n_mels)
            'voice_activity': voice_activity.tolist(),
            'zero_crossing_rate': zcr.tolist(),
            'rms_energy': rms_energy[:min_length].tolist()
        }
        
        # Save features
        # lets also make the name unique by adding the video name to the file name
        video_name = os.path.splitext(os.path.basename(audio_path))[0].replace("extracted_audio_", "")

        output_file = Path(output_dir) / f"audio_features_{video_name}.json"
        with open(output_file, 'w') as f:
            json.dump(audio_features, f, indent=2)
        
        print(f"\\nAudio feature extraction complete!")
        print(f"Mel features shape: {mel_features.shape}")
        print(f"Features saved to: {output_file}")
        
        return audio_features
    
    def extract_from_video(self, video_path, output_dir="extracted_features", max_duration=None):
        """
        Complete pipeline: extract audio from video and compute features
        
        Args:
            video_path: Path to input video
            output_dir: Directory to save features
            max_duration: Maximum duration to process (seconds, for testing)
        
        Returns:
            dict: Extracted audio features
        """
        # Step 1: Extract audio from video
        audio_path = self.extract_audio_from_video(video_path, output_dir)
        
        # Step 2: Extract mel features from audio
        features = self.extract_mel_features(audio_path, output_dir, max_duration)
        
        return features

def step2_main(video_path):
    """
    Main function to extract audio features from video
    """

    # root path
    root_path = Path.cwd()  # use current working directory in Colab
    print(f"Root path: {root_path}")

    # video_path = f"{root_path}/data/test.mp4"
    
    print("Initializing Audio Feature Extractor...")
    extractor = AudioFeatureExtractor(
        sample_rate=16000,  # 16kHz for efficiency
        n_mels=80,          # 80 mel features
        hop_length=160,     # 10ms hop (160 samples at 16kHz)
        win_length=400,     # 25ms window (400 samples at 16kHz)
        n_fft=512          # 512-point FFT
    )
    
    print("\\nStarting audio feature extraction...")
    
    # For testing, limit to same duration as blendshapes (~33 seconds)
    # Remove max_duration=33.33 to process the entire video
    features = extractor.extract_from_video(
        video_path,
        output_dir=f"{root_path}/data/extracted_features",
        # max_duration=33.33  # Match the 1000 frames we extracted earlier
    )
    
    # Print summary
    print("\\n" + "="*60)
    print("AUDIO FEATURE EXTRACTION SUMMARY")
    print("="*60)
    print(f"Audio file: {features['audio_path']}")
    print(f"Duration: {features['duration_seconds']:.2f} seconds")
    print(f"Sample rate: {features['sample_rate']} Hz")
    print(f"Mel features: {features['n_mels']}")
    print(f"Number of frames: {features['n_frames']}")
    print(f"Mel frame rate: {features['mel_frame_rate']:.1f} Hz")
    print(f"Feature dimensions: {len(features['mel_features'])} x {len(features['mel_features'][0])}")
    
    # Voice activity statistics
    vad_ratio = np.mean(features['voice_activity'])
    print(f"Voice activity: {vad_ratio:.1%} of frames")
    
    print("="*60)

print("Now doing step 2 extract audio features for all videos in the dataset")
for f in dataset_results["downloaded_files"]:
    if f["filename"].endswith(".mp4"):
        video_path = f["local_path"]
        print("will now process step 2 for video" , video_path)
        step2_main(video_path)
        print("finsihed step2  for video" , f["filename"])


will now process step 0 for video hf_datasets/test_video/test.mp4
Root path: /Users/sanjayprasads/Desktop/Coding/Python/NN_training/V2A-over-training-old-nn/1_data_cleaning
Initializing Audio Feature Extractor...
Audio Feature Extractor initialized:
  Sample rate: 16000 Hz
  Mel features: 80
  Mel frame rate: 100.0 Hz
  Hop length: 160 samples (10.0ms)
  Window length: 400 samples (25.0ms)
\nStarting audio feature extraction...
Extracting audio from: hf_datasets/test_video/test.mp4
MoviePy - Writing audio in /Users/sanjayprasads/Desktop/Coding/Python/NN_training/V2A-over-training-old-nn/1_data_cleaning/data/extracted_features/extracted_audio_test.wav


                                                        

MoviePy - Done.
Audio extracted to: /Users/sanjayprasads/Desktop/Coding/Python/NN_training/V2A-over-training-old-nn/1_data_cleaning/data/extracted_features/extracted_audio_test.wav
Loading audio from: /Users/sanjayprasads/Desktop/Coding/Python/NN_training/V2A-over-training-old-nn/1_data_cleaning/data/extracted_features/extracted_audio_test.wav
Audio loaded: 534080 samples, 33.38 seconds
Computing mel spectrogram...
Computing additional audio features...
\nAudio feature extraction complete!
Mel features shape: (3339, 80)
Features saved to: /Users/sanjayprasads/Desktop/Coding/Python/NN_training/V2A-over-training-old-nn/1_data_cleaning/data/extracted_features/audio_features_test.json
AUDIO FEATURE EXTRACTION SUMMARY
Audio file: /Users/sanjayprasads/Desktop/Coding/Python/NN_training/V2A-over-training-old-nn/1_data_cleaning/data/extracted_features/extracted_audio_test.wav
Duration: 33.38 seconds
Sample rate: 16000 Hz
Mel features: 80
Number of frames: 3339
Mel frame rate: 100.0 Hz
Feature d

In [28]:
# Step 3 to create the dataset with all the extracted features

#!/usr/bin/env python3
"""
FIXED Dataset Creation Script
Synchronizes audio features with blendshape/pose targets for training
WITH PROPER NORMALIZATION (no more z-score destruction)
"""

import json
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, RobustScaler
import joblib

class DatasetCreator:
    def __init__(self, sequence_length_ms=240, overlap_ms=120):
        """
        Initialize dataset creator
        
        Args:
            sequence_length_ms: Length of input sequences in milliseconds (240-320ms recommended)
            overlap_ms: Overlap between sequences in milliseconds
        """
        self.sequence_length_ms = sequence_length_ms
        self.overlap_ms = overlap_ms
        
        print(f"Dataset Creator initialized:")
        print(f"  Sequence length: {sequence_length_ms}ms")
        print(f"  Overlap: {overlap_ms}ms")
        print(f"  Step size: {sequence_length_ms - overlap_ms}ms")
    
    def load_features(self, features_dir="extracted_features"):
        """
        Load both audio and visual features
        
        Args:
            features_dir: Directory containing extracted features
        
        Returns:
            tuple: (audio_data, visual_data)
        """



        features_path = Path(features_dir)
        
        # Try both .json and .npy formats for compatibility
        audio_file_json = features_path / "audio_features.json"
        audio_file_npy = features_path / "audio_features.npy"
        
        if audio_file_npy.exists():
            # Load numpy format (more common now)
            audio_features = np.load(audio_file_npy)
            print(f"Loaded audio features from .npy: {audio_features.shape}")
            
            # Create compatible format
            audio_data = {
                'mel_features': audio_features.tolist(),
                'timestamps': (np.arange(len(audio_features)) / 100.0).tolist(),  # 100 FPS assumption
                'voice_activity': np.ones(len(audio_features)).tolist()  # Placeholder VAD
            }
        elif audio_file_json.exists():
            # Load JSON format
            with open(audio_file_json, 'r') as f:
                audio_data = json.load(f)
            print(f"Loaded audio features from .json: {len(audio_data['mel_features'])} frames")
        else:
            raise FileNotFoundError(f"Audio features not found: {audio_file_npy} or {audio_file_json}")
        
        # Load visual features
        visual_file_json = features_path / "blendshapes_and_pose.json"
        visual_file_npy = features_path / "blendshape_features.npy"
        
        if visual_file_npy.exists():
            # Load numpy format
            visual_features = np.load(visual_file_npy)
            print(f"Loaded visual features from .npy: {visual_features.shape}")
            
            # Create compatible format
            visual_data = {
                'frames': []
            }
            for i, frame_features in enumerate(visual_features):
                visual_data['frames'].append({
                    'timestamp': i / 30.0,  # 30 FPS assumption
                    'blendshapes': frame_features[:52].tolist(),  # First 52 are blendshapes
                    'head_pose': frame_features[52:59].tolist() if len(frame_features) >= 59 else [0]*7,
                    'has_face': True
                })
        elif visual_file_json.exists():
            # Load JSON format
            with open(visual_file_json, 'r') as f:
                visual_data = json.load(f)
            print(f"Loaded visual features from .json: {len(visual_data['frames'])} frames")
        else:
            raise FileNotFoundError(f"Visual features not found: {visual_file_npy} or {visual_file_json}")
        
        return audio_data, visual_data


    def load_features_multi_video(self, features_dir="extracted_features", dataset_results=None):
        """
        Load audio and visual features from multiple videos
        
        Args:
            features_dir: Directory containing extracted features
            dataset_results: Dataset results containing video file info
        
        Returns:
            tuple: (combined_audio_data, combined_visual_data)
        """
        all_audio_data = []
        all_visual_data = []
        
        # Get list of video names from dataset_results
        video_names = []
        for f in dataset_results["downloaded_files"]:
            if f["filename"].endswith(".mp4"):
                video_name = os.path.splitext(os.path.basename(f["local_path"]))[0]
                video_names.append(video_name)
        
        print(f"Loading features from {len(video_names)} videos: {video_names}")
        
        for video_name in video_names:
            print(f"\nLoading features for video: {video_name}")
            
            features_path = Path(features_dir)
            
            # Load audio features for this video
            audio_file = features_path / f"audio_features_{video_name}.json"
            if audio_file.exists():
                with open(audio_file, 'r') as f:
                    audio_data = json.load(f)
                print(f"  Audio: {len(audio_data['mel_features'])} frames")
                all_audio_data.append(audio_data)
            else:
                print(f"  WARNING: Audio features not found for {video_name}")
            
            # Load visual features for this video  
            visual_file = features_path / f"blendshapes_and_pose_{video_name}.json"
            if visual_file.exists():
                with open(visual_file, 'r') as f:
                    visual_data = json.load(f)
                print(f"  Visual: {len(visual_data['frames'])} frames")
                all_visual_data.append(visual_data)
            else:
                print(f"  WARNING: Visual features not found for {video_name}")
        
        # Combine all audio data
        combined_audio = {
            'mel_features': [],
            'timestamps': [],
            'voice_activity': []
        }
        
        current_time_offset = 0.0
        for audio_data in all_audio_data:
            # Adjust timestamps to be continuous across videos
            adjusted_timestamps = [t + current_time_offset for t in audio_data['timestamps']]
            
            combined_audio['mel_features'].extend(audio_data['mel_features'])
            combined_audio['timestamps'].extend(adjusted_timestamps)
            combined_audio['voice_activity'].extend(audio_data['voice_activity'])
            
            # Update offset for next video
            current_time_offset = adjusted_timestamps[-1] + 0.1  # Small gap between videos
        
        # Combine all visual data
        combined_visual = {'frames': []}
        
        current_time_offset = 0.0
        for visual_data in all_visual_data:
            for frame in visual_data['frames']:
                # Adjust timestamp
                adjusted_frame = frame.copy()
                adjusted_frame['timestamp'] = frame['timestamp'] + current_time_offset
                combined_visual['frames'].append(adjusted_frame)
            
            # Update offset for next video
            if visual_data['frames']:
                current_time_offset = visual_data['frames'][-1]['timestamp'] + current_time_offset + 0.1
        
        print(f"\nCombined dataset:")
        print(f"  Total audio frames: {len(combined_audio['mel_features'])}")
        print(f"  Total visual frames: {len(combined_visual['frames'])}")
        
        return combined_audio, combined_visual
    
    def synchronize_features(self, audio_data, visual_data):
        """
        Synchronize audio and visual features using timestamps
        
        Args:
            audio_data: Audio features dict
            visual_data: Visual features dict
        
        Returns:
            tuple: (synchronized_audio, synchronized_targets, metadata)
        """
        
        # Extract data arrays
        audio_features = np.array(audio_data['mel_features'])  # Shape: (time, 80)
        audio_timestamps = np.array(audio_data['timestamps'])
        audio_vad = np.array(audio_data['voice_activity'])
        
        # Extract visual features and timestamps
        visual_frames = visual_data['frames']
        # Convert timestamps from milliseconds to seconds if needed
        visual_timestamps = np.array([frame['timestamp'] for frame in visual_frames])
        if visual_timestamps.max() > 1000:  # Likely in milliseconds
            visual_timestamps = visual_timestamps / 1000.0
        
        # Combine blendshapes and pose into targets (59 values per frame)
        targets = []
        has_face_flags = []
        
        # Define the expected blendshape order (52 categories)
        blendshape_names = [
            '_neutral', 'browDownLeft', 'browDownRight', 'browInnerUp', 'browOuterUpLeft', 
            'browOuterUpRight', 'cheekPuff', 'cheekSquintLeft', 'cheekSquintRight', 'eyeBlinkLeft', 
            'eyeBlinkRight', 'eyeLookDownLeft', 'eyeLookDownRight', 'eyeLookInLeft', 'eyeLookInRight', 
            'eyeLookOutLeft', 'eyeLookOutRight', 'eyeLookUpLeft', 'eyeLookUpRight', 'eyeSquintLeft', 
            'eyeSquintRight', 'eyeWideLeft', 'eyeWideRight', 'jawForward', 'jawLeft', 'jawOpen', 
            'jawRight', 'mouthClose', 'mouthDimpleLeft', 'mouthDimpleRight', 'mouthFrownLeft', 
            'mouthFrownRight', 'mouthFunnel', 'mouthLeft', 'mouthLowerDownLeft', 'mouthLowerDownRight', 
            'mouthPressLeft', 'mouthPressRight', 'mouthPucker', 'mouthRight', 'mouthRollLower', 
            'mouthRollUpper', 'mouthShrugLower', 'mouthShrugUpper', 'mouthSmileLeft', 'mouthSmileRight', 
            'mouthStretchLeft', 'mouthStretchRight', 'mouthUpperUpLeft', 'mouthUpperUpRight', 
            'noseSneerLeft', 'noseSneerRight'
        ]
        
        for frame in visual_frames:
            # Extract blendshapes in the correct order
            if isinstance(frame['blendshapes'], dict):
                # New format: dictionary with named blendshapes
                blendshapes_list = []
                for name in blendshape_names:
                    blendshapes_list.append(frame['blendshapes'].get(name, 0.0))
            else:
                # Old format: already a list
                blendshapes_list = frame['blendshapes']
            
            # Extract head pose (position + rotation = 7 values: x,y,z,qw,qx,qy,qz)
            if 'headPosition' in frame and 'headRotation' in frame:
                # New format: separate position and rotation objects
                head_pose = [
                    frame['headPosition']['x'],
                    frame['headPosition']['y'], 
                    frame['headPosition']['z'],
                    frame['headRotation']['w'],
                    frame['headRotation']['x'],
                    frame['headRotation']['y'],
                    frame['headRotation']['z']
                ]
            elif 'head_pose' in frame:
                # Old format: already a list
                head_pose = frame['head_pose']
            else:
                # Fallback: default values
                head_pose = [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
            
            # Combine 52 blendshapes + 7 head pose values = 59 total
            target = blendshapes_list + head_pose
            targets.append(target)
            has_face_flags.append(frame['has_face'])
        
        targets = np.array(targets)  # Shape: (time, 59)
        has_face_flags = np.array(has_face_flags)
        
        print(f"\\nSynchronization details:")
        print(f"  Audio: {len(audio_features)} frames, {audio_timestamps[0]:.3f}s to {audio_timestamps[-1]:.3f}s")
        print(f"  Visual: {len(targets)} frames, {visual_timestamps[0]:.3f}s to {visual_timestamps[-1]:.3f}s")
        
        # Find common time range
        start_time = max(audio_timestamps[0], visual_timestamps[0])
        end_time = min(audio_timestamps[-1], visual_timestamps[-1])
        
        print(f"  Common range: {start_time:.3f}s to {end_time:.3f}s ({end_time-start_time:.3f}s)")
        
        # Interpolate visual features to audio timestamps
        # This upsamples visual features from 30fps to 100fps (audio frame rate)
        
        # Filter audio to common time range
        audio_mask = (audio_timestamps >= start_time) & (audio_timestamps <= end_time)
        sync_audio_timestamps = audio_timestamps[audio_mask]
        sync_audio_features = audio_features[audio_mask]
        sync_audio_vad = audio_vad[audio_mask]
        
        # Interpolate visual targets to audio timestamps
        sync_targets = np.zeros((len(sync_audio_timestamps), 59))
        sync_has_face = np.zeros(len(sync_audio_timestamps), dtype=bool)
        
        for i, target_dim in enumerate(range(59)):
            target_values = targets[:, target_dim]
            sync_targets[:, i] = np.interp(sync_audio_timestamps, visual_timestamps, target_values)
        
        # Interpolate face detection flags
        face_values = has_face_flags.astype(float)
        interpolated_face = np.interp(sync_audio_timestamps, visual_timestamps, face_values)
        sync_has_face = interpolated_face > 0.5  # Threshold for face presence
        
        print(f"\\nSynchronized dataset:")
        print(f"  Duration: {sync_audio_timestamps[-1] - sync_audio_timestamps[0]:.2f} seconds")
        print(f"  Audio features: {sync_audio_features.shape}")
        print(f"  Target features: {sync_targets.shape}")
        print(f"  Face detection rate: {np.mean(sync_has_face):.1%}")
        
        # Create metadata
        metadata = {
            'duration_seconds': float(sync_audio_timestamps[-1] - sync_audio_timestamps[0]),
            'sample_rate_hz': float(len(sync_audio_timestamps) / (sync_audio_timestamps[-1] - sync_audio_timestamps[0])),
            'num_frames': len(sync_audio_timestamps),
            'audio_features_dim': sync_audio_features.shape[1],
            'target_features_dim': sync_targets.shape[1],
            'face_detection_rate': float(np.mean(sync_has_face)),
            'voice_activity_rate': float(np.mean(sync_audio_vad))
        }
        
        return {
            'audio_features': sync_audio_features,
            'targets': sync_targets,
            'timestamps': sync_audio_timestamps,
            'voice_activity': sync_audio_vad,
            'has_face': sync_has_face,
            'metadata': metadata
        }
    
    def create_sequences(self, synchronized_data):
        """
        Create training sequences from synchronized data
        
        Args:
            synchronized_data: Output from synchronize_features()
        
        Returns:
            dict: Training sequences
        """
        audio_features = synchronized_data['audio_features']
        targets = synchronized_data['targets']
        voice_activity = synchronized_data['voice_activity']
        has_face = synchronized_data['has_face']
        timestamps = synchronized_data['timestamps']
        
        # Calculate sequence parameters
        sample_rate = synchronized_data['metadata']['sample_rate_hz']
        seq_length_frames = int(self.sequence_length_ms * sample_rate / 1000)
        step_size_frames = int((self.sequence_length_ms - self.overlap_ms) * sample_rate / 1000)
        
        print(f"\\nCreating sequences:")
        print(f"  Sequence length: {seq_length_frames} frames ({self.sequence_length_ms}ms)")
        print(f"  Step size: {step_size_frames} frames ({self.sequence_length_ms - self.overlap_ms}ms)")
        
        # Generate sequences
        sequences_audio = []
        sequences_targets = []
        sequences_vad = []
        sequences_face = []
        sequences_timestamps = []
        
        for start_idx in range(0, len(audio_features) - seq_length_frames + 1, step_size_frames):
            end_idx = start_idx + seq_length_frames
            
            # Extract sequence
            seq_audio = audio_features[start_idx:end_idx]  # Shape: (seq_len, 80)
            seq_targets = targets[start_idx:end_idx]       # Shape: (seq_len, 59)
            seq_vad = voice_activity[start_idx:end_idx]
            seq_face = has_face[start_idx:end_idx]
            seq_time = timestamps[start_idx:end_idx]
            
            # Quality checks
            face_ratio = np.mean(seq_face)
            vad_ratio = np.mean(seq_vad)
            
            # Only include sequences with reasonable face detection
            if face_ratio >= 0.5:  # At least 50% of frames have face detected
                sequences_audio.append(seq_audio)
                sequences_targets.append(seq_targets)
                sequences_vad.append(seq_vad)
                sequences_face.append(seq_face)
                sequences_timestamps.append(seq_time)
        
        sequences_audio = np.array(sequences_audio)      # Shape: (num_seq, seq_len, 80)
        sequences_targets = np.array(sequences_targets)  # Shape: (num_seq, seq_len, 59)
        sequences_vad = np.array(sequences_vad)          # Shape: (num_seq, seq_len)
        sequences_face = np.array(sequences_face)        # Shape: (num_seq, seq_len)
        
        print(f"  Generated {len(sequences_audio)} sequences")
        print(f"  Audio sequences shape: {sequences_audio.shape}")
        print(f"  Target sequences shape: {sequences_targets.shape}")
        
        return {
            'audio_sequences': sequences_audio,
            'target_sequences': sequences_targets,
            'vad_sequences': sequences_vad,
            'face_sequences': sequences_face,
            'sequence_timestamps': sequences_timestamps,
            'metadata': {
                'num_sequences': len(sequences_audio),
                'sequence_length_frames': seq_length_frames,
                'sequence_length_ms': self.sequence_length_ms,
                'step_size_frames': step_size_frames,
                'overlap_ms': self.overlap_ms,
                'audio_feature_dim': sequences_audio.shape[2],
                'target_feature_dim': sequences_targets.shape[2]
            }
        }
    
    def normalize_features(self, sequences_data, output_dir="extracted_features"):
        """
        FIXED NORMALIZATION - Preserves natural scales and relationships
        NO MORE Z-SCORE DESTRUCTION!
        
        Args:
            sequences_data: Output from create_sequences()
            output_dir: Directory to save normalization parameters
        
        Returns:
            dict: Normalized sequences with scalers
        """
        audio_sequences = sequences_data['audio_sequences']
        target_sequences = sequences_data['target_sequences']
        
        # Reshape for normalization (flatten time dimension)
        audio_flat = audio_sequences.reshape(-1, audio_sequences.shape[-1])
        targets_flat = target_sequences.reshape(-1, target_sequences.shape[-1])
        
        print(f"\\n=== APPLYING PROPER NORMALIZATION (NO Z-SCORE!) ===")
        print(f"  Audio features: {audio_flat.shape}")
        print(f"  Target features: {targets_flat.shape}")
        
        # Check original ranges
        print(f"\\nORIGINAL RANGES:")
        print(f"  Audio: [{audio_flat.min():.3f}, {audio_flat.max():.3f}]")
        print(f"  Targets: [{targets_flat.min():.3f}, {targets_flat.max():.3f}]")
        
        # ============ AUDIO NORMALIZATION ============
        print(f"\\n--- AUDIO NORMALIZATION ---")
        print(f"Method: Clipping to natural mel spectrogram dB range")
        
        # Keep audio in natural mel spectrogram range (-80 to 10 dB)
        audio_normalized_flat = np.clip(audio_flat, -80.0, 10.0)
        
        # Optional: Light scaling to improve training stability
        # Uncomment if you want to scale to [-1, 1] while preserving relationships:
        # audio_normalized_flat = (audio_normalized_flat + 80.0) / 45.0 - 1.0
        
        print(f"Audio after normalization: [{audio_normalized_flat.min():.3f}, {audio_normalized_flat.max():.3f}]")
        print(f"Audio mean: {audio_normalized_flat.mean():.3f}, std: {audio_normalized_flat.std():.3f}")
        
        # ============ TARGET NORMALIZATION ============
        print(f"\\n--- TARGET NORMALIZATION ---")
        
        # Split blendshapes (0-52) and pose (52-59) for different treatment
        if targets_flat.shape[1] >= 52:
            blendshapes_flat = targets_flat[:, :52]  # First 52 are blendshapes
            pose_flat = targets_flat[:, 52:] if targets_flat.shape[1] > 52 else None
            
            print(f"Blendshapes: {blendshapes_flat.shape}")
            if pose_flat is not None:
                print(f"Pose: {pose_flat.shape}")
            
            # Method 1: Clip blendshapes to natural [0, 1] range
            print(f"Method: Clipping blendshapes to natural [0, 1] range")
            blendshapes_normalized = np.clip(blendshapes_flat, 0.0, 1.0)
            
            print(f"Blendshapes after normalization: [{blendshapes_normalized.min():.3f}, {blendshapes_normalized.max():.3f}]")
            print(f"Blendshapes mean: {blendshapes_normalized.mean():.3f}, std: {blendshapes_normalized.std():.3f}")
            
            # Normalize pose if present
            if pose_flat is not None:
                # For pose, use light clipping since range can vary
                pose_normalized = np.clip(pose_flat, -1.0, 1.0)
                print(f"Pose after normalization: [{pose_normalized.min():.3f}, {pose_normalized.max():.3f}]")
                
                # Combine back
                targets_normalized_flat = np.concatenate([blendshapes_normalized, pose_normalized], axis=1)
            else:
                targets_normalized_flat = blendshapes_normalized
                
        else:
            # Fallback for unexpected feature count
            print(f"Unexpected target feature count ({targets_flat.shape[1]}), using clipping to [0, 1]")
            targets_normalized_flat = np.clip(targets_flat, 0.0, 1.0)
        
        print(f"\\nFinal target range: [{targets_normalized_flat.min():.3f}, {targets_normalized_flat.max():.3f}]")
        print(f"Final target mean: {targets_normalized_flat.mean():.3f}, std: {targets_normalized_flat.std():.3f}")
        
        # Reshape back to sequences
        audio_normalized = audio_normalized_flat.reshape(audio_sequences.shape)
        targets_normalized = targets_normalized_flat.reshape(target_sequences.shape)
        
        # ============ VALIDATION ============
        print(f"\\n=== NORMALIZATION VALIDATION ===")
        audio_std = audio_normalized.std()
        target_std = targets_normalized.std()
        
        # Check that we didn't destroy variation
        if audio_std < 0.1:
            print(f"WARNING: Audio std ({audio_std:.3f}) is very low - may indicate over-normalization")
        else:
            print(f"OK: Audio variation preserved (std: {audio_std:.3f})")
            
        if target_std < 0.05:
            print(f"WARNING: Target std ({target_std:.3f}) is very low - may indicate over-normalization")
        else:
            print(f"OK: Target variation preserved (std: {target_std:.3f})")
        
        # Check that we're NOT in z-score territory
        if abs(audio_normalized.mean()) < 0.01 and abs(audio_normalized.std() - 1.0) < 0.01:
            print(f"ERROR: Audio shows z-score pattern! This should not happen.")
        else:
            print(f"OK: Audio normalization looks good (not z-score)")
            
        if abs(targets_normalized.mean()) < 0.01 and abs(targets_normalized.std() - 1.0) < 0.01:
            print(f"ERROR: Targets show z-score pattern! This should not happen.")
        else:
            print(f"OK: Target normalization looks good (not z-score)")
        
        # Create dummy scalers for compatibility (though we're not using StandardScaler anymore)
        output_path = Path(output_dir)
        
        # Save the normalization parameters for inference
        normalization_params = {
            'audio_method': 'clipping',
            'audio_min': -80.0,
            'audio_max': 10.0,
            'target_method': 'clipping',
            'blendshape_min': 0.0,
            'blendshape_max': 1.0,
            'pose_min': -1.0,
            'pose_max': 1.0
        }
        
        with open(output_path / "normalization_params.json", 'w') as f:
            json.dump(normalization_params, f, indent=2)
        
        print(f"\\nOK: Normalization parameters saved to {output_path / 'normalization_params.json'}")
        
        # Update sequences data
        normalized_data = sequences_data.copy()
        normalized_data['audio_sequences'] = audio_normalized
        normalized_data['target_sequences'] = targets_normalized
        normalized_data['normalization_method'] = 'proper_scaling_preserves_natural_ranges'
        
        # Add normalization stats (but these are NOT z-score stats!)
        normalized_data['normalization_stats'] = {
            'method': 'clipping_to_natural_ranges',
            'audio_range': [float(audio_normalized.min()), float(audio_normalized.max())],
            'audio_mean': float(audio_normalized.mean()),
            'audio_std': float(audio_normalized.std()),
            'target_range': [float(targets_normalized.min()), float(targets_normalized.max())],
            'target_mean': float(targets_normalized.mean()),
            'target_std': float(targets_normalized.std())
        }
        
        return normalized_data
    
    def save_dataset(self, dataset, output_dir="extracted_features"):
        """
        Save the final dataset
        
        Args:
            dataset: Final dataset from normalize_features()
            output_dir: Directory to save dataset
        """
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        
        # Save sequences as numpy arrays (more efficient for training)
        np.save(output_path / "audio_sequences.npy", dataset['audio_sequences'])
        np.save(output_path / "target_sequences.npy", dataset['target_sequences'])
        np.save(output_path / "vad_sequences.npy", dataset['vad_sequences'])
        # Note: Saving as 'vad_sequences.npy' for compatibility with diagnostic script
        
        # Save metadata as JSON
        metadata = {
            'dataset_info': dataset['metadata'],
            'normalization_stats': dataset['normalization_stats'],
            'normalization_method': dataset.get('normalization_method', 'proper_scaling')
        }
        
        with open(output_path / "dataset_metadata.json", 'w') as f:
            json.dump(metadata, f, indent=2)
        
        print(f"\\nDataset saved to {output_path}")
        print(f"  Files: audio_sequences.npy, target_sequences.npy, vad_sequences.npy, dataset_metadata.json")
        print(f"  Normalization method: {dataset.get('normalization_method', 'proper_scaling')}")
        
        return output_path

def step3_main( dataset_results ):
    """
    Main function to create the training dataset with PROPER normalization
    """
    print("Creating synchronized training dataset with PROPER normalization...")
    print("FIXED: This version FIXES the z-score over-normalization problem!")

    # root path
    root_path = Path.cwd()  # use current working directory in Colab
    print(f"Root path: {root_path}")
    
    # Initialize dataset creator
    # 240ms sequences with 120ms overlap = 120ms step size
    creator = DatasetCreator(sequence_length_ms=240, overlap_ms=120)
    
    # Load features
    print("\\nLoading extracted features...")
    audio_data, visual_data = creator.load_features_multi_video(f"{root_path}/data/extracted_features", dataset_results)
    # audio_data, visual_data = creator.load_features(f"{root_path}/data/extracted_features",)
    
    # Synchronize features
    print("\\nSynchronizing audio and visual features...")
    synchronized_data = creator.synchronize_features(audio_data, visual_data)
    
    # Create training sequences
    print("\\nCreating training sequences...")
    sequences_data = creator.create_sequences(synchronized_data)
    
    # Apply PROPER normalization (no more z-score!)
    print("\\nApplying PROPER normalization...")
    final_dataset = creator.normalize_features(sequences_data, f"{root_path}/data/extracted_features")
    
    # Save dataset
    print("\\nSaving dataset...")
    output_path = creator.save_dataset(final_dataset, f"{root_path}/data/training_dataset")
    
    # Print final summary
    print("\\n" + "="*60)
    print("DATASET CREATION SUMMARY - FIXED NORMALIZATION")
    print("="*60)
    print(f"Output directory: {output_path}")
    print(f"Number of sequences: {final_dataset['metadata']['num_sequences']}")
    print(f"Sequence length: {final_dataset['metadata']['sequence_length_ms']}ms")
    print(f"Audio features per sequence: {final_dataset['metadata']['sequence_length_frames']} x {final_dataset['metadata']['audio_feature_dim']}")
    print(f"Target features per sequence: {final_dataset['metadata']['sequence_length_frames']} x {final_dataset['metadata']['target_feature_dim']}")
    print(f"Normalization method: {final_dataset.get('normalization_method', 'proper_scaling')}")
    print(f"\\nSUCCESS: Ready for TCN training with PROPER data!")
    print("="*60)


step3_main( dataset_results)



Creating synchronized training dataset with PROPER normalization...
FIXED: This version FIXES the z-score over-normalization problem!
Root path: /Users/sanjayprasads/Desktop/Coding/Python/NN_training/V2A-over-training-old-nn/1_data_cleaning
Dataset Creator initialized:
  Sequence length: 240ms
  Overlap: 120ms
  Step size: 120ms
\nLoading extracted features...
Loading features from 1 videos: ['test']

Loading features for video: test
  Audio: 3339 frames
  Visual: 1002 frames

Combined dataset:
  Total audio frames: 3339
  Total visual frames: 1002
\nSynchronizing audio and visual features...
\nSynchronization details:
  Audio: 3339 frames, 0.000s to 33.380s
  Visual: 1002 frames, 0.000s to 33.366s
  Common range: 0.000s to 33.366s (33.366s)
\nSynchronized dataset:
  Duration: 33.36 seconds
  Audio features: (3337, 80)
  Target features: (3337, 59)
  Face detection rate: 99.9%
\nCreating training sequences...
\nCreating sequences:
  Sequence length: 24 frames (240ms)
  Step size: 12 fr

In [51]:
# now i need to use hugging face token to upload the training dataset to the hub

root_path = Path.cwd()  # use current working directory in Colab
print(f"Root path: {root_path}")

print("Now uploading the training dataset to the hub")
print("The training Dataset is" , f"{root_path}/data/training_dataset")

from huggingface_hub import HfApi
import os
from dotenv import load_dotenv
load_dotenv()


# from dotenv import load_dotenv
print("imported huggingface_hub")
# load the dotenv file

HF_TOKEN = os.getenv("HF_TOKEN")
# print("HF_TOKEN" , HF_TOKEN)

api = HfApi(token=HF_TOKEN)
print("made the api object" , api)

api.upload_folder(
    folder_path=f"{root_path}/data/training_dataset",
    repo_id="sanjuhs/audio_to_blendshapes_test",
    repo_type="dataset",
)

print("uploaded the dataset to the hub!")

Root path: /Users/sanjayprasads/Desktop/Coding/Python/NN_training/V2A-over-training-old-nn/1_data_cleaning
Now uploading the training dataset to the hub
The training Dataset is /Users/sanjayprasads/Desktop/Coding/Python/NN_training/V2A-over-training-old-nn/1_data_cleaning/data/training_dataset
imported huggingface_hub
made the api object <huggingface_hub.hf_api.HfApi object at 0x30997c410>


KeyboardInterrupt: 

In [44]:
from huggingface_hub import HfApi

# Test token validity
api = HfApi(token=HF_TOKEN)

try:
    # Test connection by getting user info
    user_info = api.whoami()
    print(f"✅ Token valid for user: {user_info['name']}")
    
    # Check if repo exists
    try:
        repo_info = api.repo_info("sanjuhs/audio_to_blendshapes_test", repo_type="dataset")
        print(f"✅ Repository exists and accessible")
    except:
        print("ℹ️  Repository doesn't exist yet, will be created during upload")
        
except Exception as e:
    print(f"❌ Token/connection issue: {e}")

KeyboardInterrupt: 

In [50]:
import requests
import time

print("Testing basic connectivity...")

try:
    start = time.time()
    response = requests.get("https://huggingface.co", timeout=10)
    end = time.time()
    print(f"✅ huggingface.co reachable in {end-start:.2f}s - Status: {response.status_code}")
except Exception as e:
    print(f"❌ Cannot reach huggingface.co: {e}")

try:
    start = time.time()
    response = requests.get("https://huggingface.co/api/whoami", 
                          headers={"Authorization": f"Bearer {HF_TOKEN}"}, 
                          timeout=10)
    end = time.time()
    print(f"✅ HF API reachable in {end-start:.2f}s - Status: {response.status_code}")
    if response.status_code == 200:
        print(f"User: {response.json()}")
except Exception as e:
    print(f"❌ Cannot reach HF API: {e}")

Testing basic connectivity...
✅ huggingface.co reachable in 80.12s - Status: 200
✅ HF API reachable in 80.41s - Status: 401


In [49]:
import requests
import time

# Test a simple, fast site first
try:
    start = time.time()
    url = "https://google.com"
    url2="https://huggingface.co"
    # response = requests.get(url, timeout=5)
    response2 = requests.get(url2, timeout=5)
    end = time.time()
    # print(f"✅ Google reachable in {end-start:.2f}s - Status: {response.status_code}")
    print(f"✅ Huggingface reachable in {end-start:.2f}s - Status: {response2.status_code}")
except Exception as e:
    print(f"❌ No internet connection: {e}")

✅ Huggingface reachable in 40.17s - Status: 200
