In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

def print_subfolders(directory):
  for item in os.listdir(directory):
    item_path = os.path.join(directory, item)
    if os.path.isdir(item_path):  # Check if it's a directory
      print(f"Folders in {item_path}:")
      for subitem in os.listdir(item_path):
        subitem_path = os.path.join(item_path, subitem)
        if os.path.isdir(subitem_path):  # Check if it's a subfolder
          print(f"  - {subitem}")

# Start with the main directory
print_subfolders('/content/drive/MyDrive/PolyGlotFake3')

Folders in /content/drive/MyDrive/PolyGlotFake3/real3:
  - ar
  - en
  - es
  - fr
  - ja
  - ru
  - zh
Folders in /content/drive/MyDrive/PolyGlotFake3/fake3:
  - to_ar
  - to_en
  - to_es
  - to_fr
  - to_ja
  - to_ru
  - to_zh


In [None]:
import os

def count_videos(directory):
  """Counts the number of .mp4 video files in a directory and its subfolders."""
  video_count = 0
  for root, _, files in os.walk(directory):  # Traverse all subdirectories
    for file in files:
      if file.endswith('.mp4'):
        video_count += 1
  return video_count

# Specify the directory you want to check
directory_to_check = '/content/drive/MyDrive/PolyGlotFake3'

# Get the video count
total_videos = count_videos(directory_to_check)

# Print the result
print(f"Total number of videos in '{directory_to_check}': {total_videos}")

Total number of videos in '/content/drive/MyDrive/PolyGlotFake3': 420


In [None]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
import os
import librosa
import torch
import numpy as np
import soundfile as sf
from pydub import AudioSegment
from transformers import Wav2Vec2Processor, Wav2Vec2Model, WhisperFeatureExtractor, WhisperModel

# Configuration
SAMPLE_RATE = 16000
MAX_AUDIO_LENGTH = 30  # seconds
MAX_TIME_STEPS = 1500

LANG_TO_WAV2VEC2 = {
    'en': 'facebook/wav2vec2-large-robust-ft-swbd-300h',
    'fr': 'facebook/wav2vec2-large-xlsr-53-french',
    'ru': 'anton-l/wav2vec2-large-xlsr-53-russian',
    'es': 'facebook/wav2vec2-large-xlsr-53-spanish',
    'zh': 'jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn',
    'ja': 'jonatasgrosman/wav2vec2-large-xlsr-53-japanese',
    'ar': 'jonatasgrosman/wav2vec2-large-xlsr-53-arabic'
}
WHISPER_MODEL_ID = "openai/whisper-small"

def init_processors_and_models(languages):
    """Initialize models and processors"""
    wav2vec2_processors = {}
    wav2vec2_models = {}

    for lang in languages:
        model_id = LANG_TO_WAV2VEC2[lang]
        wav2vec2_processors[lang] = Wav2Vec2Processor.from_pretrained(model_id)
        wav2vec2_models[lang] = Wav2Vec2Model.from_pretrained(model_id)

    whisper_feature_extractor = WhisperFeatureExtractor.from_pretrained(WHISPER_MODEL_ID)
    whisper_model = WhisperModel.from_pretrained(WHISPER_MODEL_ID).encoder

    return wav2vec2_processors, wav2vec2_models, whisper_feature_extractor, whisper_model

def extract_audio_from_video(video_path):
    """Extract audio from video files"""
    try:
        temp_dir = os.path.join(os.getcwd(), "temp")
        os.makedirs(temp_dir, exist_ok=True)
        temp_path = os.path.join(temp_dir, f"temp_{os.path.basename(video_path)}.wav")

        audio = AudioSegment.from_file(video_path)
        audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(1)

        target_length = MAX_AUDIO_LENGTH * 1000  # milliseconds
        if len(audio) < target_length:
            silence = AudioSegment.silent(duration=target_length - len(audio))
            audio += silence
        else:
            audio = audio[:target_length]

        audio.export(temp_path, format="wav")
        return temp_path

    except Exception as e:
        print(f"Video processing error: {str(e)}")
        return None
    finally:
        if 'audio' in locals():
            del audio

def extract_audio_features(path, lang, wav2vec2_processors, wav2vec2_models,
                          whisper_feature_extractor, whisper_model):
    """Extract combined audio features with temporal dimension"""
    temp_file = None
    try:
        if path.lower().endswith('.mp4'):
            temp_file = extract_audio_from_video(path)
            if not temp_file or not os.path.exists(temp_file):
                return None
            audio_path = temp_file
        else:
            audio_path = path

        audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
        target_samples = SAMPLE_RATE * MAX_AUDIO_LENGTH
        if len(audio) < target_samples:
            audio = np.pad(audio, (0, target_samples - len(audio)))
        else:
            audio = audio[:target_samples]

        # Wav2Vec2 features (keep temporal dimension)
        print(f"Extracting features using Wav2Vec2 for language: {lang}")
        wav_input = wav2vec2_processors[lang](audio, return_tensors="pt", sampling_rate=SAMPLE_RATE)

        with torch.no_grad():
            wav_features = wav2vec2_models[lang](**wav_input).last_hidden_state
            wav_features = wav_features.squeeze(0).numpy()  # Shape: (time_steps, 1024)

        print(f"Wav2Vec2 features shape: {wav_features.shape}")

        # Whisper features (keep temporal dimension)
        print("Extracting features using Whisper...")
        whisper_input = whisper_feature_extractor(
            audio, sampling_rate=SAMPLE_RATE, return_tensors="pt"
        ).input_features

        with torch.no_grad():
            whisper_features = whisper_model(whisper_input).last_hidden_state
            whisper_features = whisper_features.squeeze(0).numpy()  # Shape: (time_steps, 768)

        print(f"Whisper features shape: {whisper_features.shape}")

        # Align time steps
        min_time_steps = min(wav_features.shape[0], whisper_features.shape[0])
        wav_features = wav_features[:min_time_steps, :]
        whisper_features = whisper_features[:min_time_steps, :]

        # Combine features
        combined_features = np.concatenate([wav_features, whisper_features], axis=1)
        print(f"Combined features shape: {combined_features.shape}")

        # Pad/truncate to fixed length
        if combined_features.shape[0] < MAX_TIME_STEPS:
            pad = np.zeros((MAX_TIME_STEPS - combined_features.shape[0], 1792))
            combined_features = np.vstack([combined_features, pad])
        else:
            combined_features = combined_features[:MAX_TIME_STEPS]

        return combined_features

    except Exception as e:
        print(f"Feature extraction failed: {str(e)}")
        return None
    finally:
        if temp_file and os.path.exists(temp_file):
            os.remove(temp_file)


def process_directory(base_dir, wav2vec2_processors, wav2vec2_models,
                     whisper_feature_extractor, whisper_model, is_fake=False):
    """Process all audio/video files in a directory"""
    features_list = []
    labels = []

    for folder in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder)
        if not os.path.isdir(folder_path):
            continue

        lang = folder.replace('to_', '') if is_fake else folder
        print(f"Processing: {folder_path} ({lang})")

        for file in os.listdir(folder_path):
            if file.lower().endswith('.mp4'):
                file_path = os.path.join(folder_path, file)
                try:
                    feature = extract_audio_features(
                        file_path, lang,
                        wav2vec2_processors, wav2vec2_models,
                        whisper_feature_extractor, whisper_model
                    )

                    if feature is not None:
                        # Validate shape
                        if feature.shape != (MAX_TIME_STEPS, 1792):
                            raise ValueError(f"Invalid feature shape: {feature.shape}")
                        features_list.append(feature)
                        labels.append(1 if is_fake else 0)

                except Exception as e:
                    print(f"Failed {file_path}: {str(e)}")

    return np.array(features_list), np.array(labels)


def save_features_and_labels(features, labels, prefix, save_dir="/content/drive/MyDrive/PolyGlotFake3/processed_audio_v1"):
    """Save features and labels to .npy files in a dedicated directory"""
    # Create directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # Define full paths
    features_path = os.path.join(save_dir, f'{prefix}_features1.npy')
    labels_path = os.path.join(save_dir, f'{prefix}_labels1.npy')

    # Save files
    np.save(features_path, features)
    np.save(labels_path, labels)
    print(f"Saved to Google Drive:")
    print(f"- Features: {features_path}")
    print(f"- Labels: {labels_path}")

# Entry point
if __name__ == "__main__":
    languages = ['en', 'fr', 'ru', 'es', 'zh', 'ja', 'ar']

    # Initialize models
    wav2vec2_processors, wav2vec2_models, whisper_fe, whisper_model = init_processors_and_models(languages)

    # Process real and fake datasets
    real_features, real_labels = process_directory(
        '/content/drive/MyDrive/PolyGlotFake3/real3',
        wav2vec2_processors, wav2vec2_models,
        whisper_fe, whisper_model, False
    )

    fake_features, fake_labels = process_directory(
        '/content/drive/MyDrive/PolyGlotFake3/fake3',
        wav2vec2_processors, wav2vec2_models,
        whisper_fe, whisper_model, True
    )

        # Usage
    save_features_and_labels(real_features, real_labels, 'real_audio1')
    save_features_and_labels(fake_features, fake_labels, 'fake_audio1')

    print(f"Real dataset shape: {real_features.shape}")
    print(f"Fake dataset shape: {fake_features.shape}")


KeyboardInterrupt: 

In [None]:
!pip install mediapipe==0.10.10

Collecting mediapipe==0.10.10
  Downloading mediapipe-0.10.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting protobuf<4,>=3.11 (from mediapipe==0.10.10)
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe==0.10.10)
  Downloading sounddevice-0.5.1-py3-none-any.whl.metadata (1.4 kB)
Downloading mediapipe-0.10.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.8/34.8 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sounddevice-0.5.1-py3-none-any.whl (32 kB)
Installing collected packages: protobuf, sounddevice, mediapipe
  Attempting uninstall: protobuf
    Found existing installation: protobuf 5.29.4
 

In [None]:
import os
import cv2
import numpy as np
import mediapipe as mp
from tqdm import tqdm

# Initialize MediaPipe Face Mesh
mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils

# MediaPipe lips indices (upper and lower lips)
LIPS_INDICES = [
    # Upper outer lip
    61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291,
    # Lower outer lip
    375, 321, 405, 314, 17, 84, 181, 91, 146,
    # Upper inner lip
    78, 191, 80, 81, 82, 13, 312, 311, 310, 415, 308,
    # Lower inner lip
    324, 318, 402, 317, 14, 87, 178, 88, 95
]

def initialize_face_mesh():
    """Initialize MediaPipe Face Mesh with optimized parameters"""
    return mp_face_mesh.FaceMesh(
        static_image_mode=False,
        max_num_faces=1,
        refine_landmarks=True,
        min_detection_confidence=0.3,
        min_tracking_confidence=0.3
    )

def detect_and_crop_face(image, face_mesh, debug=False):
    """Detect face in image and return bounding box with landmarks"""
    if len(image.shape) == 2 or image.shape[2] == 1:
        image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)

    results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

    if not results.multi_face_landmarks:
        return None

    face_landmarks = results.multi_face_landmarks[0]
    h, w = image.shape[:2]

    # Calculate face bounding box
    x_coords = [int(lm.x * w) for lm in face_landmarks.landmark]
    y_coords = [int(lm.y * h) for lm in face_landmarks.landmark]

    x_min, x_max = max(0, min(x_coords)), min(w, max(x_coords))
    y_min, y_max = max(0, min(y_coords)), min(h, max(y_coords))

    # Add 10% margin
    margin = int(0.1 * max(x_max - x_min, y_max - y_min))
    return (
        max(x_min - margin, 0),
        max(y_min - margin, 0),
        min(x_max + margin, w) - max(x_min - margin, 0),
        min(y_max + margin, h) - max(y_min - margin, 0)
    ), face_landmarks

def extract_lip_region(image, face_mesh, debug=False):
    """Extract lip region from image using Face Mesh"""
    result = detect_and_crop_face(image, face_mesh, debug)
    if not result:
        return None

    (x, y, w, h), face_landmarks = result
    h_img, w_img = image.shape[:2]

    # Extract lip landmarks
    lip_points = []
    for idx in LIPS_INDICES:
        lm = face_landmarks.landmark[idx]
        lip_points.append((int(lm.x * w_img), int(lm.y * h_img)))

    # Calculate lip bounding box with margin
    x_coords, y_coords = zip(*lip_points)
    x_min, x_max = max(0, min(x_coords)), min(w_img, max(x_coords))
    y_min, y_max = max(0, min(y_coords)), min(h_img, max(y_coords))

    margin = int(0.2 * max(x_max - x_min, y_max - y_min))
    return image[
        max(y_min - margin, 0):min(y_max + margin, h_img),
        max(x_min - margin, 0):min(x_max + margin, w_img)
    ]

def process_video(video_path, face_mesh, target_frames=40, debug=False):
    """Process video and extract lip regions"""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None

    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_interval = max(1, int(cap.get(cv2.CAP_PROP_FPS) // 5))

    for frame_idx in range(0, total_frames, frame_interval):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()
        if not ret:
            break

        lip_region = extract_lip_region(frame, face_mesh, debug)
        if lip_region is not None and lip_region.size > 0:
            try:
                resized = cv2.resize(lip_region, (64, 64))
                frames.append(resized)
            except:
                continue

        if len(frames) >= target_frames:
            break

    cap.release()

    # Pad if insufficient frames
    if len(frames) < target_frames:
        padding = [np.zeros((64, 64, 3), dtype=np.uint8)] * (target_frames - len(frames))
        frames.extend(padding)

    return np.array(frames[:target_frames], dtype=np.float32) / 255.0

def load_dataset(root_dir, debug=True):
    """Load dataset with comprehensive tracking"""
    face_mesh = initialize_face_mesh()
    X, y = [], []
    stats = {
        'total': 0,
        'processed': 0,
        'skipped': 0,
        'reasons': {},
        'skipped_files': []
    }

    # First count total videos
    for label, folders in [('real', 'real2'), ('fake', 'fake2')]:
        label_path = os.path.join(root_dir, folders)
        if os.path.exists(label_path):
            for lang in os.listdir(label_path):
                lang_path = os.path.join(label_path, lang)
                if os.path.isdir(lang_path):
                    stats['total'] += len([
                        f for f in os.listdir(lang_path)
                        if f.lower().endswith('.mp4')
                    ])

    print(f"🔍 Found {stats['total']} total videos")

    # Process videos
    for label, (label_name, label_dir) in enumerate([('real', 'real2'), ('fake', 'fake2')]):
        label_path = os.path.join(root_dir, label_dir)
        if not os.path.exists(label_path):
            print(f"⚠️ Missing directory: {label_path}")
            stats['reasons']['missing_directory'] = stats['reasons'].get('missing_directory', 0) + 1
            continue

        for lang in os.listdir(label_path):
            lang_path = os.path.join(label_path, lang)
            if not os.path.isdir(lang_path):
                continue

            video_files = [f for f in os.listdir(lang_path)
                          if f.lower().endswith('.mp4')]

            for video_file in tqdm(video_files, desc=f"Processing {label_name} ({lang})"):
                video_path = os.path.join(lang_path, video_file)
                try:
                    frames = process_video(video_path, face_mesh, debug=debug)
                    if frames is not None and len(frames) > 0:
                        X.append(frames)
                        y.append(label)
                        stats['processed'] += 1
                    else:
                        stats['skipped'] += 1
                        stats['reasons']['no_lip_frames'] = stats['reasons'].get('no_lip_frames', 0) + 1
                        stats['skipped_files'].append(video_path)
                except Exception as e:
                    stats['skipped'] += 1
                    stats['reasons'][str(type(e).__name__)] = stats['reasons'].get(str(type(e).__name__), 0) + 1
                    stats['skipped_files'].append(video_path)

    face_mesh.close()

    # Print final report
    print("\n📊 Processing Report:")
    print(f"✅ Successfully processed: {stats['processed']}/{stats['total']}")
    print(f"⚠️ Skipped: {stats['skipped']}/{stats['total']}")

    if stats['skipped'] > 0:
        print("\n📝 Skip Reasons:")
        for reason, count in stats['reasons'].items():
            print(f"- {reason}: {count}")

        print("\n🗑️ Top 5 Skipped Files:")
        for idx, f in enumerate(stats['skipped_files'][:5], 1):
            print(f"{idx}. {f}")
        if len(stats['skipped_files']) > 5:
            print(f"... and {len(stats['skipped_files'])-5} more")

    if stats['processed'] + stats['skipped'] == stats['total']:
        print("\n🎉 Successfully processed all videos!")
    else:
        missing = stats['total'] - (stats['processed'] + stats['skipped'])
        print(f"\n⚠️ Warning: {missing} videos unaccounted for - check file formats")

    return np.array(X, dtype=object), np.array(y)

def save_features_and_labels(features, labels, prefix, save_dir="/content/drive/MyDrive/PolyGlotFake2/processed_video"):
    """Save features and labels to .npy files in a dedicated directory with versioning support.

    Args:
        features (np.array): Processed video frames or features
        labels (np.array): Corresponding labels
        prefix (str): Prefix for filenames (e.g., 'train', 'val', 'test')
        save_dir (str): Directory to save files (defaults to Google Drive)
    """
    # Create directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # Versioning support - finds next available version number
    version = 0
    while True:
        features_path = os.path.join(save_dir, f'{prefix}_features_v{version}.npy')
        labels_path = os.path.join(save_dir, f'{prefix}_labels_v{version}.npy')
        if not (os.path.exists(features_path) or os.path.exists(labels_path)):
            break
        version += 1

    # Save files with metadata in filename
    np.save(features_path, features)
    np.save(labels_path, labels)

    print(f"✅ Saved video processing results (version {version}):")
    print(f"   - Features shape: {features.shape} -> {features_path}")
    print(f"   - Labels shape: {labels.shape} -> {labels_path}")
    print(f"   - Total samples: {len(labels)} ({np.sum(labels==1)} fake, {np.sum(labels==0)} real)")

def load_features_and_labels(prefix, version=None, save_dir="/content/drive/MyDrive/PolyGlotFake2/processed_video"):
    """Load saved features and labels from directory.

    Args:
        prefix (str): Prefix used when saving (e.g., 'train', 'val', 'test')
        version (int/str): Specific version to load ('latest' or number)
        save_dir (str): Directory where files are saved

    Returns:
        tuple: (features, labels) as numpy arrays
    """
    if version == 'latest':
        # Find all matching files and get highest version
        existing_files = [f for f in os.listdir(save_dir)
                         if f.startswith(f'{prefix}_features_v')]
        if not existing_files:
            raise FileNotFoundError(f"No files found with prefix '{prefix}'")
        versions = [int(f.split('_v')[1].split('.npy')[0]) for f in existing_files]
        version = max(versions)

    features_path = os.path.join(save_dir, f'{prefix}_features_v{version}.npy')
    labels_path = os.path.join(save_dir, f'{prefix}_labels_v{version}.npy')

    if not (os.path.exists(features_path) and os.path.exists(labels_path)):
        raise FileNotFoundError(f"Couldn't find both feature and label files for version {version}")

    features = np.load(features_path, allow_pickle=True)
    labels = np.load(labels_path)

    print(f"📁 Loaded version {version}:")
    print(f"   - Features shape: {features.shape}")
    print(f"   - Labels shape: {labels.shape}")

    return features, labels

def save_dataset(X, y, output_dir="processed_data"):
    """Legacy function for backward compatibility"""
    print("⚠️ Note: save_dataset() is deprecated. Use save_features_and_labels() instead.")
    save_features_and_labels(X, y, 'full_dataset', output_dir)

if __name__ == "__main__":
    # Configuration
    DATASET_PATH = "/content/drive/MyDrive/PolyGlotFake2"
    OUTPUT_DIR = "/content/drive/MyDrive/PolyGlotFake2/processed_data"

    # Process dataset
    X, y = load_dataset(DATASET_PATH)

    # Save processed data
    if len(X) > 0:
        # Using the new saving function
        save_features_and_labels(X, y, 'lip_features', OUTPUT_DIR)

        # Can also save train/test splits if needed
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        save_features_and_labels(X_train, y_train, 'train', OUTPUT_DIR)
        save_features_and_labels(X_test, y_test, 'test', OUTPUT_DIR)
    else:
        print("❌ No data to save - processing failed")

🔍 Found 140 total videos


Processing real (ru): 100%|██████████| 10/10 [01:43<00:00, 10.32s/it]
Processing real (en): 100%|██████████| 10/10 [01:24<00:00,  8.46s/it]
Processing real (ja): 100%|██████████| 10/10 [01:32<00:00,  9.23s/it]
Processing real (fr): 100%|██████████| 10/10 [01:19<00:00,  7.99s/it]
Processing real (es): 100%|██████████| 10/10 [01:42<00:00, 10.23s/it]
Processing real (zh): 100%|██████████| 10/10 [01:08<00:00,  6.81s/it]
Processing real (ar): 100%|██████████| 10/10 [01:22<00:00,  8.28s/it]
Processing fake (to_zh): 100%|██████████| 10/10 [02:51<00:00, 17.19s/it]
Processing fake (to_ru): 100%|██████████| 10/10 [02:56<00:00, 17.62s/it]
Processing fake (to_ja): 100%|██████████| 10/10 [02:34<00:00, 15.40s/it]
Processing fake (to_fr): 100%|██████████| 10/10 [02:45<00:00, 16.54s/it]
Processing fake (to_es): 100%|██████████| 10/10 [02:47<00:00, 16.73s/it]
Processing fake (to_ar): 100%|██████████| 10/10 [01:56<00:00, 11.60s/it]
Processing fake (to_en): 100%|██████████| 10/10 [02:45<00:00, 16.54s/it]


📊 Processing Report:
✅ Successfully processed: 140/140
⚠️ Skipped: 0/140

🎉 Successfully processed all videos!
✅ Saved video processing results (version 1):
   - Features shape: (140, 40, 64, 64, 3) -> /content/drive/MyDrive/PolyGlotFake2/processed_data/lip_features_features_v1.npy
   - Labels shape: (140,) -> /content/drive/MyDrive/PolyGlotFake2/processed_data/lip_features_labels_v1.npy
   - Total samples: 140 (70 fake, 70 real)
✅ Saved video processing results (version 1):
   - Features shape: (112, 40, 64, 64, 3) -> /content/drive/MyDrive/PolyGlotFake2/processed_data/train_features_v1.npy
   - Labels shape: (112,) -> /content/drive/MyDrive/PolyGlotFake2/processed_data/train_labels_v1.npy
   - Total samples: 112 (56 fake, 56 real)
✅ Saved video processing results (version 1):
   - Features shape: (28, 40, 64, 64, 3) -> /content/drive/MyDrive/PolyGlotFake2/processed_data/test_features_v1.npy
   - Labels shape: (28,) -> /content/drive/MyDrive/PolyGlotFake2/processed_data/test_labels_v

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, LSTM, Dense, Dropout, TimeDistributed, Flatten, concatenate
from tensorflow.keras.utils import Sequence
from google.colab import drive
import gc

# Mount Google Drive
drive.mount('/content/drive')

# Define constants
MAX_TIME_STEPS = 1500
BATCH_SIZE = 4  # Adjust based on your RAM
NUM_EPOCHS = 30

# Updated Generator Class with proper output specification
class MultimodalDataGenerator(Sequence):
    def __init__(self, video_data, audio_data, labels, batch_size):
        self.video_data = video_data
        self.audio_data = audio_data
        self.labels = labels
        self.batch_size = batch_size
        self.indices = np.arange(len(self.labels))

    def __len__(self):
        return int(np.ceil(len(self.labels) / self.batch_size))

    def __getitem__(self, idx):
        batch_indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]

        batch_video = self.video_data[batch_indices].astype('float32')
        batch_audio = self.audio_data[batch_indices].astype('float32')
        batch_labels = self.labels[batch_indices].astype('float32')

        # Instead of returning [input1, input2], label
        # Return a tuple of ({input1, input2}, label)
        return {"input_1": batch_video, "input_2": batch_audio}, batch_labels

    def on_epoch_end(self):
        np.random.shuffle(self.indices)

# Load and process data
def load_and_process_data():
    print("Loading audio features...")
    real_audio = np.load('/content/drive/MyDrive/PolyGlotFake2/processed_audio_v1/real_audio1_features1.npy')
    fake_audio = np.load('/content/drive/MyDrive/PolyGlotFake2/processed_audio_v1/fake_audio1_features1.npy')

    print("Loading visual features...")
    visual_data = np.load('/content/drive/MyDrive/PolyGlotFake2/processed_data/lip_features_features_v0.npy', allow_pickle=True)
    visual_labels = np.load('/content/drive/MyDrive/PolyGlotFake2/processed_data/lip_features_labels_v0.npy', allow_pickle=True)

    # Combine and split data
    print("Combining features...")
    all_audio = np.concatenate([real_audio, fake_audio])
    all_labels = np.concatenate([np.zeros(len(real_audio)), np.ones(len(fake_audio))])

    # Free memory
    del real_audio, fake_audio
    gc.collect()

    assert len(all_audio) == len(visual_data), "Data length mismatch"
    assert np.array_equal(all_labels, visual_labels), "Label mismatch"

    # Split data
    print("Splitting data...")
    train_idx, test_idx = train_test_split(
        np.arange(len(all_labels)),
        test_size=0.2,
        stratify=all_labels,
        random_state=42
    )

    return (
        visual_data[train_idx],
        all_audio[train_idx],
        all_labels[train_idx],
        visual_data[test_idx],
        all_audio[test_idx],
        all_labels[test_idx]
    )

# Load data
video_train, audio_train, y_train, video_test, audio_test, y_test = load_and_process_data()

# Create generators
train_generator = MultimodalDataGenerator(video_train, audio_train, y_train, BATCH_SIZE)
test_generator = MultimodalDataGenerator(video_test, audio_test, y_test, BATCH_SIZE)

# Model definition
def create_multimodal_model():
    # Video branch - giving names to inputs for clarity
    video_input = Input(shape=video_train.shape[1:], name='input_1')
    x = TimeDistributed(Conv2D(16, (3,3), activation='relu'))(video_input)
    x = TimeDistributed(MaxPooling2D(2,2))(x)
    x = TimeDistributed(Conv2D(32, (3,3), activation='relu'))(x)
    x = TimeDistributed(MaxPooling2D(2,2))(x)
    x = TimeDistributed(Flatten())(x)
    x = LSTM(64)(x)
    video_branch = Model(video_input, x)

    # Audio branch - giving names to inputs for clarity
    audio_input = Input(shape=(MAX_TIME_STEPS, 1792), name='input_2')
    y = LSTM(128)(audio_input)
    y = Dense(64, activation='relu')(y)
    audio_branch = Model(audio_input, y)

    # Combined model
    combined = concatenate([video_branch.output, audio_branch.output])
    z = Dense(32, activation='relu')(combined)
    z = Dropout(0.3)(z)
    z = Dense(1, activation='sigmoid')(z)

    model = Model(inputs=[video_branch.input, audio_branch.input], outputs=z)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.Precision(name='precision'),
                tf.keras.metrics.Recall(name='recall'), tf.keras.metrics.AUC(name='auc')]
    )
    return model

# Create model
model = create_multimodal_model()
model.summary()

# Callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'best_model.h5',
    save_best_only=True,
    monitor='val_accuracy'
)

# Train model with modified approach
print("Training model...")

# Option 1: Use the fit method with custom generators
history = model.fit(
    train_generator,
    validation_data=test_generator,
    epochs=NUM_EPOCHS,
    callbacks=[early_stopping, checkpoint],
    verbose=1
)


# Evaluation
print("\nEvaluating model...")
y_pred = model.predict(test_generator)
y_pred_classes = (y_pred > 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes, target_names=['Real', 'Fake']))

# Save final model
model.save('polyglot_fake_detector.h5')
print("Model saved successfully.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading audio features...
Loading visual features...
Combining features...
Splitting data...


Training model...


  self._warn_if_super_not_called()


Epoch 1/30
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.6631 - auc: 0.6820 - loss: 0.6625 - precision: 0.6949 - recall: 0.7054



[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 6s/step - accuracy: 0.6633 - auc: 0.6825 - loss: 0.6616 - precision: 0.6944 - recall: 0.7033 - val_accuracy: 0.7143 - val_auc: 0.9031 - val_loss: 0.5434 - val_precision: 0.8000 - val_recall: 0.5714
Epoch 2/30
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 6s/step - accuracy: 0.7037 - auc: 0.7348 - loss: 0.5809 - precision: 0.7123 - recall: 0.6973 - val_accuracy: 0.6071 - val_auc: 0.9107 - val_loss: 0.6045 - val_precision: 0.8000 - val_recall: 0.2857
Epoch 3/30
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - accuracy: 0.8374 - auc: 0.9340 - loss: 0.3912 - precision: 0.8796 - recall: 0.8390



[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 6s/step - accuracy: 0.8380 - auc: 0.9336 - loss: 0.3911 - precision: 0.8793 - recall: 0.8390 - val_accuracy: 0.8214 - val_auc: 0.9464 - val_loss: 0.3772 - val_precision: 0.9091 - val_recall: 0.7143
Epoch 4/30
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - accuracy: 0.8357 - auc: 0.9389 - loss: 0.3593 - precision: 0.7956 - recall: 0.9023



[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 6s/step - accuracy: 0.8355 - auc: 0.9383 - loss: 0.3595 - precision: 0.7962 - recall: 0.9007 - val_accuracy: 0.8571 - val_auc: 0.9694 - val_loss: 0.2734 - val_precision: 0.9167 - val_recall: 0.7857
Epoch 5/30
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 6s/step - accuracy: 0.9219 - auc: 0.9428 - loss: 0.2189 - precision: 0.9470 - recall: 0.9027 - val_accuracy: 0.8571 - val_auc: 0.9668 - val_loss: 0.2796 - val_precision: 1.0000 - val_recall: 0.7143
Epoch 6/30
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - accuracy: 0.9633 - auc: 0.9779 - loss: 0.1957 - precision: 0.9631 - recall: 0.9578



[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 6s/step - accuracy: 0.9624 - auc: 0.9781 - loss: 0.1955 - precision: 0.9619 - recall: 0.9574 - val_accuracy: 1.0000 - val_auc: 1.0000 - val_loss: 0.1354 - val_precision: 1.0000 - val_recall: 1.0000
Epoch 7/30
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 6s/step - accuracy: 0.8857 - auc: 0.9594 - loss: 0.2565 - precision: 0.8607 - recall: 0.8940 - val_accuracy: 0.8929 - val_auc: 0.9847 - val_loss: 0.2151 - val_precision: 1.0000 - val_recall: 0.7857
Epoch 8/30
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 6s/step - accuracy: 0.9056 - auc: 0.9801 - loss: 0.2183 - precision: 0.9282 - recall: 0.8136 - val_accuracy: 0.8571 - val_auc: 0.9643 - val_loss: 0.3229 - val_precision: 1.0000 - val_recall: 0.7143
Epoch 9/30
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m




Classification Report:
              precision    recall  f1-score   support

        Real       0.43      0.43      0.43        14
        Fake       0.43      0.43      0.43        14

    accuracy                           0.43        28
   macro avg       0.43      0.43      0.43        28
weighted avg       0.43      0.43      0.43        28

Model saved successfully.


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import classification_report
import gc
import os

# Configuration
BATCH_SIZE = 2
SEQ_LENGTH = 40
FRAME_SIZE = 64
MAX_TIME_STEPS = 1500
AUDIO_FEAT_DIM = 1792

# Path configuration
PATHS = {
    'train_video': '/content/drive/MyDrive/PolyGlotFake2/processed_data/train_features_v1.npy',
    'train_labels': '/content/drive/MyDrive/PolyGlotFake2/processed_data/train_labels_v1.npy',
    'test_video': '/content/drive/MyDrive/PolyGlotFake2/processed_data/test_features_v1.npy',
    'test_labels': '/content/drive/MyDrive/PolyGlotFake2/processed_data/test_labels_v1.npy',
    'real_audio': '/content/drive/MyDrive/PolyGlotFake2/processed_audio_v1/real_audio1_features1.npy',
    'fake_audio': '/content/drive/MyDrive/PolyGlotFake2/processed_audio_v1/fake_audio1_features1.npy'
}

class SafeDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, video_data, audio_data, labels, batch_size):
        self.video_data = video_data
        self.audio_data = audio_data
        self.labels = labels
        self.batch_size = batch_size
        self.indices = np.arange(len(labels))

    def __len__(self):
        return int(np.ceil(len(self.labels) / self.batch_size))

    def __getitem__(self, idx):
        batch_indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]

        video_batch = self.video_data[batch_indices].astype('float32') / 255.0
        audio_batch = self.audio_data[batch_indices].astype('float32')
        labels_batch = self.labels[batch_indices].astype('float32')

        return {'video_input': video_batch, 'audio_input': audio_batch}, labels_batch

    def on_epoch_end(self):
        self.indices = np.random.permutation(self.indices)

def create_optimized_model():
    # Video pathway
    video_input = layers.Input(
        shape=(SEQ_LENGTH, FRAME_SIZE, FRAME_SIZE, 3),
        name='video_input'
    )
    x = layers.TimeDistributed(layers.Conv2D(4, (3,3)))(video_input)
    x = layers.TimeDistributed(layers.MaxPooling2D(2))(x)
    x = layers.TimeDistributed(layers.GlobalAveragePooling2D())(x)
    x = layers.GRU(8)(x)

    # Audio pathway
    audio_input = layers.Input(
        shape=(MAX_TIME_STEPS, AUDIO_FEAT_DIM),
        name='audio_input'
    )
    y = layers.Conv1D(8, 3, activation='relu')(audio_input)
    y = layers.GlobalAveragePooling1D()(y)

    # Fusion
    z = layers.concatenate([x, y])
    z = layers.Dense(8, activation='relu')(z)
    outputs = layers.Dense(1, activation='sigmoid')(z)

    model = models.Model(
        inputs=[video_input, audio_input],
        outputs=outputs,
        name='multimodal_model'
    )
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
    )
    return model

def load_and_prepare_data():
    # Load all data with memory mapping
    train_video = np.load(PATHS['train_video'], mmap_mode='r')
    test_video = np.load(PATHS['test_video'], mmap_mode='r')
    train_labels = np.load(PATHS['train_labels'])
    test_labels = np.load(PATHS['test_labels'])

    # Combine and split audio features
    real_audio = np.load(PATHS['real_audio'], mmap_mode='r')
    fake_audio = np.load(PATHS['fake_audio'], mmap_mode='r')

    # Create aligned audio dataset
    all_audio = np.concatenate([real_audio, fake_audio])  # (140, 1500, 1792)
    train_audio = all_audio[:112]  # First 112 samples
    test_audio = all_audio[112:]   # Last 28 samples

    return (train_video, train_audio, train_labels), (test_video, test_audio, test_labels)

def train_model():
    tf.keras.backend.clear_session()
    gc.collect()

    # Load data with proper alignment
    (train_vid, train_aud, train_lbl), (test_vid, test_aud, test_lbl) = load_and_prepare_data()

    # Verify shapes
    print("Data Shapes:")
    print(f"Train Video: {train_vid.shape} | Audio: {train_aud.shape} | Labels: {train_lbl.shape}")
    print(f"Test Video: {test_vid.shape} | Audio: {test_aud.shape} | Labels: {test_lbl.shape}")

    # Create generators
    train_gen = SafeDataGenerator(train_vid, train_aud, train_lbl, BATCH_SIZE)
    test_gen = SafeDataGenerator(test_vid, test_aud, test_lbl, BATCH_SIZE)

    # Create model
    model = create_optimized_model()
    model.summary()

    try:
        history = model.fit(
            train_gen,
            validation_data=test_gen,
            epochs=50,
            callbacks=[
                tf.keras.callbacks.EarlyStopping(patience=3),
                tf.keras.callbacks.ModelCheckpoint('best_model.keras', save_best_only=True)
            ],
            verbose=1
        )
    except KeyboardInterrupt:
        print("\nTraining interrupted. Saving partial model...")
        model.save('interrupted_model.keras')
        return

    # Final evaluation
    model.load_weights('best_model.keras')
    y_pred = (model.predict(test_gen) > 0.5).astype(int)
    print(classification_report(test_lbl, y_pred))

if __name__ == "__main__":
    train_model()

Data Shapes:
Train Video: (112, 40, 64, 64, 3) | Audio: (112, 1500, 1792) | Labels: (112,)
Test Video: (28, 40, 64, 64, 3) | Audio: (28, 1500, 1792) | Labels: (28,)


Epoch 1/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 398ms/step - accuracy: 0.5216 - auc: 0.4599 - loss: 0.8490 - val_accuracy: 0.4286 - val_auc: 0.3980 - val_loss: 0.7189
Epoch 2/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 323ms/step - accuracy: 0.4861 - auc: 0.5464 - loss: 0.6926 - val_accuracy: 0.5000 - val_auc: 0.4209 - val_loss: 0.7160
Epoch 3/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 309ms/step - accuracy: 0.4871 - auc: 0.5432 - loss: 0.6843 - val_accuracy: 0.4286 - val_auc: 0.3929 - val_loss: 0.7330
Epoch 4/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 346ms/step - accuracy: 0.5999 - auc: 0.6890 - loss: 0.6515 - val_accuracy: 0.3929 - val_auc: 0.3929 - val_loss: 0.7349
Epoch 5/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 331ms/step - accuracy: 0.5820 - auc: 0.6848 - loss: 0.6706 - val_accuracy: 0.4286 - val_auc: 0.4082 - val_loss: 0.7471
[1m14/14[0m [32m━━━━━━

OPTION 2

In [None]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
import os
import librosa
import torch
import numpy as np
import soundfile as sf
from pydub import AudioSegment
from transformers import (
    Wav2Vec2Processor, Wav2Vec2Model,
    WhisperFeatureExtractor, WhisperModel
)

# --- Configuration ---
SAMPLE_RATE = 16000
MAX_AUDIO_LENGTH = 30  # seconds
MAX_TIME_STEPS = 1500
WHISPER_MODEL_ID = "openai/whisper-small"

LANG_TO_WAV2VEC2 = {
    'en': 'facebook/wav2vec2-large-robust-ft-swbd-300h',
    'fr': 'facebook/wav2vec2-large-xlsr-53-french',
    'ru': 'anton-l/wav2vec2-large-xlsr-53-russian',
    'es': 'facebook/wav2vec2-large-xlsr-53-spanish',
    'zh': 'jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn',
    'ja': 'jonatasgrosman/wav2vec2-large-xlsr-53-japanese',
    'ar': 'jonatasgrosman/wav2vec2-large-xlsr-53-arabic'
}

# --- Initialization ---
def init_processors_and_models(languages):
    wav2vec2_processors = {}
    wav2vec2_models = {}
    for lang in languages:
        model_id = LANG_TO_WAV2VEC2[lang]
        wav2vec2_processors[lang] = Wav2Vec2Processor.from_pretrained(model_id)
        wav2vec2_models[lang] = Wav2Vec2Model.from_pretrained(model_id)
    whisper_fe = WhisperFeatureExtractor.from_pretrained(WHISPER_MODEL_ID)
    whisper_model = WhisperModel.from_pretrained(WHISPER_MODEL_ID).encoder
    return wav2vec2_processors, wav2vec2_models, whisper_fe, whisper_model

# --- Audio Extraction ---
def extract_audio_from_video(video_path):
    try:
        temp_dir = os.path.join(os.getcwd(), "temp")
        os.makedirs(temp_dir, exist_ok=True)
        temp_path = os.path.join(temp_dir, f"temp_{os.path.basename(video_path)}.wav")
        audio = AudioSegment.from_file(video_path)
        audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(1)
        target_length = MAX_AUDIO_LENGTH * 1000
        if len(audio) < target_length:
            silence = AudioSegment.silent(duration=target_length - len(audio))
            audio += silence
        else:
            audio = audio[:target_length]
        audio.export(temp_path, format="wav")
        return temp_path
    except Exception as e:
        print(f"Error in audio extraction: {str(e)}")
        return None

# --- Feature Extraction ---
def extract_audio_features(path, lang, wav2vec2_processors, wav2vec2_models,
                           whisper_fe, whisper_model):
    temp_file = None
    try:
        if path.lower().endswith('.mp4'):
            temp_file = extract_audio_from_video(path)
            if not temp_file or not os.path.exists(temp_file):
                return None
            audio_path = temp_file
        else:
            audio_path = path

        audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
        target_samples = SAMPLE_RATE * MAX_AUDIO_LENGTH
        if len(audio) < target_samples:
            audio = np.pad(audio, (0, target_samples - len(audio)))
        else:
            audio = audio[:target_samples]

        # Wav2Vec2
        print(f"[{lang}] Wav2Vec2 extracting...")
        wav_input = wav2vec2_processors[lang](audio, return_tensors="pt", sampling_rate=SAMPLE_RATE)
        with torch.no_grad():
            wav_features = wav2vec2_models[lang](**wav_input).last_hidden_state.squeeze(0).numpy()

        # Whisper
        print(f"[{lang}] Whisper extracting...")
        whisper_input = whisper_fe(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt").input_features
        with torch.no_grad():
            whisper_features = whisper_model(whisper_input).last_hidden_state.squeeze(0).numpy()

        # Spectral
        print(f"[{lang}] Spectral features extracting...")
        hop_length = int(len(audio) / MAX_TIME_STEPS)
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13, hop_length=hop_length)
        chroma = librosa.feature.chroma_stft(y=audio, sr=sr, hop_length=hop_length)
        contrast = librosa.feature.spectral_contrast(y=audio, sr=sr, hop_length=hop_length)
        spectral = np.concatenate([mfcc, chroma, contrast], axis=0).T

        # Align features
        min_len = min(wav_features.shape[0], whisper_features.shape[0], spectral.shape[0])
        wav_features = wav_features[:min_len]
        whisper_features = whisper_features[:min_len]
        spectral = spectral[:min_len]
        combined = np.concatenate([wav_features, whisper_features, spectral], axis=1)

        if combined.shape[0] < MAX_TIME_STEPS:
            padding = np.zeros((MAX_TIME_STEPS - combined.shape[0], combined.shape[1]))
            combined = np.vstack([combined, padding])
        else:
            combined = combined[:MAX_TIME_STEPS]

        return combined

    except Exception as e:
        print(f"Feature extraction error: {str(e)}")
        return None
    finally:
        if temp_file and os.path.exists(temp_file):
            os.remove(temp_file)

# --- Dataset Processor ---
def process_directory(base_dir, wav2vec2_processors, wav2vec2_models, whisper_fe, whisper_model, is_fake=False):
    features = []
    labels = []
    for folder in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder)
        if not os.path.isdir(folder_path): continue

        lang = folder.replace('to_', '') if is_fake else folder
        print(f"\nProcessing folder: {folder_path} | Lang: {lang}")
        for file in os.listdir(folder_path):
            if file.lower().endswith('.mp4'):
                file_path = os.path.join(folder_path, file)
                feat = extract_audio_features(file_path, lang, wav2vec2_processors, wav2vec2_models, whisper_fe, whisper_model)
                if feat is not None and feat.shape[0] == MAX_TIME_STEPS:
                    features.append(feat)
                    labels.append(1 if is_fake else 0)
    return np.array(features), np.array(labels)

# --- Save to Drive ---
def save_features_and_labels(features, labels, prefix, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    f_path = os.path.join(save_dir, f"{prefix}_features.npy")
    l_path = os.path.join(save_dir, f"{prefix}_labels.npy")
    np.save(f_path, features)
    np.save(l_path, labels)
    print(f"\nSaved: {f_path} | {l_path}")

# --- Main Pipeline ---
if __name__ == "__main__":
    languages = ['en', 'fr', 'ru', 'es', 'zh', 'ja', 'ar']

    # Verify Google Drive paths first
    base_path = '/content/drive/MyDrive/PolyGlotFake2'
    real_dir = os.path.join(base_path, 'real2')
    fake_dir = os.path.join(base_path, 'fake2')
    save_dir = os.path.join(base_path, 'processed_audio_v2')

    # 1. Check if directories exist
    for path in [real_dir, fake_dir]:
        if not os.path.exists(path):
            raise FileNotFoundError(f"Directory not found: {path}\n"
                                  "Please verify:\n"
                                  "1. Google Drive is properly mounted\n"
                                  "2. Folder names match exactly\n"
                                  "3. Data is organized correctly")

    # 2. Initialize models after verifying paths
    print("Loading models...")
    wav2vec2_processors, wav2vec2_models, whisper_fe, whisper_model = init_processors_and_models(languages)

    # 3. Process data with progress tracking
    print("\n--- Processing Real Data ---")
    real_features, real_labels = process_directory(
        real_dir,
        wav2vec2_processors,
        wav2vec2_models,
        whisper_fe,
        whisper_model,
        is_fake=False
    )

    print("\n--- Processing Fake Data ---")
    fake_features, fake_labels = process_directory(
        fake_dir,
        wav2vec2_processors,
        wav2vec2_models,
        whisper_fe,
        whisper_model,
        is_fake=True
    )

    # 4. Save with verification
    save_features_and_labels(real_features, real_labels, 'real_audio2', save_dir)
    save_features_and_labels(fake_features, fake_labels, 'fake_audio2', save_dir)

    print(f"\n[Done] Real: {real_features.shape}, Fake: {fake_features.shape}")


Loading models...

--- Processing Real Data ---

Processing folder: /content/drive/MyDrive/PolyGlotFake2/real2/ru | Lang: ru
[ru] Wav2Vec2 extracting...
[ru] Whisper extracting...
[ru] Spectral features extracting...
[ru] Wav2Vec2 extracting...
[ru] Whisper extracting...
[ru] Spectral features extracting...
[ru] Wav2Vec2 extracting...
[ru] Whisper extracting...
[ru] Spectral features extracting...
[ru] Wav2Vec2 extracting...
[ru] Whisper extracting...
[ru] Spectral features extracting...
[ru] Wav2Vec2 extracting...
[ru] Whisper extracting...
[ru] Spectral features extracting...
[ru] Wav2Vec2 extracting...
[ru] Whisper extracting...
[ru] Spectral features extracting...
[ru] Wav2Vec2 extracting...
[ru] Whisper extracting...
[ru] Spectral features extracting...
[ru] Wav2Vec2 extracting...
[ru] Whisper extracting...
[ru] Spectral features extracting...
[ru] Wav2Vec2 extracting...
[ru] Whisper extracting...
[ru] Spectral features extracting...
[ru] Wav2Vec2 extracting...
[ru] Whisper extrac

In [None]:
import os
import cv2
import numpy as np
import mediapipe as mp
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Configuration paths
PATHS = {
    'train_video': '/content/drive/MyDrive/PolyGlotFake2/processed_data/train_features_v2.npy',
    'train_labels': '/content/drive/MyDrive/PolyGlotFake2/processed_data/train_labels_v2.npy',
    'test_video': '/content/drive/MyDrive/PolyGlotFake2/processed_data/test_features_v2.npy',
    'test_labels': '/content/drive/MyDrive/PolyGlotFake2/processed_data/test_labels_v2.npy',
}

# MediaPipe Face Mesh initialization
mp_face_mesh = mp.solutions.face_mesh
LIPS_INDICES = [
    61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291,
    375, 321, 405, 314, 17, 84, 181, 91, 146,
    78, 191, 80, 81, 82, 13, 312, 311, 310, 415, 308,
    324, 318, 402, 317, 14, 87, 178, 88, 95
]

class VideoProcessor:
    def __init__(self):
        self.face_mesh = mp_face_mesh.FaceMesh(
            static_image_mode=False,
            max_num_faces=1,
            refine_landmarks=True,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )
        self.flow_params = {
            'pyr_scale': 0.5,
            'levels': 3,
            'winsize': 15,
            'iterations': 3,
            'poly_n': 5,
            'poly_sigma': 1.2,
            'flags': 0
        }

    def extract_lip_region(self, frame, face_landmarks):
        """Extract and align lip region with perspective correction"""
        try:
            h, w = frame.shape[:2]
            lip_points = np.array([(lm.x * w, lm.y * h) for lm in face_landmarks.landmark])
            lip_points = lip_points[LIPS_INDICES].astype(np.int32)

            # Calculate bounding box with margin
            x, y, w, h = cv2.boundingRect(lip_points)
            margin = int(0.2 * max(w, h))
            x = max(0, x - margin)
            y = max(0, y - margin)

            # Perspective warping
            dst_points = np.array([[0, 0], [64, 0], [64, 64], [0, 64]], dtype=np.float32)
            src_points = lip_points[[0, 10, 20, 30]].astype(np.float32)
            M = cv2.getPerspectiveTransform(src_points, dst_points)
            return cv2.warpPerspective(frame, M, (64, 64))
        except Exception:
            return np.zeros((64, 64, 3), dtype=np.uint8)

    def calculate_optical_flow(self, prev_frame, curr_frame):
        """Calculate optical flow between frames with error handling"""
        try:
            prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
            curr_gray = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY)
            return cv2.calcOpticalFlowFarneback(prev_gray, curr_gray, None, **self.flow_params)
        except Exception:
            return np.zeros((64, 64, 2), dtype=np.float32)

    def process_video(self, video_path, target_frames=40):
        """Process video with enhanced error handling and optical flow"""
        try:
            cap = cv2.VideoCapture(video_path)
            if not cap.isOpened():
                return np.zeros((target_frames, 64, 64, 5), dtype=np.float32)

            frames = []
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            frame_interval = max(1, int(cap.get(cv2.CAP_PROP_FPS) // 5))
            prev_lip = None  # Initialize as None

            for frame_idx in range(0, total_frames, frame_interval):
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
                ret, frame = cap.read()
                if not ret:
                    continue

                # Face detection with rotation attempts
                results = None
                for _ in range(3):  # Try rotated versions
                    results = self.face_mesh.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                    if results.multi_face_landmarks:
                        break
                    frame = cv2.rotate(frame, cv2.ROTATE_180)

                # Explicit check for valid results
                if not results or not results.multi_face_landmarks:
                    continue

                # Safe landmark access
                try:
                    face_landmarks = results.multi_face_landmarks[0]
                except (IndexError, TypeError):
                    continue

                lip_region = self.extract_lip_region(frame, face_landmarks)

                # Corrected truth value check
                if prev_lip is not None:
                    flow = self.calculate_optical_flow(prev_lip, lip_region)
                else:
                    flow = np.zeros((64, 64, 2), dtype=np.float32)

                frames.append((lip_region.astype(np.float32)/255.0, flow/64.0))
                prev_lip = lip_region.copy()  # Use copy to prevent reference issues

                if len(frames) >= target_frames:
                    break

            # Smart padding using last valid frame
            if len(frames) < target_frames:
                last_valid = frames[-1] if frames else (np.zeros((64,64,3)), np.zeros((64,64,2)))
                padding = [last_valid] * (target_frames - len(frames))
                frames.extend(padding)

            # Combine features
            rgb = np.array([f[0] for f in frames[:target_frames]])
            flow = np.array([f[1] for f in frames[:target_frames]])
            return np.concatenate([rgb, flow], axis=-1)

        except Exception as e:
            print(f"Error processing {video_path}: {str(e)}")
            return np.zeros((target_frames, 64, 64, 5), dtype=np.float32)
        finally:
            cap.release()

def load_dataset(root_dir, debug=True):
    """Load dataset with comprehensive tracking and error handling"""
    processor = VideoProcessor()
    X, y = [], []
    stats = {
        'total': 0, 'processed': 0, 'skipped': 0,
        'reasons': {}, 'skipped_files': []
    }

    # Count total videos
    for label, folders in [('real', 'real2'), ('fake', 'fake2')]:
        label_path = os.path.join(root_dir, folders)
        if os.path.exists(label_path):
            for lang in os.listdir(label_path):
                lang_path = os.path.join(label_path, lang)
                if os.path.isdir(lang_path):
                    stats['total'] += len([
                        f for f in os.listdir(lang_path)
                        if f.lower().endswith('.mp4')
                    ])

    print(f"🔍 Found {stats['total']} total videos")

    # Process videos
    for label_idx, (label_name, folder) in enumerate([('real', 'real2'), ('fake', 'fake2')]):
        label_path = os.path.join(root_dir, folder)
        if not os.path.exists(label_path):
            print(f"⚠️ Missing directory: {label_path}")
            stats['reasons']['missing_directory'] = stats['reasons'].get('missing_directory', 0) + 1
            continue

        for lang in os.listdir(label_path):
            lang_path = os.path.join(label_path, lang)
            if not os.path.isdir(lang_path):
                continue

            video_files = [f for f in os.listdir(lang_path) if f.lower().endswith('.mp4')]

            for video_file in tqdm(video_files, desc=f"Processing {label_name} ({lang})"):
                video_path = os.path.join(lang_path, video_file)
                try:
                    features = processor.process_video(video_path)
                    if features is not None:
                        X.append(features)
                        y.append(label_idx)
                        stats['processed'] += 1
                    else:
                        stats['skipped'] += 1
                        stats['reasons']['processing_failed'] = stats['reasons'].get('processing_failed', 0) + 1
                        stats['skipped_files'].append(video_path)
                except Exception as e:
                    stats['skipped'] += 1
                    stats['reasons'][str(type(e).__name__)] = stats['reasons'].get(str(type(e).__name__), 0) + 1
                    stats['skipped_files'].append(video_path)

    # Print final report
    print("\n📊 Processing Report:")
    print(f"✅ Successfully processed: {stats['processed']}/{stats['total']}")
    print(f"⚠️ Skipped: {stats['skipped']}/{stats['total']}")

    if stats['skipped'] > 0:
        print("\n📝 Skip Reasons:")
        for reason, count in stats['reasons'].items():
            print(f"- {reason}: {count}")

    return np.array(X, dtype=np.float32), np.array(y)

def save_features_and_labels(features, labels, prefix, save_dir="/content/drive/MyDrive/PolyGlotFake2/processed_data"):
    """Save features and labels with versioning"""
    os.makedirs(save_dir, exist_ok=True)

    version = 0
    while True:
        features_path = os.path.join(save_dir, f'{prefix}_features_v{version}.npy')
        labels_path = os.path.join(save_dir, f'{prefix}_labels_v{version}.npy')
        if not (os.path.exists(features_path) or os.path.exists(labels_path)):
            break
        version += 1

    np.save(features_path, features)
    np.save(labels_path, labels)

    print(f"✅ Saved version {version}:")
    print(f"Features: {features_path} ({features.shape})")
    print(f"Labels: {labels_path} ({labels.shape})")
    print(f"Class balance: {np.sum(labels==0)} real, {np.sum(labels==1)} fake")

def load_features_and_labels(prefix, version=None, save_dir=PATHS['train_video'].rsplit('/', 1)[0]):
    """Load saved features and labels"""
    if version == 'latest':
        existing_files = [f for f in os.listdir(save_dir) if f.startswith(f'{prefix}_features_v')]
        versions = [int(f.split('_v')[1].split('.npy')[0]) for f in existing_files]
        version = max(versions) if versions else 0

    features_path = os.path.join(save_dir, f'{prefix}_features_v{version}.npy')
    labels_path = os.path.join(save_dir, f'{prefix}_labels_v{version}.npy')

    features = np.load(features_path, allow_pickle=True)
    labels = np.load(labels_path)

    print(f"📁 Loaded version {version}:")
    print(f"Features shape: {features.shape}")
    print(f"Labels shape: {labels.shape}")

    return features, labels

if __name__ == "__main__":
    # Configuration
    DATASET_PATH = "/content/drive/MyDrive/PolyGlotFake2"
    OUTPUT_DIR = "/content/drive/MyDrive/PolyGlotFake2/processed_data"

    # Process dataset
    try:
        X, y = load_dataset(DATASET_PATH)

        # Save processed data
        if len(X) > 0:
            # Save full dataset
            save_features_and_labels(X, y, 'full', OUTPUT_DIR)

            # Save train/test splits
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
            save_features_and_labels(X_train, y_train, 'train', OUTPUT_DIR)
            save_features_and_labels(X_test, y_test, 'test', OUTPUT_DIR)
        else:
            print("❌ No data to save - processing failed")

    except Exception as e:
        print(f"❌ Critical error: {str(e)}")
        print("🔍 Verify dataset structure and file permissions")

🔍 Found 140 total videos


Processing real (ru): 100%|██████████| 10/10 [01:20<00:00,  8.01s/it]
Processing real (en): 100%|██████████| 10/10 [01:08<00:00,  6.82s/it]
Processing real (ja): 100%|██████████| 10/10 [01:12<00:00,  7.27s/it]
Processing real (fr): 100%|██████████| 10/10 [01:17<00:00,  7.77s/it]
Processing real (es): 100%|██████████| 10/10 [01:38<00:00,  9.82s/it]
Processing real (zh): 100%|██████████| 10/10 [01:07<00:00,  6.76s/it]
Processing real (ar): 100%|██████████| 10/10 [01:24<00:00,  8.47s/it]
Processing fake (to_zh): 100%|██████████| 10/10 [02:47<00:00, 16.74s/it]
Processing fake (to_ru): 100%|██████████| 10/10 [02:51<00:00, 17.12s/it]
Processing fake (to_ja): 100%|██████████| 10/10 [02:32<00:00, 15.28s/it]
Processing fake (to_fr): 100%|██████████| 10/10 [02:44<00:00, 16.49s/it]
Processing fake (to_es): 100%|██████████| 10/10 [02:43<00:00, 16.34s/it]
Processing fake (to_ar): 100%|██████████| 10/10 [01:49<00:00, 10.92s/it]
Processing fake (to_en): 100%|██████████| 10/10 [02:43<00:00, 16.34s/it]


📊 Processing Report:
✅ Successfully processed: 140/140
⚠️ Skipped: 0/140
✅ Saved version 0:
Features: /content/drive/MyDrive/PolyGlotFake2/processed_data/full_features_v0.npy ((140, 40, 64, 64, 5))
Labels: /content/drive/MyDrive/PolyGlotFake2/processed_data/full_labels_v0.npy ((140,))
Class balance: 70 real, 70 fake
✅ Saved version 3:
Features: /content/drive/MyDrive/PolyGlotFake2/processed_data/train_features_v3.npy ((112, 40, 64, 64, 5))
Labels: /content/drive/MyDrive/PolyGlotFake2/processed_data/train_labels_v3.npy ((112,))
Class balance: 56 real, 56 fake
✅ Saved version 3:
Features: /content/drive/MyDrive/PolyGlotFake2/processed_data/test_features_v3.npy ((28, 40, 64, 64, 5))
Labels: /content/drive/MyDrive/PolyGlotFake2/processed_data/test_labels_v3.npy ((28,))
Class balance: 14 real, 14 fake


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import gc
import os

# Configuration - UPDATED AUDIO DIMENSION
BATCH_SIZE = 8
SEQ_LENGTH = 40
FRAME_SIZE = 64
MAX_TIME_STEPS = 1500
AUDIO_FEAT_DIM = 1824  # Corrected based on error message

# Path configuration
PATHS = {
    'video': {
        'train': '/content/drive/MyDrive/PolyGlotFake2/processed_data/train_features_v3.npy',
        'test': '/content/drive/MyDrive/PolyGlotFake2/processed_data/test_features_v3.npy',
        'train_labels': '/content/drive/MyDrive/PolyGlotFake2/processed_data/train_labels_v3.npy',
        'test_labels': '/content/drive/MyDrive/PolyGlotFake2/processed_data/test_labels_v3.npy'
    },
    'audio': {
        'real': '/content/drive/MyDrive/PolyGlotFake2/processed_audio_v2/real_audio2_features.npy',
        'fake': '/content/drive/MyDrive/PolyGlotFake2/processed_audio_v2/fake_audio2_features.npy'
    },
    'output': '/content/drive/MyDrive/PolyGlotFake2/multimodal_models'
}

class MultimodalDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, video_data, audio_data, labels, batch_size, shuffle=True):
        self.video_data = video_data
        self.audio_data = audio_data
        self.labels = labels
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(labels))
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.labels) / self.batch_size))

    def __getitem__(self, idx):
        batch_indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]

        video_batch = self.video_data[batch_indices].astype('float32') / 255.0
        audio_batch = self.audio_data[batch_indices].astype('float32')
        labels_batch = self.labels[batch_indices].astype('float32')

        return {'video_input': video_batch, 'audio_input': audio_batch}, labels_batch

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

def create_enhanced_model():
    # Video processing branch (5 channels)
    video_input = layers.Input(
        shape=(SEQ_LENGTH, FRAME_SIZE, FRAME_SIZE, 5),
        name='video_input'
    )

    # Temporal modeling
    x = layers.TimeDistributed(layers.Conv2D(32, (3, 3), activation='relu'))(video_input)
    x = layers.TimeDistributed(layers.MaxPooling2D(2))(x)
    x = layers.TimeDistributed(layers.GlobalAveragePooling2D())(x)
    x = layers.Bidirectional(layers.GRU(64, return_sequences=True))(x)
    x = layers.GlobalAveragePooling1D()(x)

    # Audio processing branch (updated dimension)
    audio_input = layers.Input(
        shape=(MAX_TIME_STEPS, AUDIO_FEAT_DIM),  # Now using 1824 features
        name='audio_input'
    )

    # Spectral analysis
    y = layers.Conv1D(64, 3, activation='relu')(audio_input)
    y = layers.MaxPooling1D(2)(y)
    y = layers.Bidirectional(layers.GRU(64, return_sequences=True))(y)
    y = layers.GlobalAveragePooling1D()(y)

    # Multimodal fusion
    fused = layers.concatenate([x, y])
    fused = layers.Dense(128, activation='relu')(fused)
    fused = layers.Dropout(0.5)(fused)
    output = layers.Dense(1, activation='sigmoid')(fused)

    model = models.Model(
        inputs=[video_input, audio_input],
        outputs=output,
        name='enhanced_multimodal_model'
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
        loss='binary_crossentropy',
        metrics=[
            'accuracy',
            tf.keras.metrics.AUC(name='auc'),
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall')
        ]
    )
    return model

def load_and_align_data():
    # Load video data
    train_video = np.load(PATHS['video']['train'], mmap_mode='r')
    test_video = np.load(PATHS['video']['test'], mmap_mode='r')
    train_labels = np.load(PATHS['video']['train_labels'])
    test_labels = np.load(PATHS['video']['test_labels'])

    # Load and split audio data
    real_audio = np.load(PATHS['audio']['real'], mmap_mode='r')
    fake_audio = np.load(PATHS['audio']['fake'], mmap_mode='r')

    # Verify audio dimensions
    print(f"Real audio shape: {real_audio.shape} (should be (N, {MAX_TIME_STEPS}, {AUDIO_FEAT_DIM}))")
    print(f"Fake audio shape: {fake_audio.shape} (should be (N, {MAX_TIME_STEPS}, {AUDIO_FEAT_DIM}))")

    # Split audio to match video distribution
    real_audio_train = real_audio[:56]  # 56 real train samples
    real_audio_test = real_audio[56:70]  # 14 real test samples
    fake_audio_train = fake_audio[:56]  # 56 fake train samples
    fake_audio_test = fake_audio[56:70]  # 14 fake test samples

    # Combine audio splits
    audio_train = np.concatenate([real_audio_train, fake_audio_train])
    audio_test = np.concatenate([real_audio_test, fake_audio_test])

    # Verify final alignment
    assert len(audio_train) == len(train_video), f"Train mismatch: {len(audio_train)} vs {len(train_video)}"
    assert len(audio_test) == len(test_video), f"Test mismatch: {len(audio_test)} vs {len(test_video)}"

    # Create validation split
    X_train_vid, X_val_vid, X_train_aud, X_val_aud, y_train, y_val = train_test_split(
        train_video, audio_train, train_labels,
        test_size=0.2,
        stratify=train_labels,
        random_state=42
    )

    return (X_train_vid, X_train_aud, y_train), (X_val_vid, X_val_aud, y_val), (test_video, audio_test, test_labels)

def train_enhanced_model():
    tf.keras.backend.clear_session()
    gc.collect()

    # Load data with shape verification
    (train_vid, train_aud, train_lbl), (val_vid, val_aud, val_lbl), (test_vid, test_aud, test_lbl) = load_and_align_data()

    # Print final data shapes
    print("\nFinal Data Shapes:")
    print(f"Train Video: {train_vid.shape}, Audio: {train_aud.shape}, Labels: {train_lbl.shape}")
    print(f"Val Video: {val_vid.shape}, Audio: {val_aud.shape}, Labels: {val_lbl.shape}")
    print(f"Test Video: {test_vid.shape}, Audio: {test_aud.shape}, Labels: {test_lbl.shape}")

    # Create generators
    train_gen = MultimodalDataGenerator(train_vid, train_aud, train_lbl, BATCH_SIZE)
    val_gen = MultimodalDataGenerator(val_vid, val_aud, val_lbl, BATCH_SIZE, shuffle=False)
    test_gen = MultimodalDataGenerator(test_vid, test_aud, test_lbl, BATCH_SIZE, shuffle=False)

    # Create model
    model = create_enhanced_model()
    model.summary()

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True),
        tf.keras.callbacks.ModelCheckpoint(
            os.path.join(PATHS['output'], 'best_model.keras'),
            save_best_only=True,
            monitor='val_auc',
            mode='max'
        ),
        tf.keras.callbacks.TensorBoard(
            log_dir=os.path.join(PATHS['output'], 'logs'),
            histogram_freq=1
        )
    ]

    # Training
    try:
        history = model.fit(
            train_gen,
            validation_data=val_gen,
            epochs=50,
            callbacks=callbacks,
            verbose=1
        )
    except KeyboardInterrupt:
        print("\nTraining interrupted. Saving current weights...")
        model.save(os.path.join(PATHS['output'], 'interrupted_model.keras'))
        return

    # Final evaluation
    model.load_weights(os.path.join(PATHS['output'], 'best_model.keras'))

    # Test evaluation
    print("\nFinal Test Evaluation:")
    y_pred = (model.predict(test_gen) > 0.5).astype(int)
    print(classification_report(test_lbl, y_pred, target_names=['Real', 'Fake']))

if __name__ == "__main__":
    os.makedirs(PATHS['output'], exist_ok=True)
    train_enhanced_model()

Real audio shape: (70, 1500, 1824) (should be (N, 1500, 1824))
Fake audio shape: (70, 1500, 1824) (should be (N, 1500, 1824))

Final Data Shapes:
Train Video: (89, 40, 64, 64, 5), Audio: (89, 1500, 1824), Labels: (89,)
Val Video: (23, 40, 64, 64, 5), Audio: (23, 1500, 1824), Labels: (23,)
Test Video: (28, 40, 64, 64, 5), Audio: (28, 1500, 1824), Labels: (28,)


  self._warn_if_super_not_called()


Epoch 1/50
[1m 1/12[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m5:27[0m 30s/step - accuracy: 0.0000e+00 - auc: 0.0000e+00 - loss: 0.7427 - precision: 0.0000e+00 - recall: 0.0000e+00

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import classification_report
import os
import gc

# Configuration
BATCH_SIZE = 4
SEQ_LENGTH = 40
FRAME_SIZE = 64
MAX_TIME_STEPS = 1500
AUDIO_FEAT_DIM = 1824

PATHS = {
    'video': {
        'train': '/content/drive/MyDrive/PolyGlotFake2/processed_data/train_features_v3.npy',
        'test': '/content/drive/MyDrive/PolyGlotFake2/processed_data/test_features_v3.npy',
        'train_labels': '/content/drive/MyDrive/PolyGlotFake2/processed_data/train_labels_v3.npy',
        'test_labels': '/content/drive/MyDrive/PolyGlotFake2/processed_data/test_labels_v3.npy'
    },
    'audio': {
        'real': '/content/drive/MyDrive/PolyGlotFake2/processed_audio_v2/real_audio2_features.npy',
        'fake': '/content/drive/MyDrive/PolyGlotFake2/processed_audio_v2/fake_audio2_features.npy'
    },
    'output': '/content/drive/MyDrive/PolyGlotFake2/multimodal_models'
}

class SafeDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, video_path, audio_path, label_path, batch_size, shuffle=True):
        self.video_data = np.load(video_path, mmap_mode='r', allow_pickle=True)
        self.audio_data = np.load(audio_path, mmap_mode='r', allow_pickle=True)
        self.labels = np.load(label_path, allow_pickle=True)
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.labels))
        super().__init__()
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.labels) / self.batch_size))

    def __getitem__(self, idx):
        batch_indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]

        video_batch = np.array(self.video_data[batch_indices], dtype=np.float32) / 255.0
        audio_batch = np.array(self.audio_data[batch_indices], dtype=np.float32)
        labels_batch = self.labels[batch_indices].astype(np.float32)

        return {'video_input': video_batch, 'audio_input': audio_batch}, labels_batch

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

def create_streamlined_model():
    # Video processing branch with named input
    video_input = layers.Input(
        shape=(SEQ_LENGTH, FRAME_SIZE, FRAME_SIZE, 5),
        name='video_input'
    )
    x = layers.TimeDistributed(layers.Conv2D(16, (3,3), activation='relu'))(video_input)
    x = layers.TimeDistributed(layers.GlobalAvgPool2D())(x)
    x = layers.GRU(32)(x)

    # Audio processing branch with named input
    audio_input = layers.Input(
        shape=(MAX_TIME_STEPS, AUDIO_FEAT_DIM),
        name='audio_input'
    )
    y = layers.GRU(32)(audio_input)

    # Feature fusion
    fused = layers.concatenate([x, y])
    output = layers.Dense(1, activation='sigmoid')(fused)

    model = models.Model(
        inputs=[video_input, audio_input],
        outputs=output,
        name='streamlined_multimodal_model'
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
    )
    return model

def align_audio_data():
    # Create properly formatted audio datasets
    real_audio = np.load(PATHS['audio']['real'], allow_pickle=True)
    fake_audio = np.load(PATHS['audio']['fake'], allow_pickle=True)

    # Train split: first 56 real + first 56 fake
    train_audio = np.concatenate([real_audio[:56], fake_audio[:56]])
    # Test split: next 14 real + next 14 fake
    test_audio = np.concatenate([real_audio[56:70], fake_audio[56:70]])

    # Save as new numpy files without pickling
    np.save('train_audio_aligned.npy', train_audio, allow_pickle=False)
    np.save('test_audio_aligned.npy', test_audio, allow_pickle=False)

def train_model():
    tf.keras.backend.clear_session()
    gc.collect()

    # Align audio data first
    align_audio_data()

    # Create data generators
    train_gen = SafeDataGenerator(
        PATHS['video']['train'],
        'train_audio_aligned.npy',
        PATHS['video']['train_labels'],
        BATCH_SIZE
    )

    test_gen = SafeDataGenerator(
        PATHS['video']['test'],
        'test_audio_aligned.npy',
        PATHS['video']['test_labels'],
        BATCH_SIZE,
        shuffle=False
    )

    # Create and train model
    model = create_streamlined_model()
    model.summary()

    history = model.fit(
        train_gen,
        validation_data=test_gen,
        epochs=30,
        callbacks=[
            tf.keras.callbacks.ModelCheckpoint(
                os.path.join(PATHS['output'], 'final_model.keras'),
                save_best_only=True,
                monitor='val_auc',
                mode='max'
            )
        ]
    )

    # Final evaluation
    model.load_weights(os.path.join(PATHS['output'], 'final_model.keras'))
    y_pred = (model.predict(test_gen) > 0.5).astype(int)
    print(classification_report(test_gen.labels, y_pred, target_names=['Real', 'Fake']))

if __name__ == "__main__":
    os.makedirs(PATHS['output'], exist_ok=True)
    train_model()

Epoch 1/30
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 3s/step - accuracy: 0.4339 - auc: 0.4383 - loss: 0.7048 - val_accuracy: 0.4286 - val_auc: 0.4796 - val_loss: 0.6974
Epoch 2/30
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 3s/step - accuracy: 0.5665 - auc: 0.5374 - loss: 0.6858 - val_accuracy: 0.4643 - val_auc: 0.4923 - val_loss: 0.6963
Epoch 3/30
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 3s/step - accuracy: 0.4879 - auc: 0.5848 - loss: 0.6822 - val_accuracy: 0.4286 - val_auc: 0.5026 - val_loss: 0.6964
Epoch 4/30
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 3s/step - accuracy: 0.5904 - auc: 0.7106 - loss: 0.6643 - val_accuracy: 0.4643 - val_auc: 0.4872 - val_loss: 0.6955
Epoch 5/30
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 3s/step - accuracy: 0.5964 - auc: 0.6633 - loss: 0.6662 - val_accuracy: 0.4643 - val_auc: 0.4847 - val_loss: 0.6958
Epoch 6/30
[1m28/28[0m [32m━━━━━━━━━━