In [1]:
#connect to google drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import torch
import numpy as np
import soundfile as sf

from moviepy.editor import VideoFileClip
from transformers import Wav2Vec2Processor, Wav2Vec2Model

In [3]:
# -------------------------------------------------
# Device configuration
# -------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# -------------------------------------------------
# Load pretrained Wav2Vec2 model
# -------------------------------------------------
AUDIO_MODEL_NAME = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(AUDIO_MODEL_NAME)
model = Wav2Vec2Model.from_pretrained(AUDIO_MODEL_NAME).to(DEVICE)
model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.




Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [5]:
# -------------------------------------------------
# Extract audio from MP4 using MoviePy
# -------------------------------------------------
def extract_audio_mp4_to_wav(mp4_path, wav_path):
    """
    Extracts audio from an MP4 file and saves it as a 16kHz mono WAV file.

    """
    video = VideoFileClip(mp4_path)

    if video.audio is None:
        video.close()
        raise ValueError(f"No audio stream found in {mp4_path}")

    video.audio.write_audiofile(
        wav_path,
        fps=16000,
        nbytes=2,
        codec="pcm_s16le",
        logger=None
    )

    video.close()


In [6]:
AUDIO_FEATURE_PATH = "/content/drive/MyDrive/Dissertion/Data/Features/audio"
if not os.path.exists(AUDIO_FEATURE_PATH):
    os.makedirs(AUDIO_FEATURE_PATH)
# -------------------------------------------------
# Audio embedding extraction + saving
# -------------------------------------------------
@torch.no_grad()
def extract_and_save_audio_embedding_from_mp4(
    video_path,
    save_dir=AUDIO_FEATURE_PATH,
    temp_wav_dir="_temp_wav"
):
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(temp_wav_dir, exist_ok=True)

    video_name = os.path.splitext(os.path.basename(video_path))[0]
    temp_wav_path = os.path.join(temp_wav_dir, f"{video_name}.wav")

    # 1. Extract WAV from MP4
    extract_audio_mp4_to_wav(video_path, temp_wav_path)

    # 2. Load WAV using soundfile (PURE PYTHON WHEEL)
    waveform, sr = sf.read(temp_wav_path, dtype="float32")

    # Convert to mono if stereo
    if waveform.ndim == 2:
        waveform = waveform.mean(axis=1)

    waveform = torch.from_numpy(waveform)

    # 3. Prepare input for Wav2Vec2
    inputs = processor(
        waveform,
        sampling_rate=sr,
        return_tensors="pt"
    ).to(DEVICE)

    # 4. Forward pass through Wav2Vec2
    outputs = model(**inputs)

    # Frame-level embeddings: [time_steps, hidden_dim]
    hidden_states = outputs.last_hidden_state.squeeze(0)

    # -------------------------------------------------
    # ONE audio embedding per video (temporal mean)
    # -------------------------------------------------
    audio_embedding = hidden_states.mean(dim=0)  # [768]

    # L2 normalization
    audio_embedding = audio_embedding / audio_embedding.norm()

    # 5. Save embedding
    save_path = os.path.join(save_dir, f"{video_name}.pt")
    torch.save(audio_embedding.cpu(), save_path)

    # Cleanup temp WAV
    if os.path.exists(temp_wav_path):
        os.remove(temp_wav_path)

    #return audio_embedding.cpu(), save_path

In [7]:
#RAVDESS_DATA_PATH = "/content/drive/MyDrive/Dissertion/Data/RAVDESS"
RAVDESS_DATA_PATH = "/content/drive/MyDrive/Dissertion/Data/MELD"

file_list = []
for root, dirs, files in os.walk(RAVDESS_DATA_PATH):

  for file in files:
        file_list.append(os.path.join(root, file))

print(f"total files found {len(file_list)}")
print(file_list[:5])

total files found 7754
['/content/drive/MyDrive/Dissertion/Data/MELD/dia616_utt3.mp4', '/content/drive/MyDrive/Dissertion/Data/MELD/dia615_utt6.mp4', '/content/drive/MyDrive/Dissertion/Data/MELD/dia608_utt3.mp4', '/content/drive/MyDrive/Dissertion/Data/MELD/dia629_utt15.mp4', '/content/drive/MyDrive/Dissertion/Data/MELD/dia612_utt0.mp4']


In [8]:
total_files = len(file_list)
cnt = 0
for vis_file in file_list:
    video_file = vis_file
    if not os.path.exists(video_file):
        print(f"Warning: Video file not found: {video_file}. Skipping.")
        continue
    try:
        extract_and_save_audio_embedding_from_mp4(video_file)
        cnt += 1
        print(f"{cnt}/{total_files}  Embeddings generated for file:{video_file}")
    except ValueError as e:
        print(f"Error processing {video_file}: {e}. Skipping.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2755/7754  Embeddings generated for file:/content/drive/MyDrive/Dissertion/Data/MELD/dia109_utt14.mp4
2756/7754  Embeddings generated for file:/content/drive/MyDrive/Dissertion/Data/MELD/dia118_utt1.mp4
2757/7754  Embeddings generated for file:/content/drive/MyDrive/Dissertion/Data/MELD/dia122_utt12.mp4
2758/7754  Embeddings generated for file:/content/drive/MyDrive/Dissertion/Data/MELD/dia11_utt4.mp4
2759/7754  Embeddings generated for file:/content/drive/MyDrive/Dissertion/Data/MELD/dia123_utt1.mp4
2760/7754  Embeddings generated for file:/content/drive/MyDrive/Dissertion/Data/MELD/dia110_utt1.mp4
2761/7754  Embeddings generated for file:/content/drive/MyDrive/Dissertion/Data/MELD/dia130_utt15.mp4
2762/7754  Embeddings generated for file:/content/drive/MyDrive/Dissertion/Data/MELD/dia144_utt4.mp4
2763/7754  Embeddings generated for file:/content/drive/MyDrive/Dissertion/Data/MELD/dia135_utt1.mp4
2764/7754  Embeddings ge

In [11]:
#load embeddings from feature path and show properties of them
import torch
from pathlib import Path

def inspect_embeddings(feature_dir):
   # print total embeddings count
    print(f"Total embeddings: {len(list(Path(feature_dir).rglob('*.pt')))}")
    files = sorted(Path(feature_dir).rglob("*.pt"))[:10]
    print(f"Showing {len(files)} files from: {feature_dir}\n")

    for i, f in enumerate(files, 1):
        obj = torch.load(f)

        if isinstance(obj, dict):
            emb = obj.get("embedding")
            extra = f", keys={list(obj.keys())}"
        else:
            emb = obj
            extra = ""

        if emb is None:
            print(f"{i}. {f.name} -> EMPTY{extra}")
            continue

        shape = tuple(emb.shape)
        dtype = emb.dtype
        norm = float(emb.norm(p=2)) if emb.numel() > 0 else 0.0

        print(f"{i}. {f.name}")
        print(f"   path : {f}")
        print(f"   shape: {shape}")
        print(f"   dtype: {dtype}")
        print(f"   L2   : {norm:.4f}{extra}\n")

In [12]:
inspect_embeddings(AUDIO_FEATURE_PATH)

Total embeddings: 10778
Showing 10 files from: /content/drive/MyDrive/Dissertion/Data/Features/audio

1. 01-02-01-01-01-01-01.pt
   path : /content/drive/MyDrive/Dissertion/Data/Features/audio/01-02-01-01-01-01-01.pt
   shape: (768,)
   dtype: torch.float32
   L2   : 1.0000

2. 01-02-01-01-01-01-02.pt
   path : /content/drive/MyDrive/Dissertion/Data/Features/audio/01-02-01-01-01-01-02.pt
   shape: (768,)
   dtype: torch.float32
   L2   : 1.0000

3. 01-02-01-01-01-01-03.pt
   path : /content/drive/MyDrive/Dissertion/Data/Features/audio/01-02-01-01-01-01-03.pt
   shape: (768,)
   dtype: torch.float32
   L2   : 1.0000

4. 01-02-01-01-01-01-04.pt
   path : /content/drive/MyDrive/Dissertion/Data/Features/audio/01-02-01-01-01-01-04.pt
   shape: (768,)
   dtype: torch.float32
   L2   : 1.0000

5. 01-02-01-01-01-01-05.pt
   path : /content/drive/MyDrive/Dissertion/Data/Features/audio/01-02-01-01-01-01-05.pt
   shape: (768,)
   dtype: torch.float32
   L2   : 1.0000

6. 01-02-01-01-01-01-06.pt
 