In [1]:
import torch
import torchaudio
from transformers import HubertModel
import numpy as np
import os
import pandas as pd

df_user =  pd.read_excel("user-self-reports/target-enjoyment.xlsx", sheet_name='target-enjoyment', header=1)

def extract_hubert_embeddings(audio_file_path, model_name="facebook/hubert-large-ls960-ft", output_dir="audio-embeddings"):
    """
    Extracts Hubert embeddings from a .wav audio file.

    Args:
        audio_file_path (str): Path to the .wav audio file.
        model_name (str, optional): Name of the pretrained Hubert model.
            Defaults to "facebook/hubert-base-ls960". You can also use
            "hubert-large-ls960-ft".
        output_dir (str, optional): Directory to save the embeddings.
            Defaults to "embeddings".
    Returns:
        numpy.ndarray: The Hubert embeddings, or None if an error occurs.
    """
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")

    # Load the Hubert model
    try:
        print(f"Loading Hubert model: {model_name}...")
        model = HubertModel.from_pretrained(model_name)
        print("Model loaded successfully.")
    except Exception as e:
        print(f"Error: Failed to load Hubert model {model_name}. Check the model name and your network connection.")
        print(f"Exception: {e}")
        return None

    """    # Move the model to GPU if available
    if torch.backends.mps.is_available():
        print("Using Apple Silicon GPU (MPS) for inference.")
    else:
        print("Using CPU for inference.")
    model.to(torch.device("cpu"))  # Move model to GPU if available"""
    # Set the model to evaluation mode
    model.eval()

    # Load the audio file using torchaudio
    try:
        print(f"Loading audio file: {audio_file_path}...")
        waveform, sample_rate = torchaudio.load(audio_file_path)
        print(f"Audio loaded. Initial shape: {waveform.shape}, Sample rate: {sample_rate}")
    except Exception as e:
        print(f"Error: Failed to load audio file {audio_file_path}. Ensure it is a valid .wav file.")
        print(f"Exception: {e}")
        return None

    # Resample the audio if necessary. Hubert models are often trained on 16000 Hz audio.
    target_sample_rate = 16000
    if sample_rate != target_sample_rate:
        print(f"Resampling audio from {sample_rate} Hz to {target_sample_rate} Hz...")
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        waveform = resampler(waveform)
        print(f"Resampled audio shape: {waveform.shape}")

    # Ensure the waveform is mono (Hubert expects mono audio)
    if waveform.shape[0] > 1:
        print("Converting audio to mono...")
        waveform = torch.mean(waveform, dim=0, keepdim=True)
        print(f"Mono audio shape: {waveform.shape}")

    # --- FIX: Prepare waveform for the model ---
    # The model expects input shape (batch_size, sequence_length).
    # Current shape is likely [1, sequence_length] (mono audio).
    # We need to remove the channel dimension first, then add the batch dimension.

    # Remove the channel dimension (dim=0) if it exists
    if waveform.dim() > 1 and waveform.shape[0] == 1:
         waveform = waveform.squeeze(0) # Shape becomes [sequence_length]
         print(f"Squeezed waveform shape: {waveform.shape}")


    # Add a batch dimension (batch size of 1) at the beginning
    waveform = waveform.unsqueeze(0) # Shape becomes [1, sequence_length]
    print(f"Final waveform shape for model: {waveform.shape}")
    # --- End of FIX ---


    # Disable gradient calculations to save memory and increase speed
    with torch.no_grad():
        try:
            print("Extracting embeddings...")
            # Get the Hubert embeddings. The output is a dictionary.
            # The 'last_hidden_state' key contains the embeddings.
            output = model(waveform)
            embeddings = output.last_hidden_state
            print("Embeddings extracted successfully.")
        except Exception as e:
            print(f"Error: Failed to process audio with Hubert model.")
            print(f"Input tensor shape at error: {waveform.shape}") # Print shape right before error
            print(f"Exception: {e}")
            return None

    # Convert the embeddings to a numpy array
    # Output shape is (batch_size, sequence_length_out, hidden_size)
    embeddings_numpy = embeddings.squeeze(0).cpu().numpy()  # Remove batch dimension
    print(f"Embeddings numpy shape: {embeddings_numpy.shape}")

    # Save the embeddings to a .npy file
    audio_file_name = os.path.splitext(os.path.basename(audio_file_path))[0]
    output_file_path = os.path.join(output_dir, f"{audio_file_name}_hubert_embeddings.npy")
    try:
        np.save(output_file_path, embeddings_numpy)
        print(f"Saved embeddings to {output_file_path}")
    except Exception as e:
        print(f"Error: Failed to save embeddings to {output_file_path}")
        print(f"Exception: {e}")
        return None # Return None if saving fails

    return embeddings_numpy

if __name__ == "__main__":

    for person in range(4, 43):
        if person == 26:
            continue
        robot_first_interaction = df_user.loc[df_user["PID"] == person, "Q1-Robot"].values[0]
        embeddings = []    
        turn = []
        df = pd.read_excel(f"data/text/P{person}/text-aligned-P{person}.xlsx", header=0)
        for i, row in df.iterrows():
            audio = f"data/audio_vad_300/P{person}/trimmed_audio-P{person}-{robot_first_interaction}-{i+1}.wav"
            # Check if the audio file exists
            if not os.path.exists(audio):
                print(f"Audio file {audio} does not exist. Skipping.")
                continue
            # Check audio duration in seconds from .wav file
            audio_info = torchaudio.info(audio)
            duration = audio_info.num_frames / audio_info.sample_rate
            if duration < 1:  # Check if the audio is less than 2 second
                print(f"Audio {audio} is too short ({duration:.2f} seconds). Skipping.")
                continue

            turn.append(i+1)

            embedding = extract_hubert_embeddings(audio)
            if embedding is not None:
                embeddings.append(np.mean(embedding, axis=0))  # Average the embeddings across time
            else:
                print(f"Failed to extract embedding for {audio}")
        # Save all embeddings for the personxrobot
        os.makedirs(f"data/audio-embeddings-vad-300-1s-lg", exist_ok=True)
        df_out = pd.DataFrame()
        df_out["Utterance ID"] = turn
        df_out = pd.concat([df_out, pd.DataFrame(embeddings)], axis=1)
        # Save the DataFrame to a CSV file
        df_out.to_csv(f"data/audio-embeddings-vad-300-1s-lg/audio-dataset{robot_first_interaction}-P{person}.csv", index=False)
        print(f"Saved embeddings for {robot_first_interaction} P{person} to CSV.")

  from .autonotebook import tqdm as notebook_tqdm
objc[1954]: Class AVFFrameReceiver is implemented in both /Users/ricardosantana/Development/miniconda3/envs/phd/lib/libavdevice.59.7.100.dylib (0x10e4c4778) and /Users/ricardosantana/Development/miniconda3/envs/phd/lib/python3.10/site-packages/av/.dylibs/libavdevice.61.3.100.dylib (0x30ea103a8). One of the two will be used. Which one is undefined.
objc[1954]: Class AVFAudioReceiver is implemented in both /Users/ricardosantana/Development/miniconda3/envs/phd/lib/libavdevice.59.7.100.dylib (0x10e4c47c8) and /Users/ricardosantana/Development/miniconda3/envs/phd/lib/python3.10/site-packages/av/.dylibs/libavdevice.61.3.100.dylib (0x30ea103f8). One of the two will be used. Which one is undefined.


Created output directory: audio-embeddings
Loading Hubert model: facebook/hubert-large-ls960-ft...
Model loaded successfully.
Loading audio file: data/audio_vad_300/P4/trimmed_audio-P4-Alice-1.wav...
Audio loaded. Initial shape: torch.Size([1, 53760]), Sample rate: 32000
Resampling audio from 32000 Hz to 16000 Hz...
Resampled audio shape: torch.Size([1, 26880])
Squeezed waveform shape: torch.Size([26880])
Final waveform shape for model: torch.Size([1, 26880])
Extracting embeddings...
Embeddings extracted successfully.
Embeddings numpy shape: (83, 1024)
Saved embeddings to audio-embeddings/trimmed_audio-P4-Alice-1_hubert_embeddings.npy
Loading Hubert model: facebook/hubert-large-ls960-ft...
Model loaded successfully.
Loading audio file: data/audio_vad_300/P4/trimmed_audio-P4-Alice-2.wav...
Audio loaded. Initial shape: torch.Size([1, 51840]), Sample rate: 32000
Resampling audio from 32000 Hz to 16000 Hz...
Resampled audio shape: torch.Size([1, 25920])
Squeezed waveform shape: torch.Size(

In [58]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

for person_id in range(4, 43):
    if person_id == 26:
        continue
    robot_first_interaction = df_user.loc[df_user["PID"] == person_id, "Q1-Robot"].values[0]
    embeddings_1 = pd.read_csv(f"data/audio-embeddings/audio-dataset{robot_first_interaction}-P{person_id}.csv", header=0)
    embeddings_2 = pd.read_csv(f"data/audio-embeddings2/audio-dataset{robot_first_interaction}-P{person_id}.csv", header=0)

    # Calculate cosine similarity between the two sets of embeddings only when they have the same number of rows
    if embeddings_1.shape[0] != embeddings_2.shape[0]:
        print(f"Warning: Different number of rows for {robot_first_interaction} P{person_id}. Skipping similarity calculation.")
        continue
    # Calculate cosine similarity (1 value per row)
    # Ensure the embeddings are in the same order
    embeddings_1 = embeddings_1.sort_values(by="Utterance ID").reset_index(drop=True)
    embeddings_2 = embeddings_2.sort_values(by="Utterance ID").reset_index(drop=True)
    similarity_matrix = cosine_similarity(embeddings_1.iloc[:, 1:].values, embeddings_2.iloc[:, 1:].values)
    per_row_sim = np.diagonal(similarity_matrix)
    # print(f"Cosine similarity for {robot_first_interaction} P{person_id}: {per_row_sim}")

    # check for values lower than 0.95 and print the corresponding row numbers
    low_similarity_indices = np.where(per_row_sim < 0.95)[0]
    if len(low_similarity_indices) > 0:
        print(f"Low similarity values for {robot_first_interaction} P{person_id}:")
        for index in low_similarity_indices:
            print(f"Row {index + 1}: {per_row_sim[index]:.4f}")

Low similarity values for Alice P13:
Row 11: 0.7840
Low similarity values for Alice P24:
Row 2: 0.7350
Low similarity values for Clara P40:
Row 3: 0.9496
