In [3]:
import torch, torchaudio
import warnings
import numpy as np
import os
warnings.filterwarnings("ignore")

In [2]:
import torch
import torchaudio
from torch.nn.functional import pad
from tqdm import tqdm
from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained tokenizer and model
# tokenizer = Wav2Vec2Tokenizer.from_pretrained('facebook/wav2vec2-base-960h')
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base-960h').to(device)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [160]:

def audio_perplexity(audio_file):
    max_length = 4000  # at 16kHz
    stride = 400  # overlap

    # Load and preprocess the audio file (Convert to mono)
    waveform, sample_rate = torchaudio.load(audio_file)
    waveform = waveform.mean(dim=0)  # Ensure mono-channel

    # Resample to 16KHz if needed
    if sample_rate != 16000:
        transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = transform(waveform)
    
    # Tokenization
    input_values = processor(waveform.numpy(), return_tensors="pt", sampling_rate=16000).input_values.to(device)
    seq_len = input_values.shape[1]
    
    nlls = []

    # tqdm bar
    inf_count = 0
    progress_bar = tqdm(range(0, seq_len, stride), desc="Processing", dynamic_ncols=True)
    for begin_loc in progress_bar:
        end_loc = min(begin_loc + max_length, seq_len)
        if end_loc - begin_loc < 1000:  # Avoid very short chunks
            print("⚠️ Warning: Skipping too short sequence")
            continue

        input_chunk = input_values[:, begin_loc:end_loc]

        # Create valid target labels 
        target_ids = input_chunk.clone()
        target_ids[:, :-1] = -100  # Mask only some tokens

        with torch.no_grad():
            outputs = model(input_chunk, labels=target_ids)
            neg_log_likelihood = outputs.loss

        if torch.isinf(neg_log_likelihood):
            inf_count += 1
            continue

        nlls.append(neg_log_likelihood.item())
        progress_bar.set_description(f"Chunk {begin_loc}-{end_loc} | inf: {inf_count} | NLL: {neg_log_likelihood.item():.4f}")

        if end_loc == seq_len:
            break

    return nlls

In [161]:
# Ross example
audio_file = r"C:\Users\a_has\Desktop\Friends\friends_mmc\face_track_videos\face_track_videos\s03e16\012181-012265\1.wav"
perplexity_values = audio_perplexity(audio_file)
# print("Negative Log-Likelihoods:", perplexity_values)
print("Mean Perplexity:", np.nanmean(np.asarray(perplexity_values)))

Chunk 52800-56720 | inf: 0 | NLL: -4.3193:  93%|█████████▎| 132/142 [00:08<00:00, 15.82it/s]

Mean Perplexity: 3.4509833028077854





In [162]:
import pandas as pd
import itertools
import json

annotations_file = r"C:\Users\a_has\Desktop\Friends\friends_mmc\5_turns\test-metadata.json"
with open(annotations_file, "r") as f:
    annotations = json.load(f)
    annotations = list(itertools.chain.from_iterable(annotations))
    
annotations = pd.DataFrame(annotations)
annotations.head()

Unnamed: 0,frame,speaker,content,faces,video
0,s03e20-000128,chandler,"Wait a minute, wait. You’re telling me this ac...","[[[800, 257, 870, 342], joey], [[543, 259, 597...",s03e20-000063-000194
1,s03e20-000218,joey,Yeah! Oh my God! (to Chandler) Is this what it...,"[[[797, 242, 863, 326], joey], [[532, 250, 593...",s03e20-000194-000320
2,s03e20-000538,joey,"Oh, you have no idea. And-and when we’re on st...","[[[567, 79, 787, 358], joey]]",s03e20-000403-000639
3,s03e20-000680,phoebe,to see you feeling like this!,"[[[428, 74, 657, 353], phoebe]]",s03e20-000646-000715
4,s03e20-000832,ross,"Monica, uh Dad called this morning and ah, Aun...","[[[472, 396, 545, 491], phoebe], [[133, 134, 2...",s03e20-000820-000957


In [171]:
s03e16_frames = annotations[annotations["frame"].str.startswith("s03e16")]
ross_frames = s03e16_frames[s03e16_frames["speaker"] == "ross"].drop_duplicates(subset=["video"], keep="first")
rachel_frames = s03e16_frames[s03e16_frames["speaker"] == "rachel"].drop_duplicates(subset=["video"], keep="first")

print(f"Ross count: {len(ross_frames)}")
print(f"Rachel count: {len(rachel_frames)}")

Ross count: 37
Rachel count: 42


In [172]:
import os

ross_nlls = []
rachel_nlls = []

print("# Calculate Ross")
# Iterate over some sections for ross
for index, row in ross_frames.iterrows():
    audio_file = os.path.join(
        r"C:\Users\a_has\Desktop\Friends\friends_mmc\face_track_videos\face_track_videos\s03e16",
        row["video"].replace("s03e16-", ""),
        "0.wav",
    )
    perplexity_values = audio_perplexity(audio_file)
    ross_nlls += perplexity_values


print("# Calculate Rachel")
# Iterate over some sections for rachel
for index, row in rachel_frames.iterrows():
    audio_file = os.path.join(
        r"C:\Users\a_has\Desktop\Friends\friends_mmc\face_track_videos\face_track_videos\s03e16",
        row["video"].replace("s03e16-", ""),
        "0.wav",
    )
    perplexity_values = audio_perplexity(audio_file)
    rachel_nlls += perplexity_values


print("Ross Mean Perplexity:", np.nanmean(np.asarray(ross_nlls)))
print("Rachel Mean Perplexity:", np.nanmean(np.asarray(rachel_nlls)))

# Calculate Ross


Chunk 8400-12016 | inf: 0 | NLL: 2.3938:  68%|██████▊   | 21/31 [00:01<00:00, 15.83it/s] 
Chunk 70400-74048 | inf: 0 | NLL: -2.3416:  95%|█████████▍| 176/186 [00:11<00:00, 14.83it/s]
Chunk 17600-21360 | inf: 0 | NLL: -0.2742:  81%|████████▏ | 44/54 [00:02<00:00, 15.19it/s]
Chunk 14800-18688 | inf: 0 | NLL: -0.3050:  79%|███████▊  | 37/47 [00:02<00:00, 14.45it/s]
Chunk 13600-17344 | inf: 0 | NLL: -0.7099:  77%|███████▋  | 34/44 [00:02<00:00, 14.10it/s]
Chunk 18400-22016 | inf: 0 | NLL: 0.1592:  82%|████████▏ | 46/56 [00:03<00:00, 14.13it/s] 
Chunk 27600-31360 | inf: 0 | NLL: -3.3863:  87%|████████▋ | 69/79 [00:04<00:00, 14.70it/s]
Chunk 9600-13344 | inf: 0 | NLL: 1.1551:  71%|███████   | 24/34 [00:01<00:00, 13.94it/s] 
Chunk 20800-24688 | inf: 0 | NLL: -2.3771:  84%|████████▍ | 52/62 [00:03<00:00, 16.76it/s]
Chunk 29600-33360 | inf: 0 | NLL: 1.1666:  88%|████████▊ | 74/84 [00:05<00:00, 13.42it/s] 
Chunk 22800-26688 | inf: 0 | NLL: 4.5614:  85%|████████▌ | 57/67 [00:04<00:00, 12.77it/s] 

# Calculate Rachel


Chunk 0-1328 | inf: 0 | NLL: 1.6727:   0%|          | 0/4 [00:00<?, ?it/s]
Chunk 4800-8672 | inf: 0 | NLL: 0.5886:  55%|█████▍    | 12/22 [00:00<00:00, 12.18it/s] 
Chunk 0-3328 | inf: 0 | NLL: 3.1035:   0%|          | 0/9 [00:00<?, ?it/s]
Chunk 8000-12000 | inf: 0 | NLL: -0.1674:  67%|██████▋   | 20/30 [00:01<00:00, 11.65it/s]
Chunk 47600-51376 | inf: 0 | NLL: 0.7100:  92%|█████████▏| 119/129 [00:08<00:00, 13.38it/s] 
Chunk 16400-20016 | inf: 0 | NLL: 0.1684:  80%|████████  | 41/51 [00:03<00:00, 12.31it/s] 
Chunk 51600-55376 | inf: 0 | NLL: 4.5690:  93%|█████████▎| 129/139 [00:09<00:00, 12.99it/s] 
Chunk 26400-30016 | inf: 0 | NLL: 3.8791:  87%|████████▋ | 66/76 [00:05<00:00, 12.89it/s] 
Chunk 4000-8000 | inf: 0 | NLL: 23.1697:  50%|█████     | 10/20 [00:00<00:00, 12.89it/s]
Chunk 36400-40048 | inf: 0 | NLL: 25.8800:  90%|█████████ | 91/101 [00:06<00:00, 13.54it/s]
Chunk 180400-184160 | inf: 0 | NLL: -0.3076:  98%|█████████▊| 451/461 [08:02<00:10,  1.07s/it]   
Chunk 142400-146128 | in

Ross Mean Perplexity: 4.227213766826052
Rachel Mean Perplexity: 4.508526948306991



