In [1]:
import pickle
import numpy as np
from sklearn.metrics import mean_squared_error
from openai import OpenAI
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import os
import re
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
from torch.nn.functional import pad
from audiotoken import AudioToken, Tokenizers
import sys

sys.path.append("..")
from src.non_verbal.get_audio import get_audio

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("../data/results/non_verbal_multi/pitch_results.pkl", "rb") as f:
    df = pickle.load(f)
df.head()

Unnamed: 0,session,f0_P1,f0_P2,dominance_P1,dominance_P2,hypo,pitch_rmse_P1,pitch_rmse_P2
0,2,95.991426,95.111992,2.0,3.4,true,13.364473,8.962308
1,4,96.021414,97.92961,2.0,1.6,true,13.16105,11.204088
2,5,91.887388,92.526656,2.0,3.8,false,11.084141,8.84393
3,7,74.769208,84.26931,3.4,3.4,maybe,16.713351,16.85975
4,8,94.41349,107.539095,2.8,2.4,true,23.180087,22.803583


In [3]:
model_id = "gpt2-large"
device = "cpu"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)

tokenizer = AudioToken(tokenizer=Tokenizers.acoustic, device=device)

In [13]:
def p1(audio, start_of_sentence=" "):
    max_length = 500
    stride = 1
    
    pad_token_id = 0

    encodings = tokenizer.encode(audio)
    encodings = encodings[:,-1,:]
    seq_len = encodings.size(1)
    padding_len = max_length -1 
    padded_input_ids = pad(torch.tensor([], dtype=torch.long), (0, padding_len), value=pad_token_id).unsqueeze(dim=0)
    encodings = torch.cat([padded_input_ids, encodings], dim=1)
    seq_len = encodings.size(1)
    
    
    nlls = []
    prev_end_loc = padding_len
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from max_length on the last loop 
        begin_loc = max(padding_len, begin_loc)
        input_ids = encodings[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood.item())

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break
        
    return nlls

In [None]:
for session_number in df["session"].to_list():
    print("Session: ", session_number)

    speakers_audio = get_audio(session_number, r"C:\Users\a_has\Desktop\Multisimo")
    speakers = sorted([s for s in speakers_audio.keys() if s != "sr"])
    
    tokenizer.model_sample_rate = speakers_audio["sr"]
    # l = speakers_audio[speakers[0]].shape[1]
    # perpl_1 = p1(speakers_audio[speakers[0]][:, :int(l/10)])
    perpl_1 = p1(speakers_audio[speakers[0]])
    df.loc[df["session"] == session_number, "audio_perpl_P1"] = np.nanmean(np.asarray(perpl_1))
    perpl_2 = p1(speakers_audio[speakers[1]])
    df.loc[df["session"] == session_number, "audio_perpl_P2"] = np.nanmean(np.asarray(perpl_2))


Session:  02
4990897
torch.Size([1, 1560])


  0%|          | 9/2059 [00:07<26:45,  1.28it/s]


KeyboardInterrupt: 

In [None]:
with open("./pitch_results_audio_token.pkl", "wb") as f:
    pickle.dump(df, f)