In [11]:
import pickle
import numpy as np
from sklearn.metrics import mean_squared_error
from openai import OpenAI
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import os
import re
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
from torch.nn.functional import pad

with open("../data/results/non_verbal_multi/pitch_results.pkl", "rb") as f:
    df = pickle.load(f)

In [12]:
with open("../data/results/non_verbal_multi/pitch_series.pkl", "rb") as f:
    all_pitch_series = pickle.load(f)
all_pitch_series

[{'session': '02',
  'pitch_series': [{'speaker': 'P007',
    'section': (44.627, 45.255),
    'f0': np.float64(129.36047426137077)},
   {'speaker': 'P006', 'section': (53.186, 53.716), 'f0': None},
   {'speaker': 'P007',
    'section': (65.778, 66.772),
    'f0': np.float64(104.2716078664294)},
   {'speaker': 'P007',
    'section': (72.859, 74.142),
    'f0': np.float64(96.67964454749135)},
   {'speaker': 'P006',
    'section': (74.814, 76.272),
    'f0': np.float64(93.21548537833664)},
   {'speaker': 'P007',
    'section': (77.794, 84.953),
    'f0': np.float64(91.83114509311334)},
   {'speaker': 'P006',
    'section': (84.953, 85.767),
    'f0': np.float64(96.24202992718331)},
   {'speaker': 'P006',
    'section': (86.549, 88.503),
    'f0': np.float64(103.2523573737131)},
   {'speaker': 'P007',
    'section': (89.822, 90.935),
    'f0': np.float64(93.16820507804633)},
   {'speaker': 'P007',
    'section': (91.432, 92.761),
    'f0': np.float64(89.94756294769383)},
   {'speaker': 'P

In [13]:
model_id = "gpt2-large"
device = "cpu"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
start_of_sentence =" "

def p1(dialog, start_of_sentence=" "):
    
    #max_length = model.config.n_positions
    #max_length =  model.config.max_position_embeddings
    max_length = 500
    stride = 1
    
    pad_token_id = 0
    encodings = tokenizer(f"{start_of_sentence}".join(dialog), return_tensors="pt")
    seq_len = encodings.input_ids.size(1)
    padding_len = max_length -1 
    padded_input_ids = pad(torch.tensor([], dtype=torch.long), (0, padding_len), value=pad_token_id).unsqueeze(dim=0)
    encodings.input_ids = torch.cat([padded_input_ids, encodings.input_ids], dim=1)
    seq_len = encodings.input_ids.size(1)
    
    nlls = []
    prev_end_loc = padding_len
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from max_length on the last loop 
        begin_loc = max(padding_len, begin_loc)
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood.item())

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break
        
    return nlls

In [None]:
for i, session_data in enumerate(all_pitch_series):
    session_number = session_data["session"]
    print("Session: ", session_number)
    pitch_info_series = session_data["pitch_series"]
    speakers = sorted(
        list(set([pitch_info["speaker"] for pitch_info in pitch_info_series]))
    )
    speaker_1_pitch_list = [
        pitch_info["f0"]
        for pitch_info in pitch_info_series
        if pitch_info["speaker"] == speakers[0]
        and pitch_info["f0"] is not None
        and pitch_info["f0"] < 200
    ]
    speaker_2_pitch_list = [
        pitch_info["f0"]
        for pitch_info in pitch_info_series
        if pitch_info["speaker"] == speakers[1]
        and pitch_info["f0"] is not None
        and pitch_info["f0"] < 200
    ]

    perpl_1 = p1([", ".join(map(str, speaker_1_pitch_list))], start_of_sentence=start_of_sentence)
    df.loc[df["session"] == session_number, "pitch_perpl_P1"] = np.nanmean(np.asarray(perpl_1))
    perpl_2 = p1([", ".join(map(str, speaker_2_pitch_list))], start_of_sentence=start_of_sentence)
    df.loc[df["session"] == session_number, "pitch_perpl_P2"] = np.nanmean(np.asarray(perpl_2))


Session:  02


  5%|▍         | 24/524 [00:11<03:50,  2.16it/s]

[nan, 2.4845290184020996, 8.519386291503906, 11.478304862976074, 9.748420715332031, 8.352367401123047, 4.442245006561279, 2.7687511444091797, 7.934508323669434, 0.14185687899589539, 5.192274570465088, 4.944060802459717, 6.343224048614502, 4.208057880401611, 4.794170379638672, 7.168205738067627, 0.15340711176395416, 5.308382987976074, 0.020211460068821907, 5.159751892089844, 4.855391502380371, 4.765774250030518, 4.813440322875977, 4.645648956298828, 7.795556545257568]





In [None]:
with open("./pitch_results_3.pkl", "wb") as f:
    pickle.dump(df, f)