In [1]:
import os
import numpy as np
import json
import argparse

import config
from utils_eval import generate_null, load_transcript, windows, segment_data, WER, BLEU, METEOR, BERTSCORE
from utils_ridge.textgrid import TextGrid

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open(os.path.join(config.DATA_TEST_DIR, "eval_segments.json"), "r") as f:
        eval_segments = json.load(f)

In [3]:
pred_data = np.load('../results/S1/perceived_speech/wheretheressmoke_2.npz')
pred_words, pred_times = pred_data["words"], pred_data["times"]

In [4]:
gpt_checkpoint = "perceived"
null_word_list = generate_null(pred_times, gpt_checkpoint, 10)
np.array(null_word_list).shape

(10, 1589)

In [5]:
window_scores, window_zscores = {}, {}
story_scores, story_zscores = {}, {}
reference = 'wheretheressmoke'
grid_path = '../data_test/test_stimulus/perceived_speech/wheretheressmoke.TextGrid'
experiment = "perceived_speech"
skip_words = frozenset(["sentence_start", "sentence_end", "br", "lg", "ls", "ns", "sp"])
transcript_data = {}
with open(grid_path) as f: 
    grid = TextGrid(f.read())
    if experiment == "perceived_speech": transcript = grid.tiers[1].make_simple_transcript()
    else: transcript = grid.tiers[0].make_simple_transcript()
    transcript = [(float(s), float(e), w.lower()) for s, e, w in transcript if w.lower().strip("{}").strip() not in skip_words]
transcript_data["words"] = np.array([x[2] for x in transcript])
transcript_data["times"] = np.array([(x[0] + x[1]) / 2 for x in transcript])
ref_data = transcript_data
ref_words, ref_times = ref_data["words"], ref_data["times"]

In [29]:
# segment prediction and reference words into windows
window_cutoffs = windows(*eval_segments[reference], config.WINDOW)
ref_windows = segment_data(ref_words, ref_times, window_cutoffs)
pred_windows = segment_data(pred_words, pred_times, window_cutoffs)
# pred_windows = segment_data(pred_words, ref_times, window_cutoffs)
null_window_list = [segment_data(null_words, pred_times, window_cutoffs) for null_words in null_word_list]
# null_window_list = [segment_data(null_words, ref_times, window_cutoffs) for null_words in null_word_list]

In [32]:
for metric in [BLEU(n = 1), METEOR(), BERTSCORE(
        idf_sents = np.load(os.path.join(config.DATA_TEST_DIR, "idf_segments.npy")), rescale = False, 
        score = "recall")]:
    # get null score for each window and the entire story
    window_null_scores = np.array([metric.score(ref = ref_windows, pred = null_windows) 
                                    for null_windows in null_window_list])
    # window_null_scores = np.array([BLEU(n = 1).score(ref = ref_windows[:475], pred = null_windows[:475]) 
    #                                 for null_windows in null_window_list])
    story_null_scores = window_null_scores.mean(1)
    print(type(metric), story_null_scores)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
[nltk_data] Downloading package wordnet to /home/AD/tfei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/AD/tfei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/AD/tfei/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-strea

<class 'utils_eval.BLEU'> [0.18409899 0.19254194 0.19817932 0.18524355 0.19031435 0.18813607
 0.17506497 0.18536413 0.19678263 0.19183578]
<class 'utils_eval.METEOR'> [0.13769624 0.14041239 0.14710585 0.13563184 0.1393142  0.13539227
 0.13102061 0.13276535 0.13968289 0.1417376 ]
<class 'utils_eval.BERTSCORE'> [0.78914654 0.7897891  0.7902786  0.78892934 0.7891053  0.7910224
 0.78970736 0.7902623  0.78855485 0.7883179 ]


In [36]:
# get raw score and normalized score for each window
for metric in [BLEU(n = 1), METEOR(), BERTSCORE(
        idf_sents = np.load(os.path.join(config.DATA_TEST_DIR, "idf_segments.npy")), rescale = False, 
        score = "recall")]:
    window_scores = metric.score(ref = ref_windows, pred = pred_windows)
    print(type(metric), window_scores.mean())

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
[nltk_data] Downloading package wordnet to /home/AD/tfei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/AD/tfei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/AD/tfei/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-strea

<class 'utils_eval.BLEU'> 0.23056336621182172
<class 'utils_eval.METEOR'> 0.17134205187930937
<class 'utils_eval.BERTSCORE'> 0.8056429


In [33]:
# for i in range(480):
#     print(len(ref_windows[i]), len(pred_windows[i]), len(null_window_list[0][i]))