# Preds analysis

In [None]:
!pip install datasets
!pip install evaluate

In [None]:
!pip install rouge_score

In [None]:
import json
import evaluate
from collections import defaultdict, Counter
from tqdm import tqdm as tqdm
from musiccaps import load_musiccaps
import string
import numpy as np
import random
import pandas as pd
import re
import string
import itertools

## Helper functions

In [None]:
meteor = evaluate.load('meteor')
google_bleu = evaluate.load('google_bleu')
rouge = evaluate.load('rouge')

In [None]:


ds = load_musiccaps(
    "./music_data",
    sampling_rate=16000,
    limit=None,
    num_proc=8,
    writer_batch_size=1000,
    return_without_audio=True,
)

def clean_text_for_aspect_metrics(caption):
    table = str.maketrans('','', string.punctuation)
    # split the sentences into words
    desc = caption.split()
    #converts to lower case
    desc = [word.lower() for word in desc]
    #remove punctuation from each token
    desc = [word.translate(table) for word in desc]
    #remove hanging 's and a 
    #desc = [word for word in desc if(len(word)>1)]
    #remove tokens with numbers in them
    #desc = [word for word in desc if(word.isalpha())]
    #convert back to string
    caption = ' '.join(desc)
    return caption

def preprocessing_remove_unk(text_input):

    unk_flag = False

    # remove punctuations
    desc = re.sub(r'[^\w\s]',' ',text_input)
    table = str.maketrans('','',string.punctuation)

    # turn uppercase letters into lowercase ones
    desc = text_input.lower()

    # split into words
    desc = desc.split(' ')

    # remove the punctuations
    text_no_punctuation = [word.translate(table) for word in desc]

    if 'unk' in text_no_punctuation:
        unk_flag = True

    # join the caption words
    caption = ' '.join(text_no_punctuation)
    
    return caption,unk_flag

# get a list of music-related words to use for evaluation
aspects = set()
for x in ds:
    aspect_str = x["aspect_list"]
    for t in "[]\"'":
        aspect_str = aspect_str.replace(t, "")
    aspects.update(aspect_str.split(", "))
# clean aspects
aspects = {clean_text_for_aspect_metrics(a) for a in aspects if len(a) > 2}
    
def wrap_in_space(s):
    return ' ' + s + ' '
    
# filter
all_captions = clean_text_for_aspect_metrics(' '.join(ds[i]['caption'] for i in range(len(ds))))
aspect_counts = {a: all_captions.count(wrap_in_space(a)) for a in aspects}
aspects = {a for a in aspects if aspect_counts[a] > 10}
aspects -= {'the'}

def compute_aspects_metric(true, pred):
    true = wrap_in_space(clean_text_for_aspect_metrics(true))
    pred = wrap_in_space(clean_text_for_aspect_metrics(pred))
    
    aspects_in_true = {a for a in aspects if wrap_in_space(a) in true}
    aspects_in_pred = {a for a in aspects if wrap_in_space(a) in pred}
    
    precision = len(aspects_in_true&aspects_in_pred)/np.maximum(len(aspects_in_pred),1)
    recall = len(aspects_in_true&aspects_in_pred)/np.maximum(len(aspects_in_true), 1)
    
    return precision, recall

In [None]:
def import_outputs(data_path):
    
    with open(data_path) as f:
        data = json.load(f)

    # multiple references for one captions
    true_captions = data['eval_true_captions']
    # single prediction
    predicted_captions = data['eval_pred_captions']

    return true_captions,predicted_captions

# Compute metrics for the non-summarized dataset (noaug + chataug)

In [None]:
def compute_metrics(true_captions,predicted_captions):
    true_clean = []
    pred_clean = []
    aspect_precision_list = []
    aspect_recall_list = []
    for i, (true, pred) in tqdm(enumerate(zip(true_captions, predicted_captions))):
        # preprocess captions and predictions to remove punctuations and <unk> tokens
        pred,pred_unk_flag = preprocessing_remove_unk(pred)
        if pred_unk_flag == False:

            pred_clean.append(pred)
            true = preprocessing_remove_unk(true)[0]
            true_clean.append(true)

            # compute aspect metrics:
            precision,recall = compute_aspects_metric(true, pred)
            aspect_precision_list.append(precision)
            aspect_recall_list.append(recall)

    # print(len(pred_clean))
    # print(len(true_clean))
    total_google_bleu = google_bleu.compute(predictions = pred_clean,references = true_clean)
    total_rouge = rouge.compute(predictions = pred_clean,references = true_clean)
    total_meteor = meteor.compute(predictions = pred_clean,references = true_clean)

    # compute spec metrics
    n_shuffles = 20
    shuffled_gleu_score, shuffled_meteor_score,shuffled_rouge_score = 0, 0, 0
    for _ in tqdm(range(n_shuffles)):
        true_shuffled = sorted(true_clean, key=lambda k: random.random())
        shuffled_gleu_score += 1./n_shuffles * google_bleu.compute(predictions=pred_clean, references=true_shuffled)['google_bleu']
        shuffled_meteor_score += 1./n_shuffles * meteor.compute(predictions=pred_clean, references=true_shuffled)['meteor']
        shuffled_rouge_score += 1./n_shuffles * rouge.compute(predictions=pred_clean, references=true_shuffled)['rougeL']
    spec_meteor = total_meteor["meteor"]-shuffled_meteor_score
    spec_gleu = total_google_bleu["google_bleu"]-shuffled_gleu_score
    spec_rouge = total_rouge["rougeL"] - shuffled_rouge_score



    return total_google_bleu, total_rouge, total_meteor,aspect_precision_list,aspect_recall_list,spec_meteor,spec_gleu,spec_rouge


In [None]:
def print_metrics(data_paths,methods):
    for i, (data_path, method) in enumerate(zip(data_paths,methods)):
       true_captions,predicted_captions = import_outputs(data_path)
       google_bleu_score, rouge_score, meteor_score,precision,recall,spec_meteor,spec_gleu,spec_rouge = compute_metrics(true_captions,predicted_captions)

       precision = np.array(precision)
       recall = np.array(recall)
       
       print(method,"test Google BLEU score:",str(np.round(google_bleu_score["google_bleu"],3)))
       print(method,"test ROUGE score:",str(np.round(rouge_score["rougeL"],3)))
       print(method,"test METEOR score:",str(np.round(meteor_score["meteor"],3)))
       print(method,"test spec - Google BLEU score:",str(np.round(spec_gleu,3)))
       print(method,"test spec - ROUGE score:",str(np.round(spec_rouge,3)))
       print(method,"test spec - METEOR score:",str(np.round(spec_meteor,3)))
       print(method,"test aspect precision score:",str(np.round(np.mean(precision),3)),'+/-',str(np.round(np.std(precision),3)))
       print(method,"test aspect recall score:",str(np.round(np.mean(recall),3)),'+/-',str(np.round(np.std(recall),3)))

In [None]:
data_paths = ["outputs/preds_gpt2_enc_noaug.json",
              "outputs/preds_gpt2_enc_chataug.json",
              "outputs/preds_lstm_noattn_noaug.json",
              "outputs/preds_lstm_attn_noaug.json"]


methods = ["GPT-2 fine-tuned encoder no aug",
           "GPT-2 fine-tuned encoder with ChatAug",
           "LSTM no attention no aug",
           "LSTM attention no aug"]



In [None]:
print_metrics(data_paths,methods)

# Compute metrics for the summarized dataset (noaug + chataug)

In [None]:
def compute_metrics_summaries_gpt2(true_captions,predicted_captions):
    true_clean = []
    pred_clean = []
    aspect_precision_list = []
    aspect_recall_list = []

    for i, (true, pred) in tqdm(enumerate(zip(true_captions, predicted_captions))):
        # preprocess captions and predictions to remove punctuations and <unk> tokens
        pred,pred_unk_flag = preprocessing_remove_unk(pred)
        if pred_unk_flag == False:
            pred_clean.append(pred)
            if isinstance(true, list):
                true = [preprocessing_remove_unk(caption)[0] for caption in true]
                concatenated_true = true[0] + true[1] + true[2]
            else:
                true = preprocessing_remove_unk(true)[0]
            true_clean.append(true)

            # compute aspect metrics:
            precision,recall = compute_aspects_metric(concatenated_true, pred)
            aspect_precision_list.append(precision)
            aspect_recall_list.append(recall)

    print(len(pred_clean))
    print(len(true_clean))
    total_google_bleu = google_bleu.compute(predictions = pred_clean,references = true_clean)
    total_rouge = rouge.compute(predictions = pred_clean,references = true_clean)
    total_meteor = meteor.compute(predictions = pred_clean,references = true_clean)

    # compute spec metrics
    n_shuffles = 20
    shuffled_gleu_score, shuffled_meteor_score,shuffled_rouge_score = 0, 0, 0
    for _ in tqdm(range(n_shuffles)):
        true_shuffled = sorted(true_clean, key=lambda k: random.random())
        shuffled_gleu_score += 1./n_shuffles * google_bleu.compute(predictions=pred_clean, references=true_shuffled)['google_bleu']
        shuffled_meteor_score += 1./n_shuffles * meteor.compute(predictions=pred_clean, references=true_shuffled)['meteor']
        shuffled_rouge_score += 1./n_shuffles * rouge.compute(predictions=pred_clean, references=true_shuffled)['rougeL']
    spec_meteor = total_meteor["meteor"]-shuffled_meteor_score
    spec_gleu = total_google_bleu["google_bleu"]-shuffled_gleu_score
    spec_rouge = total_rouge["rougeL"] - shuffled_rouge_score
    
    return total_google_bleu, total_rouge, total_meteor,aspect_precision_list, aspect_recall_list,spec_meteor,spec_gleu,spec_rouge


In [None]:
data_paths_summarized_gpt2= ["outputs/preds_gpt2_enc_summarized.json"]

methods_summarized_gpt2 = ["GPT-2 fine-tuned encoder and summarized dataset"]

In [None]:
def print_metrics2(data_paths,methods):
    for i, (data_path, method) in enumerate(zip(data_paths,methods)):
       true_captions,predicted_captions = import_outputs(data_path)
       google_bleu_score, rouge_score, meteor_score, precision, recall,spec_meteor,spec_gleu,spec_rouge= compute_metrics_summaries_gpt2(true_captions,predicted_captions)

       print(method,"test GLEU score:",str(np.round(google_bleu_score["google_bleu"],3)))
       print(method,"test google ROUGE score:",str(np.round(rouge_score["rougeL"],3)))
       print(method,"test google METEOR score:",str(np.round(meteor_score["meteor"],3)))
       print(method,"test spec - Google BLEU score:",str(np.round(spec_gleu,3)))
       print(method,"test spec - ROUGE score:",str(np.round(spec_rouge,3)))
       print(method,"test spec - METEOR score:",str(np.round(spec_meteor,3)))
       print(method,"test aspect precision score:",str(np.round(np.mean(precision),3)),'+/-',str(np.round(np.std(precision),3)))
       print(method,"test aspect recall score:",str(np.round(np.mean(recall),3)),'+/-',str(np.round(np.std(recall),3)))

In [None]:
print_metrics2(data_paths_summarized_gpt2,methods_summarized_gpt2)

In [None]:
def compute_metrics_summaries_lstm(true_captions,predicted_captions):
    true_clean = []
    pred_clean = []
    aspect_recall_list = []
    aspect_precision_list = []

    for i, (true, pred) in tqdm(enumerate(zip(true_captions, predicted_captions))):
        # the model outputs 3 identical predictions so we need to only append one
        pred,pred_unk_flag = preprocessing_remove_unk(pred)
        if pred_unk_flag == False:
            # preprocess captions and predictions to remove punctuations and <unk> tokens
            true = preprocessing_remove_unk(true)[0]
            true_clean.append(true)
            if i%3==0 :
                pred_clean.append(pred)

    nested_summarized_true_captions = []
    nested_summarized_true_captions = [[true_clean[i], true_clean[i+1], true_clean[i+2]] for i in range(0, len(true_clean), 3)]

    # print(pred_clean)
    # compute aspect metrics:
    for  i, (true, pred) in tqdm(enumerate(zip(nested_summarized_true_captions, pred_clean))):
        concat_true = true[0] + true[1] +true[2]
        precision,recall = compute_aspects_metric(concat_true, pred)
        aspect_precision_list.append(precision)
        aspect_recall_list.append(recall)


    print(len(pred_clean))
    print(len(nested_summarized_true_captions))
    total_google_bleu = google_bleu.compute(predictions = pred_clean,references = nested_summarized_true_captions)
    total_rouge = rouge.compute(predictions = pred_clean,references = nested_summarized_true_captions)
    total_meteor = meteor.compute(predictions = pred_clean,references = nested_summarized_true_captions)

    # print(pred_clean[2])
    # print(nested_summarized_true_captions[2])

    # compute spec metrics
    n_shuffles = 20
    shuffled_gleu_score, shuffled_meteor_score,shuffled_rouge_score = 0, 0, 0
    for _ in tqdm(range(n_shuffles)):
        true_shuffled = sorted(nested_summarized_true_captions, key=lambda k: random.random())
        shuffled_gleu_score += 1./n_shuffles * google_bleu.compute(predictions=pred_clean, references=true_shuffled)['google_bleu']
        shuffled_meteor_score += 1./n_shuffles * meteor.compute(predictions=pred_clean, references=true_shuffled)['meteor']
        shuffled_rouge_score += 1./n_shuffles * rouge.compute(predictions=pred_clean, references=true_shuffled)['rougeL']
    spec_meteor = total_meteor["meteor"]-shuffled_meteor_score
    spec_gleu = total_google_bleu["google_bleu"]-shuffled_gleu_score
    spec_rouge = total_rouge["rougeL"] - shuffled_rouge_score

    
    return total_google_bleu, total_rouge, total_meteor,aspect_precision_list,aspect_recall_list, spec_meteor, spec_gleu, spec_rouge

In [None]:
data_paths_summarized_lstm= ["outputs/preds_lstm_noattn_summaries.json",
                        "outputs/preds_lstm_attn_summaries.json"]

methods_summarized_lstm = ["LSTM no attention and summarized dataset",
                           "LSTM with attention and summarized dataset"]

In [None]:
def print_metrics3(data_paths,methods):
    for i, (data_path, method) in enumerate(zip(data_paths,methods)):
       true_captions,predicted_captions = import_outputs(data_path)
       google_bleu_score, rouge_score, meteor_score, precision,recall, spec_meteor, spec_gleu, spec_rouge = compute_metrics_summaries_lstm(true_captions,predicted_captions)

       precision = np.array(precision)
       recall = np.array(recall)
       

       print(method,"test GLEU score:",str(np.round(google_bleu_score["google_bleu"],3)))
       print(method,"test google ROUGE score:",str(np.round(rouge_score["rougeL"],3)))
       print(method,"test google METEOR score:",str(np.round(meteor_score["meteor"],3)))
       print(method,"test spec - Google BLEU score:",str(np.round(spec_gleu,3)))
       print(method,"test spec - ROUGE score:",str(np.round(spec_rouge,3)))
       print(method,"test spec - METEOR score:",str(np.round(spec_meteor,3)))
       print(method,"test aspect precision score:",str(np.round(np.mean(precision),3)),'+/-',str(np.round(np.std(precision),3)))
       print(method,"test aspect recall score:",str(np.round(np.mean(recall),3)),'+/-',str(np.round(np.std(recall),3)))

In [None]:
print_metrics3(data_paths_summarized_lstm,methods_summarized_lstm)