# Preds analysis

In [3]:
!pip install datasets
!pip install evaluate



In [55]:
!pip install rouge_score



In [1]:
import json
import evaluate
from collections import defaultdict, Counter
from tqdm import tqdm as tqdm
from musiccaps import load_musiccaps
import string
import numpy as np
import random
import pandas as pd
import re
import string
import itertools

  from .autonotebook import tqdm as notebook_tqdm


## Helper functions

In [2]:
meteor = evaluate.load('meteor')
google_bleu = evaluate.load('google_bleu')
rouge = evaluate.load('rouge')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/corinacaraconcea/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/corinacaraconcea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/corinacaraconcea/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:


ds = load_musiccaps(
    "./music_data",
    sampling_rate=16000,
    limit=None,
    num_proc=8,
    writer_batch_size=1000,
    return_without_audio=True,
)

def clean_text_for_aspect_metrics(caption):
    table = str.maketrans('','', string.punctuation)
    caption.replace("-"," ")
    # split the sentences into words
    desc = caption.split()
    #converts to lower case
    desc = [word.lower() for word in desc]
    #remove punctuation from each token
    desc = [word.translate(table) for word in desc]
    #remove hanging 's and a 
    #desc = [word for word in desc if(len(word)>1)]
    #remove tokens with numbers in them
    #desc = [word for word in desc if(word.isalpha())]
    #convert back to string
    caption = ' '.join(desc)
    return caption

def preprocessing_remove_unk(text_input):
    # remove punctuations
    desc = re.sub(r'[^\w\s]',' ',text_input)
    table = str.maketrans('','',string.punctuation)

    # turn uppercase letters into lowercase ones
    desc = text_input.lower()

    # split into words
    desc = desc.split(' ')
    
    try: 
        # remove <unk> tokens
        desc.remove("<unk>")
    except ValueError:
        desc = desc
    try: 
        # remove <unk> tokens
        desc.remove("unk")
    except ValueError:
        desc = desc
        
    # remove the punctuations
    text_no_punctuation = [word.translate(table) for word in desc]

    # join the caption words
    caption = ' '.join(text_no_punctuation)
    
    return caption

# get a list of music-related words to use for evaluation
aspects = set()
for x in ds:
    aspect_str = x["aspect_list"]
    for t in "[]\"'":
        aspect_str = aspect_str.replace(t, "")
    aspects.update(aspect_str.split(", "))
# clean aspects
aspects = {clean_text_for_aspect_metrics(a) for a in aspects if len(a) > 2}
    
def wrap_in_space(s):
    return ' ' + s + ' '
    
# filter
all_captions = clean_text_for_aspect_metrics(' '.join(ds[i]['caption'] for i in range(len(ds))))
aspect_counts = {a: all_captions.count(wrap_in_space(a)) for a in aspects}
aspects = {a for a in aspects if aspect_counts[a] > 10}
aspects -= {'the'}

def compute_aspects_metric(true, pred):
    true = wrap_in_space(clean_text_for_aspect_metrics(true))
    pred = wrap_in_space(clean_text_for_aspect_metrics(pred))
    
    aspects_in_true = {a for a in aspects if wrap_in_space(a) in true}
    aspects_in_pred = {a for a in aspects if wrap_in_space(a) in pred}
    
    #print(aspects_in_true)
    #print(aspects_in_pred)
    
    precision = len(aspects_in_true&aspects_in_pred)/np.maximum(len(aspects_in_pred),1)
    recall = len(aspects_in_true&aspects_in_pred)/np.maximum(len(aspects_in_true), 1)
    
    return precision, recall

Found cached dataset csv (/Users/corinacaraconcea/.cache/huggingface/datasets/google___csv/google--MusicCaps-bedc2a0fd7888f2f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


In [4]:
def import_outputs(data_path):
    
    with open(data_path) as f:
        data = json.load(f)

    # multiple references for one captions
    true_captions = data['eval_true_captions']
    # single prediction
    predicted_captions = data['eval_pred_captions']

    return true_captions,predicted_captions

# Compute metrics for the non-summarized dataset (noaug + chataug)

In [5]:
def compute_metrics(true_captions,predicted_captions):
    true_clean = []
    pred_clean = []
    for i, (true, pred) in tqdm(enumerate(zip(true_captions, predicted_captions))):
        # preprocess captions and predictions to remove punctuations and <unk> tokens
        true = preprocessing_remove_unk(true)
        true_clean.append(true)
        pred = preprocessing_remove_unk(pred)
        pred_clean.append(pred)

    total_google_bleu = google_bleu.compute(predictions = pred_clean,references = true_clean)
    total_rouge = rouge.compute(predictions = pred_clean,references = true_clean)
    total_meteor = meteor.compute(predictions = pred_clean,references = true_clean)
    
    return total_google_bleu, total_rouge, total_meteor


In [6]:
def print_metrics(data_paths,methods):
    for i, (data_path, method) in enumerate(zip(data_paths,methods)):
       true_captions,predicted_captions = import_outputs(data_path)
       google_bleu_score, rouge_score, meteor_score, = compute_metrics(true_captions,predicted_captions)

       print(method,"test GLEU score:",str(np.round(google_bleu_score["google_bleu"],3)))
       print(method,"test google ROUGE score:",str(np.round(rouge_score["rouge1"],3)))
       print(method,"test google METEOR score:",str(np.round(meteor_score["meteor"],3)))

In [10]:
data_paths = ["outputs/preds_gpt2_enc_noaug.json",
              "outputs/preds_gpt2_enc_chataug.json",
              "outputs/preds_lstm_noattn_noaug.json",
              "outputs/preds_lstm_attn_noaug.json"]


methods = ["GPT-2 fine-tuned encoder no aug",
           "GPT-2 fine-tuned encoder with ChatAug",
           "LSTM no attention no aug",
           "LSTM attention no aug"]



In [11]:
print_metrics(data_paths,methods)

549it [00:00, 15937.11it/s]


GPT-2 fine-tuned encoder no aug test GLEU score: 0.091
GPT-2 fine-tuned encoder no aug test google ROUGE score: 0.287
GPT-2 fine-tuned encoder no aug test google METEOR score: 0.208


549it [00:00, 20248.62it/s]


GPT-2 fine-tuned encoder with ChatAug test GLEU score: 0.093
GPT-2 fine-tuned encoder with ChatAug test google ROUGE score: 0.302
GPT-2 fine-tuned encoder with ChatAug test google METEOR score: 0.213


549it [00:00, 24083.26it/s]


LSTM no attention no aug test GLEU score: 0.084
LSTM no attention no aug test google ROUGE score: 0.276
LSTM no attention no aug test google METEOR score: 0.185


549it [00:00, 24687.72it/s]


LSTM attention no aug test GLEU score: 0.083
LSTM attention no aug test google ROUGE score: 0.276
LSTM attention no aug test google METEOR score: 0.184


# Compute metrics for the summarized dataset (noaug + chataug)

In [107]:
def compute_metrics_summaries_gpt2(true_captions,predicted_captions):
    true_clean = []
    pred_clean = []
    for i, (true, pred) in tqdm(enumerate(zip(true_captions, predicted_captions))):
        # preprocess captions and predictions to remove punctuations and <unk> tokens
        if isinstance(true, list):
            true = [preprocessing_remove_unk(caption) for caption in true]
        else:
            true = preprocessing_remove_unk(true)
        true_clean.append(true)
        pred = preprocessing_remove_unk(pred)
        pred_clean.append(pred)

    print(pred_clean[:2])
    print(true_clean[:2])
    total_google_bleu = google_bleu.compute(predictions = pred_clean,references = true_clean)
    total_rouge = rouge.compute(predictions = pred_clean,references = true_clean)
    total_meteor = meteor.compute(predictions = pred_clean,references = true_clean)
    
    return total_google_bleu, total_rouge, total_meteor


In [114]:
data_paths_summarized_gpt2= ["outputs/preds_gpt2_enc_summarized.json"]

methods_summarized_gpt2 = ["GPT-2 fine-tuned encoder and summarized dataset"]

In [115]:
def print_metrics2(data_paths,methods):
    for i, (data_path, method) in enumerate(zip(data_paths,methods)):
       true_captions,predicted_captions = import_outputs(data_path)
       google_bleu_score, rouge_score, meteor_score, = compute_metrics_summaries_gpt2(true_captions,predicted_captions)

       print(method,"test GLEU score:",str(np.round(google_bleu_score["google_bleu"],3)))
       print(method,"test google ROUGE score:",str(np.round(rouge_score["rouge1"],3)))
       print(method,"test google METEOR score:",str(np.round(meteor_score["meteor"],3)))

In [116]:
print_metrics2(data_paths_summarized_gpt2,methods_summarized_gpt2)

549it [00:00, 13365.18it/s]


['the music has a mellow electric guitar melody bass guitar shimmering cymbals and punchy kick and snare hits the recording quality is low but the overall vibe is relaxed and easygoing', 'the recording has a distorted electric guitar melody with background noise it is in mono and sounds like it was recorded with a phone']
[['the song has an electric guitar as the main instrument playing a descending run arpeggiated chord double stop hammer and descending slide followed by a chord run the percussion uses rim shots and plays a simple beat in common time the bass plays only one note on the first count of each bar while the piano provides backing chords there are no vocals and the mood is relaxing', 'this is a laidback instrumental piece featuring an electric guitar percussion bass and piano the guitar opens with a descending run and goes on to play arpeggiated chords with hammerons and slides while the percussion keeps a simple beat the bass plays just one note at the beginning of each ba

In [102]:
def compute_metrics_summaries_lstm(true_captions,predicted_captions):
    true_clean = []
    pred_clean = []
    for i, (true, pred) in tqdm(enumerate(zip(true_captions, predicted_captions))):
        # preprocess captions and predictions to remove punctuations and <unk> tokens
        true = preprocessing_remove_unk(true)
        true_clean.append(true)
        # the model outputs 3 identical predictions so we need to only append one
        if i%3==0 :
            pred = preprocessing_remove_unk(pred)
            pred_clean.append(pred)

    nested_summarized_true_captions = []
    nested_summarized_true_captions = [[true_clean[i], true_clean[i+1], true_clean[i+2]] for i in range(0, len(true_clean), 3)]

    total_google_bleu = google_bleu.compute(predictions = pred_clean,references = nested_summarized_true_captions)
    total_rouge = rouge.compute(predictions = pred_clean,references = nested_summarized_true_captions)
    total_meteor = meteor.compute(predictions = pred_clean,references = nested_summarized_true_captions)
    
    return total_google_bleu, total_rouge, total_meteor

In [103]:
data_paths_summarized_lstm= ["outputs/preds_lstm_noattn_summaries.json",
                        "outputs/preds_lstm_attn_summaries.json"]

methods_summarized_lstm = ["LSTM no attention and summarized dataset",
                           "LSTM with attention and summarized dataset"]

In [104]:
def print_metrics3(data_paths,methods):
    for i, (data_path, method) in enumerate(zip(data_paths,methods)):
       true_captions,predicted_captions = import_outputs(data_path)
       google_bleu_score, rouge_score, meteor_score, = compute_metrics_summaries_lstm(true_captions,predicted_captions)

       print(method,"test GLEU score:",str(np.round(google_bleu_score["google_bleu"],3)))
       print(method,"test google ROUGE score:",str(np.round(rouge_score["rouge1"],3)))
       print(method,"test google METEOR score:",str(np.round(meteor_score["meteor"],3)))

In [105]:
print_metrics3(data_paths_summarized_lstm,methods_summarized_lstm)

1647it [00:00, 37369.53it/s]


LSTM no attention and summarized dataset test GLEU score: 0.111
LSTM no attention and summarized dataset test google ROUGE score: 0.354
LSTM no attention and summarized dataset test google METEOR score: 0.246


1647it [00:00, 44781.08it/s]


LSTM with attention and summarized dataset test GLEU score: 0.107
LSTM with attention and summarized dataset test google ROUGE score: 0.346
LSTM with attention and summarized dataset test google METEOR score: 0.243


In [106]:
true_clean = [1,2,3,4,5,6]
[[true_clean[i], true_clean[i+1], true_clean[i+2]] for i in range(0, len(true_clean), 3)]

[[1, 2, 3], [4, 5, 6]]