In [None]:
!pip install pysbd
!pip install sentence-transformers
!pip install datasets
!pip install evaluate
!pip install bert_score

In [None]:
import numpy as np
import os
import pandas as pd
import regex as re
import datetime
import matplotlib.pyplot as plt
from tqdm import tqdm
from difflib import SequenceMatcher
from collections import Counter
from functools import partial

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import TensorBoard

import pysbd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AdamWeightDecay
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers.keras_callbacks import KerasMetricCallback
import evaluate
import bert_score

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
tqdm.pandas()

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ameya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
file_path = '/content/drive/MyDrive/682-project/data/metadata.tsv'

# Import Pandas
import pandas as pd

# Read the TSV file as a DataFrame
metadata_training = pd.read_csv(file_path, sep='\t')
print("Columns: ", metadata_training.columns)
print("Shape: ", metadata_training.shape)

In [None]:
metadata_training.head()

In [None]:
print("Episode Duration Stats:\n"
      f"{metadata_training['duration'].describe()}")
metadata_training['duration'].hist(bins=1000, figsize=(10,5), log=True)
plt.show()

In [None]:
show_episodes = metadata_training.groupby(['show_filename_prefix']).apply(lambda x: list(zip(x['episode_filename_prefix'], x['episode_description']))).to_dict()
show_n_episodes = {k: len(v) for k, v in show_episodes.items()}
print("Statistics about number of episodes per show:\n"
      f"{pd.Series(show_n_episodes.values()).describe()}")
pd.Series(show_n_episodes.values()).hist(bins=1000, figsize=(10,5), log=True)
plt.show()

#### Removing NAN values from the episode_description and show_description columns.

In [None]:
print("Before dropping NaN values: \n", metadata_training.isna().any())
metadata_training.dropna(subset=['episode_description', 'show_description'], inplace=True)
print("\nAfter dropping NaN values:\n", metadata_training.isna().any())

#### Gold dataset of 150 episodes composed by 6 set of summaries for each episode (900 document-summary-grade triplets) that were graded on the Bad/Fair/Good/Excellent scale (0-3).

We merge this gold dataset with the dataset we are going to clean, and the best summary of each episode will be considered.

In [None]:
metadata_gold = pd.read_csv('/content/drive/MyDrive/682-project/data/150gold.tsv', sep='\t')
metadata_gold.head()

In [None]:
# Bad = 1, Excellent = 4
quality = {
    'B': 1,
    'F': 2,
    'G': 3,
    'E': 4
}

# convert egfb columns to a quality score
egfb_columns = ['EGFB', 'EGFB.1', 'EGFB.2', 'EGFB.3', 'EGFB.4', 'EGFB.5']
egfb_to_quality = metadata_gold[egfb_columns].applymap(lambda x: quality[x])

# remove rows with no quality > 1
egfb_to_quality = egfb_to_quality[[any(row > 1) for row in egfb_to_quality.values]]

# select the best transcript for each episode
best_egfb = egfb_to_quality.apply(lambda x: x.idxmax(), axis=1)
best_summary = [metadata_gold.iloc[i, np.argwhere(metadata_gold.columns == egfb)[0][0] - 1] for i, egfb in best_egfb.iteritems()]

metadata_gold = metadata_gold.loc[best_egfb.index]
metadata_gold['best_summary'] = best_summary

# create a dictionary of the best summary for each episode
gold_summaries = {row['episode id']: row['best_summary'] for i, row in metadata_gold.iterrows()}

In [None]:
metadata_gold.head()

In [None]:
# substitute the episode descriptions correspondent to the episodes in the gold set with the best summary
for i, row in metadata_training.iterrows():
    if row['episode_uri'] in gold_summaries.keys():
        metadata_training.at[i, 'episode_description'] = gold_summaries[row['episode_uri']]

In [None]:
metadata_training['episode_description'][1]

#### We clean and improve these epsiode descriptions:

1.   removing the content after "---" that usually is a sponsorship or a boilerplate (e.g., “--- This episode is sponsored by ...” “--- Send in a voice message”)
2.   remove sentences that contain URLs, @mentions and email addresses in the episode descriptions
3. remove tokens corresponding to emojii
4. identify sentences that contain not useful content and remove them from the descriptions by computing a salience score for each sentence. This is done by summing over word IDF scores. Then we remove sentences if their salience scores are lower than a threshold.

In [None]:
def compute_document_frequencies(episode_descriptions):
    """
    Compute the document frequencies in the whole dataset descriptions

    Parameters
    ----------
    episode_descriptions : list of str
        The descriptions of the episodes

    Returns
    -------
    A dictionary of word frequencies
    """
    seg = pysbd.Segmenter(language="en", clean=False)

    # get a set of words contained in each description (words are all lowercase)
    flattened_descriptions = []
    for description in tqdm(episode_descriptions, desc="Computing word frequencies"):
        description_set = set()
        for sentence in seg.segment(description):
            description_set.update([word.lower() for word in word_tokenize(sentence)])
        flattened_descriptions.extend(list(description_set))

    counts = pd.Series(Counter(flattened_descriptions))  # Get counts and transform to Series
    return counts

# compute the document frequencies that will be used to compute the sentence salience score
document_frequencies = compute_document_frequencies(metadata_training['episode_description'])
print(document_frequencies)

In [None]:
# store the old dataframe to make comparisons
metadata_train_old = metadata_training.copy()

In [None]:
def remove_boilerplate(description):
    """
    Remove boilerplate from the episode description

    Parameters
    ----------
    description : str
        The episode description

    Returns
    -------
    A description without boilerplate (str)
    """
    boilerplate_re = re.compile(r"---.*")
    return boilerplate_re.sub("", description)

def remove_link_or_sponsors(description):
    """
    Remove sentences containing links and sponsors or username and hashtag from the episode description

    Parameters
    ----------
    description : str
        The episode description

    Returns
    -------
    A description without links and sponsors (str)
    """
    username_and_hashtag_re = re.compile(r"(\B@\w+|\B#\w+)")
    links_or_sponsors_re = re.compile(
        r"(http|https|[pP]atreon|[eE]mail|[dD]onate|IG|[iI]nstagram|[fF]acebook|[yY]outube|[tT]witter|[dD]iscord|[fF]ollow|[sS]potify)"
    )

    # remove username and hashtag
    description = username_and_hashtag_re.sub(" ", description)

    # remove sentences containing links and sponsors
    seg = pysbd.Segmenter(language="en", clean=False)
    sentences = seg.segment(description)
    sentences = [sentence for sentence in sentences if not links_or_sponsors_re.search(sentence)]
    return " ".join(sentences)

def remove_emojii(description):
    """
    Remove emojii from the episode description

    Parameters
    ----------
    description : str
        The episode description

    Returns
    -------
    A description without emojii (str)
    """
    emoji_re = re.compile(r"[^\x00-\x7F]+")
    return emoji_re.sub(" ", description)

print("\nRemoving boilerplate from the episode descriptions:")
metadata_training['episode_description'] = metadata_training['episode_description'].progress_map(remove_boilerplate)

print("Removing links and sponsors from the episode descriptions:")
metadata_training['episode_description'] = metadata_training['episode_description'].progress_map(remove_link_or_sponsors)

print("Removing emojii from the episode descriptions:")
metadata_training['episode_description'] = metadata_training['episode_description'].progress_map(remove_emojii)

In [None]:
# see a few examples of comparisons between the old and new descriptions
samples = [137, 172]
print("\nExamples of comparisons before and after removing sponsors and links:")
for i in samples:
        print("BEFORE:"
                f"\n\t- {metadata_train_old['episode_description'].iloc[i]}")
        print("AFTER:"
                f"\n\t- {metadata_training['episode_description'].iloc[i]}")
        print("\n")

In [None]:
nltk.download('stopwords')

def sentence_salience_score(sentence, num_descriptions, document_frequencies):
    """
    Compute the salience score of a sentence by summing over word IDF scores.
    Only alphabetic words that are longer than one character and are neither stop words nor words like 'episode' or 'podcast'
    are considered when computing sentence salience scores.

    Parameters
    ----------
    sentence : str
        The sentence to compute the salience score for
    num_descriptions : int
        The number of descriptions in the dataset
    document_frequencies : pandas.Series
        The document frequencies in the whole dataset descriptions

    Returns
    -------
    The salience score of the sentence (float)
    """
    idf_scores = []
    tokenized_sentence = word_tokenize(sentence)

    # compute IDF scores for each word in the sentence and sum them up

    for word in tokenized_sentence:
        lower_world = word.lower()
        # consider only alphabetic words, and remove stop words, single character
        if lower_world in document_frequencies.keys() and lower_world.isalpha() and lower_world not in stopwords.words('english') and len(lower_world) > 1 and lower_world not in ['episode', 'podcast']:
            # get document frequency
            df = document_frequencies[lower_world]

            # compute idf score
            idf_score = np.log(num_descriptions/df)
            idf_scores.append(idf_score)

    idf_scores = np.array(idf_scores)
    salience_score = idf_scores.mean() if len(idf_scores)>0 else 0.0
    return salience_score

def remove_unuseful_sentences(description, num_descriptions, word_frequencies, threshold=3.6):
    """
    Remove sentences that are not useful for the transcriptions

    Parameters
    ----------
    description : str
        The episode description
    num_descriptions : int
        The number of descriptions in the dataset
    word_frequencies : pandas.Series
        The word frequencies in the whole dataset descriptions
    threshold : double
        The threshold for the salience score of a sentence to be considered useful

    Returns
    -------
    A description without unuseful sentences (str)
    """
    # segment the text into sentences
    seg = pysbd.Segmenter(language="en", clean=False)
    sentences = seg.segment(description)
    # remove sentences that are not useful for the transcriptions
    sentences = [sentence for sentence in sentences if sentence_salience_score(sentence, num_descriptions, word_frequencies) > threshold]
    return " ".join(sentences)

metadata_training['episode_description'] = metadata_training['episode_description'].progress_map(lambda x: remove_unuseful_sentences(x, metadata_training.shape[0], document_frequencies))

In [None]:
# see a few examples of comparisons between the old and new descriptions
samples = [137, 172]
print("\nExamples of comparisons before and after removing unuseful sentences:")
for i in samples:
        print("BEFORE:"
                f"\n\t- {metadata_train_old['episode_description'].iloc[i]}")
        print("AFTER:"
                f"\n\t- {metadata_training['episode_description'].iloc[i]}")
        print("\n")

In [None]:
def check_length_brass(episode, upper_bound=750, lower_bound=20):
    """
    Check if the episode descriptions is not too long (> 750 characters) or not too short (< 20 characters)

    Parameters
    ----------
    episode : pandas.Series
        A row from the metadata file
    upper_bound : int
        The upper bound of the episode description length
    lower_bound : int
        The lower bound of the episode description length

    Returns
    -------
    Boolean indicating if the episode description is long enough
    """
    return len(episode['episode_description']) <= upper_bound and len(episode['episode_description']) >= lower_bound

def description_similarity(a, b):
    """
    Measure the overlapping between two descriptions

    Parameters
    ----------
    a : str
        The first description
    b : str
        The second description

    Returns
    -------
    Value indicating the overlapping between the two descriptions
    """
    return SequenceMatcher(None, a, b).ratio()

def check_show_description_overlap_brass(episode, thresh=0.5):
    """
    Check if the episode descriptions overlapping with the show description is not too high (< 0.5)

    Parameters
    ----------
    episode : pandas.Series
        A row from the metadata file
    thresh : float
        The threshold of the overlap between the episode description and the show description

    Returns
    -------
    Boolean indicating if the episode description is different enough from the show description
    """
    return description_similarity(episode['show_description'], episode['episode_description']) < thresh

def check_other_description_overlap_brass(episode, show_episodes, thresh=0.6):
    """
    Check if the episode descriptions overlapping with the other description in the same show is not too high (< 0.6)

    Parameters
    ----------
    episode : pandas.Series
        A row from the metadata file
    show_episodes : dict
        A dictionary of the episodes of the same show
    thresh : float
        The threshold of the overlap between the episode description and the other description

    Returns
    -------
    Boolean indicating if the episode description is different enough from the other description
    """
    for other_prefix, other_description in show_episodes[episode['show_filename_prefix']]:
        if other_prefix != episode['episode_filename_prefix'] and description_similarity(episode['episode_description'], other_description) > thresh and len(episode['episode_description']) < len(other_description):
            return False
    return True


brass_set_lenght = metadata_training[metadata_training.progress_apply(check_length_brass, axis=1)]
print(f"Removed {len(metadata_training) - len(brass_set_lenght)} episodes ({(100-(len(brass_set_lenght)/len(metadata_training)*100)):.2f}%) because of too long or too short descriptions")

brass_set_show_overlap = brass_set_lenght[brass_set_lenght.progress_apply(check_show_description_overlap_brass, axis=1)]
print(f"Removed {len(brass_set_lenght) - len(brass_set_show_overlap)} episodes ({(100-(len(brass_set_show_overlap)/len(brass_set_lenght)*100)):.2f}%) because of too high overlap with the show description")

show_episodes = brass_set_show_overlap.groupby(['show_filename_prefix']).apply(lambda x: list(zip(x['episode_filename_prefix'], x['episode_description']))).to_dict()
brass_set = brass_set_show_overlap[brass_set_show_overlap.progress_apply(lambda x: check_other_description_overlap_brass(x, show_episodes), axis=1)]
print(f"Removed {len(brass_set_show_overlap) - len(brass_set)} episodes ({(100-(len(brass_set)/len(brass_set_show_overlap)*100)):.2f}%) because of too high overlap with other descriptions in the same show")

In [None]:
# look to the removed episode descriptions due to the overlap with the show description
removed_episodes_show_overlap = pd.concat([brass_set_lenght, brass_set_show_overlap]).drop_duplicates(keep=False)[['show_description', 'episode_description']]
removed_episodes_show_overlap['overlapping'] = removed_episodes_show_overlap.apply(lambda row: description_similarity(row['show_description'], row['episode_description']), axis=1)

num_to_visualize = 3

for _ in range(num_to_visualize):
    row = removed_episodes_show_overlap.sample()
    print(f"Episode description: \n\t{row['episode_description'].values[0]}")
    print(f"Show description: \n\t{row['show_description'].values[0]}")
    print(f"Overlapping score: \n\t{row['overlapping'].values[0]}")
    print("\n")

In [None]:
# look to the removed episode descriptions due to the overlap with the other episode descriptions in the same show
removed_episodes_other_overlap = pd.concat([brass_set, brass_set_show_overlap]).drop_duplicates(keep=False)[['show_filename_prefix', 'episode_filename_prefix', 'episode_description']]
two_episodes_show  = {str(show_filename_prefix): show_episodes[show_filename_prefix] for show_filename_prefix in removed_episodes_other_overlap['show_filename_prefix'] if len(show_episodes[show_filename_prefix]) == 2 }
removed_episodes_other_overlap = removed_episodes_other_overlap[removed_episodes_other_overlap['show_filename_prefix'].isin(two_episodes_show.keys())]
other_episode_show = {}
for i, row in removed_episodes_other_overlap.iterrows():
    if row['show_filename_prefix'] in two_episodes_show:
        if row['episode_filename_prefix'] in two_episodes_show[row['show_filename_prefix']][0]:
            other_episode_show[row['show_filename_prefix']] = two_episodes_show[row['show_filename_prefix']][1][1]
        else:
            other_episode_show[row['show_filename_prefix']] = two_episodes_show[row['show_filename_prefix']][0][1]
removed_episodes_other_overlap['other_episode_description'] = removed_episodes_other_overlap.apply(lambda row: other_episode_show[row['show_filename_prefix']], axis=1)
removed_episodes_other_overlap['overlapping'] = removed_episodes_other_overlap.apply(lambda row: description_similarity(row['episode_description'], row['other_episode_description']), axis=1)

num_to_visualize = 3

for _ in range(num_to_visualize):
    row = removed_episodes_other_overlap.sample()
    print(f"Episode description: \n\t{row['episode_description'].values[0]}")
    print(f"Other episode description: \n\t{row['other_episode_description'].values[0]}")
    print(f"Overlapping score: \n\t{row['overlapping'].values[0]}")
    print("\n")

#### Remove non english descriptions

In [None]:
try:
    nltk.data.find('corpora/words')
except LookupError:
    nltk.download('words')
wordset = set(words.words())

def is_english(text, threshold = 0.3):
    """
    Check if the text is written in english

    Parameters
    ----------
    text : str
        The text to check
    threshold : float
        The threshold of the ratio of english words in the text

    Returns
    -------
    Boolean indicating if the text is written in english
    """
    tokenized = word_tokenize(text)
    alpha_tokenized = [word.lower() for word in tokenized if word.isalpha()]
    dictionary_score = sum([word.lower() in wordset for word in alpha_tokenized
                           ]) / len(alpha_tokenized)
    return dictionary_score > threshold

# remove episodes with non english description
len_old_brass_set = len(brass_set)
brass_set = brass_set[brass_set.progress_apply(lambda x: is_english(x['episode_description']), axis=1)]
print(f"Removed {len_old_brass_set - len(brass_set)} episodes ({(100-(len(brass_set)/len_old_brass_set*100)):.2f}%) because of non english description")

### BRASS SET IS READY

In [None]:
# store brass set
brass_set.to_csv(os.path.join(os.path.dirname('/content/drive/MyDrive/682-project/data'), "brass_set.tsv"), index=False, sep='\t')

In [None]:
brass_set_df = pd.read_csv('/content/drive/MyDrive/682-project/data/brass_set.tsv', delimiter='\t')
brass_set_df.head()

### Transcript filtering

In [None]:
# load brass set
brass_set = pd.read_csv(('/content/drive/MyDrive/682-project/data/brass_set.tsv'), sep='\t')
brass_set.head()

### Chunk classification

In [None]:
# !pip install sentence-transformers
!pip install rouge

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from rouge import Rouge

In [None]:
# loading the dataset from the csv file
cleaned_gold_file_path = '/content/drive/MyDrive/682-project/data/gold_set_cleaned.tsv'

dataset = pd.read_csv(cleaned_gold_file_path, sep='\t')
dataset.head()

### Chunk Selection

In [None]:
def isChunkUseful(chunk, summary, metric, threshold, verbose=False):
    """
    Function to check if a chunk is useful or not

    Parameters:
        - chunk: part of the transcript
        - summary: summary of a transcript
        - metric: function of ariety 2 (chunk, summary) used to evaluate the summary
        - threshold: value used to decide whether chunk is a good summary or not
    Returns:
        - True if the chunk is a good summary, False otherwise
    """
    score = metric(chunk, summary)
    if verbose: print(f"\tChunck: {chunk}\n\tSummary: {summary}\n\tScore: {score}")

    if score < threshold:
        result = False
    else:
        result = True

    return result

### ROUGE-L f1-score

In [None]:
def rouge_score(candidate, reference, type='rouge-l', metric='f'):
    """
    ROUGE score
    Parameters:
        reference: reference text
        candidate: candidate text
        type: type of ROUGE, it can be rouge-1, rouge-2, rouge-l (default)
        metric: precision (p), recall (r) or f-score (f) (default)
    """
    rouge = Rouge()
    scores = rouge.get_scores(candidate, reference)
    return scores[0][type][metric]

In [None]:
threshold = 0.20
metric = rouge_score
verbose = False

# creation of the dataset for chunk classification
# creation of the targets

features = []
targets = []

# initalize the model for the sentence transformer
sentence_encoder = SentenceTransformer('all-MiniLM-L6-v2')

for i in tqdm(range(len(dataset)), desc="Extracting features and targets"):
    if verbose: print(f"Episode: {i}")
    chunks = semantic_segmentation(dataset.transcript[i], sentence_encoder)
    description = dataset.best_summary[i]

    num_chunks = len(chunks)
    if verbose: print(f"Num chunks: {num_chunks}")

    for j in range(num_chunks):
        if verbose: print(f"\tChunk {j}")
        features.append(extract_features(chunks[j], sentence_encoder))
        if isChunkUseful(' '.join(chunks[j]), description, metric, threshold, verbose):
            targets.append(1)
        else:
            targets.append(0)

y = np.array(targets)
y = y.reshape(y.shape[0], 1)
X = np.array(features)

In [None]:
print('y = ', y)
print('y shape = ', len(y))
print('X = ', X)
print('X shape = ', len(X))

In [None]:
# show the percentage of useful and unuseful chunks
positive = y[y==1].shape[0]
negative = y.shape[0] - positive
print(f"Percentage of useful chunks: {positive/(positive+negative)*100}%")
print(f"Percentage of unuseful chunks: {negative/(positive+negative)*100}%")

# store chunk classification dataset
chunk_classification_dataset = np.hstack((X, y))
df_chunk = pd.DataFrame(chunk_classification_dataset)
df_chunk.to_csv(('/content/drive/MyDrive/682-project/data/chunk_classification_dataset.csv'), header=False, index=False)

In [None]:
chunk_classification_dataset = pd.read_csv('/content/drive/MyDrive/682-project/data/chunk_classification_dataset.csv')
chunk_classification_dataset.head()

In [None]:
# The dataset contains 384 features and 1 target
y = chunk_classification_dataset.iloc[:,-1]
X = chunk_classification_dataset.drop(chunk_classification_dataset.columns[[-1]], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train_positive = X_train[y_train>0]
X_train_negative = X_train[y_train==0][:X_train_positive.shape[0]]
y_train_positive = y_train[y_train>0]
y_train_negative = y_train[y_train==0][:X_train_positive.shape[0]]

X_train = np.vstack((X_train_positive,X_train_negative))
y_train = np.hstack((y_train_positive, y_train_negative))

In [None]:
# Neural Network for chunk classification

inputs = keras.Input(shape=(384))
x = keras.layers.Dense(512, activation='relu')(inputs)
x = keras.layers.Dense(256, activation='relu')(x)
x = keras.layers.Dropout(0.4)(x)
x = keras.layers.Dense(256, activation='relu', kernel_regularizer='l2')(x)
x = keras.layers.Dropout(0.4)(x)
x = keras.layers.Dense(128, activation='relu', kernel_regularizer='l2')(x)
output = keras.layers.Dense(1, activation='sigmoid', kernel_regularizer='l2')(x)
model = keras.Model(inputs, output)

model.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

history = model.fit(
    X_train,
    y_train,
    batch_size=16,
    epochs=15,
    validation_split=0.15,
    validation_data=(X_test,y_test),
    callbacks=[keras.callbacks.EarlyStopping(monitor='loss', patience=3)]
)

# model.save("modelChunkNN")
model.save("/content/drive/MyDrive/682-project/modelChunkNN")

y_pred = model.predict(X_test)
y_pred = [1 if y>0.5 else 0 for y in y_pred]
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision_score(y_test, y_pred, average=None)}")
print(f"Recall: {recall_score(y_test, y_pred, average=None)}")

In [None]:
import json
import re

def get_path(episode, transcript_path):
    """
    Get the path of the episode json file

    Parameters
    ----------
    episode : pandas.Series
        A row from the metadata file
    transcript_path : str
        The absolute path of the folder containing the transcripts

    Returns
    -------
    path : str
        The absolute path of the episode json file
    """
    # extract the 2 reference number/letter to access the episode transcript
    show_filename = episode['show_filename_prefix']
    episode_filename = episode['episode_filename_prefix'] + ".json"
    dir_1, dir_2 = re.match(r'show_(\d)(\w).*', show_filename).groups()
    transcipt_path = os.path.join(transcript_path, dir_2.upper(), show_filename, episode_filename)
    print('show_filename = ', show_filename)
    print('transcript_path = ', transcript_path)
    return transcipt_path

    # Only extract 0 and 1 folders

#     match = re.match(r'show_([01])(\w).*', show_filename)
#     if match:
#         # print('match = ', match)
#         dir_1, dir_2 = match.groups()
#         # check if the transcript file in all the derived subfolders exist

#     else:
#       raise FileNotFoundError("The show_filename does not match the expected pattern")

def get_transcription(episode):
    """
    Extract the transcript from the episode json file

    Parameters
    ----------
    episode : pandas.Series
        A row from the metadata file
    dataset_path : str
        The absolute path of the dataset
    test_set : bool

    Returns
    -------
    transcript : str
        The transcript of the episode
    """

    transcript_path = os.path.join('E:/682-project/data/spotify-podcasts-2020/podcasts-transcripts/0')

#     try:
#       with open(get_path(episode, transcript_path), 'r') as f:
#           print('f = ', f)
#           episode_json = json.load(f)
#           # seems that the last result in each trastcript is a repetition of the first one, so we ignore it
#           transcripts = [
#               result["alternatives"][0]['transcript'] if 'transcript' in result["alternatives"][0] else ""
#               for result in episode_json["results"][:-1]
#           ]
#           return " ".join(transcripts)
#     except FileNotFoundError:
#         # Handle the case where the file is not found by returning an empty string
#         return ""
    try:
        with open(get_path(episode, transcript_path), 'r') as f:
            episode_json = json.load(f)
            # Check if the 'results' key exists in the JSON
            if 'results' in episode_json:
                # Remove the last result if it's a repetition of the first one
                results = episode_json.get('results', [])[:-1]
                transcripts = [
                    result["alternatives"][0]['transcript'] if 'transcript' in result["alternatives"][0] else ""
                    for result in results
                ]
                return " ".join(transcripts)
            else:
                # If 'results' key is not found, return an empty string
                return ""
    except FileNotFoundError:
        # Handle the case where the file is not found by returning an empty string
        return ""

def look_ahead_chuck(sentences, lower_chunk_size):
    """
    Look-ahead function to determine the next chunk
    """
    if sum([len(s) for s in sentences]) < lower_chunk_size:
        # if the remaining sentences size is smaller than the lower bound, we return the remaining sentences
        return sentences
    else:
        # next chunk size should be at least the lower bound
        for i in range(len(sentences)):
            if sum([len(s) for s in sentences[:i+1]]) >= lower_chunk_size:
                return sentences[:i+1]


def semantic_segmentation(text, model, lower_chunk_size=300, upper_chunk_size=2000):
    """
    Algorithm proposed by Moro et. al. (2022) to semantically segment long inputs into GPU memory-adaptable chunks.
    https://www.aaai.org/AAAI22Papers/AAAI-3882.MoroG.pdf

    Parameters
    -------------
    text: str
        The text to be segmented
    model: SentenceTransformer
        The model to be used for the sentence embeddings
    lower_chunk_size: int
        The lower bound of the chunk size
    upper_chunk_size: int
        The upper bound of the chunk size
    Return
    -------
    List of chunks of text
    """

    # segment the text into sentences
    if len(text) < 1: return
#     print('text = ', text)
    seg = pysbd.Segmenter(language="en", clean=False)
    sentences = seg.segment(text)

    # print('sentences = ', sentences)
    chunks = []
    current_chunk = [sentences[0]]
#     print('current chunk = ', current_chunk)

    # Iterate over the sentences in the text
    for i, sentence in enumerate(sentences[1:]):
        if sentence == sentences[-1]:
            # If the sentence is the last one, we add it to the last chunk
            current_chunk.append(sentence)
            chunks.append(current_chunk)
        elif sum([len(s) for s in current_chunk]) + len(sentence) < lower_chunk_size:
            # standardize each chunk to a minimum size to best leverage the capability of Transformers
            current_chunk.append(sentence)
        elif sum([len(s) for s in current_chunk]) + len(sentence) > upper_chunk_size:
            # if the chunk is too big, we add it to the list of chunks and start a new one
            chunks.append(current_chunk)
            current_chunk = [sentence]
        else:
            idx = i+1
            next_chunk = look_ahead_chuck(sentences[idx+1:], lower_chunk_size)

            # get the embedding of the previous chunk and the next chunk
            current_embedding = model.encode(current_chunk)
            next_embedding = model.encode(next_chunk)
            sentence_embedding = model.encode([sentence])

            # get the cosine similarity between the embedding of the embeddings
            score_current_chunk = util.cos_sim(sentence_embedding, current_embedding).numpy().mean()
            score_next_chunk = util.cos_sim(sentence_embedding, next_embedding).numpy().mean()

            # if the score_current_chunk is higher than the score_next_chunk, we add the sentence to the current chunk
            if score_current_chunk > score_next_chunk:
                current_chunk.append(sentence)
            else:
                if sum([len(s) for s in current_chunk]) >= lower_chunk_size:
                    chunks.append(current_chunk)
                    current_chunk = [sentence]
                else:
                    current_chunk.append(sentence)
    return chunks


def extract_features(text, model):
    """
    Extract features from text using the sentence transformer model which produce a vector of 384 dimensions for each sentence
    From each chunk an encoding of each sentence is extracted using a pretrained RoBerta Transformer to obtain a dense encoding.
    The encoding of the chunk is the mean of the encoding of its sentences.

    Parameters:
        - text: string representing a document
        - model: sentence transformer model
    Returns:
        - extracted features
    """
    embeddings = []
    for sentence in text:
        embeddings.append(model.encode(sentence))

    features = np.mean(embeddings, axis=0)

    return features

In [None]:
brass_set = pd.read_csv(('E:/682-project/data/brass_set.tsv'), sep='\t')
brass_set.head()

In [None]:
brass_set_minimized = brass_set.copy()
brass_set_minimized.head()

In [None]:
# Function to extract folder name and check if it contains '0'
def check_for_folder_conditions(row):
    folder_name_6th = row['show_filename_prefix'][5]  # Extract 6th character after 'show_'
    folder_name_7th = row['show_filename_prefix'][6]  # Extract 7th character after 'show_'

    return folder_name_6th == '0' and folder_name_7th in ['0', '1', '2', '3', '4']

# Filter the dataframe to keep rows where contains_zero_folder is True
brass_set_minimized = brass_set_minimized[brass_set_minimized.apply(check_for_folder_conditions, axis=1)]

In [None]:
brass_set_minimized.info()

In [None]:
num_rows = brass_set_minimized.shape[0]
print("Number of rows:", num_rows)

In [None]:
brass_set_minimized.head()

In [None]:
def transcript_filtering(episode, chunk_classifier, sentence_encoder, tokenizer, test_set=False):
    """
    Extract the most salient chunks inside the transcript of an episode

    Parameters
    ----------
    episode : pandas.Series
        The episode to extract the chunks from
    chunk_classifier : tf.Model
        The classifier to use to extract the most salient chunks
    sentence_encoder : tf.Model
        The encoder to use to encode the sentences
    tokenizer : AutoTokenizer
        The BART tokenizer to use to tokenize the transcript
    test_set : Boolean
        If True, the trascriptions will be searched in the test set directory, otherwise in the training set directory (default: False)

    Return
    ------
    Transcript after the selection of the most salient chunks
    """

    # extraction of chunks from the episode
    chunks = semantic_segmentation(get_transcription(episode), sentence_encoder)

    # extraction of features for each chunk
    if chunks is not None:
      features = np.array([extract_features(chunk, sentence_encoder) for chunk in chunks])

      # prediction of the classifier
      y = chunk_classifier.predict(features)

      # score for each chunk
      scores = [{'idx': i, 'relevance':y[i]} for i in range(len(chunks))]

      # sorting chunks according to the probability to be relevant
      scores.sort(key=lambda e: e['relevance'], reverse=True)

      # filter chunks according to a maximum amount of 1024 tokens
      count = 0
      i = 0
      max_tokens = 1024
      # until the number of tokens is not max_tokens and there are still chunks to tokenize
      while count <= max_tokens and i < len(scores):
          count += len(tokenizer(' '.join(chunks[scores[i]['idx']]))['input_ids'])
          i += 1
      # if total number of chunk is less than max_tokens
      if i == len(scores):
          relevant_chunks = [' '.join(chunk) for chunk in chunks]
      # othewise if there are more token than max_tokens
      else:
          selected_chunks = {scores[j]['idx']: chunks[scores[j]['idx']] for j in range(i-1)}
          # reoreder chunks in the original order
          relevant_chunks = [' '.join(chunks[idx]) for idx in sorted(selected_chunks.keys())]

      # return the new transcript
      return ' '.join(relevant_chunks)
    else: return None


chunk_classifier = tf.keras.models.load_model("E:/682-project/model/modelChunkNN")
model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
sentence_encoder = SentenceTransformer('all-MiniLM-L6-v2')

brass_set_minimized['filtered_transcript'] = brass_set_minimized.progress_apply(lambda x: transcript_filtering(x, chunk_classifier, sentence_encoder, tokenizer), axis=1)
# transcripts = brass_set.progress_apply(lambda x: transcript_filtering(x, chunk_classifier, sentence_encoder, tokenizer), axis=1)

brass_set_minimized[['episode_uri','filtered_transcript', 'episode_description']].to_csv(("E:/682-project/data/filtered_set_minimized.csv"), index=False)
print("Filtering done!")

In [None]:
# brass_set_minimized[['episode_uri','filtered_transcript', 'episode_description']].to_csv(("E:/682-project/data/filtered_set_minimized.csv"), index=False)
null_counts = filtered_set_minimized.isna().sum()
print(null_counts)

In [None]:
filtered_set_minimized = pd.read_csv("E:/682-project/data/filtered_set_minimized.csv")
filtered_set_minimized.head()

Unnamed: 0,episode_uri,filtered_transcript,episode_description
0,spotify:episode:007I0vUfkdTg8FvE5WLFIl,This is the Premier League preview podcast fol...,It's Matchweek 25. This week we preview the ...
1,spotify:episode:01X3QMRerbUz0YJ47atAAc,You date me you like you get to know me. You ...,"On this weeks Taste of Taylor, Taylor intervie..."
2,spotify:episode:01rlULldcOvtmXKi70zcco,"My name is Giovanni, Georgia. But everybody c...","In this episode, Giorgio Moroder joins David W..."
3,spotify:episode:02cmc8Kj2L214GakIX6s0w,Just wanted to take a brief moment to give you...,Your Host Dustin Nichols sits down with an old...
4,spotify:episode:039r0XO3svhB04rQmDKwa8,Hello and welcome to the volunteer firefighter...,This Week we celebrate St Valentine's day as C...


### Training

In [None]:
dataset = load_dataset('csv', data_files=("E:/682-project/data/filtered_set_minimized.csv"))

train_set, validation_set = dataset['train'].train_test_split(test_size=0.1).values()
print(f"Training set size: {train_set.num_rows}")
print(f"Validation set size: {validation_set.num_rows}")

Training set size: 693
Validation set size: 77


In [None]:
# Check train_set
# print(type(train_set))
for i, sample in enumerate(train_set):
    if i>5: break
    print(sample)

In [None]:
from evaluate import load

metric = load("rouge")
# !pip install rouge_score

### Try model

In [None]:
from transformers import BartTokenizer, TFBartForConditionalGeneration

# Initialize the BART model and tokenizer
pretrained_model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(pretrained_model_name)
model = TFBartForConditionalGeneration.from_pretrained(pretrained_model_name)

# Prepare the data
input_texts = [item['filtered_transcript'] for item in train_set]
target_texts = [item['episode_description'] for item in train_set]

# Tokenize and encode the input and target sequences
inputs = tokenizer(input_texts, padding=True, truncation=True, return_tensors="tf")
targets = tokenizer(target_texts, padding=True, truncation=True, return_tensors="tf")

# Create TensorFlow datasets
dataset = tf.data.Dataset.from_tensor_slices((dict(inputs), dict(targets)))
train_dataset = dataset.shuffle(len(train_set)).batch(2)

# Define optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for step, batch in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            input_ids = batch[0]['input_ids']
            decoder_input_ids = batch[1]['input_ids']
            labels = batch[1]['input_ids']  # Same as decoder input for Bart

            logits = model(input_ids, decoder_input_ids=decoder_input_ids, return_dict=True).logits
            loss = loss_fn(labels, logits)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        if step % 100 == 0:
            print(f"Step {step}/{len(train_set)//2}, Loss: {loss:.4f}")

print("Training complete.")

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large-cnn.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


Epoch 1/3
Step 0/346, Loss: 10.5663


### Preprocessing data

In [None]:
model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def batch_tokenize_preprocess(dataset, text_column, summary_column, tokenizer, max_input_length, max_target_length):
    """
    Preprocess a dataset by tokenizing the transcript and the summary.

    Parameters
    ----------
    dataset : Dataset
        The dataset to preprocess
    text_column : str
        The name of the column containing the transcript
    summary_column : str
        The name of the column containing the summary
    tokenizer : AutoTokenizer
        The tokenizer to use
    max_input_length : int
        The maximum length of the input sequence
    max_target_length : int
        The maximum length of the target sequence

    Returns
    -------
    The preprocessed dataset
    """
    inputs = dataset[text_column]
    targets = dataset[summary_column]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
max_input_length = 1024
max_target_length = 256


train_set_tokenized = train_set.map(
    lambda batch: batch_tokenize_preprocess(
        batch, "filtered_transcript", "episode_description", tokenizer, max_input_length, max_target_length
    ),
    batched=True,
    remove_columns=train_set.column_names,
    desc="Running tokenizer on train dataset"
)

validation_set_tokenized = validation_set.map(
    lambda batch: batch_tokenize_preprocess(
        batch, "filtered_transcript", "episode_description", tokenizer, max_input_length, max_target_length
    ),
    batched=True,
    remove_columns=validation_set.column_names,
    desc="Running tokenizer on validation dataset"
)

In [None]:
# for i, sample in enumerate(train_set_tokenized):
#     if i>5: break
#     print(sample)

train_set_tokenized_df = train_set_tokenized.to_pandas()

# Display the first few rows of the DataFrame
train_set_tokenized_df.head()

In [None]:
train_set_tokenized_df.isna().any()

In [None]:
train_set_tokenized_df.info()

In [None]:
validation_set_tokenized_df = validation_set_tokenized.to_pandas()

# Display the first few rows of the DataFrame
validation_set_tokenized_df.head()

In [None]:
validation_set_tokenized_df.isna().any()

In [None]:
validation_set_tokenized_df.info()

### Fine-tuning the model

In [None]:
# prepare the model
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
# parameters for the training
batch_size = 2
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 3

In [None]:
train_dataset = train_set_tokenized.to_tf_dataset(
    batch_size=batch_size,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
validation_dataset = validation_set_tokenized.to_tf_dataset(
    batch_size=batch_size,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)
generation_dataset = (
    validation_set_tokenized
    .shuffle()
    .select(range(min(200, len(validation_set_tokenized))))
    .to_tf_dataset(
        batch_size=batch_size,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
)

In [None]:
for i, sample in enumerate(train_dataset):
    if i>5: break
    print(sample)

In [None]:
# optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-6)
model.compile(optimizer=optimizer)

In [None]:
def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Rouge expects a newline after each sentence
    decoded_predictions = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_predictions
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]
    result = metric.compute(
        predictions=decoded_predictions, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    # Add mean generated length
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return result


model_path = "E:/682-project/model/bart-large-finetuned/filtered-spotify-podcast-summ"

log_dir = model_path + "/logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir)

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=generation_dataset, predict_with_generate=True
)

callbacks = [metric_callback, tensorboard_callback]

In [None]:
# Assuming you have tokenized inputs stored in `model_inputs["input_ids"]`

# Get token IDs
token_ids = train_set_tokenized["input_ids"]

# Check the maximum and minimum token IDs
max_token_id = max([max(seq) for seq in token_ids])
min_token_id = min([min(seq) for seq in token_ids])

# Get the vocabulary size of the tokenizer
vocab_size = tokenizer.vocab_size

# Verify the token IDs range
print(f"Maximum Token ID: {max_token_id}")
print(f"Minimum Token ID: {min_token_id}")
print(f"Tokenizer Vocabulary Size: {vocab_size}")

# Check if the token IDs fall within the expected range
if min_token_id >= 0 and max_token_id < vocab_size:
    print("Token IDs fall within the tokenizer's vocabulary range.")
else:
    print("Token IDs fall outside the tokenizer's vocabulary range.")

In [None]:
# fine-tune the model
history = model.fit(
    train_dataset, validation_data=validation_dataset, epochs=num_train_epochs, callbacks=callbacks
)
history = history.history

In [None]:
hub_model_id = "bart-large-finetuned-filtered-spotify-podcast-summ"
model.push_to_hub(hub_model_id)
tokenizer.push_to_hub(hub_model_id)

### History of the fine-tuning

In [None]:
# convert the history.history dict to a pandas DataFrame:
path_model_history = os.path.join(model_path, 'history')
if not os.path.exists(path_model_history):
    os.makedirs(path_model_history)

df_history = pd.DataFrame(history)
with open(os.path.join(path_model_history, "history.csv"), mode="w") as file:
    df_history.to_csv(file)

In [None]:
# Restore history
cols = ['loss','val_loss','rouge1', 'rouge2', 'rougeL', 'rougeLsum', 'gen_len']
path_model_history = os.path.join(model_path, 'history')
history = pd.read_csv(os.path.join(path_model_history, "history.csv"), usecols=cols)

In [None]:
def plot_history(models_history, keys, model_names=[], labels=("epochs", "metrics"), y_scale="linear", figsize=(10,5), cmap='rainbow'):
    """
    Plot the history of the metrics in the history dictionary for each model.
        :param models_history: array of dictionary of the metric history for each model
        :param keys: list of keys of the metrics to plot
        :param model_names: list of names of the models
        :param labels: list of labels of the axes
        :param figsize: size of the figure
        :param cmap: color map used for the plot
    """

    # maps each model to a distinct RGB color
    cmap = plt.cm.get_cmap(cmap, len(keys))

    fig = plt.figure(figsize=figsize)

    # for each model trained
    for i, history in enumerate(models_history):
        # take all pairs of training and val metrics
        for j, metric in enumerate(keys):
            plt.plot(history[metric], label=f"{model_names[i]} {metric}", linestyle="solid", color=cmap(j))

    plt.xlabel(labels[0])
    plt.ylabel(labels[1])
    plt.yscale(y_scale)

    # Adding legend
    plt.legend(
          title ="Legend",
          loc ="best",
          bbox_to_anchor=(1, 0.5))
    plt.title("Training history")
    plt.grid(linestyle='--', linewidth=1)
    plt.show()

model_history = [history]
model_names = ["BART fine-tuning"]
plot_history(model_history, keys=['loss', 'val_loss'], model_names=model_names, labels=("epochs", "loss"), figsize=(6,3))
plot_history(model_history, keys=['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], model_names=model_names, labels=("epochs", "rouge"), figsize=(6,3), cmap="cool")

### Evaluation

In [None]:
metadata_path_test = os.path.join(dataset_path, "spotify-podcasts-2020", "metadata-summarization-testset.tsv")
metadata_test = pd.read_csv(metadata_path_test, sep='\t')
print("Columns: ", metadata_test.columns)
print("Shape: ", metadata_test.shape)

In [None]:
# drop NaN values or empty descriptions if any
metadata_test.dropna(subset=['episode_description', 'show_description'], inplace=True)
metadata_test = metadata_test[[len(desc.strip()) > 0 for desc in metadata_test['episode_description']]]
print("Test set size after dropping NaN values: \n", metadata_test.shape)

In [None]:
# load the tokenizer from the hub
model_finetuned_checkpoint = "gmurro/bart-large-finetuned-filtered-spotify-podcast-summ"
tokenizer = AutoTokenizer.from_pretrained(model_finetuned_checkpoint)

### Transcript filtering for the test set

In [None]:
chunk_classifier = keras.models.load_model("modelChunkNN")
sentence_encoder = SentenceTransformer('all-MiniLM-L6-v2')

metadata_test['filtered_transcript'] = metadata_test.progress_apply(lambda x: transcript_filtering(x, chunk_classifier, sentence_encoder, tokenizer, test_set=True), axis=1)

metadata_test[['episode_uri','filtered_transcript', 'episode_description']].to_csv(os.path.join(dataset_path, "filtered_testset.csv"), index=False)
print("Filtering done!")

### Evaluating the model

In [None]:
# load the filtered test set
test_set = load_dataset('csv', data_files=os.path.join(dataset_path, "filtered_testset.csv"))['train']
print(f"Test set size: {test_set.num_rows}")

In [None]:
def dataset_to_tf(dataset, tokenizer, model, max_input_length = 1024, max_target_length = 256, eval_batch_size = 2):
    """
    Convert a dataset to a tensorflow dataset.

    Parameters
    ----------
    dataset : Dataset
        The dataset to convert.
    tokenizer : AutoTokenizer
        The tokenizer to use.
    model : TFAutoModelForSeq2SeqLM
        The model to use.
    max_input_length : int
        The maximum length of the input sequences. Default: 1024.
    max_target_length : int
        The maximum length of the target sequences. Default: 256.
    eval_batch_size : int
        The batch size used for evaluation. Default: 2.
    """

    # tokenize the set
    set_tokenized = dataset.map(
        lambda batch: batch_tokenize_preprocess(
            batch, "filtered_transcript", "episode_description", tokenizer, max_input_length, max_target_length
        ),
        batched=True,
        remove_columns=dataset.column_names,
        desc="Running tokenizer on the given dataset"
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

    dataset_tf  = set_tokenized.to_tf_dataset(
        batch_size=eval_batch_size,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
    return dataset_tf


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def predict(test_dataset, model, tokenizer, gen_kwargs, eval_batch_size):
    """
    Generate predictions for the test set

    Parameters
    ----------
    test_dataset : datasets.Dataset
        Test set
    model : tf.keras.Model
        Model to use for generation
    tokenizer : transformers.AutoTokenizer
        Tokenizer to use for generation
    gen_kwargs : dict
        Keyword arguments for the generation
    eval_batch_size : int
        Batch size for evaluation

    Returns
    -------
    Pair correspoding to the list of predictions and the list of labels
    """
    predictions = []
    references  = []

    # convert the dataset to a tensorflow dataset prebatched
    testset_tf = dataset_to_tf(test_dataset, tokenizer, model, max_input_length = 1024, max_target_length = 256, eval_batch_size = eval_batch_size)

    # generate the predicted summaries
    for batch in tqdm(testset_tf, desc="Generating summaries", total=len(test_dataset)//eval_batch_size):
        labels = batch.pop("labels")
        batch.update(gen_kwargs)
        generated_tokens = model.generate(**batch)
        if isinstance(generated_tokens, tuple):
            generated_tokens = generated_tokens[0]
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

        predictions.extend(decoded_preds)
        references.extend(decoded_labels)
    return predictions, references

In [None]:
model_checkpoint_finetuned = "gmurro/bart-large-finetuned-filtered-spotify-podcast-summ"
tokenizer_finetuned = AutoTokenizer.from_pretrained(model_checkpoint_finetuned)
model_finetuned = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_finetuned)

model_checkpoint_pretrained = "facebook/bart-large-cnn"
tokenizer_pretrained = AutoTokenizer.from_pretrained(model_checkpoint_pretrained)
model_pretrained = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_pretrained)

In [None]:
# bart generation parameters
gen_kwargs = {
    "length_penalty": 2.0,
    "num_beams": 4,
    "no_repeat_ngram_size": 3,
    "min_length": 39,
    "max_length": 250
    }

# predict on the test set with the finetuned model
predictions_ft, references_ft = predict(test_set, model_finetuned, tokenizer_finetuned, gen_kwargs, eval_batch_size=2)

# predict on the test set with the pretrained model
predictions_pt, references_pt = predict(test_set, model_pretrained, tokenizer_pretrained, gen_kwargs, eval_batch_size=1)
print("Predictions done!")

In [None]:
references_ft[10].replace("\n", " ")

In [None]:
num_to_visualize = 3

for _ in range(num_to_visualize):
    i = np.random.randint(0, len(references_ft))

    ref = references_ft[i].replace('\n', ' ')
    ft = predictions_ft[i].replace('\n', ' ')
    pt = predictions_pt[i].replace('\n', ' ')
    print(f"Creator-provided description: \n\t{ref}")
    print(f"Fine-tuned model prediction: \n\t{ft}")
    print(f"Pre-trained model prediction: \n\t{pt}")
    print("\n")

In [None]:
# read predictions and references (if stored)
df_pred = pd.read_csv(os.path.join(dataset_path, "predictions_ft.csv"))
predictions_ft = df_pred['predictions'].tolist()
references_ft = df_pred['references'].tolist()

df_pred = pd.read_csv(os.path.join(dataset_path, "predictions_pt.csv"))
predictions_pt = df_pred['predictions'].tolist()
references_pt = df_pred['references'].tolist()

In [None]:
def rouge_evaluation(predictions, references):
    """
    Evaluate the ROUGE score for the given predictions and references

    Parameters
    ----------
    predictions : list
        List of predictions
    references : list
        List of references

    Returns
    -------
    dict
        ROUGE score
    """
    rouge = evaluate.load('rouge')
    results_rouge = rouge.compute(predictions=predictions, references=references, use_stemmer=True)

    df_rouge = pd.DataFrame({'precision': [round(value.mid.precision,4) for key, value in results_rouge.items()],
                             'recall': [round(value.mid.recall,4) for key, value in results_rouge.items()],
                             'f1': [round(value.mid.fmeasure,4) for key, value in results_rouge.items()]},
                             index=results_rouge.keys())

    return df_rouge

def bertscore_evaluation(predictions, references, idf_weighting=True):
    """
    Evaluate the BERTScore score for the given predictions and references

    Parameters
    ----------
    predictions : list
        List of predictions
    references : list
        List of references
    idf_weighting : bool
        Whether to use idf weighting

    Returns
    -------
    dict
        BERTScore score
    """
    precision, recall, fmeasure = bert_score.score(cands=predictions, refs=references, lang="en", model_type="microsoft/deberta-xlarge-mnli", num_layers=40, idf=idf_weighting)
    df_bertscore = pd.DataFrame({'precision': [round(precision.mean().item(), 4), round(precision.std().item(), 4)],
                                 'recall': [round(recall.mean().item(), 4), round(recall.std().item(),4)],
                                 'f1': [round(fmeasure.mean().item(), 4), round(fmeasure.std().item(),4)]},
                             index=["mean", "std"])

    return df_bertscore

### Evaluation of the fine-tuned model bart-large-finetuned-filtered-spotify-podcast-summ

In [None]:
# compute the ROUGE score on fine-tuned model
rouge_ft = rouge_evaluation(predictions_ft, references_ft)
rouge_ft

In [None]:
# compute the BERT score with IDF on fine-tuned model
bertscore_ft = bertscore_evaluation(predictions_ft, references_ft, idf_weighting=True)
bertscore_ft

In [None]:
# compute the BERT score without IDF on fine-tuned model
bertscore_ft = bertscore_evaluation(predictions_ft, references_ft, idf_weighting=False)
bertscore_ft