# Scoring and Citations Testbed
---

The objective of this notebook is to:
1. Experiment and figure out how to perform Scoring as described in the [paper](papers/TRLM_2412.02626.pdf)
2. Experiment with linear search for citation attribution

To further explore: 
1. Experiment with binary and exclusion search
2. Experiment with retrieval

## Import Libraries

In [2]:
import torch as t
import numpy as np
import pandas as pd
import torch.nn.functional as F
from tqdm.auto import tqdm

from transformers import GPTNeoXForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer, util

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize



In [3]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.float_format', '{:.4f}'.format)

## Define Util Functions

In [4]:
device="cuda" if t.cuda.is_available() else "cpu"

In [5]:
# TODO: may be helpful to move this over to a utils.py later, or define the models as a separate classes?

# Load models
def load_models():
    # Forward model
    fo_model = GPTNeoXForCausalLM.from_pretrained(
        "EleutherAI/pythia-160m-deduped",
        revision="step143000",
        cache_dir="./.cache/pythia-160m-deduped/step143000",
    ).to(device)
    
    fo_tokenizer = AutoTokenizer.from_pretrained(
        "EleutherAI/pythia-160m-deduped",
        revision="step143000",
        cache_dir="./.cache/pythia-160m-deduped/step143000",
    )
    
    # Backward model
    ba_model = GPTNeoXForCausalLM.from_pretrained(
        "afterless/reverse-pythia-160m",
        cache_dir="./.cache/reverse-pythia-160m",
    ).to(device)
    
    ba_tokenizer = AutoTokenizer.from_pretrained(
        "afterless/reverse-pythia-160m",
        cache_dir="./.cache/reverse-pythia-160m",
    )
    
    return fo_model, fo_tokenizer, ba_model, ba_tokenizer

In [6]:
# TODO : same with above

# Load dataset
def load_cnn_dataset(num_samples=10):
    try:
        # Try with a specific cache directory
        dataset = load_dataset("cnn_dailymail", "3.0.0", cache_dir=".cache")
        print("Dataset loaded successfully")
        
        # Verify the structure - this helps debug
        if num_samples > 0:
            print("Example dataset item:", dataset['train'][0])
            
        # Take only a small sample for testing
        if hasattr(dataset, 'train'):
            return dataset['train'].select(range(min(num_samples, len(dataset['train']))))
        
        return dataset['train'][:num_samples]
        
    except Exception as e:
        print(f"Error loading full dataset: {e}")
        
        # Create a tiny synthetic dataset for testing
        print("Creating synthetic test dataset instead...")
        
        sample_data = {
            'article': [
                "John likes to play basketball. He goes to the court every evening. His friends join him on weekends.",
                "The company announced record profits. Investors were pleased. The stock price increased by 10%."
            ],
            'highlights': [
                "John plays basketball regularly with friends.",
                "Company profits lead to stock price increase."
            ],
            'id': ['test1', 'test2']  # Added ID field
        }
        
        return Dataset.from_dict(sample_data)

In [7]:
def calculate_baseline_score(query, answer, model, tokenizer, task='citation', backward=False, debug=False):
    """
    Calculate log probability of response
    
    Args:
        query (str): The prompt text
        answer (str): The response text
        model: The language model
        tokenizer: The corresponding tokenizer
        direction (str): "forward" for P(response|prompt) or "backward" for P(prompt|response)
    
    Returns:
        dict: Contains token-wise and sequence log probabilities
    """
    
    # The paper describes "Score" as conditional distribution (Section 4) which means the Log Probability, and therefore
    # this reimplementation uses Log Probability.
        
    # First, prepare the texts
    if not backward: 
        #Forward, assumes this is a simple reversal to (P Answer|Query)
        conditioning_prompt = ' is summarized by ' if task =='citation' else ' is answered by '
    else:
        #Backward
        conditioning_prompt = ' is a summary of ' if task =='citation' else ' has an answer to '
    
    # DEBUG 
    if debug:
        print(f"Context: {query + conditioning_prompt}")
        print(f"Target: {answer}")
    
    context_ids = tokenizer.encode(query + conditioning_prompt, return_tensors="pt")
    target_ids = tokenizer.encode(answer, return_tensors="pt")

    # store length to "divide" the texts later
    target_len = target_ids.shape[1]
    context_len = context_ids.shape[1]
    
    if backward:
        # We need to reverse the tokens in backward
        target_ids = t.flip(target_ids, (1,))
        context_ids = t.flip(context_ids, (1,))

    input_ids = t.cat((context_ids, target_ids), dim=1).to(model.device)
    
    if debug:
        print(query + conditioning_prompt + answer)

    # Get model output
    with t.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    
    # Extract token probabilities for the target text
    token_probs = []
    
    # Because the text is (query+answer), we just want to get the (answer) logits
    for i in range(context_len - 1, context_len + target_len - 1):
        
        # essentially, get the probability for the actual token at sequence, i.e. 
        # if "Harry Potter is the boy who survived" and i = 5
        # then we get the probabilities of the model output up until "who"
        # and find what's the probability of "survived"
        
        # get the logits [batch_size, sequence_length, vocabulary_size]
        next_token_logits = logits[0, i, :]  # no batch, sequence i, all vocab
        
        # get the actual token
        next_token_id = input_ids[0, i+1].item()
        
        # Convert logits to probabilities
        next_token_probs = F.softmax(next_token_logits, dim=0)
        prob = next_token_probs[next_token_id].item()
        log_prob = np.log(prob)
        
        token_text = tokenizer.decode([next_token_id])
        token_probs.append({
            'token': token_text,
            'token_id': next_token_id,
            'log_prob': log_prob
        })
    
    # Calculate sequence probability
    sequence_log_prob = sum(tp['log_prob'] for tp in token_probs)
    # Normalize by length to get per-token average
    normalized_log_prob = sequence_log_prob / len(token_probs)
    # Convert to perplexity if needed
    perplexity = np.exp(-sequence_log_prob / len(token_probs))
    
    return {
        'token_log_probs': token_probs,
        'sequence_log_prob': sequence_log_prob,
        'normalized_log_prob': normalized_log_prob,
        'perplexity': perplexity
    }

In [8]:
def calculate_llm_score(query, answer, model, tokenizer, task='citation', backward=False, debug=False):
    """
    Calculate log probability of response given prompt or vice versa.
    
    Args:
        query (str): The prompt text
        answer (str): The response text
        model: The language model
        tokenizer: The corresponding tokenizer
        direction (str): "forward" for P(response|prompt) or "backward" for P(prompt|response)
    
    Returns:
        dict: Contains token-wise and sequence log probabilities
    """
    
    # The paper describes "Score" as conditional distribution (Section 4) which means the Log Probability, and therefore
    # this reimplementation uses Log Probability.
    
    # The notation used here is P(Query|Answer) to make it easier to compare with the paper
    
    # First, prepare the texts
    if not backward: 
        #Forward
        conditioning_prompt = ' is a summary of ' if task =='citation' else ' has an answer to '
    else:
        #Backward
        conditioning_prompt = ' is summarized by ' if task =='citation' else ' is answered by '
    
    # DEBUG 
    if debug:
        print(f"Context: {answer + conditioning_prompt}")
        print(f"Target: {query}")
    
    # convert to tokens, but because the model is auto-regressive, it predicts left -> right 
    # (i.e. tokens at t, t+1, t+2 predicts the t+3, t+4)
    # so we may need to reverse the tokens? 
    # input_ids = tokenizer.encode(query + (conditioning_prompt + answer), return_tensors="pt").to(model.device)
    
    target_ids = tokenizer.encode(query, return_tensors="pt")
    context_ids = tokenizer.encode(answer + conditioning_prompt, return_tensors="pt")

    # store length to "divide" the texts later
    target_len = target_ids.shape[1]
    context_len = context_ids.shape[1]
    
    if backward:
        # We need to reverse the tokens in backward
        target_ids = t.flip(target_ids, (1,))
        context_ids = t.flip(context_ids, (1,))

    input_ids = t.cat((context_ids, target_ids), dim=1).to(model.device)

    # Get model output
    with t.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    
    # Extract token probabilities for the target text
    token_probs = []
    # Because the text is (query+answer), we just want to get the (answer) logits
    for i in range(context_len - 1, context_len + target_len - 1):
        
        # essentially, get the probability for the actual token at sequence, i.e. 
        # if "Harry Potter is the boy who survived" and i = 5
        # then we get the probabilities of the model output up until "who"
        # and find what's the probability of "survived"
        
        # get the logits [batch_size, sequence_length, vocabulary_size]
        next_token_logits = logits[0, i, :]  # no batch, sequence i, all vocab
        
        # get the actual token
        next_token_id = input_ids[0, i+1].item()
        
        # Convert logits to probabilities
        next_token_probs = F.softmax(next_token_logits, dim=0)
        prob = next_token_probs[next_token_id].item()
        log_prob = np.log(prob)
        
        token_text = tokenizer.decode([next_token_id])
        token_probs.append({
            'token': token_text,
            'token_id': next_token_id,
            'log_prob': log_prob
        })
    
    # Calculate sequence probability
    sequence_log_prob = sum(tp['log_prob'] for tp in token_probs)
    # Normalize by length to get per-token average
    normalized_log_prob = sequence_log_prob / len(token_probs)
    # Convert to perplexity if needed
    perplexity = np.exp(-sequence_log_prob / len(token_probs))
    
    return {
        'token_log_probs': token_probs,
        'sequence_log_prob': sequence_log_prob,
        'normalized_log_prob': normalized_log_prob,
        'perplexity': perplexity
    }



In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_tfidf_score(highlight, sentences, citation):
    """
    Calculate the maximum TF-IDF similarity between the highlight and a given citation
    among the provided sentences.
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([highlight] + [citation] + sentences)
    similarity = (tfidf_matrix * tfidf_matrix.T).toarray()[0][2:]
    return max(similarity) if similarity.size > 0 else 0.0

In [10]:
# Testing

fo_model, fo_tokenizer, ba_model, ba_tokenizer = load_models()


# Example Text
sentence = "Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him."
highlight = "Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday"
adverse_highlight = "Daniel Craig is recasted as James Bond again"

# Defining sentence/highlight query/answer is still confusing tho sheesh


# Define prompts
ba_score = calculate_llm_score(sentence, highlight, ba_model, ba_tokenizer, backward=True, debug=True)
fo_score = calculate_llm_score(sentence, highlight, fo_model, fo_tokenizer)

adv_ba_score = calculate_llm_score(sentence, adverse_highlight, ba_model, ba_tokenizer, backward=True)
adv_fo_score = calculate_llm_score(sentence, adverse_highlight, fo_model, fo_tokenizer)


scores_data = {
    'Model Type': ['Backward', 'Forward', 'Backward', 'Forward'],
    'Highlight': ['Correct', 'Correct', 'Adverse', 'Adverse'],
    'Sequence Log Prob': [
        ba_score['sequence_log_prob'],
        fo_score['sequence_log_prob'],
        adv_ba_score['sequence_log_prob'],
        adv_fo_score['sequence_log_prob']
    ],
    'Normalized Log Prob': [
        ba_score['normalized_log_prob'],
        fo_score['normalized_log_prob'],
        adv_ba_score['normalized_log_prob'],
        adv_fo_score['normalized_log_prob']
    ],
    'Perplexity': [
        ba_score['perplexity'],
        fo_score['perplexity'],
        adv_ba_score['perplexity'],
        adv_fo_score['perplexity']
    ]
}

# Create DataFrame
pd.DataFrame(scores_data)

Context: Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday is summarized by 
Target: Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.


Unnamed: 0,Model Type,Highlight,Sequence Log Prob,Normalized Log Prob,Perplexity
0,Backward,Correct,-113.2715,-2.7627,15.8429
1,Forward,Correct,-113.7015,-2.7732,16.0099
2,Backward,Adverse,-137.511,-3.3539,28.6148
3,Forward,Adverse,-151.0687,-3.6846,39.8293


##  Citation, Linear Search

In [10]:
dataset = load_cnn_dataset(num_samples=50)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Dataset loaded successfully
Example dataset item: {'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places be

In [None]:
# Show dataframe
pd.DataFrame(dataset).head(5)

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell...",Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe's earnings from first five Potter films have be...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,"Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a ja...","Mentally ill inmates in Miami are housed on the ""forgotten floor""\nJudge Steven Leifman says most are there as a result of ""avoidable felonies""\nWhile CNN tours facility, patient shouts: ""I am the...",ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who were on the Minneapolis bridge when it collapsed told harrowing tales of survival. ""The whole bridge from one side of the Mississippi to the other just ...","NEW: ""I thought I was going to die,"" driver says .\nMan says pickup truck was folded in half; he just has cut on face .\nDriver: ""I probably had a 30-, 35-foot free fall""\nMinnesota bridge collaps...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,"WASHINGTON (CNN) -- Doctors removed five small polyps from President Bush's colon on Saturday, and ""none appeared worrisome,"" a White House spokesman said. The polyps were removed and sent to the ...","Five small polyps found during procedure; ""none worrisome,"" spokesman says .\nPresident reclaims powers transferred to vice president .\nBush undergoes routine colonoscopy at Camp David .",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,"(CNN) -- The National Football League has indefinitely suspended Atlanta Falcons quarterback Michael Vick without pay, officials with the league said Friday. NFL star Michael Vick is set to appea...","NEW: NFL chief, Atlanta Falcons owner critical of Michael Vick's conduct .\nNFL suspends Falcons quarterback indefinitely without pay .\nVick admits funding dogfighting operation but says he did n...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a
5,"BAGHDAD, Iraq (CNN) -- Dressed in a Superman shirt, 5-year-old Youssif held his sister's hand Friday, seemingly unaware that millions of people across the world have been touched by his story. Nea...","Parents beam with pride, can't stop from smiling from outpouring of support .\nMom: ""I was so happy I didn't know what to do""\nBurn center in U.S. has offered to provide treatment for reconstructi...",a1ebb8bb4d370a1fdf28769206d572be60642d70
6,"BAGHDAD, Iraq (CNN) -- The women are too afraid and ashamed to show their faces or have their real names used. They have been driven to sell their bodies to put food on the table for their childre...","Aid workers: Violence, increased cost of living drive women to prostitution .\nGroup is working to raise awareness of the problem with Iraq's political leaders .\nTwo Iraqi mothers tell CNN they t...",7c0e61ac829a3b3b653e2e3e7536cc4881d1f264
7,"BOGOTA, Colombia (CNN) -- A key rebel commander and fugitive from a U.S. drug trafficking indictment was killed over the weekend in an air attack on a guerrilla encampment, the Colombian military ...","Tomas Medina Caracas was a fugitive from a U.S. drug trafficking indictment .\n""El Negro Acacio"" allegedly helped manage extensive cocaine network .\nU.S. Justice Department indicted him in 2002 ....",f0d73bdab711763e745cdc75850861c9018f235d
8,"WASHINGTON (CNN) -- White House press secretary Tony Snow, who is undergoing treatment for cancer, will step down from his post September 14 and be replaced by deputy press secretary Dana Perino, ...","President Bush says Tony Snow ""will battle cancer and win"" Job of press secretary ""has been a dream for me,"" Snow says Snow leaving on September 14, will be succeeded by Dana Perino .",5e22bbfc7232418b8d2dd646b952e404df5bd048
9,"(CNN) -- Police and FBI agents are investigating the discovery of an empty rocket launcher tube on the front lawn of a Jersey City, New Jersey, home, FBI spokesman Sean Quinn said. Niranjan Desai ...","Empty anti-tank weapon turns up in front of New Jersey home .\nDevice handed over to Army ordnance disposal unit .\nWeapon not capable of being reloaded, experts say .",613d6311ec2c1985bd44707d1796d275452fe156


In [22]:
def linear_attribution_search(dataset, ba_model, ba_tokenizer, fo_model, fo_tokenizer):
    """
    Perform linear attribution search for citations as described in TRLM paper.
    
    For each highlight (summary sentence), find the most likely article sentence
    that it was derived from by scoring all possible pairs.
    """
    results = []
    
    # Process only the first few examples for demonstration
    for idx, example in tqdm(dataset.iterrows(), total=len(dataset)):
        # Split article and highlights into sentences
        article_sentences = sent_tokenize(example['article'])
        highlight_sentences = sent_tokenize(example['highlights'])
        
        # For demonstration, process just the first highlight sentence
        if not highlight_sentences:
            continue
            
        highlight = highlight_sentences[0]
        
        # Store best attribution for each model
        best_ba_sentence = None
        best_ba_score = float('-inf')
        best_fo_sentence = None
        best_fo_score = float('-inf')
        
        # Linear search through all article sentences
        for sentence in article_sentences:
            # Skip very short sentences
            if len(sentence.split()) < 3:
                continue
                
            # Calculate scores using both models
            ba_score = calculate_llm_score(sentence, highlight, ba_model, ba_tokenizer, backward=True)
            fo_score = calculate_llm_score(sentence, highlight, fo_model, fo_tokenizer)
            
            # Track best scores
            if ba_score['normalized_log_prob'] > best_ba_score:
                best_ba_score = ba_score['normalized_log_prob']
                best_ba_sentence = sentence
                
            if fo_score['normalized_log_prob'] > best_fo_score:
                best_fo_score = fo_score['normalized_log_prob']
                best_fo_sentence = sentence
        
        # Add results to our list
        results.append({
            'id': example['id'],
            'highlight': highlight,
            'ba_citation': best_ba_sentence,
            'ba_score': best_ba_score,
            'ba_perplexity': np.exp(-best_ba_score),
            'fo_citation': best_fo_sentence,
            'fo_score': best_fo_score,
            'fo_perplexity': np.exp(-best_fo_score)
        })
    
    
    return results


In [23]:
def linear_attribution_search(dataset, ba_model, ba_tokenizer, fo_model, fo_tokenizer):
    """
    Perform linear attribution search for citations as described in TRLM paper.
    
    For each highlight (summary sentence), find the most likely article sentence
    that it was derived from by scoring all possible pairs.
    """
    results = []
    
    # Process only the first few examples for demonstration
    for idx, example in tqdm(dataset.iterrows(), total=len(dataset)):
        # Split article and highlights into sentences
        article_sentences = sent_tokenize(example['article'])
        highlight_sentences = sent_tokenize(example['highlights'])
        
        # For demonstration, process just the first highlight sentence
        if not highlight_sentences:
            continue
            
        highlight = highlight_sentences[0]
        
        # Store best attribution for each model
        best_ba_sentence = None
        best_ba_score = float('-inf')
        best_fo_sentence = None
        best_fo_score = float('-inf')
        best_base_sentence = None
        best_base_score = float('-inf')
        
        # Linear search through all article sentences
        for sentence in article_sentences:
            # Skip very short sentences
            if len(sentence.split()) < 3:
                continue
                
            # Calculate scores using both models
            base_score = calculate_baseline_score(sentence, highlight, fo_model, fo_tokenizer)
            ba_score = calculate_llm_score(sentence, highlight, ba_model, ba_tokenizer, backward=True)
            fo_score = calculate_llm_score(sentence, highlight, fo_model, fo_tokenizer)
            
            # Track best scores
            if base_score['normalized_log_prob'] > best_base_score:
                best_base_score = base_score['normalized_log_prob']
                best_base_sentence = sentence
                
            if ba_score['normalized_log_prob'] > best_ba_score:
                best_ba_score = ba_score['normalized_log_prob']
                best_ba_sentence = sentence
                
            if fo_score['normalized_log_prob'] > best_fo_score:
                best_fo_score = fo_score['normalized_log_prob']
                best_fo_sentence = sentence
        
        # Add results to our list
        results.append({
            'id': example['id'],
            'highlight': highlight,
            'base_citation': best_base_sentence,
            'base_score': best_base_score,
            'base_perplexity': np.exp(-best_base_score),
            'ba_citation': best_ba_sentence,
            'ba_score': best_ba_score,
            'ba_perplexity': np.exp(-best_ba_score),
            'fo_citation': best_fo_sentence,
            'fo_score': best_fo_score,
            'fo_perplexity': np.exp(-best_fo_score)
        })
    
    
    return results


In [None]:
def binary_search_citation(article, highlight, model, tokenizer, backward=False, max_iterations=30):
    # Split the article into individual sentences using NLTK's sentence tokenizer
    sentences = sent_tokenize(article)
    if not sentences: # Return default values if no sentences are found in the article
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf')}
    
    # Split the highlight into sentences and use only the first sentence
    highlight_sentences = sent_tokenize(highlight)
    if not highlight_sentences: # Return default values if no sentences are found in the highlight
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf')}
    highlight = highlight_sentences[0] # Select the first sentence of the highlight
    
    # Define a recursive binary search function to find the best citation
    def binary_search_recursive(s, t, iteration=0):
        if t - s <= 0 or iteration >= max_iterations: # Base case: if the search range is invalid or max iterations are reached
            if t < s: # Return default values if the range is invalid
                return '', float('-inf'), float('inf')
            # Combine sentences in the range [s, t] into a single string
            a_half = ' '.join(sentences[s:t + 1])
            # Calculate the LLM score for the combined text
            result = calculate_llm_score(a_half, highlight, model, tokenizer, backward=backward)
            score = result['normalized_log_prob']
            perplexity = result['perplexity']
            return a_half, score, perplexity
        
        # Calculate the midpoint of the current range
        mid = s + (t - s) // 2
        # Split the sentences into two halves: [s, mid] and [mid+1, t]
        a_half1 = ' '.join(sentences[s:mid + 1])
        a_half2 = ' '.join(sentences[mid + 1:t + 1])
        # Calculate LLM scores for both halves
        result1 = calculate_llm_score(a_half1, highlight, model, tokenizer, backward=backward)
        result2 = calculate_llm_score(a_half2, highlight, model, tokenizer, backward=backward)
        # Extract normalized log probabilities and perplexities for both halves
        s1, p1 = result1['normalized_log_prob'], result1['perplexity']
        s2, p2 = result2['normalized_log_prob'], result2['perplexity']
        
        # Debugging
        # print(f"Binary Search (Backward={backward}): s={s}, t={t}, Mid={mid}, s1={s1}, s2={s2}, p1={p1}, p2={p2}")
        
        if s1 > s2:
            return binary_search_recursive(s, mid, iteration + 1)
        else:
            return binary_search_recursive(mid + 1, t, iteration + 1)
    
    # Initialize the search range: start (s) at 0, end (t) at the last sentence index
    s, t = 0, len(sentences) - 1
    # Perform binary search to find the best citation
    citation, score, perplexity = binary_search_recursive(s, t)
    
    # If no citation is found, return default values
    if not citation:
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf')}
    
    # Print the final result of the binary search
    # print(f"Binary Search (Backward={backward}) Final: Score={score}, Citation={citation[:50]}...")
    return {
        'citation': citation,
        'score': score,
        'perplexity': perplexity
    }

In [40]:
# Not the best way to do this, but 

def baseline_binary_search_citation(article, highlight, model, tokenizer, backward=False, max_iterations=30):
    # Split the article into individual sentences using NLTK's sentence tokenizer
    sentences = sent_tokenize(article)
    if not sentences: # Return default values if no sentences are found in the article
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf')}
    
    # Split the highlight into sentences and use only the first sentence
    highlight_sentences = sent_tokenize(highlight)
    if not highlight_sentences: # Return default values if no sentences are found in the highlight
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf')}
    highlight = highlight_sentences[0] # Select the first sentence of the highlight
    
    # Define a recursive binary search function to find the best citation
    def baseline_binary_search_recursive(s, t, iteration=0):
        if t - s <= 0 or iteration >= max_iterations: # Base case: if the search range is invalid or max iterations are reached
            if t < s: # Return default values if the range is invalid
                return '', float('-inf'), float('inf')
            # Combine sentences in the range [s, t] into a single string
            a_half = ' '.join(sentences[s:t + 1])
            # Calculate the LLM score for the combined text
            result = calculate_baseline_score(a_half, highlight, model, tokenizer, backward=backward)
            score = result['normalized_log_prob']
            perplexity = result['perplexity']
            return a_half, score, perplexity
        
        # Calculate the midpoint of the current range
        mid = s + (t - s) // 2
        # Split the sentences into two halves: [s, mid] and [mid+1, t]
        a_half1 = ' '.join(sentences[s:mid + 1])
        a_half2 = ' '.join(sentences[mid + 1:t + 1])
        # Calculate LLM scores for both halves
        result1 = calculate_baseline_score(a_half1, highlight, model, tokenizer, backward=backward)
        result2 = calculate_baseline_score(a_half2, highlight, model, tokenizer, backward=backward)
        # Extract normalized log probabilities and perplexities for both halves
        s1, p1 = result1['normalized_log_prob'], result1['perplexity']
        s2, p2 = result2['normalized_log_prob'], result2['perplexity']
        
        # Debugging
        # print(f"Binary Search (Backward={backward}): s={s}, t={t}, Mid={mid}, s1={s1}, s2={s2}, p1={p1}, p2={p2}")
        
        if s1 > s2:
            return baseline_binary_search_recursive(s, mid, iteration + 1)
        else:
            return baseline_binary_search_recursive(mid + 1, t, iteration + 1)
    
    # Initialize the search range: start (s) at 0, end (t) at the last sentence index
    s, t = 0, len(sentences) - 1
    # Perform binary search to find the best citation
    citation, score, perplexity = baseline_binary_search_recursive(s, t)
    
    # If no citation is found, return default values
    if not citation:
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf')}
    
    # Print the final result of the binary search
    # print(f"Binary Search (Backward={backward}) Final: Score={score}, Citation={citation[:50]}...")
    return {
        'citation': citation,
        'score': score,
        'perplexity': perplexity
    }

In [41]:
# new version
def exclusion_search_citation(article, highlight, model, tokenizer, backward=False):
    # 1. Split the entire sentence group into individual sentences
    sentences = sent_tokenize(article)
    if not sentences:
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf'), 
                'individual_score': float('-inf'), 'individual_perplexity': float('inf')}
    
    # Split the highlight into sentences and use only the first sentence
    highlight_sentences = sent_tokenize(highlight)
    if not highlight_sentences:
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf'), 
                'individual_score': float('-inf'), 'individual_perplexity': float('inf')}
    highlight = highlight_sentences[0]  # Use only the first sentence
    
    # 2. Calculate calculate_llm_score for each sentence group with one sentence removed from index 0 to the end
    all_scores = []
    for i in range(len(sentences)):
        # Create a sentence group excluding the sentence at index i
        excluded_sentences = sentences[:i] + sentences[i+1:]
        if not excluded_sentences:
            score = float('-inf')  # Minimum score for an empty set
            perplexity = float('inf')
        else:
            # Combine the remaining sentences into one (maintain context)
            combined_text = " ".join(excluded_sentences)
            result = calculate_llm_score(combined_text, highlight, model, tokenizer, task='citation', backward=backward)
            score = result['normalized_log_prob']
            perplexity = result['perplexity']
        all_scores.append((score, perplexity, i))
    
    if not all_scores:
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf'), 
                'individual_score': float('-inf'), 'individual_perplexity': float('inf')}
    
    # 3. Select the sentence that results in the lowest relevance when removed
    worst_score, worst_perplexity, worst_idx = min(all_scores, key=lambda x: x[0])
    worst_citation = sentences[worst_idx]  # Sentence with the lowest relevance when removed
    
    # 4. Calculate the individual score for the selected sentence
    individual_result = calculate_llm_score(worst_citation, highlight, model, tokenizer, task='citation', backward=backward)
    individual_score = individual_result['normalized_log_prob']
    individual_perplexity = individual_result['perplexity']
    
    print(f"All scores (excluding each sentence): {[score for score, _, _ in all_scores]}")
    print(f"Worst score: {worst_score}, Perplexity: {worst_perplexity}, Sentence index: {worst_idx}")
    print(f"Individual score for selected citation: {individual_score}, Individual perplexity: {individual_perplexity}")
    
    return {
        'citation': worst_citation,
        'score': individual_score,
        'perplexity': individual_perplexity
    }

In [42]:
# not the best way to do this, but anyway

def baseline_exclusion_search_citation(article, highlight, model, tokenizer, backward=False):
    # 1. Split the entire sentence group into individual sentences
    sentences = sent_tokenize(article)
    if not sentences:
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf'), 
                'individual_score': float('-inf'), 'individual_perplexity': float('inf')}
    
    # Split the highlight into sentences and use only the first sentence
    highlight_sentences = sent_tokenize(highlight)
    if not highlight_sentences:
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf'), 
                'individual_score': float('-inf'), 'individual_perplexity': float('inf')}
    highlight = highlight_sentences[0]  # Use only the first sentence
    
    # 2. Calculate calculate_llm_score for each sentence group with one sentence removed from index 0 to the end
    all_scores = []
    for i in range(len(sentences)):
        # Create a sentence group excluding the sentence at index i
        excluded_sentences = sentences[:i] + sentences[i+1:]
        if not excluded_sentences:
            score = float('-inf')  # Minimum score for an empty set
            perplexity = float('inf')
        else:
            # Combine the remaining sentences into one (maintain context)
            combined_text = " ".join(excluded_sentences)
            result = calculate_baseline_score(combined_text, highlight, model, tokenizer, task='citation', backward=backward)
            score = result['normalized_log_prob']
            perplexity = result['perplexity']
        all_scores.append((score, perplexity, i))
    
    if not all_scores:
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf'), 
                'individual_score': float('-inf'), 'individual_perplexity': float('inf')}
    
    # 3. Select the sentence that results in the lowest relevance when removed
    worst_score, worst_perplexity, worst_idx = min(all_scores, key=lambda x: x[0])
    worst_citation = sentences[worst_idx]  # Sentence with the lowest relevance when removed
    
    # 4. Calculate the individual score for the selected sentence
    individual_result = calculate_baseline_score(worst_citation, highlight, model, tokenizer, task='citation', backward=backward)
    individual_score = individual_result['normalized_log_prob']
    individual_perplexity = individual_result['perplexity']
    
    print(f"All scores (excluding each sentence): {[score for score, _, _ in all_scores]}")
    print(f"Worst score: {worst_score}, Perplexity: {worst_perplexity}, Sentence index: {worst_idx}")
    print(f"Individual score for selected citation: {individual_score}, Individual perplexity: {individual_perplexity}")
    
    return {
        'citation': worst_citation,
        'score': individual_score,
        'perplexity': individual_perplexity
    }

In [50]:
# baseline added ver
def evaluate_citations_with_linear_binary_exclusion(dataset, num_samples=10):
    # TODO This must be refactored for speed, e.g. caching breaking down article to sentences
    # use threading, cache models, or something
    # otherwise evaluation will take really long.
    
    
    fo_model, fo_tokenizer, ba_model, ba_tokenizer = load_models()
    sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    if isinstance(dataset, list):
        df_dataset = pd.DataFrame(dataset)
    else:
        df_dataset = pd.DataFrame(dataset)
        
    df_dataset = df_dataset.iloc[:num_samples]
    
    linear_results = linear_attribution_search(df_dataset, ba_model, ba_tokenizer, fo_model, fo_tokenizer)
    
    results = []
    dataset_list = df_dataset.to_dict('records')
    
    for i in tqdm(range(min(num_samples, len(dataset_list)))):
        article = dataset_list[i]['article']
        highlight = dataset_list[i]['highlights']
        highlight_sentences = sent_tokenize(highlight)
        if not highlight_sentences:
            continue
        first_highlight = highlight_sentences[0]
        
        linear_result = linear_results[i]

        ba_binary = binary_search_citation(article, first_highlight, ba_model, ba_tokenizer, backward=True)
        fo_binary = binary_search_citation(article, first_highlight, fo_model, fo_tokenizer)
        
        ba_exclusion = exclusion_search_citation(article, first_highlight, ba_model, ba_tokenizer, backward=True)
        fo_exclusion = exclusion_search_citation(article, first_highlight, fo_model, fo_tokenizer)
        
        # Baseline Score
        # baseline linear is using different output method, so that has to be conformed later TODO. 
        baseline_binary = baseline_binary_search_citation(article, first_highlight, fo_model, fo_tokenizer)
        baseline_exclusion = baseline_exclusion_search_citation(article, first_highlight, fo_model, fo_tokenizer)

        highlight_emb = sentence_transformer.encode(first_highlight)
        ba_linear_emb = sentence_transformer.encode(linear_result['ba_citation'])
        fo_linear_emb = sentence_transformer.encode(linear_result['fo_citation'])
        ba_binary_emb = sentence_transformer.encode(ba_binary['citation'])
        fo_binary_emb = sentence_transformer.encode(fo_binary['citation'])
        ba_exclusion_emb = sentence_transformer.encode(ba_exclusion['citation'])
        fo_exclusion_emb = sentence_transformer.encode(fo_exclusion['citation'])
        
        baseline_linear_emb = sentence_transformer.encode(linear_result['base_citation'])  # used same citation with FO
        baseline_binary_emb = sentence_transformer.encode(baseline_binary['citation'])
        baseline_exclusion_emb = sentence_transformer.encode(baseline_exclusion['citation'])
        
        ba_linear_rouge = scorer.score(first_highlight, linear_result['ba_citation'])
        fo_linear_rouge = scorer.score(first_highlight, linear_result['fo_citation'])
        ba_binary_rouge = scorer.score(first_highlight, ba_binary['citation'])
        fo_binary_rouge = scorer.score(first_highlight, fo_binary['citation'])
        ba_exclusion_rouge = scorer.score(first_highlight, ba_exclusion['citation'])
        fo_exclusion_rouge = scorer.score(first_highlight, fo_exclusion['citation'])
        
        baseline_linear_rouge = scorer.score(first_highlight, linear_result['base_citation'])
        baseline_binary_rouge = scorer.score(first_highlight, baseline_binary['citation'])
        baseline_exclusion_rouge = scorer.score(first_highlight, baseline_exclusion['citation'])
        
        # TF-IDF
        sentences = sent_tokenize(article)
        ba_linear_tfidf = calculate_tfidf_score(first_highlight, sentences, linear_result['ba_citation'])
        fo_linear_tfidf = calculate_tfidf_score(first_highlight, sentences, linear_result['fo_citation'])
        ba_binary_tfidf = calculate_tfidf_score(first_highlight, sentences, ba_binary['citation'])
        fo_binary_tfidf = calculate_tfidf_score(first_highlight, sentences, fo_binary['citation'])
        ba_exclusion_tfidf = calculate_tfidf_score(first_highlight, sentences, ba_exclusion['citation'])
        fo_exclusion_tfidf = calculate_tfidf_score(first_highlight, sentences, fo_exclusion['citation'])
        
        baseline_linear_tfidf = calculate_tfidf_score(first_highlight, sentences, linear_result['base_citation'])
        baseline_binary_tfidf = calculate_tfidf_score(first_highlight, sentences, baseline_binary['citation'])
        baseline_exclusion_tfidf = calculate_tfidf_score(first_highlight, sentences, baseline_exclusion['citation'])
        
        # This may have to be refactored for readability/convenience later TODO
        result = {
            'id': dataset_list[i]['id'],
            'highlight': first_highlight,
            
            'ba_linear_citation': linear_result['ba_citation'],
            'fo_linear_citation': linear_result['fo_citation'],
            'ba_linear_score': linear_result['ba_score'],
            'fo_linear_score': linear_result['fo_score'],
            'ba_linear_perplexity': linear_result['ba_perplexity'],
            'fo_linear_perplexity': linear_result['fo_perplexity'],
            'ba_linear_emb_similarity': util.cos_sim([highlight_emb], [ba_linear_emb])[0][0].item(),
            'fo_linear_emb_similarity': util.cos_sim([highlight_emb], [fo_linear_emb])[0][0].item(),
            'ba_linear_rougeL_fmeasure': ba_linear_rouge['rougeL'].fmeasure,
            'fo_linear_rougeL_fmeasure': fo_linear_rouge['rougeL'].fmeasure,
            'ba_linear_tfidf': ba_linear_tfidf,
            'fo_linear_tfidf': fo_linear_tfidf,
            'ba_binary_citation': ba_binary['citation'],
            'fo_binary_citation': fo_binary['citation'],
            'ba_binary_score': ba_binary['score'],
            'fo_binary_score': fo_binary['score'],
            'ba_binary_perplexity': ba_binary['perplexity'],
            'fo_binary_perplexity': fo_binary['perplexity'],
            'ba_binary_emb_similarity': util.cos_sim([highlight_emb], [ba_binary_emb])[0][0].item(),
            'fo_binary_emb_similarity': util.cos_sim([highlight_emb], [fo_binary_emb])[0][0].item(),
            'ba_binary_rougeL_fmeasure': ba_binary_rouge['rougeL'].fmeasure,
            'fo_binary_rougeL_fmeasure': fo_binary_rouge['rougeL'].fmeasure,
            'ba_binary_tfidf': ba_binary_tfidf,
            'fo_binary_tfidf': fo_binary_tfidf,
            'ba_exclusion_citation': ba_exclusion['citation'],
            'fo_exclusion_citation': fo_exclusion['citation'],
            'ba_exclusion_score': ba_exclusion['score'],
            'fo_exclusion_score': fo_exclusion['score'],
            'ba_exclusion_perplexity': ba_exclusion['perplexity'],
            'fo_exclusion_perplexity': fo_exclusion['perplexity'],
            'ba_exclusion_emb_similarity': util.cos_sim([highlight_emb], [ba_exclusion_emb])[0][0].item(),
            'fo_exclusion_emb_similarity': util.cos_sim([highlight_emb], [fo_exclusion_emb])[0][0].item(),
            'ba_exclusion_rougeL_fmeasure': ba_exclusion_rouge['rougeL'].fmeasure,
            'fo_exclusion_rougeL_fmeasure': fo_exclusion_rouge['rougeL'].fmeasure,
            'ba_exclusion_tfidf': ba_exclusion_tfidf,
            'fo_exclusion_tfidf': fo_exclusion_tfidf,
            
            
            # Baseline model added
            # Why were these using forward values rather than baseline? 
            'baseline_linear_citation': linear_result['base_citation'], 
            'baseline_linear_score': linear_result['base_score'],
            'baseline_linear_perplexity': linear_result['base_perplexity'],
            'baseline_linear_emb_similarity': util.cos_sim([highlight_emb], [baseline_linear_emb])[0][0].item(),
            'baseline_linear_rougeL_fmeasure': baseline_linear_rouge['rougeL'].fmeasure,
            'baseline_linear_tfidf': baseline_linear_tfidf,
            
            'baseline_binary_citation': baseline_binary['citation'],
            'baseline_binary_score': baseline_binary['score'],
            'baseline_binary_perplexity': baseline_binary['perplexity'],
            'baseline_binary_emb_similarity': util.cos_sim([highlight_emb], [baseline_binary_emb])[0][0].item(),
            'baseline_binary_rougeL_fmeasure': baseline_binary_rouge['rougeL'].fmeasure,
            'baseline_binary_tfidf': baseline_binary_tfidf,
            
            'baseline_exclusion_citation': fo_exclusion['citation'],
            'baseline_exclusion_score': baseline_exclusion['score'],
            'baseline_exclusion_perplexity': baseline_exclusion['perplexity'],
            'baseline_exclusion_emb_similarity': util.cos_sim([highlight_emb], [baseline_exclusion_emb])[0][0].item(),
            'baseline_exclusion_rougeL_fmeasure': baseline_exclusion_rouge['rougeL'].fmeasure,
            'baseline_exclusion_tfidf': baseline_exclusion_tfidf,
        }
        results.append(result)
    
    return results

In [51]:
def display_comparison_results(results):
    results_df = pd.DataFrame(results)
    
    # results_df's column name (debugging)
    print("Columns in results_df:", results_df.columns.tolist())
    
    metrics = ['score', 'perplexity', 'emb_similarity', 'rougeL_fmeasure', 'tfidf']
    comparison_data = {}
    
    for model_type in ['ba', 'fo', 'baseline']:
        for search_type in ['linear', 'binary', 'exclusion']:
            col_prefix = f'{model_type}_{search_type}_'
            comparison_data[f'{model_type.upper()} {search_type.capitalize()}'] = {
                metric: results_df[f'{col_prefix}{metric}'].mean() 
                for metric in metrics
            }
    
    comparison_df = pd.DataFrame(comparison_data)
    
    # display_df
    display_df = results_df[[
        'highlight',
        # BA Linear
        'ba_linear_citation', 'ba_linear_score', 'ba_linear_perplexity', 'ba_linear_emb_similarity', 'ba_linear_rougeL_fmeasure', 'ba_linear_tfidf',
        # FO Linear
        'fo_linear_citation', 'fo_linear_score', 'fo_linear_perplexity', 'fo_linear_emb_similarity', 'fo_linear_rougeL_fmeasure', 'fo_linear_tfidf',
        # Baseline Linear
        'baseline_linear_citation', 'baseline_linear_score', 'baseline_linear_perplexity', 'baseline_linear_emb_similarity', 'baseline_linear_rougeL_fmeasure', 'baseline_linear_tfidf',
        # BA Binary
        'ba_binary_citation', 'ba_binary_score', 'ba_binary_perplexity', 'ba_binary_emb_similarity', 'ba_binary_rougeL_fmeasure', 'ba_binary_tfidf',
        # FO Binary
        'fo_binary_citation', 'fo_binary_score', 'fo_binary_perplexity', 'fo_binary_emb_similarity', 'fo_binary_rougeL_fmeasure', 'fo_binary_tfidf',
        # Baseline Binary
        'baseline_binary_citation', 'baseline_binary_score', 'baseline_binary_perplexity', 'baseline_binary_emb_similarity', 'baseline_binary_rougeL_fmeasure', 'baseline_binary_tfidf',
        # BA Exclusion
        'ba_exclusion_citation', 'ba_exclusion_score', 'ba_exclusion_perplexity', 'ba_exclusion_emb_similarity', 'ba_exclusion_rougeL_fmeasure', 'ba_exclusion_tfidf',
        # FO Exclusion
        'fo_exclusion_citation', 'fo_exclusion_score', 'fo_exclusion_perplexity', 'fo_exclusion_emb_similarity', 'fo_exclusion_rougeL_fmeasure', 'fo_exclusion_tfidf',
        # Baseline Exclusion
        'baseline_exclusion_citation', 'baseline_exclusion_score', 'baseline_exclusion_perplexity', 'baseline_exclusion_emb_similarity', 'baseline_exclusion_rougeL_fmeasure', 'baseline_exclusion_tfidf'
    ]]
    
    # pandas
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', 50)
    
    return comparison_df, display_df



In [52]:
num_samples = 50

dataset = load_cnn_dataset(num_samples=num_samples)
comparison_results = evaluate_citations_with_linear_binary_exclusion(dataset, num_samples=num_samples)
comparison_df, display_df = display_comparison_results(comparison_results)

Dataset loaded successfully
Example dataset item: {'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places be

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Binary Search (Backward=True): s=0, t=23, Mid=11, s1=-2.977705282264682, s2=-3.194227808243272, p1=19.64269045846984, p2=24.39133163316726
Binary Search (Backward=True): s=0, t=11, Mid=5, s1=-3.005053265259954, s2=-3.1527275183372483, p1=20.187291348671813, p2=23.399801006628188
Binary Search (Backward=True): s=0, t=5, Mid=2, s1=-2.856931128816732, s2=-3.658465028547803, p1=17.4080219017706, p2=38.80173757734645
Binary Search (Backward=True): s=0, t=2, Mid=1, s1=-2.9076363216575167, s2=-3.2597458041810436, p1=18.31346023891901, p2=26.04291630062178
Binary Search (Backward=True): s=0, t=1, Mid=0, s1=-2.670148073404129, s2=-3.2587536024251884, p1=14.442107526508048, p2=26.017089288263033
Binary Search (Backward=False): s=0, t=23, Mid=11, s1=-2.9589545213599053, s2=-3.2560818978961974, p1=19.277806681149233, p2=25.947672085399745
Binary Search (Backward=False): s=0, t=11, Mid=5, s1=-3.0057152874467, s2=-3.2581350867401944, p1=20.200660208187475, p2=26.001002286004816
Binary Search (Backwa

In [56]:
display_df.to_clipboard()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [54]:
comparison_df.round(4).T

Unnamed: 0,score,perplexity,emb_similarity,rougeL_fmeasure,tfidf
BA Linear,-2.2648,10.976,0.3901,0.1789,0.3905
BA Binary,-2.6032,15.6472,0.407,0.1649,0.39
BA Exclusion,-2.573,16.47,0.3865,0.1458,0.3909
FO Linear,-2.6623,15.9545,0.3705,0.1728,0.3918
FO Binary,-2.9222,20.9684,0.4089,0.1622,0.3903
FO Exclusion,-3.0276,25.7542,0.3955,0.1604,0.3911
BASELINE Linear,-4.3003,127.1135,0.6259,0.3012,0.39
BASELINE Binary,-4.3055,125.9294,0.6345,0.3438,0.3895
BASELINE Exclusion,-4.3767,140.499,0.5983,0.333,0.3906


In [55]:
comparison_df.round(4)

Unnamed: 0,BA Linear,BA Binary,BA Exclusion,FO Linear,FO Binary,FO Exclusion,BASELINE Linear,BASELINE Binary,BASELINE Exclusion
score,-2.2648,-2.6032,-2.573,-2.6623,-2.9222,-3.0276,-4.3003,-4.3055,-4.3767
perplexity,10.976,15.6472,16.47,15.9545,20.9684,25.7542,127.1135,125.9294,140.499
emb_similarity,0.3901,0.407,0.3865,0.3705,0.4089,0.3955,0.6259,0.6345,0.5983
rougeL_fmeasure,0.1789,0.1649,0.1458,0.1728,0.1622,0.1604,0.3012,0.3438,0.333
tfidf,0.3905,0.39,0.3909,0.3918,0.3903,0.3911,0.39,0.3895,0.3906
