# Scoring and Citations Testbed
---

The objective of this notebook is to:
1. Experiment and figure out how to perform Scoring as described in the [paper](papers/TRLM_2412.02626.pdf)
2. Experiment with linear search for citation attribution

To further explore: 
1. Experiment with binary and exclusion search
2. Experiment with retrieval

## Import Libraries

In [1]:
import torch as t
import numpy as np
import pandas as pd
import torch.nn.functional as F
from tqdm.auto import tqdm

from transformers import GPTNeoXForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer, util

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.float_format', '{:.4f}'.format)

## Define Util Functions

In [3]:
device="cuda" if t.cuda.is_available() else "cpu"

In [4]:
# TODO: may be helpful to move this over to a utils.py later, or define the models as a separate classes?

# Load models
def load_models():
    # Forward model
    fo_model = GPTNeoXForCausalLM.from_pretrained(
        "EleutherAI/pythia-160m-deduped",
        revision="step143000",
        cache_dir="./.cache/pythia-160m-deduped/step143000",
    ).to(device)
    
    fo_tokenizer = AutoTokenizer.from_pretrained(
        "EleutherAI/pythia-160m-deduped",
        revision="step143000",
        cache_dir="./.cache/pythia-160m-deduped/step143000",
    )
    
    # Backward model
    ba_model = GPTNeoXForCausalLM.from_pretrained(
        "afterless/reverse-pythia-160m",
        cache_dir="./.cache/reverse-pythia-160m",
    ).to(device)
    
    ba_tokenizer = AutoTokenizer.from_pretrained(
        "afterless/reverse-pythia-160m",
        cache_dir="./.cache/reverse-pythia-160m",
    )
    
    return fo_model, fo_tokenizer, ba_model, ba_tokenizer

In [5]:
# TODO : same with above

# Load dataset
def load_cnn_dataset(num_samples=10):
    try:
        # Try with a specific cache directory
        dataset = load_dataset("cnn_dailymail", "3.0.0", cache_dir=".cache")
        print("Dataset loaded successfully")
        
        # Verify the structure - this helps debug
        if num_samples > 0:
            print("Example dataset item:", dataset['train'][0])
            
        # Take only a small sample for testing
        if hasattr(dataset, 'train'):
            return dataset['train'].select(range(min(num_samples, len(dataset['train']))))
        
        return dataset['train'][:num_samples]
        
    except Exception as e:
        print(f"Error loading full dataset: {e}")
        
        # Create a tiny synthetic dataset for testing
        print("Creating synthetic test dataset instead...")
        
        sample_data = {
            'article': [
                "John likes to play basketball. He goes to the court every evening. His friends join him on weekends.",
                "The company announced record profits. Investors were pleased. The stock price increased by 10%."
            ],
            'highlights': [
                "John plays basketball regularly with friends.",
                "Company profits lead to stock price increase."
            ],
            'id': ['test1', 'test2']  # Added ID field
        }
        
        return Dataset.from_dict(sample_data)

In [6]:
def calculate_baseline_score(query, answer, model, tokenizer, task='citation', backward=False, debug=False):
    """
    Calculate log probability of response
    
    Args:
        query (str): The prompt text
        answer (str): The response text
        model: The language model
        tokenizer: The corresponding tokenizer
        direction (str): "forward" for P(response|prompt) or "backward" for P(prompt|response)
    
    Returns:
        dict: Contains token-wise and sequence log probabilities
    """
    
    # The paper describes "Score" as conditional distribution (Section 4) which means the Log Probability, and therefore
    # this reimplementation uses Log Probability.
        
    # First, prepare the texts
    if not backward: 
        #Forward, assumes this is a simple reversal to (P Answer|Query)
        conditioning_prompt = ' is summarized by ' if task =='citation' else ' is answered by '
    else:
        #Backward
        conditioning_prompt = ' is a summary of ' if task =='citation' else ' has an answer to '
    
    # DEBUG 
    if debug:
        print(f"Context: {query + conditioning_prompt}")
        print(f"Target: {answer}")
    
    context_ids = tokenizer.encode(query + conditioning_prompt, return_tensors="pt")
    target_ids = tokenizer.encode(answer, return_tensors="pt")

    # store length to "divide" the texts later
    target_len = target_ids.shape[1]
    context_len = context_ids.shape[1]
    
    if backward:
        # We need to reverse the tokens in backward
        target_ids = t.flip(target_ids, (1,))
        context_ids = t.flip(context_ids, (1,))

    input_ids = t.cat((context_ids, target_ids), dim=1).to(model.device)
    
    if debug:
        print(query + conditioning_prompt + answer)

    # Get model output
    with t.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    
    # Extract token probabilities for the target text
    token_probs = []
    
    # Because the text is (query+answer), we just want to get the (answer) logits
    for i in range(context_len - 1, context_len + target_len - 1):
        
        # essentially, get the probability for the actual token at sequence, i.e. 
        # if "Harry Potter is the boy who survived" and i = 5
        # then we get the probabilities of the model output up until "who"
        # and find what's the probability of "survived"
        
        # get the logits [batch_size, sequence_length, vocabulary_size]
        next_token_logits = logits[0, i, :]  # no batch, sequence i, all vocab
        
        # get the actual token
        next_token_id = input_ids[0, i+1].item()
        
        # Convert logits to probabilities
        next_token_probs = F.softmax(next_token_logits, dim=0)
        prob = next_token_probs[next_token_id].item()
        log_prob = np.log(prob)
        
        token_text = tokenizer.decode([next_token_id])
        token_probs.append({
            'token': token_text,
            'token_id': next_token_id,
            'log_prob': log_prob
        })
    
    # Calculate sequence probability
    sequence_log_prob = sum(tp['log_prob'] for tp in token_probs)
    # Normalize by length to get per-token average
    normalized_log_prob = sequence_log_prob / len(token_probs)
    # Convert to perplexity if needed
    perplexity = np.exp(-sequence_log_prob / len(token_probs))
    
    return {
        'token_log_probs': token_probs,
        'sequence_log_prob': sequence_log_prob,
        'normalized_log_prob': normalized_log_prob,
        'perplexity': perplexity
    }

In [7]:
def calculate_llm_score(query, answer, model, tokenizer, task='citation', backward=False, debug=False):
    """
    Calculate log probability of response given prompt or vice versa.
    
    Args:
        query (str): The prompt text
        answer (str): The response text
        model: The language model
        tokenizer: The corresponding tokenizer
        direction (str): "forward" for P(response|prompt) or "backward" for P(prompt|response)
    
    Returns:
        dict: Contains token-wise and sequence log probabilities
    """
    
    # The paper describes "Score" as conditional distribution (Section 4) which means the Log Probability, and therefore
    # this reimplementation uses Log Probability.
    
    # The notation used here is P(Query|Answer) to make it easier to compare with the paper
    
    # First, prepare the texts
    if not backward: 
        #Forward
        conditioning_prompt = ' is a summary of ' if task =='citation' else ' has an answer to '
    else:
        #Backward
        conditioning_prompt = ' is summarized by ' if task =='citation' else ' is answered by '
    
    # DEBUG 
    if debug:
        print(f"Context: {answer + conditioning_prompt}")
        print(f"Target: {query}")
    
    # convert to tokens, but because the model is auto-regressive, it predicts left -> right 
    # (i.e. tokens at t, t+1, t+2 predicts the t+3, t+4)
    # so we may need to reverse the tokens? 
    # input_ids = tokenizer.encode(query + (conditioning_prompt + answer), return_tensors="pt").to(model.device)
    
    target_ids = tokenizer.encode(query, return_tensors="pt")
    context_ids = tokenizer.encode(answer + conditioning_prompt, return_tensors="pt")

    # store length to "divide" the texts later
    target_len = target_ids.shape[1]
    context_len = context_ids.shape[1]
    
    if backward:
        # We need to reverse the tokens in backward
        target_ids = t.flip(target_ids, (1,))
        context_ids = t.flip(context_ids, (1,))

    input_ids = t.cat((context_ids, target_ids), dim=1).to(model.device)

    # Get model output
    with t.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    
    # Extract token probabilities for the target text
    token_probs = []
    # Because the text is (query+answer), we just want to get the (answer) logits
    for i in range(context_len - 1, context_len + target_len - 1):
        
        # essentially, get the probability for the actual token at sequence, i.e. 
        # if "Harry Potter is the boy who survived" and i = 5
        # then we get the probabilities of the model output up until "who"
        # and find what's the probability of "survived"
        
        # get the logits [batch_size, sequence_length, vocabulary_size]
        next_token_logits = logits[0, i, :]  # no batch, sequence i, all vocab
        
        # get the actual token
        next_token_id = input_ids[0, i+1].item()
        
        # Convert logits to probabilities
        next_token_probs = F.softmax(next_token_logits, dim=0)
        prob = next_token_probs[next_token_id].item()
        log_prob = np.log(prob)
        
        token_text = tokenizer.decode([next_token_id])
        token_probs.append({
            'token': token_text,
            'token_id': next_token_id,
            'log_prob': log_prob
        })
    
    # Calculate sequence probability
    sequence_log_prob = sum(tp['log_prob'] for tp in token_probs)
    # Normalize by length to get per-token average
    normalized_log_prob = sequence_log_prob / len(token_probs)
    # Convert to perplexity if needed
    perplexity = np.exp(-sequence_log_prob / len(token_probs))
    
    return {
        'token_log_probs': token_probs,
        'sequence_log_prob': sequence_log_prob,
        'normalized_log_prob': normalized_log_prob,
        'perplexity': perplexity
    }



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_tfidf_score(highlight, sentences, citation):
    """
    Calculate the maximum TF-IDF similarity between the highlight and a given citation
    among the provided sentences.
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([highlight] + [citation] + sentences)
    similarity = (tfidf_matrix * tfidf_matrix.T).toarray()[0][2:]
    return max(similarity) if similarity.size > 0 else 0.0

In [9]:
# Testing

fo_model, fo_tokenizer, ba_model, ba_tokenizer = load_models()


# Example Text
sentence = "Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him."
highlight = "Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday"
adverse_highlight = "Daniel Craig is recasted as James Bond again"

# Defining sentence/highlight query/answer is still confusing tho sheesh


# Define prompts
ba_score = calculate_llm_score(sentence, highlight, ba_model, ba_tokenizer, backward=True)
fo_score = calculate_llm_score(sentence, highlight, fo_model, fo_tokenizer)

adv_ba_score = calculate_llm_score(sentence, adverse_highlight, ba_model, ba_tokenizer, backward=True)
adv_fo_score = calculate_llm_score(sentence, adverse_highlight, fo_model, fo_tokenizer)


scores_data = {
    'Model Type': ['Backward', 'Forward', 'Backward', 'Forward'],
    'Highlight': ['Correct', 'Correct', 'Adverse', 'Adverse'],
    'Sequence Log Prob': [
        ba_score['sequence_log_prob'],
        fo_score['sequence_log_prob'],
        adv_ba_score['sequence_log_prob'],
        adv_fo_score['sequence_log_prob']
    ],
    'Normalized Log Prob': [
        ba_score['normalized_log_prob'],
        fo_score['normalized_log_prob'],
        adv_ba_score['normalized_log_prob'],
        adv_fo_score['normalized_log_prob']
    ],
    'Perplexity': [
        ba_score['perplexity'],
        fo_score['perplexity'],
        adv_ba_score['perplexity'],
        adv_fo_score['perplexity']
    ]
}

# Create DataFrame
pd.DataFrame(scores_data)

Unnamed: 0,Model Type,Highlight,Sequence Log Prob,Normalized Log Prob,Perplexity
0,Backward,Correct,-113.2715,-2.7627,15.8429
1,Forward,Correct,-113.7015,-2.7732,16.0099
2,Backward,Adverse,-137.511,-3.3539,28.6148
3,Forward,Adverse,-151.0687,-3.6846,39.8293


##  Citation, Linear Search

In [10]:
dataset = load_cnn_dataset(num_samples=50)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Dataset loaded successfully
Example dataset item: {'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places be

In [11]:
# Show dataframe
pd.DataFrame(dataset)

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell...",Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe's earnings from first five Potter films have be...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,"Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a ja...","Mentally ill inmates in Miami are housed on the ""forgotten floor""\nJudge Steven Leifman says most are there as a result of ""avoidable felonies""\nWhile CNN tours facility, patient shouts: ""I am the...",ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who were on the Minneapolis bridge when it collapsed told harrowing tales of survival. ""The whole bridge from one side of the Mississippi to the other just ...","NEW: ""I thought I was going to die,"" driver says .\nMan says pickup truck was folded in half; he just has cut on face .\nDriver: ""I probably had a 30-, 35-foot free fall""\nMinnesota bridge collaps...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,"WASHINGTON (CNN) -- Doctors removed five small polyps from President Bush's colon on Saturday, and ""none appeared worrisome,"" a White House spokesman said. The polyps were removed and sent to the ...","Five small polyps found during procedure; ""none worrisome,"" spokesman says .\nPresident reclaims powers transferred to vice president .\nBush undergoes routine colonoscopy at Camp David .",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,"(CNN) -- The National Football League has indefinitely suspended Atlanta Falcons quarterback Michael Vick without pay, officials with the league said Friday. NFL star Michael Vick is set to appea...","NEW: NFL chief, Atlanta Falcons owner critical of Michael Vick's conduct .\nNFL suspends Falcons quarterback indefinitely without pay .\nVick admits funding dogfighting operation but says he did n...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a
5,"BAGHDAD, Iraq (CNN) -- Dressed in a Superman shirt, 5-year-old Youssif held his sister's hand Friday, seemingly unaware that millions of people across the world have been touched by his story. Nea...","Parents beam with pride, can't stop from smiling from outpouring of support .\nMom: ""I was so happy I didn't know what to do""\nBurn center in U.S. has offered to provide treatment for reconstructi...",a1ebb8bb4d370a1fdf28769206d572be60642d70
6,"BAGHDAD, Iraq (CNN) -- The women are too afraid and ashamed to show their faces or have their real names used. They have been driven to sell their bodies to put food on the table for their childre...","Aid workers: Violence, increased cost of living drive women to prostitution .\nGroup is working to raise awareness of the problem with Iraq's political leaders .\nTwo Iraqi mothers tell CNN they t...",7c0e61ac829a3b3b653e2e3e7536cc4881d1f264
7,"BOGOTA, Colombia (CNN) -- A key rebel commander and fugitive from a U.S. drug trafficking indictment was killed over the weekend in an air attack on a guerrilla encampment, the Colombian military ...","Tomas Medina Caracas was a fugitive from a U.S. drug trafficking indictment .\n""El Negro Acacio"" allegedly helped manage extensive cocaine network .\nU.S. Justice Department indicted him in 2002 ....",f0d73bdab711763e745cdc75850861c9018f235d
8,"WASHINGTON (CNN) -- White House press secretary Tony Snow, who is undergoing treatment for cancer, will step down from his post September 14 and be replaced by deputy press secretary Dana Perino, ...","President Bush says Tony Snow ""will battle cancer and win"" Job of press secretary ""has been a dream for me,"" Snow says Snow leaving on September 14, will be succeeded by Dana Perino .",5e22bbfc7232418b8d2dd646b952e404df5bd048
9,"(CNN) -- Police and FBI agents are investigating the discovery of an empty rocket launcher tube on the front lawn of a Jersey City, New Jersey, home, FBI spokesman Sean Quinn said. Niranjan Desai ...","Empty anti-tank weapon turns up in front of New Jersey home .\nDevice handed over to Army ordnance disposal unit .\nWeapon not capable of being reloaded, experts say .",613d6311ec2c1985bd44707d1796d275452fe156


In [12]:
def linear_attribution_search(dataset, ba_model, ba_tokenizer, fo_model, fo_tokenizer):
    """
    Perform linear attribution search for citations as described in TRLM paper.
    
    For each highlight (summary sentence), find the most likely article sentence
    that it was derived from by scoring all possible pairs.
    """
    results = []
    
    # Process only the first few examples for demonstration
    for idx, example in tqdm(dataset.iterrows(), total=len(dataset)):
        # Split article and highlights into sentences
        article_sentences = sent_tokenize(example['article'])
        highlight_sentences = sent_tokenize(example['highlights'])
        
        # For demonstration, process just the first highlight sentence
        if not highlight_sentences:
            continue
            
        highlight = highlight_sentences[0]
        
        # Store best attribution for each model
        best_ba_sentence = None
        best_ba_score = float('-inf')
        best_fo_sentence = None
        best_fo_score = float('-inf')
        
        # Linear search through all article sentences
        for sentence in article_sentences:
            # Skip very short sentences
            if len(sentence.split()) < 3:
                continue
                
            # Calculate scores using both models
            ba_score = calculate_llm_score(sentence, highlight, ba_model, ba_tokenizer, backward=True)
            fo_score = calculate_llm_score(sentence, highlight, fo_model, fo_tokenizer)
            
            # Track best scores
            if ba_score['normalized_log_prob'] > best_ba_score:
                best_ba_score = ba_score['normalized_log_prob']
                best_ba_sentence = sentence
                
            if fo_score['normalized_log_prob'] > best_fo_score:
                best_fo_score = fo_score['normalized_log_prob']
                best_fo_sentence = sentence
        
        # Add results to our list
        results.append({
            'id': example['id'],
            'highlight': highlight,
            'ba_citation': best_ba_sentence,
            'ba_score': best_ba_score,
            'ba_perplexity': np.exp(-best_ba_score),
            'fo_citation': best_fo_sentence,
            'fo_score': best_fo_score,
            'fo_perplexity': np.exp(-best_fo_score)
        })
    
    
    return results


In [None]:
def binary_search_citation(article, highlight, model, tokenizer, backward=False, max_iterations=30):
    # Split the article into individual sentences using NLTK's sentence tokenizer
    sentences = sent_tokenize(article)
    if not sentences: # Return default values if no sentences are found in the article
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf')}
    
    # Split the highlight into sentences and use only the first sentence
    highlight_sentences = sent_tokenize(highlight)
    if not highlight_sentences: # Return default values if no sentences are found in the highlight
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf')}
    highlight = highlight_sentences[0] # Select the first sentence of the highlight
    
    # Define a recursive binary search function to find the best citation
    def binary_search_recursive(s, t, iteration=0):
        if t - s <= 0 or iteration >= max_iterations: # Base case: if the search range is invalid or max iterations are reached
            if t < s: # Return default values if the range is invalid
                return '', float('-inf'), float('inf')
            # Combine sentences in the range [s, t] into a single string
            a_half = ' '.join(sentences[s:t + 1])
            # Calculate the LLM score for the combined text
            result = calculate_llm_score(a_half, highlight, model, tokenizer, backward=backward)
            score = result['normalized_log_prob']
            perplexity = result['perplexity']
            return a_half, score, perplexity
        
        # Calculate the midpoint of the current range
        mid = s + (t - s) // 2
        # Split the sentences into two halves: [s, mid] and [mid+1, t]
        a_half1 = ' '.join(sentences[s:mid + 1])
        a_half2 = ' '.join(sentences[mid + 1:t + 1])
        # Calculate LLM scores for both halves
        result1 = calculate_llm_score(a_half1, highlight, model, tokenizer, backward=backward)
        result2 = calculate_llm_score(a_half2, highlight, model, tokenizer, backward=backward)
        # Extract normalized log probabilities and perplexities for both halves
        s1, p1 = result1['normalized_log_prob'], result1['perplexity']
        s2, p2 = result2['normalized_log_prob'], result2['perplexity']
        
        # Debugging
        print(f"Binary Search (Backward={backward}): s={s}, t={t}, Mid={mid}, s1={s1}, s2={s2}, p1={p1}, p2={p2}")
        
        if s1 > s2:
            return binary_search_recursive(s, mid, iteration + 1)
        else:
            return binary_search_recursive(mid + 1, t, iteration + 1)
    
    # Initialize the search range: start (s) at 0, end (t) at the last sentence index
    s, t = 0, len(sentences) - 1
    # Perform binary search to find the best citation
    citation, score, perplexity = binary_search_recursive(s, t)
    
    # If no citation is found, return default values
    if not citation:
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf')}
    
    # Print the final result of the binary search
    print(f"Binary Search (Backward={backward}) Final: Score={score}, Citation={citation[:50]}...")
    return {
        'citation': citation,
        'score': score,
        'perplexity': perplexity
    }

In [None]:
# new version
def exclusion_search_citation(article, highlight, model, tokenizer, backward=False):
    # 1. Split the entire sentence group into individual sentences
    sentences = sent_tokenize(article)
    if not sentences:
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf'), 
                'individual_score': float('-inf'), 'individual_perplexity': float('inf')}
    
    # Split the highlight into sentences and use only the first sentence
    highlight_sentences = sent_tokenize(highlight)
    if not highlight_sentences:
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf'), 
                'individual_score': float('-inf'), 'individual_perplexity': float('inf')}
    highlight = highlight_sentences[0]  # Use only the first sentence
    
    # 2. Calculate calculate_llm_score for each sentence group with one sentence removed from index 0 to the end
    all_scores = []
    for i in range(len(sentences)):
        # Create a sentence group excluding the sentence at index i
        excluded_sentences = sentences[:i] + sentences[i+1:]
        if not excluded_sentences:
            score = float('-inf')  # Minimum score for an empty set
            perplexity = float('inf')
        else:
            # Combine the remaining sentences into one (maintain context)
            combined_text = " ".join(excluded_sentences)
            result = calculate_llm_score(combined_text, highlight, model, tokenizer, task='citation', backward=backward)
            score = result['normalized_log_prob']
            perplexity = result['perplexity']
        all_scores.append((score, perplexity, i))
    
    if not all_scores:
        return {'citation': '', 'score': float('-inf'), 'perplexity': float('inf'), 
                'individual_score': float('-inf'), 'individual_perplexity': float('inf')}
    
    # 3. Select the sentence that results in the lowest relevance when removed
    worst_score, worst_perplexity, worst_idx = min(all_scores, key=lambda x: x[0])
    worst_citation = sentences[worst_idx]  # Sentence with the lowest relevance when removed
    
    # 4. Calculate the individual score for the selected sentence
    individual_result = calculate_llm_score(worst_citation, highlight, model, tokenizer, task='citation', backward=backward)
    individual_score = individual_result['normalized_log_prob']
    individual_perplexity = individual_result['perplexity']
    
    print(f"All scores (excluding each sentence): {[score for score, _, _ in all_scores]}")
    print(f"Worst score: {worst_score}, Perplexity: {worst_perplexity}, Sentence index: {worst_idx}")
    print(f"Individual score for selected citation: {individual_score}, Individual perplexity: {individual_perplexity}")
    
    return {
        'citation': worst_citation,
        'score': individual_score,
        'perplexity': individual_perplexity
    }

In [None]:
# baseline added ver
def evaluate_citations_with_linear_binary_exclusion(dataset, num_samples=10):
    fo_model, fo_tokenizer, ba_model, ba_tokenizer = load_models()
    sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    if isinstance(dataset, list):
        df_dataset = pd.DataFrame(dataset)
    else:
        df_dataset = pd.DataFrame(dataset)
        
    df_dataset = df_dataset.iloc[:num_samples]
    
    linear_results = linear_attribution_search(df_dataset, ba_model, ba_tokenizer, fo_model, fo_tokenizer)
    
    results = []
    dataset_list = df_dataset.to_dict('records')
    
    for i in tqdm(range(min(num_samples, len(dataset_list)))):
        article = dataset_list[i]['article']
        highlight = dataset_list[i]['highlights']
        highlight_sentences = sent_tokenize(highlight)
        if not highlight_sentences:
            continue
        first_highlight = highlight_sentences[0]
        
        linear_result = linear_results[i]
        
        ba_binary = binary_search_citation(article, first_highlight, ba_model, ba_tokenizer, backward=True)
        fo_binary = binary_search_citation(article, first_highlight, fo_model, fo_tokenizer)
        
        ba_exclusion = exclusion_search_citation(article, first_highlight, ba_model, ba_tokenizer, backward=True)
        fo_exclusion = exclusion_search_citation(article, first_highlight, fo_model, fo_tokenizer)
        
        # Baseline Score
        baseline_linear = calculate_baseline_score(article, first_highlight, fo_model, fo_tokenizer)
        baseline_binary = calculate_baseline_score(article, first_highlight, fo_model, fo_tokenizer)
        baseline_exclusion = calculate_baseline_score(article, first_highlight, fo_model, fo_tokenizer)
        
        highlight_emb = sentence_transformer.encode(first_highlight)
        ba_linear_emb = sentence_transformer.encode(linear_result['ba_citation'])
        fo_linear_emb = sentence_transformer.encode(linear_result['fo_citation'])
        ba_binary_emb = sentence_transformer.encode(ba_binary['citation'])
        fo_binary_emb = sentence_transformer.encode(fo_binary['citation'])
        ba_exclusion_emb = sentence_transformer.encode(ba_exclusion['citation'])
        fo_exclusion_emb = sentence_transformer.encode(fo_exclusion['citation'])
        baseline_linear_emb = sentence_transformer.encode(linear_result['fo_citation'])  # used same citation with FO
        baseline_binary_emb = sentence_transformer.encode(fo_binary['citation'])
        baseline_exclusion_emb = sentence_transformer.encode(fo_exclusion['citation'])
        
        ba_linear_rouge = scorer.score(first_highlight, linear_result['ba_citation'])
        fo_linear_rouge = scorer.score(first_highlight, linear_result['fo_citation'])
        ba_binary_rouge = scorer.score(first_highlight, ba_binary['citation'])
        fo_binary_rouge = scorer.score(first_highlight, fo_binary['citation'])
        ba_exclusion_rouge = scorer.score(first_highlight, ba_exclusion['citation'])
        fo_exclusion_rouge = scorer.score(first_highlight, fo_exclusion['citation'])
        
        # TF-IDF
        sentences = sent_tokenize(article)
        ba_linear_tfidf = calculate_tfidf_score(first_highlight, sentences, linear_result['ba_citation'])
        fo_linear_tfidf = calculate_tfidf_score(first_highlight, sentences, linear_result['fo_citation'])
        ba_binary_tfidf = calculate_tfidf_score(first_highlight, sentences, ba_binary['citation'])
        fo_binary_tfidf = calculate_tfidf_score(first_highlight, sentences, fo_binary['citation'])
        ba_exclusion_tfidf = calculate_tfidf_score(first_highlight, sentences, ba_exclusion['citation'])
        fo_exclusion_tfidf = calculate_tfidf_score(first_highlight, sentences, fo_exclusion['citation'])
        baseline_linear_tfidf = calculate_tfidf_score(first_highlight, sentences, linear_result['fo_citation'])
        baseline_binary_tfidf = calculate_tfidf_score(first_highlight, sentences, fo_binary['citation'])
        baseline_exclusion_tfidf = calculate_tfidf_score(first_highlight, sentences, fo_exclusion['citation'])
        
        result = {
            'id': dataset_list[i]['id'],
            'highlight': first_highlight,
            'ba_linear_citation': linear_result['ba_citation'],
            'fo_linear_citation': linear_result['fo_citation'],
            'ba_linear_score': linear_result['ba_score'],
            'fo_linear_score': linear_result['fo_score'],
            'ba_linear_perplexity': linear_result['ba_perplexity'],
            'fo_linear_perplexity': linear_result['fo_perplexity'],
            'ba_linear_emb_similarity': util.cos_sim([highlight_emb], [ba_linear_emb])[0][0].item(),
            'fo_linear_emb_similarity': util.cos_sim([highlight_emb], [fo_linear_emb])[0][0].item(),
            'ba_linear_rougeL_fmeasure': ba_linear_rouge['rougeL'].fmeasure,
            'fo_linear_rougeL_fmeasure': fo_linear_rouge['rougeL'].fmeasure,
            'ba_linear_tfidf': ba_linear_tfidf,
            'fo_linear_tfidf': fo_linear_tfidf,
            'ba_binary_citation': ba_binary['citation'],
            'fo_binary_citation': fo_binary['citation'],
            'ba_binary_score': ba_binary['score'],
            'fo_binary_score': fo_binary['score'],
            'ba_binary_perplexity': ba_binary['perplexity'],
            'fo_binary_perplexity': fo_binary['perplexity'],
            'ba_binary_emb_similarity': util.cos_sim([highlight_emb], [ba_binary_emb])[0][0].item(),
            'fo_binary_emb_similarity': util.cos_sim([highlight_emb], [fo_binary_emb])[0][0].item(),
            'ba_binary_rougeL_fmeasure': ba_binary_rouge['rougeL'].fmeasure,
            'fo_binary_rougeL_fmeasure': fo_binary_rouge['rougeL'].fmeasure,
            'ba_binary_tfidf': ba_binary_tfidf,
            'fo_binary_tfidf': fo_binary_tfidf,
            'ba_exclusion_citation': ba_exclusion['citation'],
            'fo_exclusion_citation': fo_exclusion['citation'],
            'ba_exclusion_score': ba_exclusion['score'],
            'fo_exclusion_score': fo_exclusion['score'],
            'ba_exclusion_perplexity': ba_exclusion['perplexity'],
            'fo_exclusion_perplexity': fo_exclusion['perplexity'],
            'ba_exclusion_emb_similarity': util.cos_sim([highlight_emb], [ba_exclusion_emb])[0][0].item(),
            'fo_exclusion_emb_similarity': util.cos_sim([highlight_emb], [fo_exclusion_emb])[0][0].item(),
            'ba_exclusion_rougeL_fmeasure': ba_exclusion_rouge['rougeL'].fmeasure,
            'fo_exclusion_rougeL_fmeasure': fo_exclusion_rouge['rougeL'].fmeasure,
            'ba_exclusion_tfidf': ba_exclusion_tfidf,
            'fo_exclusion_tfidf': fo_exclusion_tfidf,
            # Baseline model added
            'baseline_linear_citation': linear_result['fo_citation'],  # use same citation with FO
            'baseline_linear_score': baseline_linear['normalized_log_prob'],
            'baseline_linear_perplexity': baseline_linear['perplexity'],
            'baseline_linear_emb_similarity': util.cos_sim([highlight_emb], [baseline_linear_emb])[0][0].item(),
            'baseline_linear_rougeL_fmeasure': fo_linear_rouge['rougeL'].fmeasure,
            'baseline_linear_tfidf': baseline_linear_tfidf,
            'baseline_binary_citation': fo_binary['citation'],
            'baseline_binary_score': baseline_binary['normalized_log_prob'],
            'baseline_binary_perplexity': baseline_binary['perplexity'],
            'baseline_binary_emb_similarity': util.cos_sim([highlight_emb], [baseline_binary_emb])[0][0].item(),
            'baseline_binary_rougeL_fmeasure': fo_binary_rouge['rougeL'].fmeasure,
            'baseline_binary_tfidf': baseline_binary_tfidf,
            'baseline_exclusion_citation': fo_exclusion['citation'],
            'baseline_exclusion_score': baseline_exclusion['normalized_log_prob'],
            'baseline_exclusion_perplexity': baseline_exclusion['perplexity'],
            'baseline_exclusion_emb_similarity': util.cos_sim([highlight_emb], [baseline_exclusion_emb])[0][0].item(),
            'baseline_exclusion_rougeL_fmeasure': fo_exclusion_rouge['rougeL'].fmeasure,
            'baseline_exclusion_tfidf': baseline_exclusion_tfidf,
        }
        results.append(result)
    
    return results

In [16]:
def display_comparison_results(results):
    results_df = pd.DataFrame(results)
    
    # results_df's column name (debugging)
    print("Columns in results_df:", results_df.columns.tolist())
    
    metrics = ['score', 'perplexity', 'emb_similarity', 'rougeL_fmeasure', 'tfidf']
    comparison_data = {}
    
    for model_type in ['ba', 'fo', 'baseline']:
        for search_type in ['linear', 'binary', 'exclusion']:
            col_prefix = f'{model_type}_{search_type}_'
            comparison_data[f'{model_type.upper()} {search_type.capitalize()}'] = {
                metric: results_df[f'{col_prefix}{metric}'].mean() 
                for metric in metrics
            }
    
    comparison_df = pd.DataFrame(comparison_data)
    
    # display_df
    display_df = results_df[[
        'highlight',
        # BA Linear
        'ba_linear_citation', 'ba_linear_score', 'ba_linear_perplexity', 'ba_linear_emb_similarity', 'ba_linear_rougeL_fmeasure', 'ba_linear_tfidf',
        # FO Linear
        'fo_linear_citation', 'fo_linear_score', 'fo_linear_perplexity', 'fo_linear_emb_similarity', 'fo_linear_rougeL_fmeasure', 'fo_linear_tfidf',
        # Baseline Linear
        'baseline_linear_citation', 'baseline_linear_score', 'baseline_linear_perplexity', 'baseline_linear_emb_similarity', 'baseline_linear_rougeL_fmeasure', 'baseline_linear_tfidf',
        # BA Binary
        'ba_binary_citation', 'ba_binary_score', 'ba_binary_perplexity', 'ba_binary_emb_similarity', 'ba_binary_rougeL_fmeasure', 'ba_binary_tfidf',
        # FO Binary
        'fo_binary_citation', 'fo_binary_score', 'fo_binary_perplexity', 'fo_binary_emb_similarity', 'fo_binary_rougeL_fmeasure', 'fo_binary_tfidf',
        # Baseline Binary
        'baseline_binary_citation', 'baseline_binary_score', 'baseline_binary_perplexity', 'baseline_binary_emb_similarity', 'baseline_binary_rougeL_fmeasure', 'baseline_binary_tfidf',
        # BA Exclusion
        'ba_exclusion_citation', 'ba_exclusion_score', 'ba_exclusion_perplexity', 'ba_exclusion_emb_similarity', 'ba_exclusion_rougeL_fmeasure', 'ba_exclusion_tfidf',
        # FO Exclusion
        'fo_exclusion_citation', 'fo_exclusion_score', 'fo_exclusion_perplexity', 'fo_exclusion_emb_similarity', 'fo_exclusion_rougeL_fmeasure', 'fo_exclusion_tfidf',
        # Baseline Exclusion
        'baseline_exclusion_citation', 'baseline_exclusion_score', 'baseline_exclusion_perplexity', 'baseline_exclusion_emb_similarity', 'baseline_exclusion_rougeL_fmeasure', 'baseline_exclusion_tfidf'
    ]]
    
    # pandas
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', 50)
    
    return comparison_df, display_df

# execute
dataset = load_cnn_dataset(num_samples=50)
comparison_results = evaluate_citations_with_linear_binary_exclusion(dataset, num_samples=10)
comparison_df, display_df = display_comparison_results(comparison_results)

print("Comparison of Average Metrics:")
print(comparison_df.round(4))
print("\nDetailed Results:")
display_df

Dataset loaded successfully
Example dataset item: {'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places be

100%|██████████| 10/10 [00:26<00:00,  2.63s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Binary Search (Backward=True): s=0, t=23, Mid=11, s1=-2.977705282264682, s2=-3.194227808243272
Binary Search (Backward=True): s=0, t=11, Mid=5, s1=-3.005053265259954, s2=-3.1527275183372483
Binary Search (Backward=True): s=0, t=5, Mid=2, s1=-2.856931128816732, s2=-3.658465028547803
Binary Search (Backward=True): s=0, t=2, Mid=1, s1=-2.9076363216575167, s2=-3.2597458041810436
Binary Search (Backward=True): s=0, t=1, Mid=0, s1=-2.670148073404129, s2=-3.2587536024251884
Binary Search (Backward=True) Final: Score=-2.670148073404129, Citation=LONDON, England (Reuters) -- Harry Potter star Dan...
Binary Search (Backward=False): s=0, t=23, Mid=11, s1=-2.9589545213599053, s2=-3.2560818978961974
Binary Search (Backward=False): s=0, t=11, Mid=5, s1=-3.0057152874467, s2=-3.2581350867401944
Binary Search (Backward=False): s=0, t=5, Mid=2, s1=-2.790441799686955, s2=-3.8938452826852963
Binary Search (Backward=False): s=0, t=2, Mid=1, s1=-2.780818273672728, s2=-3.468478783882734
Binary Search (Backwa

  a = torch.tensor(a)
 10%|█         | 1/10 [00:16<02:29, 16.62s/it]

Binary Search (Backward=True): s=0, t=44, Mid=22, s1=-2.9671204877814654, s2=-2.975111526736919
Binary Search (Backward=True): s=0, t=22, Mid=11, s1=-3.0037087234340594, s2=-3.08427234742647
Binary Search (Backward=True): s=0, t=11, Mid=5, s1=-2.908400140835885, s2=-3.2827093003202528
Binary Search (Backward=True): s=0, t=5, Mid=2, s1=-2.9966691036753654, s2=-3.005916555245996
Binary Search (Backward=True): s=0, t=2, Mid=1, s1=-3.230420726410803, s2=-2.6425296956716364
Binary Search (Backward=True) Final: Score=-2.6425296956716364, Citation=MIAMI, Florida (CNN) -- The ninth floor of the Mia...
Binary Search (Backward=False): s=0, t=44, Mid=22, s1=-3.044584807817026, s2=-2.9681757009098915
Binary Search (Backward=False): s=23, t=44, Mid=33, s1=-3.2127016893258706, s2=-2.979435335091113
Binary Search (Backward=False): s=34, t=44, Mid=39, s1=-3.086394142300387, s2=-3.1979930418953697
Binary Search (Backward=False): s=34, t=39, Mid=36, s1=-4.565123826863146, s2=-2.9951805598655086
Binary S

 20%|██        | 2/10 [00:57<04:05, 30.71s/it]

Binary Search (Backward=True): s=0, t=48, Mid=24, s1=-3.01729378573465, s2=-3.00626511472068
Binary Search (Backward=True): s=25, t=48, Mid=36, s1=-3.4040801081248135, s2=-2.934284960946225
Binary Search (Backward=True): s=37, t=48, Mid=42, s1=-2.8859658421256547, s2=-3.2367396668159136
Binary Search (Backward=True): s=37, t=42, Mid=39, s1=-3.049313955700673, s2=-3.0619891535139727
Binary Search (Backward=True): s=37, t=39, Mid=38, s1=-3.0428981741491925, s2=-5.015040054455315
Binary Search (Backward=True): s=37, t=38, Mid=37, s1=-3.561259907625085, s2=-3.3691779581691157
Binary Search (Backward=True) Final: Score=-3.3691779581691157, Citation="I had no idea there was a vehicle on my car," she...
Binary Search (Backward=False): s=0, t=48, Mid=24, s1=-3.056426712570785, s2=-3.084592302169313
Binary Search (Backward=False): s=0, t=24, Mid=12, s1=-3.1435165964722014, s2=-3.3252093764493096
Binary Search (Backward=False): s=0, t=12, Mid=6, s1=-3.1765230640089697, s2=-3.4757252258183335
Bin

 30%|███       | 3/10 [01:38<04:10, 35.76s/it]

Binary Search (Backward=True): s=0, t=23, Mid=11, s1=-2.9137138040644963, s2=-3.037734767064977
Binary Search (Backward=True): s=0, t=11, Mid=5, s1=-2.8379738153987, s2=-3.3214874388338034
Binary Search (Backward=True): s=0, t=5, Mid=2, s1=-2.445381627616076, s2=-3.939482203755474
Binary Search (Backward=True): s=0, t=2, Mid=1, s1=-2.4936735789862685, s2=-3.0125366941213825
Binary Search (Backward=True): s=0, t=1, Mid=0, s1=-2.5536090374220324, s2=-2.7161189403082373
Binary Search (Backward=True) Final: Score=-2.5536090374220324, Citation=WASHINGTON (CNN) -- Doctors removed five small pol...
Binary Search (Backward=False): s=0, t=23, Mid=11, s1=-2.9364513471476634, s2=-3.313540162716507
Binary Search (Backward=False): s=0, t=11, Mid=5, s1=-2.8676265547508204, s2=-3.4064257875220374
Binary Search (Backward=False): s=0, t=5, Mid=2, s1=-2.747067384721152, s2=-3.95527159681326
Binary Search (Backward=False): s=0, t=2, Mid=1, s1=-2.7921887598152573, s2=-4.4316741951198075
Binary Search (Bac

 40%|████      | 4/10 [01:53<02:43, 27.19s/it]

Binary Search (Backward=True): s=0, t=45, Mid=22, s1=-2.891959569805292, s2=-2.964761213340547
Binary Search (Backward=True): s=0, t=22, Mid=11, s1=-2.657708145957144, s2=-3.106754263386592
Binary Search (Backward=True): s=0, t=11, Mid=5, s1=-2.3781280702699434, s2=-3.0896664394279103
Binary Search (Backward=True): s=0, t=5, Mid=2, s1=-2.3441738662050904, s2=-2.642459305132955
Binary Search (Backward=True): s=0, t=2, Mid=1, s1=-2.3733873217769887, s2=-3.1268996188312244
Binary Search (Backward=True): s=0, t=1, Mid=0, s1=-2.465530031657023, s2=-2.4883462192315124
Binary Search (Backward=True) Final: Score=-2.465530031657023, Citation=(CNN)  -- The National Football League has indefin...
Binary Search (Backward=False): s=0, t=45, Mid=22, s1=-2.925811837767673, s2=-3.037706487611813
Binary Search (Backward=False): s=0, t=22, Mid=11, s1=-2.7614377883995647, s2=-3.163700514953601
Binary Search (Backward=False): s=0, t=11, Mid=5, s1=-2.703882420463007, s2=-3.1860963573930197
Binary Search (B

 50%|█████     | 5/10 [02:47<03:04, 36.95s/it]

Binary Search (Backward=True): s=0, t=39, Mid=19, s1=-2.923807924345239, s2=-3.4275217289134265
Binary Search (Backward=True): s=0, t=19, Mid=9, s1=-2.818586062465763, s2=-3.2332808181850874
Binary Search (Backward=True): s=0, t=9, Mid=4, s1=-3.2488098505142258, s2=-2.5158344374655055
Binary Search (Backward=True): s=5, t=9, Mid=7, s1=-2.5921416265915687, s2=-2.7517203905453154
Binary Search (Backward=True): s=5, t=7, Mid=6, s1=-2.58058241201018, s2=-2.8189801193542054
Binary Search (Backward=True): s=5, t=6, Mid=5, s1=-2.696183912605801, s2=-3.0991470612564274
Binary Search (Backward=True) Final: Score=-2.696183912605801, Citation="I was so happy I didn't know what to do with myse...
Binary Search (Backward=False): s=0, t=39, Mid=19, s1=-3.052446282957539, s2=-3.3989004522821227
Binary Search (Backward=False): s=0, t=19, Mid=9, s1=-2.992599722017947, s2=-3.430678824542376
Binary Search (Backward=False): s=0, t=9, Mid=4, s1=-3.46503336268747, s2=-2.6975346180343833
Binary Search (Backw

 60%|██████    | 6/10 [03:21<02:24, 36.00s/it]

Binary Search (Backward=True): s=0, t=52, Mid=26, s1=-3.1136766316950997, s2=-3.121011797255734
Binary Search (Backward=True): s=0, t=26, Mid=13, s1=-3.190051389330659, s2=-3.2599216472195205
Binary Search (Backward=True): s=0, t=13, Mid=6, s1=-3.238249364599208, s2=-3.399305550513923
Binary Search (Backward=True): s=0, t=6, Mid=3, s1=-3.053728492672908, s2=-3.651611932606123
Binary Search (Backward=True): s=0, t=3, Mid=1, s1=-2.880086037517533, s2=-3.80382831871904
Binary Search (Backward=True): s=0, t=1, Mid=0, s1=-3.3986264074894357, s2=-3.0622138784312907
Binary Search (Backward=True) Final: Score=-3.0622138784312907, Citation=They have been driven to sell their bodies to put ...
Binary Search (Backward=False): s=0, t=52, Mid=26, s1=-3.1345751694280968, s2=-3.094082090111405
Binary Search (Backward=False): s=27, t=52, Mid=39, s1=-3.166216303267796, s2=-3.1451937723908827
Binary Search (Backward=False): s=40, t=52, Mid=46, s1=-3.2548924105675123, s2=-3.2384950344686874
Binary Search

 70%|███████   | 7/10 [04:14<02:04, 41.52s/it]

Binary Search (Backward=True): s=0, t=13, Mid=6, s1=-2.3498920389105766, s2=-3.072667671222796
Binary Search (Backward=True): s=0, t=6, Mid=3, s1=-2.2992547498653177, s2=-2.6253501769523293
Binary Search (Backward=True): s=0, t=3, Mid=1, s1=-2.5190265268911056, s2=-2.2189464616391974
Binary Search (Backward=True): s=2, t=3, Mid=2, s1=-2.3624139514827425, s2=-2.3737139982801616
Binary Search (Backward=True) Final: Score=-2.3624139514827425, Citation=Tomas Medina Caracas, known popularly as "El Negro...
Binary Search (Backward=False): s=0, t=13, Mid=6, s1=-2.471976067379738, s2=-3.0937561600533403
Binary Search (Backward=False): s=0, t=6, Mid=3, s1=-2.49854833160305, s2=-2.6650493730425233
Binary Search (Backward=False): s=0, t=3, Mid=1, s1=-2.8342506771271196, s2=-2.4522183277418255
Binary Search (Backward=False): s=2, t=3, Mid=2, s1=-2.5568817510016077, s2=-2.629907670220384
Binary Search (Backward=False) Final: Score=-2.5568817510016077, Citation=Tomas Medina Caracas, known popularly 

 80%|████████  | 8/10 [04:22<01:01, 30.98s/it]

Binary Search (Backward=True): s=0, t=32, Mid=16, s1=-2.703443656561015, s2=-2.8507435641644983
Binary Search (Backward=True): s=0, t=16, Mid=8, s1=-2.379347749300833, s2=-3.1266311310297445
Binary Search (Backward=True): s=0, t=8, Mid=4, s1=-2.0952336488236907, s2=-3.0462891638784373
Binary Search (Backward=True): s=0, t=4, Mid=2, s1=-1.8583584046396189, s2=-2.8375439341637527
Binary Search (Backward=True): s=0, t=2, Mid=1, s1=-1.7544277502923344, s2=-2.6395366975000107
Binary Search (Backward=True): s=0, t=1, Mid=0, s1=-1.7866121077149346, s2=-2.10315935432421
Binary Search (Backward=True) Final: Score=-1.7866121077149346, Citation=WASHINGTON (CNN) -- White House press secretary To...
Binary Search (Backward=False): s=0, t=32, Mid=16, s1=-2.5825431540491244, s2=-2.9593954326975243
Binary Search (Backward=False): s=0, t=16, Mid=8, s1=-2.337524977169851, s2=-3.050067182664002
Binary Search (Backward=False): s=0, t=8, Mid=4, s1=-2.1187463717254342, s2=-3.0226919221498405
Binary Search (

 90%|█████████ | 9/10 [04:46<00:28, 28.64s/it]

Binary Search (Backward=True): s=0, t=15, Mid=7, s1=-2.6316915327754327, s2=-3.6764438629908307
Binary Search (Backward=True): s=0, t=7, Mid=3, s1=-2.6229269637019836, s2=-2.917093357197552
Binary Search (Backward=True): s=0, t=3, Mid=1, s1=-2.963574449640481, s2=-2.566943720648975
Binary Search (Backward=True): s=2, t=3, Mid=2, s1=-2.5236784226803888, s2=-2.87421368715628
Binary Search (Backward=True) Final: Score=-2.5236784226803888, Citation=The launcher has been turned over to U.S. Army off...
Binary Search (Backward=False): s=0, t=15, Mid=7, s1=-2.877929814036636, s2=-3.821008775461497
Binary Search (Backward=False): s=0, t=7, Mid=3, s1=-2.901142079992057, s2=-3.137883455941682
Binary Search (Backward=False): s=0, t=3, Mid=1, s1=-3.1680399311049996, s2=-2.954353657821384
Binary Search (Backward=False): s=2, t=3, Mid=2, s1=-3.2003646629309177, s2=-3.4045958602931856
Binary Search (Backward=False) Final: Score=-3.2003646629309177, Citation=The launcher has been turned over to U.S. A

100%|██████████| 10/10 [04:56<00:00, 29.60s/it]

Columns in results_df: ['id', 'highlight', 'ba_linear_citation', 'fo_linear_citation', 'ba_linear_score', 'fo_linear_score', 'ba_linear_perplexity', 'fo_linear_perplexity', 'ba_linear_emb_similarity', 'fo_linear_emb_similarity', 'ba_linear_rougeL_fmeasure', 'fo_linear_rougeL_fmeasure', 'ba_linear_tfidf', 'fo_linear_tfidf', 'ba_binary_citation', 'fo_binary_citation', 'ba_binary_score', 'fo_binary_score', 'ba_binary_perplexity', 'fo_binary_perplexity', 'ba_binary_emb_similarity', 'fo_binary_emb_similarity', 'ba_binary_rougeL_fmeasure', 'fo_binary_rougeL_fmeasure', 'ba_binary_tfidf', 'fo_binary_tfidf', 'ba_exclusion_citation', 'fo_exclusion_citation', 'ba_exclusion_score', 'fo_exclusion_score', 'ba_exclusion_perplexity', 'fo_exclusion_perplexity', 'ba_exclusion_emb_similarity', 'fo_exclusion_emb_similarity', 'ba_exclusion_rougeL_fmeasure', 'fo_exclusion_rougeL_fmeasure', 'ba_exclusion_tfidf', 'fo_exclusion_tfidf', 'baseline_linear_citation', 'baseline_linear_score', 'baseline_linear_perpl




Unnamed: 0,highlight,ba_linear_citation,ba_linear_score,ba_linear_perplexity,ba_linear_emb_similarity,ba_linear_rougeL_fmeasure,ba_linear_tfidf,fo_linear_citation,fo_linear_score,fo_linear_perplexity,fo_linear_emb_similarity,fo_linear_rougeL_fmeasure,fo_linear_tfidf,baseline_linear_citation,baseline_linear_score,baseline_linear_perplexity,baseline_linear_emb_similarity,baseline_linear_rougeL_fmeasure,baseline_linear_tfidf,ba_binary_citation,ba_binary_score,ba_binary_perplexity,ba_binary_emb_similarity,ba_binary_rougeL_fmeasure,ba_binary_tfidf,fo_binary_citation,fo_binary_score,fo_binary_perplexity,fo_binary_emb_similarity,fo_binary_rougeL_fmeasure,fo_binary_tfidf,baseline_binary_citation,baseline_binary_score,baseline_binary_perplexity,baseline_binary_emb_similarity,baseline_binary_rougeL_fmeasure,baseline_binary_tfidf,ba_exclusion_citation,ba_exclusion_score,ba_exclusion_perplexity,ba_exclusion_emb_similarity,ba_exclusion_rougeL_fmeasure,ba_exclusion_tfidf,fo_exclusion_citation,fo_exclusion_score,fo_exclusion_perplexity,fo_exclusion_emb_similarity,fo_exclusion_rougeL_fmeasure,fo_exclusion_tfidf,baseline_exclusion_citation,baseline_exclusion_score,baseline_exclusion_perplexity,baseline_exclusion_emb_similarity,baseline_exclusion_rougeL_fmeasure,baseline_exclusion_tfidf
0,Harry Potter star Daniel Radcliffe gets £20M f...,All rights reserved.This material may not be p...,-1.0776,2.9377,-0.0652,0.0,0.4307,All rights reserved.This material may not be p...,-1.823,6.1904,-0.0652,0.0,0.4307,All rights reserved.This material may not be p...,-3.7769,43.681,-0.0652,0.0,0.4307,"LONDON, England (Reuters) -- Harry Potter star...",-2.6701,14.4421,0.9102,0.44,0.4329,"LONDON, England (Reuters) -- Harry Potter star...",-2.6466,14.1064,0.9102,0.44,0.4329,"LONDON, England (Reuters) -- Harry Potter star...",-3.7769,43.681,0.9102,0.44,0.4329,All rights reserved.This material may not be p...,-1.0776,2.9377,-0.0652,0.0,0.4307,"LONDON, England (Reuters) -- Harry Potter star...",-2.6466,14.1064,0.9102,0.44,0.4329,"LONDON, England (Reuters) -- Harry Potter star...",-3.7769,43.681,0.9102,0.44,0.4329
1,Mentally ill inmates in Miami are housed on th...,"""I am the son of the president.",-2.6203,13.7398,0.3323,0.25,0.3938,"MIAMI, Florida (CNN) -- The ninth floor of the...",-2.4918,12.0827,0.5824,0.1791,0.4079,"MIAMI, Florida (CNN) -- The ninth floor of the...",-2.736,15.4249,0.5824,0.1791,0.4079,"MIAMI, Florida (CNN) -- The ninth floor of the...",-2.6425,14.0487,0.5824,0.1791,0.4079,Leifman says in 1955 there were more than half...,-3.1904,24.2988,0.4099,0.1176,0.4128,Leifman says in 1955 there were more than half...,-2.736,15.4249,0.4099,0.1176,0.4128,Leifman says about one-third of all people in ...,-2.7332,15.3815,0.6664,0.0923,0.4105,"You need to get me out of here!""",-4.8839,132.1492,0.1211,0.0351,0.4079,"You need to get me out of here!""",-2.736,15.4249,0.1211,0.0351,0.4079
2,"NEW: ""I thought I was going to die,"" driver sa...","""I knew the deck was going down, there was no ...",-2.3778,10.781,0.5736,0.4242,0.4346,"""I knew the deck was going down, there was no ...",-2.7926,16.3238,0.5736,0.4242,0.4346,"""I knew the deck was going down, there was no ...",-3.8858,48.7038,0.5736,0.4242,0.4346,"""I had no idea there was a vehicle on my car,""...",-3.3692,29.0546,0.4661,0.1667,0.4471,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...",-3.5209,33.8159,0.3882,0.0714,0.4476,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...",-3.8858,48.7038,0.3882,0.0714,0.4476,"""I realized there was a school bus right next ...",-3.1096,22.4131,0.3848,0.1538,0.4454,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...",-3.5209,33.8159,0.3882,0.0714,0.4476,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...",-3.8858,48.7038,0.3882,0.0714,0.4476
3,"Five small polyps found during procedure; ""non...",A colonoscopy is the most sensitive test for c...,-2.4613,11.7206,0.342,0.0541,0.3985,The procedure was supervised by Dr. Richard Tu...,-3.0601,21.329,0.3074,0.05,0.4105,The procedure was supervised by Dr. Richard Tu...,-4.9624,142.9355,0.3074,0.05,0.4105,WASHINGTON (CNN) -- Doctors removed five small...,-2.5536,12.8534,0.69,0.3636,0.3924,WASHINGTON (CNN) -- Doctors removed five small...,-3.0771,21.6956,0.69,0.3636,0.3924,WASHINGTON (CNN) -- Doctors removed five small...,-4.9624,142.9355,0.69,0.3636,0.3924,The polyps were removed and sent to the Nation...,-2.7161,15.1215,0.6356,0.1212,0.3979,The polyps were removed and sent to the Nation...,-3.3033,27.2027,0.6356,0.1212,0.3979,The polyps were removed and sent to the Nation...,-4.9624,142.9355,0.6356,0.1212,0.3979
4,"NEW: NFL chief, Atlanta Falcons owner critical...","Vick, 27, is scheduled to appear Monday in cou...",-2.1516,8.5987,0.4278,0.0645,0.2073,The charge is punishable by up to five years i...,-2.8822,17.8526,0.0371,0.0476,0.2037,The charge is punishable by up to five years i...,-5.0741,159.8212,0.0371,0.0476,0.2037,(CNN) -- The National Football League has ind...,-2.4655,11.7697,0.5499,0.2424,0.1936,(CNN) -- The National Football League has ind...,-3.0224,20.5398,0.5499,0.2424,0.1936,(CNN) -- The National Football League has ind...,-5.0741,159.8212,0.5499,0.2424,0.1936,The charge is punishable by up to five years i...,-2.1657,8.721,0.0371,0.0476,0.2037,"Your team, the NFL, and NFL fans have all been...",-2.952,19.1449,0.6781,0.1111,0.196,"Your team, the NFL, and NFL fans have all been...",-5.0741,159.8212,0.6781,0.1111,0.196
5,"Parents beam with pride, can't stop from smili...","""We just want to thank everyone who has come f...",-2.4802,11.9435,0.2002,0.0,0.2322,His father said he was on the roof of his hous...,-3.0645,21.4236,0.2225,0.2051,0.2355,His father said he was on the roof of his hous...,-5.2967,199.6713,0.2225,0.2051,0.2355,"""I was so happy I didn't know what to do with ...",-2.6962,14.8231,0.4127,0.1081,0.2325,"""I was so happy I didn't know what to do with ...",-3.0813,21.7877,0.4127,0.1081,0.2325,"""I was so happy I didn't know what to do with ...",-5.2967,199.6713,0.4127,0.1081,0.2325,"Shortly after Youssif's story aired Wednesday,...",-3.1283,22.8352,0.1686,0.0294,0.2327,"Shortly after Youssif's story aired Wednesday,...",-3.358,28.7327,0.1686,0.0294,0.2327,"Shortly after Youssif's story aired Wednesday,...",-5.2967,199.6713,0.1686,0.0294,0.2327
6,"Aid workers: Violence, increased cost of livin...","""At first I rejected it, but then I realized I...",-2.951,19.1248,0.015,0.08,0.5183,"She adds, ""There is a huge population of women...",-3.2221,25.0807,0.4129,0.1538,0.5164,"She adds, ""There is a huge population of women...",-4.0021,54.7128,0.4129,0.1538,0.5164,They have been driven to sell their bodies to ...,-3.0622,21.3748,0.347,0.0571,0.5183,"""It's not like we were born into this, nor was...",-3.4451,31.3459,0.0525,0.0,0.52,"""It's not like we were born into this, nor was...",-4.0021,54.7128,0.0525,0.0,0.52,I have to do anything that I can to preserve m...,-3.0969,22.1284,0.3275,0.1143,0.5183,"""At this point there is a population of women ...",-3.3598,28.7829,0.3701,0.125,0.5164,"""At this point there is a population of women ...",-4.0021,54.7128,0.3701,0.125,0.5164
7,Tomas Medina Caracas was a fugitive from a U.S...,"BOGOTA, Colombia (CNN) -- A key rebel commande...",-2.3433,10.4151,0.6413,0.383,0.3552,"BOGOTA, Colombia (CNN) -- A key rebel commande...",-2.5102,12.3074,0.6413,0.383,0.3552,"BOGOTA, Colombia (CNN) -- A key rebel commande...",-1.7811,5.9365,0.6413,0.383,0.3552,"Tomas Medina Caracas, known popularly as ""El N...",-2.3624,10.6165,0.6908,0.3019,0.3606,"Tomas Medina Caracas, known popularly as ""El N...",-2.5569,12.8955,0.6908,0.3019,0.3606,"Tomas Medina Caracas, known popularly as ""El N...",-1.7811,5.9365,0.6908,0.3019,0.3606,"Tomas Medina Caracas, known popularly as ""El N...",-2.3624,10.6165,0.6908,0.3019,0.3606,"Tomas Medina Caracas, known popularly as ""El N...",-2.5569,12.8955,0.6908,0.3019,0.3606,"Tomas Medina Caracas, known popularly as ""El N...",-1.7811,5.9365,0.6908,0.3019,0.3606
8,"President Bush says Tony Snow ""will battle can...",WASHINGTON (CNN) -- White House press secretar...,-1.7866,5.9692,0.7224,0.2609,0.3718,WASHINGTON (CNN) -- White House press secretar...,-2.0669,7.9006,0.7224,0.2609,0.3718,WASHINGTON (CNN) -- White House press secretar...,-3.621,37.3741,0.7224,0.2609,0.3718,WASHINGTON (CNN) -- White House press secretar...,-1.7866,5.9692,0.7224,0.2609,0.3718,WASHINGTON (CNN) -- White House press secretar...,-2.0669,7.9006,0.7224,0.2609,0.3718,WASHINGTON (CNN) -- White House press secretar...,-3.621,37.3741,0.7224,0.2609,0.3718,WASHINGTON (CNN) -- White House press secretar...,-1.7866,5.9692,0.7224,0.2609,0.3718,WASHINGTON (CNN) -- White House press secretar...,-2.0669,7.9006,0.7224,0.2609,0.3718,WASHINGTON (CNN) -- White House press secretar...,-3.621,37.3741,0.7224,0.2609,0.3718
9,Empty anti-tank weapon turns up in front of Ne...,The launcher has been turned over to U.S. Army...,-2.5237,12.4744,0.3719,0.1463,0.3223,Army officials said they could not determine i...,-3.188,24.24,0.2356,0.0,0.3313,Army officials said they could not determine i...,-4.4657,86.9858,0.2356,0.0,0.3313,The launcher has been turned over to U.S. Army...,-2.5237,12.4744,0.3719,0.1463,0.3223,The launcher has been turned over to U.S. Army...,-3.2004,24.5415,0.3719,0.1463,0.3223,The launcher has been turned over to U.S. Army...,-4.4657,86.9858,0.3719,0.1463,0.3223,(CNN) -- Police and FBI agents are investigati...,-2.9535,19.1728,0.4042,0.2791,0.3155,(CNN) -- Police and FBI agents are investigati...,-3.2542,25.8987,0.4042,0.2791,0.3155,(CNN) -- Police and FBI agents are investigati...,-4.4657,86.9858,0.4042,0.2791,0.3155
