# Scoring and Citations Testbed
---

The objective of this notebook is to:
1. Experiment and figure out how to perform Scoring as described in the [paper](papers/TRLM_2412.02626.pdf)
2. Experiment with linear search for citation attribution

To further explore: 
1. Experiment with binary and exclusion search
2. Experiment with retrieval

## Import Libraries

In [1]:
import torch as t
import numpy as np
import pandas as pd
import torch.nn.functional as F
from tqdm.auto import tqdm

from transformers import GPTNeoXForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize


In [2]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.float_format', '{:.4f}'.format)

## Define Util Functions

In [3]:
device="cuda" if t.cuda.is_available() else "cpu"

In [4]:
# TODO: may be helpful to move this over to a utils.py later, or define the models as a separate classes?

# Load models
def load_models():
    # Forward model
    fo_model = GPTNeoXForCausalLM.from_pretrained(
        "EleutherAI/pythia-160m-deduped",
        revision="step143000",
        cache_dir="./.cache/pythia-160m-deduped/step143000",
    ).to(device)
    
    fo_tokenizer = AutoTokenizer.from_pretrained(
        "EleutherAI/pythia-160m-deduped",
        revision="step143000",
        cache_dir="./.cache/pythia-160m-deduped/step143000",
    )
    
    # Backward model
    ba_model = GPTNeoXForCausalLM.from_pretrained(
        "afterless/reverse-pythia-160m",
        cache_dir="./.cache/reverse-pythia-160m",
    ).to(device)
    
    ba_tokenizer = AutoTokenizer.from_pretrained(
        "afterless/reverse-pythia-160m",
        cache_dir="./.cache/reverse-pythia-160m",
    )
    
    return fo_model, fo_tokenizer, ba_model, ba_tokenizer

In [5]:
# TODO : same with above

# Load dataset
def load_cnn_dataset(num_samples=10):
    try:
        # Try with a specific cache directory
        dataset = load_dataset("cnn_dailymail", "3.0.0", cache_dir=".cache")
        print("Dataset loaded successfully")
        
        # Verify the structure - this helps debug
        if num_samples > 0:
            print("Example dataset item:", dataset['train'][0])
            
        # Take only a small sample for testing
        if hasattr(dataset, 'train'):
            return dataset['train'].select(range(min(num_samples, len(dataset['train']))))
        
        return dataset['train'][:num_samples]
        
    except Exception as e:
        print(f"Error loading full dataset: {e}")
        
        # Create a tiny synthetic dataset for testing
        print("Creating synthetic test dataset instead...")
        
        sample_data = {
            'article': [
                "John likes to play basketball. He goes to the court every evening. His friends join him on weekends.",
                "The company announced record profits. Investors were pleased. The stock price increased by 10%."
            ],
            'highlights': [
                "John plays basketball regularly with friends.",
                "Company profits lead to stock price increase."
            ],
            'id': ['test1', 'test2']  # Added ID field
        }
        
        return Dataset.from_dict(sample_data)

In [6]:
def calculate_llm_score(query, answer, model, tokenizer, task='citation', backward=False, debug=False):
    """
    Calculate log probability of response given prompt or vice versa.
    
    Args:
        query (str): The prompt text
        answer (str): The response text
        model: The language model
        tokenizer: The corresponding tokenizer
        direction (str): "forward" for P(response|prompt) or "backward" for P(prompt|response)
    
    Returns:
        dict: Contains token-wise and sequence log probabilities
    """
    
    # The paper describes "Score" as conditional distribution (Section 4) which means the Log Probability, and therefore
    # this reimplementation uses Log Probability.
    
    # The notation used here is P(Query|Answer) to make it easier to compare with the paper
    
    # First, prepare the texts
    if not backward: 
        #Forward
        conditioning_prompt = ' is a summary of ' if task =='citation' else ' has an answer to '
    else:
        #Backward
        conditioning_prompt = ' is summarized by ' if task =='citation' else ' is answered by '
    
    # DEBUG 
    if debug:
        print(f"Context: {answer + conditioning_prompt}")
        print(f"Target: {query}")
    
    # convert to tokens, but because the model is auto-regressive, it predicts left -> right 
    # (i.e. tokens at t, t+1, t+2 predicts the t+3, t+4)
    # so we may need to reverse the tokens? 
    # input_ids = tokenizer.encode(query + (conditioning_prompt + answer), return_tensors="pt").to(model.device)
    
    target_ids = tokenizer.encode(query, return_tensors="pt")
    context_ids = tokenizer.encode(answer + conditioning_prompt, return_tensors="pt")

    # store length to "divide" the texts later
    target_len = target_ids.shape[1]
    context_len = context_ids.shape[1]
    
    if backward:
        # We need to reverse the tokens in backward
        target_ids = t.flip(target_ids, (1,))
        context_ids = t.flip(context_ids, (1,))

    input_ids = t.cat((context_ids, target_ids), dim=1).to(model.device)

    # Get model output
    with t.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    
    # Extract token probabilities for the target text
    token_probs = []
    # Because the text is (query+answer), we just want to get the (answer) logits
    for i in range(context_len - 1, context_len + target_len - 1):
        
        # essentially, get the probability for the actual token at sequence, i.e. 
        # if "Harry Potter is the boy who survived" and i = 5
        # then we get the probabilities of the model output up until "who"
        # and find what's the probability of "survived"
        
        # get the logits [batch_size, sequence_length, vocabulary_size]
        next_token_logits = logits[0, i, :]  # no batch, sequence i, all vocab
        
        # get the actual token
        next_token_id = input_ids[0, i+1].item()
        
        # Convert logits to probabilities
        next_token_probs = F.softmax(next_token_logits, dim=0)
        prob = next_token_probs[next_token_id].item()
        log_prob = np.log(prob)
        
        token_text = tokenizer.decode([next_token_id])
        token_probs.append({
            'token': token_text,
            'token_id': next_token_id,
            'log_prob': log_prob
        })
    
    # Calculate sequence probability
    sequence_log_prob = sum(tp['log_prob'] for tp in token_probs)
    # Normalize by length to get per-token average
    normalized_log_prob = sequence_log_prob / len(token_probs)
    # Convert to perplexity if needed
    perplexity = np.exp(-sequence_log_prob / len(token_probs))
    
    return {
        'token_log_probs': token_probs,
        'sequence_log_prob': sequence_log_prob,
        'normalized_log_prob': normalized_log_prob,
        'perplexity': perplexity
    }



In [7]:
# Testing

fo_model, fo_tokenizer, ba_model, ba_tokenizer = load_models()


# Example Text
sentence = "Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him."
highlight = "Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday"
adverse_highlight = "Daniel Craig is recasted as James Bond again"

# Defining sentence/highlight query/answer is still confusing tho sheesh


# Define prompts
ba_score = calculate_llm_score(sentence, highlight, ba_model, ba_tokenizer, backward=True)
fo_score = calculate_llm_score(sentence, highlight, fo_model, fo_tokenizer)

adv_ba_score = calculate_llm_score(sentence, adverse_highlight, ba_model, ba_tokenizer, backward=True)
adv_fo_score = calculate_llm_score(sentence, adverse_highlight, fo_model, fo_tokenizer)


scores_data = {
    'Model Type': ['Backward', 'Forward', 'Backward', 'Forward'],
    'Highlight': ['Correct', 'Correct', 'Adverse', 'Adverse'],
    'Sequence Log Prob': [
        ba_score['sequence_log_prob'],
        fo_score['sequence_log_prob'],
        adv_ba_score['sequence_log_prob'],
        adv_fo_score['sequence_log_prob']
    ],
    'Normalized Log Prob': [
        ba_score['normalized_log_prob'],
        fo_score['normalized_log_prob'],
        adv_ba_score['normalized_log_prob'],
        adv_fo_score['normalized_log_prob']
    ],
    'Perplexity': [
        ba_score['perplexity'],
        fo_score['perplexity'],
        adv_ba_score['perplexity'],
        adv_fo_score['perplexity']
    ]
}

# Create DataFrame
pd.DataFrame(scores_data)

Unnamed: 0,Model Type,Highlight,Sequence Log Prob,Normalized Log Prob,Perplexity
0,Backward,Correct,-113.2715,-2.7627,15.8429
1,Forward,Correct,-113.7015,-2.7732,16.0099
2,Backward,Adverse,-137.511,-3.3539,28.6148
3,Forward,Adverse,-151.0687,-3.6846,39.8293


##  Citation, Linear Search

In [8]:
dataset = load_cnn_dataset(num_samples=50)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Dataset loaded successfully
Example dataset item: {'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places be

In [9]:
# Show dataframe
pd.DataFrame(dataset)

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell...",Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe's earnings from first five Potter films have be...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,"Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a ja...","Mentally ill inmates in Miami are housed on the ""forgotten floor""\nJudge Steven Leifman says most are there as a result of ""avoidable felonies""\nWhile CNN tours facility, patient shouts: ""I am the...",ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who were on the Minneapolis bridge when it collapsed told harrowing tales of survival. ""The whole bridge from one side of the Mississippi to the other just ...","NEW: ""I thought I was going to die,"" driver says .\nMan says pickup truck was folded in half; he just has cut on face .\nDriver: ""I probably had a 30-, 35-foot free fall""\nMinnesota bridge collaps...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,"WASHINGTON (CNN) -- Doctors removed five small polyps from President Bush's colon on Saturday, and ""none appeared worrisome,"" a White House spokesman said. The polyps were removed and sent to the ...","Five small polyps found during procedure; ""none worrisome,"" spokesman says .\nPresident reclaims powers transferred to vice president .\nBush undergoes routine colonoscopy at Camp David .",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,"(CNN) -- The National Football League has indefinitely suspended Atlanta Falcons quarterback Michael Vick without pay, officials with the league said Friday. NFL star Michael Vick is set to appea...","NEW: NFL chief, Atlanta Falcons owner critical of Michael Vick's conduct .\nNFL suspends Falcons quarterback indefinitely without pay .\nVick admits funding dogfighting operation but says he did n...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a
5,"BAGHDAD, Iraq (CNN) -- Dressed in a Superman shirt, 5-year-old Youssif held his sister's hand Friday, seemingly unaware that millions of people across the world have been touched by his story. Nea...","Parents beam with pride, can't stop from smiling from outpouring of support .\nMom: ""I was so happy I didn't know what to do""\nBurn center in U.S. has offered to provide treatment for reconstructi...",a1ebb8bb4d370a1fdf28769206d572be60642d70
6,"BAGHDAD, Iraq (CNN) -- The women are too afraid and ashamed to show their faces or have their real names used. They have been driven to sell their bodies to put food on the table for their childre...","Aid workers: Violence, increased cost of living drive women to prostitution .\nGroup is working to raise awareness of the problem with Iraq's political leaders .\nTwo Iraqi mothers tell CNN they t...",7c0e61ac829a3b3b653e2e3e7536cc4881d1f264
7,"BOGOTA, Colombia (CNN) -- A key rebel commander and fugitive from a U.S. drug trafficking indictment was killed over the weekend in an air attack on a guerrilla encampment, the Colombian military ...","Tomas Medina Caracas was a fugitive from a U.S. drug trafficking indictment .\n""El Negro Acacio"" allegedly helped manage extensive cocaine network .\nU.S. Justice Department indicted him in 2002 ....",f0d73bdab711763e745cdc75850861c9018f235d
8,"WASHINGTON (CNN) -- White House press secretary Tony Snow, who is undergoing treatment for cancer, will step down from his post September 14 and be replaced by deputy press secretary Dana Perino, ...","President Bush says Tony Snow ""will battle cancer and win"" Job of press secretary ""has been a dream for me,"" Snow says Snow leaving on September 14, will be succeeded by Dana Perino .",5e22bbfc7232418b8d2dd646b952e404df5bd048
9,"(CNN) -- Police and FBI agents are investigating the discovery of an empty rocket launcher tube on the front lawn of a Jersey City, New Jersey, home, FBI spokesman Sean Quinn said. Niranjan Desai ...","Empty anti-tank weapon turns up in front of New Jersey home .\nDevice handed over to Army ordnance disposal unit .\nWeapon not capable of being reloaded, experts say .",613d6311ec2c1985bd44707d1796d275452fe156


In [10]:
def linear_attribution_search(dataset, ba_model, ba_tokenizer, fo_model, fo_tokenizer):
    """
    Perform linear attribution search for citations as described in TRLM paper.
    
    For each highlight (summary sentence), find the most likely article sentence
    that it was derived from by scoring all possible pairs.
    """
    results = []
    
    # Process only the first few examples for demonstration
    for idx, example in tqdm(dataset.iterrows(), total=len(dataset)):
        # Split article and highlights into sentences
        article_sentences = sent_tokenize(example['article'])
        highlight_sentences = sent_tokenize(example['highlights'])
        
        # For demonstration, process just the first highlight sentence
        if not highlight_sentences:
            continue
            
        highlight = highlight_sentences[0]
        
        # Store best attribution for each model
        best_ba_sentence = None
        best_ba_score = float('-inf')
        best_fo_sentence = None
        best_fo_score = float('-inf')
        
        # Linear search through all article sentences
        for sentence in article_sentences:
            # Skip very short sentences
            if len(sentence.split()) < 3:
                continue
                
            # Calculate scores using both models
            ba_score = calculate_llm_score(sentence, highlight, ba_model, ba_tokenizer, backward=True)
            fo_score = calculate_llm_score(sentence, highlight, fo_model, fo_tokenizer)
            
            # Track best scores
            if ba_score['normalized_log_prob'] > best_ba_score:
                best_ba_score = ba_score['normalized_log_prob']
                best_ba_sentence = sentence
                
            if fo_score['normalized_log_prob'] > best_fo_score:
                best_fo_score = fo_score['normalized_log_prob']
                best_fo_sentence = sentence
        
        # Add results to our list
        results.append({
            'id': example['id'],
            'highlight': highlight,
            'ba_citation': best_ba_sentence,
            'ba_score': best_ba_score,
            'ba_perplexity': np.exp(-best_ba_score),
            'fo_citation': best_fo_sentence,
            'fo_score': best_fo_score,
            'fo_perplexity': np.exp(-best_fo_score)
        })
    
    
    return results


In [11]:
results = linear_attribution_search(pd.DataFrame(dataset), ba_model, ba_tokenizer, fo_model, fo_tokenizer)
results_df = pd.DataFrame(results)

# Display results in a more readable format
display_df = results_df[['highlight', 'ba_citation', 'ba_score', 'ba_perplexity', 
                         'fo_citation', 'fo_score', 'fo_perplexity']]

  0%|          | 0/50 [00:00<?, ?it/s]

In [12]:
display_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
highlight,Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .,"Mentally ill inmates in Miami are housed on the ""forgotten floor""\nJudge Steven Leifman says most are there as a result of ""avoidable felonies""\nWhile CNN tours facility, patient shouts: ""I am the...","NEW: ""I thought I was going to die,"" driver says .","Five small polyps found during procedure; ""none worrisome,"" spokesman says .","NEW: NFL chief, Atlanta Falcons owner critical of Michael Vick's conduct .","Parents beam with pride, can't stop from smiling from outpouring of support .","Aid workers: Violence, increased cost of living drive women to prostitution .",Tomas Medina Caracas was a fugitive from a U.S. drug trafficking indictment .,"President Bush says Tony Snow ""will battle cancer and win"" Job of press secretary ""has been a dream for me,"" Snow says Snow leaving on September 14, will be succeeded by Dana Perino .",Empty anti-tank weapon turns up in front of New Jersey home .,...,"NEW: Chadian president wants journalists, flight crew released .",Earth has warmed one degree in past 100 years .,Ethiopian soldier dragged after battle with Islamic insurgents killed 19 people .,NEW: Judge signs order to exhume the body of Drew Peterson's third wife .,Julia Vakulenko has reached her first final on the WTA Tour at Bell Challenge .,NEW: President Musharraf orders troops to take a television station's equipment .,Robert A.,NEW: Accused pedophile Chester Arthur Stiles gets additional charges .,South Africa lead New Zealand by 287 with 8 wickets standing in the 1st test .,Real Simple tips can add up to great summer .
ba_citation,"All rights reserved.This material may not be published, broadcast, rewritten, or redistributed.","""I am the son of the president.","""I knew the deck was going down, there was no question about it, and I thought I was going to die,"" he said.","A colonoscopy is the most sensitive test for colon cancer, rectal cancer and polyps, small clumps of cells that can become cancerous, according to the Mayo Clinic.","Vick, 27, is scheduled to appear Monday in court, where he is expected to plead guilty before a judge.","""We just want to thank everyone who has come forward,"" he said.","""At first I rejected it, but then I realized I have to do it.""","BOGOTA, Colombia (CNN) -- A key rebel commander and fugitive from a U.S. drug trafficking indictment was killed over the weekend in an air attack on a guerrilla encampment, the Colombian military ...","WASHINGTON (CNN) -- White House press secretary Tony Snow, who is undergoing treatment for cancer, will step down from his post September 14 and be replaced by deputy press secretary Dana Perino, ...","The launcher has been turned over to U.S. Army officials at the 754th Ordnance Company, an explosive ordnance disposal unit, at Fort Monmouth, New Jersey, Army officials said.",...,"All rights reserved.This material may not be published, broadcast, rewritten, or redistributed.","The amount of carbon dioxide in the atmosphere, for instance, has increased by 35 percent since the dawn of the industrial age, according to the United Nations' Intergovernmental Panel on Climate ...","MOGADISHU, Somalia (CNN) -- An enraged crowd dragged the body of an Ethiopian soldier through the streets of Somalia's capital Thursday after gun battles with Islamic insurgents killed 19 people, ...","""Our main thrust is to determine whether or not it was a homicide, and as we do that, we will see if there is any evidence that implicates anyone,"" he said.",Julia Vakulenko will seek her first victory on the WTA Tour at the Bell Challenge in Quebec.,"All rights reserved.This material may not be published, broadcast, rewritten, or redistributed.","If convicted on each count, he would face a sentence of up to 140 years in prison -- up to 20 years for the first count and up to 40 years for each additional count, prosecutors said.","(CNN) -- With his hands and feet shackled and his face obscured by his long hair, Chester Arthur Stiles made his initial court appearance in Las Vegas, Nevada, on Wednesday morning on charges ste...","JOHANNESBURG, South Africa -- South African fast bowler Dale Steyn took a career-best five for 34 as the Proteas took a tight grip on the first test against New Zealand in Johannesburg.",Lazing in a hammock is one of the best ways to spend a summer evening.
ba_score,-1.0776,-2.6203,-2.3778,-2.4613,-2.1516,-2.4802,-2.9510,-2.3433,-1.7866,-2.5237,...,-1.1231,-1.8550,-2.5347,-2.5770,-2.0097,-1.1008,-2.2705,-2.5134,-2.5865,-2.4903
ba_perplexity,2.9377,13.7398,10.7810,11.7206,8.5987,11.9435,19.1248,10.4151,5.9692,12.4744,...,3.0743,6.3915,12.6122,13.1572,7.4607,3.0065,9.6846,12.3463,13.2827,12.0651
fo_citation,"All rights reserved.This material may not be published, broadcast, rewritten, or redistributed.","MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the ""forgotten floor.""","""I knew the deck was going down, there was no question about it, and I thought I was going to die,"" he said.","The procedure was supervised by Dr. Richard Tubb, Bush's physician, and conducted by a multidisciplinary team from the National Naval Medical Center in Bethesda, Maryland, the White House said.","The charge is punishable by up to five years in prison, a $250,000 fine, ""full restitution, a special assessment and 3 years of supervised release,"" the plea deal said.",His father said he was on the roof of his house when CNN called him with the news about the outpouring of support for his son.,"She adds, ""There is a huge population of women who were the victims of war who had to sell their bodies, their souls and they lost it all.","BOGOTA, Colombia (CNN) -- A key rebel commander and fugitive from a U.S. drug trafficking indictment was killed over the weekend in an air attack on a guerrilla encampment, the Colombian military ...","WASHINGTON (CNN) -- White House press secretary Tony Snow, who is undergoing treatment for cancer, will step down from his post September 14 and be replaced by deputy press secretary Dana Perino, ...","Army officials said they could not determine if the launcher had been fired, but indicated they should know once they find out where it came from.",...,"All rights reserved.This material may not be published, broadcast, rewritten, or redistributed.","The amount of carbon dioxide in the atmosphere, for instance, has increased by 35 percent since the dawn of the industrial age, according to the United Nations' Intergovernmental Panel on Climate ...","MOGADISHU, Somalia (CNN) -- An enraged crowd dragged the body of an Ethiopian soldier through the streets of Somalia's capital Thursday after gun battles with Islamic insurgents killed 19 people, ...","In another development, a judge signed an order to exhume the body of Drew Peterson's third wife, who was found drowned in a bathtub in 2004, said Will County State Attorney James Glasgow.",Julia Vakulenko will seek her first victory on the WTA Tour at the Bell Challenge in Quebec.,"All rights reserved.This material may not be published, broadcast, rewritten, or redistributed.","If convicted on each count, he would face a sentence of up to 140 years in prison -- up to 20 years for the first count and up to 40 years for each additional count, prosecutors said.","Prosecutors added a couple more charges before Wednesday's hearing, bringing the total to 23 felony counts, including a charge of lewdness with a minor, sexual assault and the use of a child in th...","JOHANNESBURG, South Africa -- South African fast bowler Dale Steyn took a career-best five for 34 as the Proteas took a tight grip on the first test against New Zealand in Johannesburg.","• Position your backside toward the hammock's center and tilt back until you reach a 45-degree angle, with the hammock parallel to your rear."
fo_score,-1.8230,-2.4918,-2.7926,-3.0601,-2.8822,-3.0645,-3.2221,-2.5102,-2.0669,-3.1880,...,-1.7970,-2.0957,-2.7629,-2.9218,-2.4657,-1.7372,-2.5886,-2.7563,-2.9646,-3.4284
fo_perplexity,6.1904,12.0827,16.3238,21.3290,17.8526,21.4236,25.0807,12.3074,7.9006,24.2400,...,6.0316,8.1314,15.8449,18.5743,11.7715,5.6816,13.3113,15.7413,19.3860,30.8268


## Further Analysis

1. Analyze first sample citations is to check for a single article, what are the sentences' individual scores
2. Evaluate citations and display is to get the benchmark metrics
3. 

In [13]:
def analyze_first_sample_citations(dataset, ba_model, ba_tokenizer, fo_model, fo_tokenizer):
    """
    Analyze all possible citation sentences for the first highlight in the first sample.
    
    This function calculates scores for all article sentences against the first
    highlight sentence and returns them sorted by backward model score.
    """
    
    
    # Get the first sample
    first_sample = dataset.iloc[0]
    
    # Split article into sentences
    article_sentences = sent_tokenize(first_sample['article'])
    
    # Get the first highlight sentence
    highlight_sentences = sent_tokenize(first_sample['highlights'])
    if not highlight_sentences:
        print("No highlight sentences found!")
        return None
    
    highlight = highlight_sentences[0]
    print(f"Analyzing citations for highlight: \n'{highlight}'\n")
    
    # Calculate scores for all article sentences
    results = []
    
    for i, sentence in enumerate(article_sentences):
        # Skip very short sentences
        if len(sentence.split()) < 3:
            continue
            
        # Calculate scores using both models
        ba_score = calculate_llm_score(sentence, highlight, ba_model, ba_tokenizer, backward=True)
        fo_score = calculate_llm_score(sentence, highlight, fo_model, fo_tokenizer)
        
        # Add to results
        results.append({
            'sentence_idx': i,
            'article_sentence': sentence,
            'ba_score': ba_score['normalized_log_prob'],
            'ba_perplexity': ba_score['perplexity'],
            'fo_score': fo_score['normalized_log_prob'],
            'fo_perplexity': fo_score['perplexity']
        })
    
    # Create DataFrame and sort by backward model score (descending)
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('ba_score', ascending=False).reset_index(drop=True)
    
    # Set display options for better readability
    pd.set_option('display.max_colwidth', 70)
    pd.set_option('display.float_format', '{:.4f}'.format)
    
    return results_df

# Example usage:
citation_analysis = analyze_first_sample_citations(pd.DataFrame(dataset), 
                                                  ba_model, ba_tokenizer, 
                                                  fo_model, fo_tokenizer)
citation_analysis

Analyzing citations for highlight: 
'Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .'



Unnamed: 0,sentence_idx,article_sentence,ba_score,ba_perplexity,fo_score,fo_perplexity
0,23,"All rights reserved.This material may not be published, broadcast,...",-1.0776,2.9377,-1.823,6.1904
1,14,"His latest outing as the boy wizard in ""Harry Potter and the Order...",-2.4849,11.9997,-2.9522,19.1488
2,0,"LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe ga...",-2.6701,14.4421,-2.6466,14.1064
3,8,"""I'll definitely have some sort of party,"" he said in an interview.",-3.0738,21.6246,-4.1418,62.9158
4,13,"""But I try very hard not to go that way because it would be too ea...",-3.1752,23.9311,-3.8298,46.0539
5,1,"Daniel Radcliffe as Harry Potter in ""Harry Potter and the Order of...",-3.2588,26.0171,-3.0884,21.941
6,2,"""I don't plan to be one of those people who, as soon as they turn ...",-3.2597,26.0429,-3.4685,32.0879
7,11,"Despite his growing fame and riches, the actor says he is keeping ...",-3.2964,27.015,-3.1094,22.4069
8,17,"The Londoner has filmed a TV movie called ""My Boy Jack,"" about aut...",-3.3624,28.8586,-3.8838,48.6093
9,20,"Meanwhile, he is braced for even closer media scrutiny now that he...",-3.5253,33.9653,-3.7427,42.2138


In [14]:
def evaluate_citations(results, embedding_model_name='all-MiniLM-L6-v2'):
    """
    Evaluate citation quality using multiple metrics (ROUGE, embeddings, TF-IDF).
    
    Args:
        results: List of dictionaries with citation results
        embedding_model_name: Name of the sentence-transformers model to use
    
    Returns:
        Enhanced results with additional evaluation metrics
    """
    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Initialize sentence transformer for embeddings
    embedding_model = SentenceTransformer(embedding_model_name, cache_folder='.cache/')
    
    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    
    # Process all highlights and citations to prepare TF-IDF matrix
    all_texts = []
    for result in results:
        all_texts.append(result['highlight'])
        all_texts.append(result['ba_citation'])
        all_texts.append(result['fo_citation'])
    
    # Fit the TF-IDF vectorizer
    tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)
    
    enhanced_results = []
    
    for i, result in enumerate(results):
        # Get text indices for the current result
        highlight_idx = i * 3
        ba_citation_idx = i * 3 + 1
        fo_citation_idx = i * 3 + 2
        
        # Get text for the current result
        highlight = result['highlight']
        ba_citation = result['ba_citation']
        fo_citation = result['fo_citation']
        
        # Calculate embeddings
        highlight_emb = embedding_model.encode([highlight])[0]
        ba_citation_emb = embedding_model.encode([ba_citation])[0]
        fo_citation_emb = embedding_model.encode([fo_citation])[0]
        
        # Calculate embedding similarity (cosine)
        ba_emb_similarity = cosine_similarity(
            highlight_emb.reshape(1, -1), 
            ba_citation_emb.reshape(1, -1)
        )[0][0]
        
        fo_emb_similarity = cosine_similarity(
            highlight_emb.reshape(1, -1), 
            fo_citation_emb.reshape(1, -1)
        )[0][0]
        
        # Calculate TF-IDF similarity
        ba_tfidf_similarity = cosine_similarity(
            tfidf_matrix[highlight_idx], 
            tfidf_matrix[ba_citation_idx]
        )[0][0]
        
        fo_tfidf_similarity = cosine_similarity(
            tfidf_matrix[highlight_idx], 
            tfidf_matrix[fo_citation_idx]
        )[0][0]
        
        # Calculate ROUGE scores
        ba_rouge = scorer.score(highlight, ba_citation)
        fo_rouge = scorer.score(highlight, fo_citation)
        
        # Create enhanced result with all metrics
        enhanced_result = result.copy()
        
        # Add backward model metrics
        enhanced_result.update({
            'ba_emb_similarity': ba_emb_similarity,
            'ba_tfidf_similarity': ba_tfidf_similarity,
            'ba_rouge1_precision': ba_rouge['rouge1'].precision,
            'ba_rouge1_recall': ba_rouge['rouge1'].recall,
            'ba_rouge1_fmeasure': ba_rouge['rouge1'].fmeasure,
            'ba_rouge2_fmeasure': ba_rouge['rouge2'].fmeasure,
            'ba_rougeL_fmeasure': ba_rouge['rougeL'].fmeasure,
        })
        
        # Add forward model metrics
        enhanced_result.update({
            'fo_emb_similarity': fo_emb_similarity,
            'fo_tfidf_similarity': fo_tfidf_similarity,
            'fo_rouge1_precision': fo_rouge['rouge1'].precision,
            'fo_rouge1_recall': fo_rouge['rouge1'].recall,
            'fo_rouge1_fmeasure': fo_rouge['rouge1'].fmeasure,
            'fo_rouge2_fmeasure': fo_rouge['rouge2'].fmeasure,
            'fo_rougeL_fmeasure': fo_rouge['rougeL'].fmeasure,
        })
        
        enhanced_results.append(enhanced_result)
    
    return enhanced_results

In [15]:
def display_evaluation_results(results, metrics_to_show=None):
    """
    Display evaluation results in a DataFrame.
    
    Args:
        results: List of dictionaries with enhanced evaluation metrics
        metrics_to_show: List of metric columns to display (if None, shows a default set)
    
    Returns:
        DataFrame with evaluation metrics
    """
    if metrics_to_show is None:
        # Default metrics to show
        metrics_to_show = [
            'highlight', 'ba_citation', 'fo_citation',
            'ba_perplexity', 'fo_perplexity',
            'ba_emb_similarity', 'fo_emb_similarity',
            'ba_rougeL_fmeasure', 'fo_rougeL_fmeasure'
        ]
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Select columns to display
    display_df = df[metrics_to_show]
    
    return display_df

# Example usage:
enhanced_results = evaluate_citations(results)
display_df = display_evaluation_results(enhanced_results)
display_df

Unnamed: 0,highlight,ba_citation,fo_citation,ba_perplexity,fo_perplexity,ba_emb_similarity,fo_emb_similarity,ba_rougeL_fmeasure,fo_rougeL_fmeasure
0,Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 1...,"All rights reserved.This material may not be published, broadcast,...","All rights reserved.This material may not be published, broadcast,...",2.9377,6.1904,-0.0652,-0.0652,0.0,0.0
1,"Mentally ill inmates in Miami are housed on the ""forgotten floor""\...","""I am the son of the president.","MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial...",13.7398,12.0827,0.3323,0.5824,0.25,0.1791
2,"NEW: ""I thought I was going to die,"" driver says .","""I knew the deck was going down, there was no question about it, a...","""I knew the deck was going down, there was no question about it, a...",10.781,16.3238,0.5736,0.5736,0.4242,0.4242
3,"Five small polyps found during procedure; ""none worrisome,"" spokes...","A colonoscopy is the most sensitive test for colon cancer, rectal ...","The procedure was supervised by Dr. Richard Tubb, Bush's physician...",11.7206,21.329,0.342,0.3074,0.0541,0.05
4,"NEW: NFL chief, Atlanta Falcons owner critical of Michael Vick's c...","Vick, 27, is scheduled to appear Monday in court, where he is expe...","The charge is punishable by up to five years in prison, a $250,000...",8.5987,17.8526,0.4278,0.0371,0.0645,0.0476
5,"Parents beam with pride, can't stop from smiling from outpouring o...","""We just want to thank everyone who has come forward,"" he said.",His father said he was on the roof of his house when CNN called hi...,11.9435,21.4236,0.2002,0.2225,0.0,0.2051
6,"Aid workers: Violence, increased cost of living drive women to pro...","""At first I rejected it, but then I realized I have to do it.""","She adds, ""There is a huge population of women who were the victim...",19.1248,25.0807,0.015,0.4129,0.08,0.1538
7,Tomas Medina Caracas was a fugitive from a U.S. drug trafficking i...,"BOGOTA, Colombia (CNN) -- A key rebel commander and fugitive from ...","BOGOTA, Colombia (CNN) -- A key rebel commander and fugitive from ...",10.4151,12.3074,0.6413,0.6413,0.383,0.383
8,"President Bush says Tony Snow ""will battle cancer and win"" Job of...","WASHINGTON (CNN) -- White House press secretary Tony Snow, who is ...","WASHINGTON (CNN) -- White House press secretary Tony Snow, who is ...",5.9692,7.9006,0.7224,0.7224,0.2609,0.2609
9,Empty anti-tank weapon turns up in front of New Jersey home .,The launcher has been turned over to U.S. Army officials at the 75...,Army officials said they could not determine if the launcher had b...,12.4744,24.24,0.3719,0.2356,0.1463,0.0


In [16]:
# After you have your enhanced_results
df = pd.DataFrame(enhanced_results)

# Create two separate dataframes - one for each model type
ba_metrics = {col.replace('ba_', ''): df[col].mean() for col in df.columns 
              if col.startswith('ba_') and col != 'ba_citation'}
              
fo_metrics = {col.replace('fo_', ''): df[col].mean() for col in df.columns 
              if col.startswith('fo_') and col != 'fo_citation'}

# Combine into single comparison dataframe
comparison = pd.DataFrame({
    'Backward Model': ba_metrics,
    'Forward Model': fo_metrics
})

# Display the properly formatted comparison
comparison.round(4)

Unnamed: 0,Backward Model,Forward Model
score,-2.2648,-2.6623
perplexity,10.976,15.9545
emb_similarity,0.3901,0.3705
tfidf_similarity,0.1951,0.1962
rouge1_precision,0.1788,0.1628
rouge1_recall,0.3255,0.3211
rouge1_fmeasure,0.2061,0.1998
rouge2_fmeasure,0.0968,0.0968
rougeL_fmeasure,0.1789,0.1728
