# Scoring and Citations Testbed
---

## Import Libraries

In [1]:
import sys 
import os 
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from rouge_score import rouge_scorer
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
from src.model import load_fo_model, load_ba_model
from src.data import load_cnn_dataset, prepare_cnn_dataset
from src.utils import *
from src.search import *

Using device: mps


In [4]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.float_format', '{:.4f}'.format)

In [5]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /Users/ivw/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
DEVICE

'mps'

In [7]:
CACHE_DIR

'.cache/'

## Quick Test

In [8]:
# Load the models
fo_model, fo_tokenizer = load_fo_model()
ba_model, ba_tokenizer = load_ba_model()

In [9]:
summaries, articles, adverse_summaries = example_texts()

In [10]:
%%time
debug = False

batch = 16

samp_summaries=summaries[:batch]
samp_articles=articles[:batch]
samp_advsummaries=adverse_summaries[:batch]

# Evaluate with batch (normal direction) → S->A (only for Fo model)
base_scores = calculate_score_batch(samp_summaries, samp_articles, fo_model, fo_tokenizer, backward=False, query_direction="normal", debug=debug)

# Evaluate with batch (reverse direction) → A->S
fo_scores = calculate_score_batch(samp_summaries, samp_articles, fo_model, fo_tokenizer, backward=False, query_direction="reverse", debug=debug)
ba_scores = calculate_score_batch(samp_summaries, samp_articles, ba_model, ba_tokenizer, backward=True, query_direction="reverse", debug=debug)

# Adverse summaries
adv_base_scores = calculate_score_batch(samp_advsummaries, samp_articles, fo_model, fo_tokenizer, backward=False, query_direction="normal", debug=debug)
adv_fo_scores = calculate_score_batch(samp_advsummaries, samp_articles, fo_model, fo_tokenizer, backward=False, query_direction="reverse", debug=debug)
adv_ba_scores = calculate_score_batch(samp_advsummaries, samp_articles, ba_model, ba_tokenizer, backward=True, query_direction="reverse", debug=debug)


def extract_metric(scores, key):
    return [s[key] for s in scores]

df = pd.DataFrame({
    'Article': samp_articles,
    'Summary': samp_summaries,
    'Adverse Summary': samp_advsummaries,
    
    'Base_NLL': extract_metric(base_scores, 'normalized_log_prob'),
    'Fo_NLL': extract_metric(fo_scores, 'normalized_log_prob'),
    'Ba_NLL': extract_metric(ba_scores, 'normalized_log_prob'),
    
    'AdvBase_NLL': extract_metric(adv_base_scores, 'normalized_log_prob'),
    'AdvFo_NLL': extract_metric(adv_fo_scores, 'normalized_log_prob'),
    'AdvBa_NLL': extract_metric(adv_ba_scores, 'normalized_log_prob'),
})
df

CPU times: user 496 ms, sys: 266 ms, total: 762 ms
Wall time: 2.21 s


Unnamed: 0,Article,Summary,Adverse Summary,Base_NLL,Fo_NLL,Ba_NLL,AdvBase_NLL,AdvFo_NLL,AdvBa_NLL
0,Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday,"Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.",Daniel Craig is recasted as James Bond again,-3.0221,-2.7748,-2.877,-5.799,-7.1044,-6.339
1,Apple launches new iPhone with better camera and chip,"In a widely anticipated event on Tuesday, Apple unveiled its newest iPhone model. The device features a significantly improved camera system with low-light enhancements, a faster A17 chip promisin...",Samsung releases folding phone with improved battery,-4.9806,-3.7895,-3.5116,-5.2396,-5.0973,-4.6742
2,NASA rover lands on Mars for ancient life mission,NASA’s Perseverance rover successfully touched down on the surface of Mars after a seven-month journey through space. The rover is equipped with a suite of scientific instruments aimed at detectin...,NASA delays rover mission due to mechanical failure,-5.6071,-3.0059,-2.5611,-5.8888,-5.0428,-4.8878
3,Tech stocks boost market as inflation concerns drop,"The U.S. stock market closed higher on Wednesday, buoyed by a rally in technology stocks and signs that inflation may be cooling. The Federal Reserve hinted at a potential pause in interest rate h...",Market crashes amid tech layoffs and inflation fears,-7.3556,-2.6911,-2.5011,-7.6544,-6.4484,-5.484
4,Simone Biles wins all-around gold in comeback,"Simone Biles returned to competition after a two-year break and delivered a stunning performance at the national championships, winning gold in the all-around category. Her comeback is being haile...",Olympic committee bans Simone Biles from competition,-4.5687,-3.1953,-2.612,-4.9171,-6.8431,-5.1172
5,CDC recommends new COVID booster for seniors,"The CDC has recommended a new round of COVID-19 booster shots for people over 60 and those with compromised immune systems, citing the recent emergence of new subvariants. Pharmacies across the co...",FDA pulls COVID boosters citing lack of demand,-8.7931,-3.3092,-3.4108,-8.4099,-6.3571,-5.4663
6,EU passes bill to regulate AI transparency and risk,The European Parliament passed a preliminary AI regulation bill requiring transparency for generative models and stricter risk assessment for high-impact applications. The law aims to protect user...,AI startup fined for violating European data laws,-6.4797,-4.3156,-3.9798,-7.8661,-7.4039,-5.5238
7,UN warns global temps may exceed 1.5°C soon,A new report from the United Nations warns that global temperatures could surpass the 1.5°C threshold within the next decade if greenhouse gas emissions aren’t drastically reduced. The report urge...,New study denies impact of CO2 on global warming,-4.5019,-2.6341,-2.4452,-5.3818,-5.5674,-4.5546
8,'Oppenheimer' earns $80M on strong opening weekend,"Christopher Nolan's new historical thriller 'Oppenheimer' opened to strong box office numbers this weekend, bringing in over $80 million globally. Critics have praised the film's storytelling and ...",Nolan's 'Oppenheimer' flops on opening weekend,-3.9063,-3.4476,-2.9741,-4.0294,-3.8317,-3.6694
9,Ford to invest $2B to boost EV production,"Ford announced a $2 billion investment to expand electric vehicle production in the Midwest, including new battery facilities. The move is part of Ford’s effort to compete with Tesla and meet risi...",Ford cuts EV funding amid low consumer interest,-4.2487,-2.7613,-2.6306,-4.5132,-6.3631,-5.9599


In [11]:
%%time

debug = False

summary = summaries[0]
article = articles[0]
adverse_summary = adverse_summaries[0]

# In normal, query is sentence/article, and answer is summary/highlight (S->A direction)
base = calculate_score(summary, article, fo_model, fo_tokenizer, backward=False, query_direction="normal", debug=debug)
# In Fo, query is summary/highlight , and answer is sentence/article(A->S direction)
fo = calculate_score(summary, article, fo_model, fo_tokenizer, backward=False, query_direction="reverse", debug=debug)
# In Ba, query is summary/highlight , and answer is sentence/article(A->S direction)
ba = calculate_score(summary, article, ba_model, ba_tokenizer, backward=True, query_direction="reverse", debug=debug)


adv_base = calculate_score(adverse_summary, article, fo_model, fo_tokenizer, backward=False, query_direction="normal", debug=debug)
adv_fo = calculate_score(adverse_summary, article, fo_model, fo_tokenizer, backward=False, query_direction="reverse", debug=debug)
adv_ba = calculate_score(adverse_summary, article, ba_model, ba_tokenizer, backward=True, query_direction="reverse", debug=debug)


scores_data = {
    'Model Type': ['Base', 'Forward', 'Backward', 'Adv Base', 'Adv Forward', 'Adv Backward'],
    
    'Query Direction': ['S->A', 'A->S', 'A->S', 'S->A', 'A->S', 'A->S'],
    
    'Sequence Log Prob': [
        base['sequence_log_prob'],
        fo['sequence_log_prob'],
        ba['sequence_log_prob'],
        
        adv_base['sequence_log_prob'],
        adv_fo['sequence_log_prob'],
        adv_ba['sequence_log_prob'],
    ],
    'Normalized Log Prob': [
        base['normalized_log_prob'],
        fo['normalized_log_prob'],
        ba['normalized_log_prob'],
        
        adv_base['normalized_log_prob'],
        adv_fo['normalized_log_prob'],
        adv_ba['normalized_log_prob'],
    ],
    'Perplexity': [
        base['perplexity'],
        fo['perplexity'],
        ba['perplexity'],
        
        adv_base['perplexity'],
        adv_fo['perplexity'],
        adv_ba['perplexity'],
    ]
}

CPU times: user 286 ms, sys: 44.3 ms, total: 330 ms
Wall time: 437 ms


In [12]:
result_df = pd.DataFrame(scores_data)
result_df

Unnamed: 0,Model Type,Query Direction,Sequence Log Prob,Normalized Log Prob,Perplexity
0,Base,S->A,-48.3538,-3.0221,20.5346
1,Forward,A->S,-113.7698,-2.7749,16.0366
2,Backward,A->S,-117.9553,-2.877,17.7602
3,Adv Base,S->A,-92.7872,-5.7992,330.0361
4,Adv Forward,A->S,-63.9409,-7.1045,1217.4864
5,Adv Backward,A->S,-57.0505,-6.3389,566.2004


## Linear Search

In [17]:
dataset = load_cnn_dataset(num_samples=50)
dataset = pd.DataFrame(dataset)

Dataset loaded successfully
Example dataset item: {'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places be

In [18]:
dataset.head(5)

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell...",Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe's earnings from first five Potter films have be...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,"Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a ja...","Mentally ill inmates in Miami are housed on the ""forgotten floor""\nJudge Steven Leifman says most are there as a result of ""avoidable felonies""\nWhile CNN tours facility, patient shouts: ""I am the...",ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who were on the Minneapolis bridge when it collapsed told harrowing tales of survival. ""The whole bridge from one side of the Mississippi to the other just ...","NEW: ""I thought I was going to die,"" driver says .\nMan says pickup truck was folded in half; he just has cut on face .\nDriver: ""I probably had a 30-, 35-foot free fall""\nMinnesota bridge collaps...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,"WASHINGTON (CNN) -- Doctors removed five small polyps from President Bush's colon on Saturday, and ""none appeared worrisome,"" a White House spokesman said. The polyps were removed and sent to the ...","Five small polyps found during procedure; ""none worrisome,"" spokesman says .\nPresident reclaims powers transferred to vice president .\nBush undergoes routine colonoscopy at Camp David .",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,"(CNN) -- The National Football League has indefinitely suspended Atlanta Falcons quarterback Michael Vick without pay, officials with the league said Friday. NFL star Michael Vick is set to appea...","NEW: NFL chief, Atlanta Falcons owner critical of Michael Vick's conduct .\nNFL suspends Falcons quarterback indefinitely without pay .\nVick admits funding dogfighting operation but says he did n...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a


In [19]:
all_pairs, meta = prepare_cnn_dataset(dataset)

In [20]:
linear_results = linear_attribution_search_batched(all_pairs, meta, fo_model, fo_tokenizer, ba_model, ba_tokenizer)

  0%|          | 0/52 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
linear_results = linear_attribution_search(dataset, fo_model, fo_tokenizer, ba_model, ba_tokenizer)

  0%|          | 0/100 [00:00<?, ?it/s]

In [26]:
binary_results = binary_search_attribution_search(dataset, fo_model, fo_tokenizer, ba_model, ba_tokenizer)

  0%|          | 0/100 [00:00<?, ?it/s]

In [27]:
exclusion_results = exclusion_search_attribution_search(dataset, fo_model, fo_tokenizer, ba_model, ba_tokenizer)

  0%|          | 0/100 [00:00<?, ?it/s]

In [28]:
def calculate_embedding_similarity(highlight, citation):
    """Calculate cosine similarity between sentence embeddings."""
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Generate embeddings
    highlight_embedding = model.encode([highlight])[0]
    citation_embedding = model.encode([citation])[0]
    
    # Calculate cosine similarity (1 - cosine distance)
    similarity = 1 - cosine(highlight_embedding, citation_embedding)
    return similarity

def calculate_rouge_score(highlight, citation):
    """Calculate ROUGE-L F-measure score."""
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(highlight, citation)
    return scores['rougeL'].fmeasure

def calculate_tfidf_score(highlight, citation):
    """Calculate TF-IDF similarity score."""
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([highlight, citation])
    
    # Convert sparse matrix to dense array for cosine similarity calculation
    dense_matrix = tfidf_matrix.toarray()
    
    # Calculate cosine similarity
    similarity = 1 - cosine(dense_matrix[0], dense_matrix[1])
    return similarity

def process_data(data):
    """Process the data and reformat to show scores by citation type in columns."""
    results = []
    
    for item in data:
        highlight = item['highlight']
        result_entry = {'id': item['id'], 'highlight': highlight}
        
        # Process each citation type
        for citation_type in ['base_citation', 'ba_citation', 'fo_citation']:
            prefix = citation_type.split('_')[0]  # Extract 'base', 'ba', or 'fo'
            
            if citation_type in item and item[citation_type]:
                citation = item[citation_type]
                
                # Calculate scores
                emb_similarity = calculate_embedding_similarity(highlight, citation)
                rouge_score = calculate_rouge_score(highlight, citation)
                tfidf_score = calculate_tfidf_score(highlight, citation)
                
                # Add scores as columns with prefix
                result_entry[f'{prefix}_emb_similarity'] = emb_similarity
                result_entry[f'{prefix}_rouge_score'] = rouge_score
                result_entry[f'{prefix}_tfidf_score'] = tfidf_score
                
            else:
                # Set default values if citation doesn't exist
                result_entry[f'{prefix}_emb_similarity'] = None
                result_entry[f'{prefix}_rouge_score'] = None
                result_entry[f'{prefix}_tfidf_score'] = None
                
        results.append(result_entry)
    
    return results

# Process the data
linear_final_results = process_data(linear_results)
binary_final_results = process_data(binary_results)
exclusion_final_results = process_data(exclusion_results)


  dist = 1.0 - uv / np.sqrt(uu * vv)


In [29]:
r = pd.DataFrame(linear_final_results) 
r.drop(['id', 'highlight'], axis=1).mean()

base_emb_similarity   0.4425
base_rouge_score      0.1945
base_tfidf_score      0.1664
ba_emb_similarity     0.6453
ba_rouge_score        0.3099
ba_tfidf_score        0.2708
fo_emb_similarity     0.6380
fo_rouge_score        0.3206
fo_tfidf_score        0.2855
dtype: float64

In [30]:
r = pd.DataFrame(binary_final_results) 
r.drop(['id', 'highlight'], axis=1).mean()

base_emb_similarity   0.4260
base_rouge_score      0.1673
base_tfidf_score      0.1448
ba_emb_similarity     0.6403
ba_rouge_score        0.3137
ba_tfidf_score        0.2745
fo_emb_similarity     0.6378
fo_rouge_score        0.3302
fo_tfidf_score        0.2923
dtype: float64

In [31]:
r = pd.DataFrame(exclusion_final_results) 
r.drop(['id', 'highlight'], axis=1).mean()

base_emb_similarity   0.4007
base_rouge_score      0.1688
base_tfidf_score      0.1433
ba_emb_similarity     0.6219
ba_rouge_score        0.3161
ba_tfidf_score        0.2895
fo_emb_similarity     0.6041
fo_rouge_score        0.3204
fo_tfidf_score        0.2921
dtype: float64

In [32]:
df_linear = pd.DataFrame(linear_final_results)
df_binary = pd.DataFrame(binary_final_results)
df_exclusion = pd.DataFrame(exclusion_final_results)

df_merged = pd.merge(df_linear, df_binary, on=['id', 'highlight'],suffixes=('_linear', '_binary'))

df_exclusion = df_exclusion.add_suffix('_exclusion')
df_exclusion = df_exclusion.rename(columns={'id_exclusion': 'id', 'highlight_exclusion': 'highlight'})

df_merged = pd.merge(df_merged, df_exclusion, on=['id', 'highlight'])

mean_series = df_merged.drop(['id', 'highlight'], axis=1).mean()

data = {
    'Base_linear': [
        mean_series['base_emb_similarity_linear'],
        mean_series['base_rouge_score_linear'],
        mean_series['base_tfidf_score_linear']
    ],
    'Fo_linear': [
        mean_series['fo_emb_similarity_linear'],
        mean_series['fo_rouge_score_linear'],
        mean_series['fo_tfidf_score_linear']
    ],
    'Ba_linear': [
        mean_series['ba_emb_similarity_linear'],
        mean_series['ba_rouge_score_linear'],
        mean_series['ba_tfidf_score_linear']
    ],
    'Base_binary': [
        mean_series['base_emb_similarity_binary'],
        mean_series['base_rouge_score_binary'],
        mean_series['base_tfidf_score_binary']
    ],
    'Fo_binary': [
        mean_series['fo_emb_similarity_binary'],
        mean_series['fo_rouge_score_binary'],
        mean_series['fo_tfidf_score_binary']
    ],
    'Ba_binary': [
        mean_series['ba_emb_similarity_binary'],
        mean_series['ba_rouge_score_binary'],
        mean_series['ba_tfidf_score_binary']
    ]
}

table_df = pd.DataFrame(data, index=['Embedding', 'Rouge', 'Tfidf'])
table_df

Unnamed: 0,Base_linear,Fo_linear,Ba_linear,Base_binary,Fo_binary,Ba_binary
Embedding,0.4425,0.638,0.6453,0.426,0.6378,0.6403
Rouge,0.1945,0.3206,0.3099,0.1673,0.3302,0.3137
Tfidf,0.1664,0.2855,0.2708,0.1448,0.2923,0.2745
