# Scoring and Citations Testbed
---

## Import Libraries

In [None]:
import sys 
import os 
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
sys.path.append('/ocean/projects/cis250068p/jhwang4/idl-project')

In [2]:
import torch as t
import numpy as np
import pandas as pd
import torch.nn.functional as F
from tqdm.auto import tqdm

from transformers import GPTNeoXForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer, util

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize

from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
import re
from scipy.spatial.distance import cosine


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from src.model import load_fo_model, load_ba_model, DEVICE
from src.data import load_cnn_dataset
#from src.utils import *
from src.utils_batch_v2 import *
#from src.search import *
from src.search_batch_v2 import *

Using device: cuda


In [4]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.float_format', '{:.4f}'.format)

In [5]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /jet/home/jhwang4/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
DEVICE

'cuda'

## Quick Test

In [7]:
# Load the models
fo_model, fo_tokenizer = load_fo_model()
ba_model, ba_tokenizer = load_ba_model()
fo_model = fo_model.to(DEVICE)
ba_model = ba_model.to(DEVICE)
fo_model.half()
ba_model.half()
if t.cuda.device_count() > 1:
    print(t.cuda.device_count())
    fo_model = t.nn.DataParallel(fo_model)
    ba_model = t.nn.DataParallel(ba_model)

fo_model = t.compile(fo_model)
ba_model = t.compile(ba_model)


# Example Text
# article = "Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him."
# summary = "Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday"
# adverse_summary = "Daniel Craig is recasted as James Bond again"

4


In [None]:
# debug = True

# # In normal, query is sentence/article, and answer is summary/highlight (S->A direction)
# base = calculate_score(summary, article, fo_model, fo_tokenizer, backward=False, query_direction="normal", debug=debug)

# # In Fo, query is summary/highlight , and answer is sentence/article(A->S direction)
# fo = calculate_score(summary, article, fo_model, fo_tokenizer, backward=False, query_direction="reverse", debug=debug)

# # In Ba, query is summary/highlight , and answer is sentence/article(A->S direction)
# ba = calculate_score(summary, article, ba_model, ba_tokenizer, backward=True, query_direction="reverse", debug=debug)


# adv_base = calculate_score(adverse_summary, article, fo_model, fo_tokenizer, backward=False, query_direction="normal", debug=debug)

# adv_fo = calculate_score(adverse_summary, article, fo_model, fo_tokenizer, backward=False, query_direction="reverse", debug=debug)

# adv_ba = calculate_score(adverse_summary, article, ba_model, ba_tokenizer, backward=True, query_direction="reverse", debug=debug)


# scores_data = {
#     'Model Type': ['Base', 'Forward', 'Backward', 'Adv Base', 'Adv Forward', 'Adv Backward'],
    
#     'Query Direction': ['S->A', 'A->S', 'A->S', 'S->A', 'A->S', 'A->S'],
    
#     'Sequence Log Prob': [
#         base['sequence_log_prob'],
#         fo['sequence_log_prob'],
#         ba['sequence_log_prob'],
        
#         adv_base['sequence_log_prob'],
#         adv_fo['sequence_log_prob'],
#         adv_ba['sequence_log_prob'],
#     ],
#     'Normalized Log Prob': [
#         base['normalized_log_prob'],
#         fo['normalized_log_prob'],
#         ba['normalized_log_prob'],
        
#         adv_base['normalized_log_prob'],
#         adv_fo['normalized_log_prob'],
#         adv_ba['normalized_log_prob'],
#     ],
#     'Perplexity': [
#         base['perplexity'],
#         fo['perplexity'],
#         ba['perplexity'],
        
#         adv_base['perplexity'],
#         adv_fo['perplexity'],
#         adv_ba['perplexity'],
#     ]
# }

In [None]:
# result_df = pd.DataFrame(scores_data)
# result_df

## Linear Search

In [8]:
dataset = load_cnn_dataset(num_samples=500)
#dataset = load_cnn_dataset()
dataset = pd.DataFrame(dataset)

Dataset loaded successfully
Example dataset item: {'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places be

In [9]:
dataset.head(5)

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell...",Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe's earnings from first five Potter films have be...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,"Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a ja...","Mentally ill inmates in Miami are housed on the ""forgotten floor""\nJudge Steven Leifman says most are there as a result of ""avoidable felonies""\nWhile CNN tours facility, patient shouts: ""I am the...",ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who were on the Minneapolis bridge when it collapsed told harrowing tales of survival. ""The whole bridge from one side of the Mississippi to the other just ...","NEW: ""I thought I was going to die,"" driver says .\nMan says pickup truck was folded in half; he just has cut on face .\nDriver: ""I probably had a 30-, 35-foot free fall""\nMinnesota bridge collaps...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,"WASHINGTON (CNN) -- Doctors removed five small polyps from President Bush's colon on Saturday, and ""none appeared worrisome,"" a White House spokesman said. The polyps were removed and sent to the ...","Five small polyps found during procedure; ""none worrisome,"" spokesman says .\nPresident reclaims powers transferred to vice president .\nBush undergoes routine colonoscopy at Camp David .",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,"(CNN) -- The National Football League has indefinitely suspended Atlanta Falcons quarterback Michael Vick without pay, officials with the league said Friday. NFL star Michael Vick is set to appea...","NEW: NFL chief, Atlanta Falcons owner critical of Michael Vick's conduct .\nNFL suspends Falcons quarterback indefinitely without pay .\nVick admits funding dogfighting operation but says he did n...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a


In [10]:
print(dataset.shape[0])

500


In [None]:
#linear_results = linear_attribution_search(dataset, fo_model, fo_tokenizer, ba_model, ba_tokenizer) # 50 - 59.4s
linear_results = linear_attribution_search(dataset, fo_model, fo_tokenizer, ba_model, ba_tokenizer, sentence_batch_size=500) # 52.4s, 2m30.9s, 6h18m49s

In [None]:
#binary_results = binary_search_attribution_search(dataset, fo_model, fo_tokenizer, ba_model, ba_tokenizer) # 50 - 25.7s
binary_results = binary_search_attribution_search(dataset, fo_model, fo_tokenizer, ba_model, ba_tokenizer, max_iterations=30, sentence_batch_size=1000) # 35.7s, 7m 47.6s, 5m 15.6s, 5m0.8s

In [None]:
t.cuda.empty_cache()

In [11]:
#exclusion_results = exclusion_search_attribution_search(dataset, fo_model, fo_tokenizer, ba_model, ba_tokenizer) # 50 - 4m54.5s
exclusion_results = exclusion_search_attribution_search(dataset, fo_model, fo_tokenizer, ba_model, ba_tokenizer, sentence_batch_size=256) # 3m30.4s, 21m

  with t.no_grad(), autocast():
W0424 07:56:07.916000 93137 torch/_logging/_internal.py:1081] [0/0] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
  with t.no_grad(), autocast():
  with t.no_grad(), autocast():
  with t.no_grad(), autocast():
  with t.no_grad(), autocast():
  with t.no_grad(), autocast():
  with t.no_grad(), autocast():
  6%|█████▎                                                                                   | 30/500 [02:04<32:25,  4.14s/it]


KeyboardInterrupt: 

In [None]:
# def calculate_embedding_similarity(highlight, citation):
#     """Calculate cosine similarity between sentence embeddings."""
#     model = SentenceTransformer('all-MiniLM-L6-v2')
    
#     # Generate embeddings
#     highlight_embedding = model.encode([highlight])[0]
#     citation_embedding = model.encode([citation])[0]
    
#     # Calculate cosine similarity (1 - cosine distance)
#     similarity = 1 - cosine(highlight_embedding, citation_embedding)
#     return similarity

# def calculate_rouge_score(highlight, citation):
#     """Calculate ROUGE-L F-measure score."""
#     scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
#     scores = scorer.score(highlight, citation)
#     return scores['rougeL'].fmeasure

# def calculate_tfidf_score(highlight, citation):
#     """Calculate TF-IDF similarity score."""
#     vectorizer = TfidfVectorizer()
#     tfidf_matrix = vectorizer.fit_transform([highlight, citation])
    
#     # Convert sparse matrix to dense array for cosine similarity calculation
#     dense_matrix = tfidf_matrix.toarray()
    
#     # Calculate cosine similarity
#     similarity = 1 - cosine(dense_matrix[0], dense_matrix[1])
#     return similarity

# def process_data(data):
#     """Process the data and reformat to show scores by citation type in columns."""
#     results = []
    
#     for item in data:
#         highlight = item['highlight']
#         result_entry = {'id': item['id'], 'highlight': highlight}
        
#         # Process each citation type
#         for citation_type in ['base_citation', 'ba_citation', 'fo_citation']:
#             prefix = citation_type.split('_')[0]  # Extract 'base', 'ba', or 'fo'
            
#             if citation_type in item and item[citation_type]:
#                 citation = item[citation_type]
                
#                 # Calculate scores
#                 emb_similarity = calculate_embedding_similarity(highlight, citation)
#                 rouge_score = calculate_rouge_score(highlight, citation)
#                 tfidf_score = calculate_tfidf_score(highlight, citation)
                
#                 # Add scores as columns with prefix
#                 result_entry[f'{prefix}_emb_similarity'] = emb_similarity
#                 result_entry[f'{prefix}_rouge_score'] = rouge_score
#                 result_entry[f'{prefix}_tfidf_score'] = tfidf_score
                
#             else:
#                 # Set default values if citation doesn't exist
#                 result_entry[f'{prefix}_emb_similarity'] = None
#                 result_entry[f'{prefix}_rouge_score'] = None
#                 result_entry[f'{prefix}_tfidf_score'] = None
                
#         results.append(result_entry)
    
#     return results

# # Process the data
# linear_final_results = process_data(linear_results)
# binary_final_results = process_data(binary_results)
# #exclusion_final_results = process_data(exclusion_results)


In [None]:
EMB_MODEL    = SentenceTransformer('all-MiniLM-L6-v2')
ROUGE_SCORER = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
TFIDF_VEC    = TfidfVectorizer(
    token_pattern=r"(?u)\b\w+\b",  
    stop_words=None               
)

def calculate_embedding_similarity(highlight, citation):
    he = EMB_MODEL.encode([highlight])[0]
    ce = EMB_MODEL.encode([citation])[0]
    return 1 - cosine(he, ce)

def calculate_rouge_score(highlight, citation):
    return ROUGE_SCORER.score(highlight, citation)['rougeL'].fmeasure

def calculate_tfidf_score(highlight, citation):
    try:
        mat = TFIDF_VEC.fit_transform([highlight, citation]).toarray()
        return 1 - cosine(mat[0], mat[1])
    except ValueError:
        return 0.0

def process_data(data):
    results = []
    for item in data:
        hl = item['highlight']
        rec = {'id': item['id'], 'highlight': hl}
        for citation_type in ['base_citation','fo_citation','ba_citation']:
            prefix = citation_type.split('_')[0]
            cit = item.get(citation_type, None)
            if cit:
                rec[f'{prefix}_emb_similarity'] = calculate_embedding_similarity(hl, cit)
                rec[f'{prefix}_rouge_score']     = calculate_rouge_score(hl, cit)
                rec[f'{prefix}_tfidf_score']     = calculate_tfidf_score(hl, cit)
            else:
                rec[f'{prefix}_emb_similarity'] = None
                rec[f'{prefix}_rouge_score']     = None
                rec[f'{prefix}_tfidf_score']     = None
        results.append(rec)
    return results

#linear_final_results    = process_data(linear_results)
#binary_final_results    = process_data(binary_results)
exclusion_final_results = process_data(exclusion_results)

In [None]:
r = pd.DataFrame(linear_final_results) 
r.drop(['id', 'highlight'], axis=1).mean()
r.to_csv("/ocean/projects/cis250068p/jhwang4/idl-project/notebooks/linear_result_500.csv", index=True)

In [None]:
r = pd.DataFrame(binary_final_results) 
r.drop(['id', 'highlight'], axis=1).mean()
#r.to_csv("binary_500_results.csv", index=True)

In [None]:
r = pd.DataFrame(exclusion_final_results) 
r.drop(['id', 'highlight'], axis=1).mean()

In [None]:
df_linear = pd.DataFrame(linear_final_results)
df_binary = pd.DataFrame(binary_final_results)
df_exclusion = pd.DataFrame(exclusion_final_results)

df_merged = pd.merge(df_linear, df_binary, on=['id', 'highlight'],suffixes=('_linear', '_binary'))

df_exclusion = df_exclusion.add_suffix('_exclusion')
df_exclusion = df_exclusion.rename(columns={'id_exclusion': 'id', 'highlight_exclusion': 'highlight'})

df_merged = pd.merge(df_merged, df_exclusion, on=['id', 'highlight'])

mean_series = df_merged.drop(['id', 'highlight'], axis=1).mean()

data = {
    'Base_linear': [
        mean_series['base_emb_similarity_linear'],
        mean_series['base_rouge_score_linear'],
        mean_series['base_tfidf_score_linear']
    ],
    'Fo_linear': [
        mean_series['fo_emb_similarity_linear'],
        mean_series['fo_rouge_score_linear'],
        mean_series['fo_tfidf_score_linear']
    ],
    'Ba_linear': [
        mean_series['ba_emb_similarity_linear'],
        mean_series['ba_rouge_score_linear'],
        mean_series['ba_tfidf_score_linear']
    ],
    'Base_binary': [
        mean_series['base_emb_similarity_binary'],
        mean_series['base_rouge_score_binary'],
        mean_series['base_tfidf_score_binary']
    ],
    'Fo_binary': [
        mean_series['fo_emb_similarity_binary'],
        mean_series['fo_rouge_score_binary'],
        mean_series['fo_tfidf_score_binary']
    ],
    'Ba_binary': [
        mean_series['ba_emb_similarity_binary'],
        mean_series['ba_rouge_score_binary'],
        mean_series['ba_tfidf_score_binary']
    ],
    'Base_exclusion': [
        mean_series['base_emb_similarity_exclusion'],
        mean_series['base_rouge_score_exclusion'],
        mean_series['base_tfidf_score_exclusion']
    ],
    'Fo_exclusion': [
        mean_series['fo_emb_similarity_exclusion'],
        mean_series['fo_rouge_score_exclusion'],
        mean_series['fo_tfidf_score_exclusion']
    ],
    'Ba_exclusion': [
        mean_series['ba_emb_similarity_exclusion'],
        mean_series['ba_rouge_score_exclusion'],
        mean_series['ba_tfidf_score_exclusion']
    ],
}

table_df = pd.DataFrame(data, index=['Embedding', 'Rouge', 'Tfidf'])
table_df

In [None]:
df_linear = pd.DataFrame(linear_final_results)
mean_series = df_linear.drop(['id', 'highlight'], axis=1).mean()

data = {
    'Base_linear': [
        mean_series['base_emb_similarity'],
        mean_series['base_rouge_score'],
        mean_series['base_tfidf_score']
    ],
    'Fo_linear': [
        mean_series['fo_emb_similarity'],
        mean_series['fo_rouge_score'],
        mean_series['fo_tfidf_score']
    ],
    'Ba_linear': [
        mean_series['ba_emb_similarity'],
        mean_series['ba_rouge_score'],
        mean_series['ba_tfidf_score']
    ]
}

table_df = pd.DataFrame(data, index=['Embedding', 'Rouge', 'Tfidf'])
table_df.to_csv("/ocean/projects/cis250068p/jhwang4/idl-project/notebooks/linear_result.csv", index=True)

In [None]:
table_df

In [None]:
table_df.to_csv("/ocean/projects/cis250068p/jhwang4/idl-project/notebooks/linear_result.csv", index=True)