In [2]:
import torch as t
import numpy as np
import pandas as pd
import torch.nn.functional as F
from tqdm.auto import tqdm

from transformers import GPTNeoXForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [3]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/ivw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/ivw/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "afterless/reverse-pythia-160m"
)
model = GPTNeoXForCausalLM.from_pretrained(
    "afterless/reverse-pythia-160m",
    cache_dir="./.cache/reverse-pythia-160m",
)

inputs = tokenizer(
    "What time is it?",
    return_token_type_ids=False,
    return_tensors="pt"
)

inputs['input_ids'] = t.flip(inputs.input_ids, (1,))
tokens = t.flip(model.generate(**inputs), (1,))
tokenizer.decode(tokens[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


' it?" I asked.\n\n"I think about it all the time," he said. "What time is it?'

In [5]:
# Load models
def load_models(device='cpu'):
    # Forward model
    fo_model = GPTNeoXForCausalLM.from_pretrained(
        "EleutherAI/pythia-160m-deduped",
        revision="step143000",
        cache_dir="./.cache/pythia-160m-deduped/step143000",
    ).to(device)
    
    fo_tokenizer = AutoTokenizer.from_pretrained(
        "EleutherAI/pythia-160m-deduped",
        revision="step143000",
        cache_dir="./.cache/pythia-160m-deduped/step143000",
    )
    
    # Backward model
    ba_model = GPTNeoXForCausalLM.from_pretrained(
        "afterless/reverse-pythia-160m",
        cache_dir="./.cache/reverse-pythia-160m",
    ).to(device)
    
    ba_tokenizer = AutoTokenizer.from_pretrained(
        "afterless/reverse-pythia-160m",
        cache_dir="./.cache/reverse-pythia-160m",
    )
    
    return fo_model, fo_tokenizer, ba_model, ba_tokenizer

In [None]:
import torch
import numpy as np
from torch.nn import functional as F

def calculate_llm_score(prompt, response, model, tokenizer, direction="forward"):
    """
    Calculate log probability of response given prompt or vice versa.
    
    Args:
        prompt (str): The prompt text
        response (str): The response text
        model: The language model
        tokenizer: The corresponding tokenizer
        direction (str): "forward" for P(response|prompt) or "backward" for P(prompt|response)
    
    Returns:
        dict: Contains token-wise and sequence log probabilities
    """
    # Prepare text based on direction
    if direction == "forward":
        # Forward: calculate P(Response|Prompt)
        full_text = prompt + response
        input_ids = tokenizer.encode(full_text, return_tensors="pt").to(model.device)
        prompt_len = len(tokenizer.encode(prompt)) - 1  # -1 because we don't count the first token in scoring
        target_len = len(tokenizer.encode(response))
    else:
        # Backward: calculate P(Prompt|Response)
        # For backward model, we reverse the text and calculate from the end
        full_text = response + prompt
        input_ids = tokenizer.encode(full_text, return_tensors="pt").to(model.device)
        prompt_len = len(tokenizer.encode(response)) - 1
        target_len = len(tokenizer.encode(prompt))
    
    # Get model output
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    
    # Extract token probabilities for the target sequence
    token_probs = []
    for i in range(prompt_len, prompt_len + target_len - 1):
        next_token_logits = logits[0, i, :]
        next_token_id = input_ids[0, i+1].item()
        
        # Convert logits to probabilities
        next_token_probs = F.softmax(next_token_logits, dim=0)
        prob = next_token_probs[next_token_id].item()
        log_prob = np.log(prob)
        
        token_text = tokenizer.decode([next_token_id])
        token_probs.append({
            'token': token_text,
            'token_id': next_token_id,
            'log_prob': log_prob
        })
    
    # Calculate sequence probability
    sequence_log_prob = sum(tp['log_prob'] for tp in token_probs)
    # Normalize by length to get per-token average
    normalized_log_prob = sequence_log_prob / len(token_probs)
    # Convert to perplexity if needed
    perplexity = np.exp(-sequence_log_prob / len(token_probs))
    
    return {
        'token_log_probs': token_probs,
        'sequence_log_prob': sequence_log_prob,
        'normalized_log_prob': normalized_log_prob,
        'perplexity': perplexity
    }

# Example usage
def score_text_pair(prompt, response, fo_model, fo_tokenizer, ba_model, ba_tokenizer):
    # Forward score: P(Response|Prompt)
    forward_score = calculate_llm_score(
        prompt, response, 
        fo_model, fo_tokenizer, 
        direction="forward"
    )
    
    # Backward score: P(Prompt|Response)
    backward_score = calculate_llm_score(
        response, prompt,
        ba_model, ba_tokenizer,
        direction="backward"
    )
    
    return {
        'forward': forward_score,
        'backward': backward_score
    }

In [8]:
# Load the models
fo_model, fo_tokenizer, ba_model, ba_tokenizer = load_models(device="cuda" if torch.cuda.is_available() else "cpu")

# Example text
prompt = "ABCDE in reverse"
response = " is simply EDCBA"

# Get scores
scores = score_text_pair(prompt, response, fo_model, fo_tokenizer, ba_model, ba_tokenizer)

# Print results
print(f"Forward (P(Response|Prompt)) - Log probability: {scores['forward']['sequence_log_prob']:.4f}, Perplexity: {scores['forward']['perplexity']:.4f}")
print(f"Backward (P(Prompt|Response)) - Log probability: {scores['backward']['sequence_log_prob']:.4f}, Perplexity: {scores['backward']['perplexity']:.4f}")

Forward (P(Response|Prompt)) - Log probability: -27.7338, Perplexity: 1026.0319
Backward (P(Prompt|Response)) - Log probability: -36.1751, Perplexity: 8465.7544


In [9]:
# Your example texts
sentence = "Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him."
highlight = "Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday"
adverse_highlight = "Daniel Craig is recasted as James Bond again"

# Define prompts
SP = None  # Scoring Prompt
CP = "is summarized by"  # Conditioning Prompt
Q = sentence  # Query
A = highlight  # Response

def score_summary(original_text, summary_text, condition_prompt, score_prompt=None):
    """
    Score how well a summary represents the original text
    """
    # Create the inputs as specified
    input_text = f"{score_prompt or ''}{original_text}"
    output_text = f"{condition_prompt}{summary_text}"
    
    # Score using forward model (P(summary|original))
    forward_score = calculate_llm_score(
        input_text, output_text,
        fo_model, fo_tokenizer,
        direction="forward"
    )
    
    # Score using backward model (P(original|summary))
    backward_score = calculate_llm_score(
        output_text, input_text,
        ba_model, ba_tokenizer,
        direction="backward"
    )
    
    return {
        'forward_score': forward_score['normalized_log_prob'],
        'backward_score': backward_score['normalized_log_prob'],
        'combined_score': forward_score['normalized_log_prob'] + backward_score['normalized_log_prob']
    }

# Score the correct highlight
correct_scores = score_summary(Q, A, CP, SP)
print("Correct highlight scores:")
print(f"Forward score: {correct_scores['forward_score']:.4f}")
print(f"Backward score: {correct_scores['backward_score']:.4f}")
print(f"Combined score: {correct_scores['combined_score']:.4f}")

# Score the adverse highlight for comparison
adverse_scores = score_summary(Q, adverse_highlight, CP, SP)
print("\nAdverse highlight scores:")
print(f"Forward score: {adverse_scores['forward_score']:.4f}")
print(f"Backward score: {adverse_scores['backward_score']:.4f}")
print(f"Combined score: {adverse_scores['combined_score']:.4f}")

Correct highlight scores:
Forward score: -3.4735
Backward score: -4.8827
Combined score: -8.3562

Adverse highlight scores:
Forward score: -7.0841
Backward score: -10.2840
Combined score: -17.3682


In [None]:
fo_model, fo_tokenizer, ba_model, ba_tokenizer = load_models()

trlm_ba_scores = []

sentence = "Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him." 
highlight = "Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday"
adverse_highlight = "Daniel Craig is recasted as James Bond again"

# Define prompts
SP = None  # Scoring Prompt
CP = "is summarized by"   # Conditioning Prompt
Q = sentence       # Query
A = highlight      # Response

# Create the components according to the algorithm
SP_plus_Q = f"{SP}{Q}"
CP_plus_A = f"{CP}{A}"

# Tokenize each part
tokens_SP_plus_Q = ba_tokenizer(SP_plus_Q, return_tensors="pt").input_ids
tokens_CP_plus_A = ba_tokenizer(CP_plus_A, return_tensors="pt").input_ids

# Reverse the tokens
reversed_SP_plus_Q = t.flip(tokens_SP_plus_Q, dims=[1])
reversed_CP_plus_A = t.flip(tokens_CP_plus_A, dims=[1])

# Calculate conditional probability
# We use reversed_CP_plus_A as input and reversed_SP_plus_Q as target
with t.no_grad():
    outputs = ba_model(input_ids=reversed_CP_plus_A, labels=reversed_SP_plus_Q)
    print(outputs)
    trlm_ba_score = outputs.loss.item()

print(f"TRLM-Ba Score: {trlm_ba_score}")









# prompt = f"{sentence} is summarized by: {highlight}"
# tokens = ba_tokenizer(prompt, return_tensors="pt").input_ids
# print(tokens)
# reversed_tokens = t.flip(tokens, dims=[1])
# print(reversed_tokens)

# with t.no_grad():
#     outputs = ba_model(reversed_tokens, labels=reversed_tokens)
#     trlm_ba_scores.append(outputs.loss.item())
    
    

# # Reverse strings
# reverse_sp_q = scoring_prompt + query
# reverse_cp_a = conditioning_prompt + response

# # Tokenize inputs
# inputs = ba_tokenizer(reverse_cp_a, return_tensors="pt")
# with ba_tokenizer.as_target_tokenizer():
#     labels = ba_tokenizer(reverse_sp_q, return_tensors="pt").input_ids

# # Calculate log probability
# with torch.no_grad():
#     outputs = model(**inputs, labels=labels)
#     print(-outputs.loss.item())  # Negative loss approximates log probability

# trlm_ba_scores

CausalLMOutputWithPast(loss=None, logits=tensor([[[-2.5526, -2.8882,  7.4647,  ..., -2.8904, -2.8837, -2.8828],
         [-1.0366, -1.3005, 10.2495,  ..., -1.2970, -1.2920, -1.2922],
         [-2.0659, -1.9825,  6.2695,  ..., -1.9774, -1.9772, -1.9792],
         ...,
         [-1.3120, -1.6416,  9.7668,  ..., -1.6346, -1.6288, -1.6395],
         [ 0.6956,  0.2210,  9.2045,  ...,  0.2218,  0.2219,  0.2160],
         [-1.4803, -1.6478,  8.9454,  ..., -1.6395, -1.6371, -1.6486]]]), past_key_values=DynamicCache(), hidden_states=None, attentions=None)


AttributeError: 'NoneType' object has no attribute 'item'

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


' has happened to me, and I don’t know how to get out of it.\n\nis summarized by the sentence Daniel Radcliffe gets £20M fortune as he turns 18'

In [21]:
# Load dataset
def load_cnn_dataset(num_samples=10):
    try:
        # Try with a specific cache directory
        dataset = load_dataset("cnn_dailymail", "3.0.0", cache_dir=".cache")
        print("Dataset loaded successfully")
        
        # Verify the structure - this helps debug
        if num_samples > 0:
            print("Example dataset item:", dataset['train'][0])
            
        # Take only a small sample for testing
        if hasattr(dataset, 'train'):
            return dataset['train'].select(range(min(num_samples, len(dataset['train']))))
        
        return dataset['train'][:num_samples]
        
    except Exception as e:
        print(f"Error loading full dataset: {e}")
        
        # Create a tiny synthetic dataset for testing
        print("Creating synthetic test dataset instead...")
        
        sample_data = {
            'article': [
                "John likes to play basketball. He goes to the court every evening. His friends join him on weekends.",
                "The company announced record profits. Investors were pleased. The stock price increased by 10%."
            ],
            'highlights': [
                "John plays basketball regularly with friends.",
                "Company profits lead to stock price increase."
            ],
            'id': ['test1', 'test2']  # Added ID field
        }
        
        return Dataset.from_dict(sample_data)

In [22]:
def preprocess_text(example):
    """
    Process the CNN dataset example to extract article and highlight sentences
    """
    # Handle the article - could be a string or a list
    if isinstance(example['article'], list):
        # Already in list format
        article_sentences = example['article']
    else:
        # Need to tokenize
        article_sentences = nltk.sent_tokenize(example['article'])
    
    # Handle the highlights - could be a string or a list
    if isinstance(example['highlights'], list):
        # Already in list format
        highlight_sentences = example['highlights']
    else:
        # Need to tokenize
        highlight_sentences = nltk.sent_tokenize(example['highlights'])
    
    return {
        'article_sentences': article_sentences,
        'highlight_sentences': highlight_sentences
    }

In [23]:
# def forward_baseline_score(model, tokenizer, article_sentence, highlight, device='cpu'):
#     """Compute P(highlight|article_sentence)"""
#     # Format: [article_sentence] is summarized by: [highlight]
#     input_text = f"{article_sentence} is summarized by: "
    
#     # Tokenize the input and target separately
#     input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
#     target_ids = tokenizer(highlight, return_tensors="pt").input_ids.to(device)
    
#     # Concatenate for full sequence, but remember where the split is
#     full_ids = t.cat([input_ids, target_ids[:, 1:]], dim=1)  # Skip BOS token for target
    
#     # Setup for loss calculation - we only want loss on highlight tokens
#     labels = t.full_like(full_ids, -100)  # -100 is ignored in loss calculation
#     labels[:, input_ids.shape[1]:] = target_ids[:, 1:]  # Only compute loss on highlight
    
#     # Calculate loss
#     outputs = model(full_ids, labels=labels)
#     return outputs.loss.item()

# def trlm_fo_score(model, tokenizer, article_sentence, highlight, device='cpu'):
#     """Compute P(article_sentence|highlight)"""
#     # Format: [article_sentence] is a summary of: [highlight]
#     input_text = f"{highlight} is a summary of: "
    
#     # Tokenize the input and target separately
#     input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
#     target_ids = tokenizer(article_sentence, return_tensors="pt").input_ids.to(device)
    
#     # Concatenate for full sequence
#     full_ids = t.cat([input_ids, target_ids[:, 1:]], dim=1)
    
#     # Setup for loss calculation - we only want loss on article sentence tokens
#     labels = t.full_like(full_ids, -100)
#     labels[:, input_ids.shape[1]:] = target_ids[:, 1:]
    
#     # Calculate loss
#     outputs = model(full_ids, labels=labels)
#     return outputs.loss.item()

# def trlm_ba_score(backward_model, backward_tokenizer, article_sentence, highlight, device='cpu'):
#     """Compute P(article_sentence|highlight) using backward model"""
#     # Format according to paper: [highlight] is summarized by: [article_sentence]
    
#     # We need to tokenize each part separately to know token boundaries
#     highlight_tokens = backward_tokenizer(highlight, return_tensors="pt").input_ids.to(device)
#     connector_tokens = backward_tokenizer(" is summarized by: ", return_tensors="pt").input_ids.to(device)
#     article_tokens = backward_tokenizer(article_sentence, return_tensors="pt").input_ids.to(device)
    
#     # Combine all tokens (removing extra BOS tokens if needed)
#     # Keep only first BOS token, remove others
#     if connector_tokens.size(1) > 1:
#         connector_tokens = connector_tokens[:, 1:]
#     if article_tokens.size(1) > 1:
#         article_tokens = article_tokens[:, 1:]
    
#     combined_tokens = t.cat([highlight_tokens, connector_tokens, article_tokens], dim=1)
    
#     # Now reverse the combined tokens
#     reversed_tokens = t.flip(combined_tokens, dims=[1])
    
#     # Create labels tensor - start with all -100 (ignored positions)
#     labels = t.full_like(reversed_tokens, -100)
    
#     # In the reversed sequence, the article tokens appear at the beginning
#     # The length of article_tokens tells us how many tokens to score
#     article_length = article_tokens.size(1)
    
#     # Set the labels for article tokens (now at the beginning of reversed sequence)
#     labels[:, :article_length] = reversed_tokens[:, :article_length]
    
#     # Calculate loss only on the article tokens
#     outputs = backward_model(reversed_tokens, labels=labels)
#     return outputs.loss.item()

In [24]:
def calculate_metrics(predicted_sentence, gold_sentences):
    """
    Calculate similarity metrics between predicted sentence and gold references
    """
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    
    # Handle empty inputs
    if not predicted_sentence or not gold_sentences:
        return {
            'tfidf_similarity': 0,
            'embedding_similarity': 0,
            'rouge1': 0,
            'rougeL': 0,
            'bleu': 0
        }
    
    # Load a sentence embedding model
    sentence_model = SentenceTransformer('all-mpnet-base-v2')
    
    # TF-IDF similarity
    vectorizer = TfidfVectorizer()
    try:
        tfidf_matrix = vectorizer.fit_transform([predicted_sentence] + gold_sentences)
        tfidf_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
        tfidf_score = max(tfidf_similarities) if len(tfidf_similarities) > 0 else 0
    except:
        tfidf_score = 0
    
    # Embedding similarity
    try:
        pred_embedding = sentence_model.encode(predicted_sentence)
        gold_embeddings = sentence_model.encode(gold_sentences)
        embedding_similarities = cosine_similarity([pred_embedding], gold_embeddings)[0]
        embedding_score = max(embedding_similarities) if len(embedding_similarities) > 0 else 0
    except:
        embedding_score = 0
    
    # ROUGE score
    try:
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
        rouge_scores = [scorer.score(predicted_sentence, gold_sent) for gold_sent in gold_sentences]
        rouge1_score = max([score['rouge1'].fmeasure for score in rouge_scores]) if rouge_scores else 0
        rougeL_score = max([score['rougeL'].fmeasure for score in rouge_scores]) if rouge_scores else 0
    except:
        rouge1_score = 0
        rougeL_score = 0
    
    # BLEU score with smoothing to avoid zero scores
    try:
        # Tokenize for BLEU calculation
        predicted_tokens = nltk.word_tokenize(predicted_sentence.lower())
        gold_tokens = [nltk.word_tokenize(gold.lower()) for gold in gold_sentences]
        
        # Use smoothing function to mitigate "0 counts of n-gram overlaps" warnings
        smoother = SmoothingFunction().method1
        
        # Calculate BLEU score - take the best score against any reference
        bleu_scores = [
            sentence_bleu([gold_tok], predicted_tokens, 
                         weights=(0.25, 0.25, 0.25, 0.25),
                         smoothing_function=smoother)
            for gold_tok in gold_tokens
        ]
        bleu_score = max(bleu_scores) if bleu_scores else 0
    except Exception as e:
        print(f"Error calculating BLEU: {e}")
        bleu_score = 0
    
    return {
        'tfidf_similarity': tfidf_score,
        'embedding_similarity': embedding_score,
        'rouge1': rouge1_score,
        'rougeL': rougeL_score,
        'bleu': bleu_score
    }

In [25]:

# def calculate_model_scores(fo_model, fo_tokenizer, ba_model, ba_tokenizer, 
#                           article_sentences, highlight, device='cpu'):
#     """
#     Calculate scores for both forward and backward models properly.
#     Returns scores and selected sentences.
#     """
#     # Forward baseline scoring: P(highlight|sentence)
#     fw_scores = []
#     for sentence in article_sentences:
#         # Condition = sentence, Target = highlight
#         condition = f"{sentence} is summarized by: "
        
#         # Tokenize condition and target separately
#         condition_ids = fo_tokenizer(condition, return_tensors="pt").input_ids.to(device)
#         target_ids = fo_tokenizer(highlight, return_tensors="pt").input_ids.to(device)
        
#         # Combine them (skipping BOS token in target)
#         input_ids = t.cat([condition_ids, target_ids[:, 1:] if target_ids.size(1) > 1 else target_ids], dim=1)
        
#         # Create label mask that only calculates loss on target tokens
#         labels = t.full_like(input_ids, -100)
#         labels[:, condition_ids.size(1):] = input_ids[:, condition_ids.size(1):]
        
#         # Calculate loss
#         with t.no_grad():
#             outputs = fo_model(input_ids.to(device), labels=labels.to(device))
#             fw_scores.append(outputs.loss.item())
    
#     # TRLM-Fo scoring: P(sentence|highlight)
#     trlm_fo_scores = []
#     for sentence in article_sentences:
#         # Condition = highlight, Target = sentence
#         condition = f"{highlight} is a summary of: "
        
#         # Tokenize condition and target separately
#         condition_ids = fo_tokenizer(condition, return_tensors="pt").input_ids.to(device)
#         target_ids = fo_tokenizer(sentence, return_tensors="pt").input_ids.to(device)
        
#         # Combine them
#         input_ids = t.cat([condition_ids, target_ids[:, 1:] if target_ids.size(1) > 1 else target_ids], dim=1)
        
#         # Create label mask
#         labels = t.full_like(input_ids, -100)
#         labels[:, condition_ids.size(1):] = input_ids[:, condition_ids.size(1):]
        
#         # Calculate loss
#         with t.no_grad():
#             outputs = fo_model(input_ids.to(device), labels=labels.to(device))
#             trlm_fo_scores.append(outputs.loss.item())
    
#     # TRLM-Ba scoring (using backward model)
#     trlm_ba_scores = []
#     for sentence in article_sentences:
#         # Format text according to TRLM-Ba
#         text = f"{highlight} is summarized by: {sentence}"
        
#         # Tokenize and reverse
#         tokens = ba_tokenizer(text, return_tensors="pt").input_ids.to(device)
#         reversed_tokens = t.flip(tokens, dims=[1])
        
#         # We would need a more sophisticated approach to properly mask the labels
#         # For simplicity, we'll use the full sequence loss here
#         with t.no_grad():
#             outputs = ba_model(reversed_tokens, labels=reversed_tokens)
#             trlm_ba_scores.append(outputs.loss.item())
    
#     # Find best match for each model
#     fw_best_idx = np.argmin(fw_scores)
#     trlm_fo_best_idx = np.argmin(trlm_fo_scores)
#     trlm_ba_best_idx = np.argmin(trlm_ba_scores)
    
#     return {
#         'forward': {
#             'scores': fw_scores,
#             'best_sentence': article_sentences[fw_best_idx],
#             'best_score': fw_scores[fw_best_idx]
#         },
#         'trlm_fo': {
#             'scores': trlm_fo_scores,
#             'best_sentence': article_sentences[trlm_fo_best_idx],
#             'best_score': trlm_fo_scores[trlm_fo_best_idx]
#         },
#         'trlm_ba': {
#             'scores': trlm_ba_scores,
#             'best_sentence': article_sentences[trlm_ba_best_idx],
#             'best_score': trlm_ba_scores[trlm_ba_best_idx]
#         }
#     }



In [26]:
def calculate_model_scores(fo_model, fo_tokenizer, ba_model, ba_tokenizer, 
                          article_sentences, highlight, device='cpu'):
    """
    Calculate scores for all models and return the best matching sentences.
    """
    # Calculate scores for each model
    fw_scores = [forward_baseline_score(fo_model, fo_tokenizer, sentence, highlight, device) 
                for sentence in article_sentences]
    
    trlm_fo_scores = [trlm_fo_score(fo_model, fo_tokenizer, sentence, highlight, device) 
                     for sentence in article_sentences]
    
    trlm_ba_scores = [trlm_ba_score(ba_model, ba_tokenizer, sentence, highlight, device) 
                     for sentence in article_sentences]
    
    # Find best match for each model
    fw_best_idx = np.argmin(fw_scores)
    trlm_fo_best_idx = np.argmin(trlm_fo_scores)
    trlm_ba_best_idx = np.argmin(trlm_ba_scores)
    
    return {
        'forward': {
            'scores': fw_scores,
            'best_sentence': article_sentences[fw_best_idx],
            'best_score': fw_scores[fw_best_idx]
        },
        'trlm_fo': {
            'scores': trlm_fo_scores,
            'best_sentence': article_sentences[trlm_fo_best_idx],
            'best_score': trlm_fo_scores[trlm_fo_best_idx]
        },
        'trlm_ba': {
            'scores': trlm_ba_scores,
            'best_sentence': article_sentences[trlm_ba_best_idx],
            'best_score': trlm_ba_scores[trlm_ba_best_idx]
        }
    }

In [27]:
def calculate_model_scores(fo_model, fo_tokenizer, ba_model, ba_tokenizer, 
                          article_sentences, highlight, device='cpu'):
    """
    A simplified, more direct implementation to debug the issue.
    """
    # Forward baseline scoring (query → response)
    fw_scores = []
    for sentence in article_sentences:
        # Standard way: how likely is the highlight given the sentence
        prompt = f"{sentence} is summarized by: {highlight}"
        inputs = fo_tokenizer(prompt, return_tensors="pt").to(device)
        
        with t.no_grad():
            outputs = fo_model(**inputs, labels=inputs["input_ids"])
            fw_scores.append(outputs.loss.item())
    
    # TRLM-Fo scoring (response → query using forward model)
    trlm_fo_scores = []
    for sentence in article_sentences:
        # Reverse direction: how likely is the sentence given the highlight
        prompt = f"{highlight} is a summary of: {sentence}"
        inputs = fo_tokenizer(prompt, return_tensors="pt").to(device)
        
        with t.no_grad():
            outputs = fo_model(**inputs, labels=inputs["input_ids"])
            trlm_fo_scores.append(outputs.loss.item())
    
    # TRLM-Ba scoring (response → query using backward model)
    trlm_ba_scores = []
    for sentence in article_sentences:
        # For backward model: reverse the entire prompt
        prompt = f"{sentence} is summarized by: {highlight}"
        tokens = ba_tokenizer(prompt, return_tensors="pt").input_ids.to(device)
        reversed_tokens = t.flip(tokens, dims=[1])
        
        with t.no_grad():
            outputs = ba_model(reversed_tokens, labels=reversed_tokens)
            trlm_ba_scores.append(outputs.loss.item())
    
    # Find best match for each model
    fw_best_idx = np.argmin(fw_scores)
    trlm_fo_best_idx = np.argmin(trlm_fo_scores)
    trlm_ba_best_idx = np.argmin(trlm_ba_scores)
    
    # Print some debug info
    print("\nDEBUG INFO:")
    print(f"Forward scores (min: {min(fw_scores):.4f}, idx: {fw_best_idx})")
    print(f"TRLM-Fo scores (min: {min(trlm_fo_scores):.4f}, idx: {trlm_fo_best_idx})")
    print(f"TRLM-Ba scores (min: {min(trlm_ba_scores):.4f}, idx: {trlm_ba_best_idx})")
    
    return {
        'forward': {
            'scores': fw_scores,
            'best_sentence': article_sentences[fw_best_idx],
            'best_score': fw_scores[fw_best_idx]
        },
        'trlm_fo': {
            'scores': trlm_fo_scores,
            'best_sentence': article_sentences[trlm_fo_best_idx],
            'best_score': trlm_fo_scores[trlm_fo_best_idx]
        },
        'trlm_ba': {
            'scores': trlm_ba_scores,
            'best_sentence': article_sentences[trlm_ba_best_idx],
            'best_score': trlm_ba_scores[trlm_ba_best_idx]
        }
    }

In [None]:
import torch
import numpy as np

def calculate_model_scores(fo_model, fo_tokenizer, ba_model, ba_tokenizer,
                          article_sentences, highlight, device='cpu'):
    """
    A simplified implementation that sticks to the core functionality.
    
    Args:
        fo_model: Forward language model
        fo_tokenizer: Tokenizer for forward model
        ba_model: Backward language model (trained on reversed tokens)
        ba_tokenizer: Tokenizer for backward model
        article_sentences: List of candidate sentences from the article
        highlight: The highlight/summary to match with article sentences
        device: Device to run models on ('cpu' or 'cuda')
    """
    # Initialize scores
    fw_scores = []
    trlm_fo_scores = []
    trlm_ba_scores = []
    
    # Ensure padding token exists for both tokenizers
    if fo_tokenizer.pad_token is None:
        fo_tokenizer.pad_token = fo_tokenizer.eos_token
    if ba_tokenizer.pad_token is None:
        ba_tokenizer.pad_token = ba_tokenizer.eos_token
    
    # Process each sentence individually (no batching)
    for sentence in article_sentences:
        # 1. FORWARD MODEL SCORING (query → response)
        try:
            # Create prompt
            prompt = f"{sentence} is summarized by: {highlight}"
            
            # Tokenize
            inputs = fo_tokenizer(prompt, return_tensors="pt").to(device)
            
            # Calculate perplexity
            with torch.no_grad():
                outputs = fo_model(**inputs)
                
                # Get logits
                logits = outputs.logits
                
                # Calculate loss directly
                labels = inputs.input_ids.clone()
                
                # Shift for causal language modeling
                shift_logits = logits[:, :-1, :]
                shift_labels = labels[:, 1:]
                
                # Simple loss calculation
                loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')
                loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), 
                              shift_labels.view(-1))
                
                # Normalize by sequence length
                seq_length = shift_labels.numel()
                fw_scores.append(loss.item() / seq_length)
        except Exception as e:
            print(f"Forward scoring error: {e}")
            fw_scores.append(float('inf'))
        
        # 2. TRLM-FO SCORING (response → query using forward model)
        try:
            # Create prompt
            prompt = f"{highlight} is a summary of: {sentence}"
            
            # Tokenize
            inputs = fo_tokenizer(prompt, return_tensors="pt").to(device)
            
            # Calculate perplexity
            with torch.no_grad():
                outputs = fo_model(**inputs)
                
                # Get logits
                logits = outputs.logits
                
                # Calculate loss directly
                labels = inputs.input_ids.clone()
                
                # Shift for causal language modeling
                shift_logits = logits[:, :-1, :]
                shift_labels = labels[:, 1:]
                
                # Simple loss calculation
                loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')
                loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), 
                              shift_labels.view(-1))
                
                # Normalize by sequence length
                seq_length = shift_labels.numel()
                trlm_fo_scores.append(loss.item() / seq_length)
        except Exception as e:
            print(f"TRLM-Fo scoring error: {e}")
            trlm_fo_scores.append(float('inf'))
        
        # 3. TRLM-BA SCORING (backward model)
        try:
            # For backward model, simply reverse the tokens
            prompt = f"{sentence} is summarized by: {highlight}"
            tokens = ba_tokenizer.encode(prompt, add_special_tokens=True)
            reversed_tokens = tokens[::-1]
            
            # Convert to tensor
            input_ids = torch.tensor([reversed_tokens]).to(device)
            
            # Calculate perplexity
            with torch.no_grad():
                outputs = ba_model(input_ids)
                
                # Get logits
                logits = outputs.logits
                
                # Calculate loss directly
                labels = input_ids.clone()
                
                # Shift for causal language modeling
                shift_logits = logits[:, :-1, :]
                shift_labels = labels[:, 1:]
                
                # Simple loss calculation
                loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')
                loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), 
                              shift_labels.view(-1))
                
                # Normalize by sequence length
                seq_length = shift_labels.numel()
                trlm_ba_scores.append(loss.item() / seq_length)
        except Exception as e:
            print(f"TRLM-Ba scoring error: {e}")
            trlm_ba_scores.append(float('inf'))
    
    # Find best match for each approach
    fw_best_idx = np.argmin(fw_scores) if fw_scores else -1
    trlm_fo_best_idx = np.argmin(trlm_fo_scores) if trlm_fo_scores else -1
    trlm_ba_best_idx = np.argmin(trlm_ba_scores) if trlm_ba_scores else -1
    
    # Print debug information
    print("\nDEBUG INFO:")
    if fw_scores:
        print(f"Forward scores (min: {min(fw_scores):.4f}, idx: {fw_best_idx})")
    if trlm_fo_scores:
        print(f"TRLM-Fo scores (min: {min(trlm_fo_scores):.4f}, idx: {trlm_fo_best_idx})")
    if trlm_ba_scores:
        print(f"TRLM-Ba scores (min: {min(trlm_ba_scores):.4f}, idx: {trlm_ba_best_idx})")
    
    # Return results
    return {
        'forward': {
            'scores': fw_scores,
            'best_sentence': article_sentences[fw_best_idx] if fw_best_idx >= 0 else None,
            'best_score': fw_scores[fw_best_idx] if fw_best_idx >= 0 else float('inf')
        },
        'trlm_fo': {
            'scores': trlm_fo_scores,
            'best_sentence': article_sentences[trlm_fo_best_idx] if trlm_fo_best_idx >= 0 else None,
            'best_score': trlm_fo_scores[trlm_fo_best_idx] if trlm_fo_best_idx >= 0 else float('inf')
        },
        'trlm_ba': {
            'scores': trlm_ba_scores,
            'best_sentence': article_sentences[trlm_ba_best_idx] if trlm_ba_best_idx >= 0 else None,
            'best_score': trlm_ba_scores[trlm_ba_best_idx] if trlm_ba_best_idx >= 0 else float('inf')
        }
    }

In [53]:

def run_evaluation(num_samples=10):
    # Setup part remains the same
    if t.cuda.is_available(): 
        device = 'cuda' 
    elif t.backends.mps.is_available():
        device = 'mps'
    else:
        device = 'cpu'
    print(f"Using device: {device}")
    
    fo_model, fo_tokenizer, ba_model, ba_tokenizer = load_models(device)
    dataset = load_cnn_dataset(num_samples)
    
    print(f"Dataset type: {type(dataset)}")
    print(f"Dataset keys: {dataset.keys()}")
    
    # Results list to store all evaluation results
    results = []
    
    # Process each example
    num_examples = len(dataset['article']) if 'article' in dataset else 0
    print(f"Processing {num_examples} examples")
    
    for idx in tqdm(range(num_examples)):
        try:
            # Prepare the example
            example = {
                'article': dataset['article'][idx],
                'highlights': dataset['highlights'][idx],
                'id': dataset['id'][idx] if 'id' in dataset else f"example_{idx}"
            }
            
            # Print info
            print(f"\n--- Example {idx+1} ---")
            print(f"Article (truncated): {example['article'][:150]}...")
            print(f"Highlight: {example['highlights']}")
            
            # Preprocess
            processed = preprocess_text(example)
            article_sentences = processed['article_sentences']
            
            # Skip if article is too short
            if len(article_sentences) < 3:
                print("Article too short, skipping")
                continue
                
            # Evaluate on first highlight sentence (if available)
            if processed['highlight_sentences']:
                highlight = processed['highlight_sentences'][0]
                
                # Get scores for all models
                model_results = calculate_model_scores(
                    fo_model, fo_tokenizer, ba_model, ba_tokenizer,
                    article_sentences, highlight, device
                )
                
                # Calculate metrics
                fw_metrics = calculate_metrics(
                    model_results['forward']['best_sentence'], 
                    processed['highlight_sentences']
                )
                
                trlm_fo_metrics = calculate_metrics(
                    model_results['trlm_fo']['best_sentence'], 
                    processed['highlight_sentences']
                )
                
                trlm_ba_metrics = calculate_metrics(
                    model_results['trlm_ba']['best_sentence'], 
                    processed['highlight_sentences']
                )
                
                # Store results
                result = {
                    'example_id': example['id'],
                    'highlight': highlight,
                    'forward_sentence': model_results['forward']['best_sentence'],
                    'trlm_fo_sentence': model_results['trlm_fo']['best_sentence'],
                    'trlm_ba_sentence': model_results['trlm_ba']['best_sentence'],
                    'forward_score': model_results['forward']['best_score'],
                    'trlm_fo_score': model_results['trlm_fo']['best_score'],
                    'trlm_ba_score': model_results['trlm_ba']['best_score']
                }
                
                # Add all metrics
                for metric, value in fw_metrics.items():
                    result[f'forward_{metric}'] = value
                
                for metric, value in trlm_fo_metrics.items():
                    result[f'trlm_fo_{metric}'] = value
                    
                for metric, value in trlm_ba_metrics.items():
                    result[f'trlm_ba_{metric}'] = value
                
                results.append(result)
                
                # Print metrics
                print(f"Forward: {model_results['forward']['best_sentence']}")
                print(f"TRLM-Fo: {model_results['trlm_fo']['best_sentence']}")
                print(f"TRLM-Ba: {model_results['trlm_ba']['best_sentence']}")
                
        except Exception as e:
            print(f"Error processing example {idx}: {e}")
            continue
    
    return results



In [54]:
def create_detailed_dataframe(results):
    """
    Create a simplified DataFrame focusing only on the loss/score comparisons.
    """
    if not results:
        return pd.DataFrame()
    
    # Convert results to DataFrame
    df = pd.DataFrame(results)
    
    # Create a new DataFrame with multi-index columns
    example_cols = ['example_id', 'highlight']
    model_names = ['forward', 'trlm_fo', 'trlm_ba']
    
    # Create multi-index column list
    columns = []
    for col in example_cols:
        columns.append((col, ''))
    
    # Add sentences and scores columns
    for model in model_names:
        # Add sentence column
        columns.append((model, 'sentence'))
        # Add score column
        columns.append((model, 'score'))
    
    # Create multi-index DataFrame
    multi_df = pd.DataFrame(index=df.index)
    
    # Add example columns
    for col in example_cols:
        multi_df[(col, '')] = df[col]
    
    # Add sentence and score columns
    for model in model_names:
        # Add sentence
        multi_df[(model, 'sentence')] = df[f'{model}_sentence']
        # Add score
        multi_df[(model, 'score')] = df[f'{model}_score']
    
    # Set column multi-index
    multi_df.columns = pd.MultiIndex.from_tuples(columns)
    
    return multi_df

In [55]:
# %%
def create_summary_dataframe(results):
    """
    Create a summary DataFrame with models as rows and metrics as columns.
    """
    if not results:
        return pd.DataFrame()
    
    # Convert to DataFrame first
    df = pd.DataFrame(results)
    
    # Models we're comparing
    models = ['forward', 'trlm_fo', 'trlm_ba']
    
    # Metrics we want to summarize
    metrics = [
        'score',
        'perplexity',
        'tfidf_similarity',
        'embedding_similarity', 
        'rouge1',
        'rougeL',
        'bleu'
    ]
    
    # Create the summary DataFrame
    summary_data = {}
    
    for metric in metrics:
        for model in models:
            col_name = f'{model}_{metric}'
            if col_name in df.columns:
                if metric not in summary_data:
                    summary_data[metric] = {}
                summary_data[metric][model] = df[col_name].mean()
    
    # Convert to DataFrame
    summary_df = pd.DataFrame(summary_data)
    
    return summary_df

In [56]:
results = run_evaluation(num_samples=5)   
 
# Create detailed DataFrame
detailed_df = create_detailed_dataframe(results)

# Create summary DataFrame
summary_df = create_summary_dataframe(results)

Using device: mps
Dataset loaded successfully
Example dataset item: {'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," curre

  0%|          | 0/5 [00:00<?, ?it/s]


--- Example 1 ---
Article (truncated): LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monda...
Highlight: Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .
TRLM-Ba scoring error: Could not infer dtype of NoneType
TRLM-Ba scoring error: Could not infer dtype of NoneType
TRLM-Ba scoring error: Could not infer dtype of NoneType
TRLM-Ba scoring error: Could not infer dtype of NoneType
TRLM-Ba scoring error: Could not infer dtype of NoneType
TRLM-Ba scoring error: Could not infer dtype of NoneType
TRLM-Ba scoring error: Could not infer dtype of NoneType
TRLM-Ba scoring error: Could not infer dtype of NoneType
TRLM-Ba scoring error: Could not infer dtype of NoneType
TRLM-Ba scoring error: Could not infer dtype of NoneType
TRLM-Ba

KeyboardInterrupt: 

In [50]:
detailed_df

Unnamed: 0_level_0,example_id,highlight,forward,forward,trlm_fo,trlm_fo,trlm_ba,trlm_ba
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sentence,score,sentence,score,sentence,score
0,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4,Harry Potter star Daniel Radcliffe gets £20M f...,"LONDON, England (Reuters) -- Harry Potter star...",3.312327,"LONDON, England (Reuters) -- Harry Potter star...",3.438889,"LONDON, England (Reuters) -- Harry Potter star...",3.671095
1,ee8871b15c50d0db17b0179a6d2beab35065f1e9,Mentally ill inmates in Miami are housed on th...,"Starting in 2008, many inmates who would other...",3.835283,Leifman says in 1955 there were more than half...,3.854144,Leifman says in 1955 there were more than half...,3.835589
2,06352019a19ae31e527f37f7571c6dd7f0c5da37,"NEW: ""I thought I was going to die,"" driver sa...","""I knew the deck was going down, there was no ...",3.347004,"""I knew the deck was going down, there was no ...",3.083808,"""I knew the deck was going down, there was no ...",3.357843
3,24521a2abb2e1f5e34e6824e0f9e56904a2b0e88,"Five small polyps found during procedure; ""non...",A colonoscopy is the most sensitive test for c...,4.082535,The procedure was supervised by Dr. Richard Tu...,3.942272,The procedure was supervised by Dr. Richard Tu...,3.843853
4,7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a,"NEW: NFL chief, Atlanta Falcons owner critical...","""Such costs may include, but are not limited t...",3.635967,The charge is punishable by up to five years i...,3.538003,The charge is punishable by up to five years i...,3.589679


In [51]:
summary_df

Unnamed: 0,score,tfidf_similarity,embedding_similarity,rouge1,rougeL,bleu
forward,3.642623,0.231524,0.529002,0.255832,0.232182,0.107367
trlm_fo,3.571423,0.214766,0.511483,0.231062,0.216945,0.094781
trlm_ba,3.659612,0.214766,0.511483,0.231062,0.216945,0.094781
