In [None]:
import torch as t
from transformers import GPTNeoXForCausalLM, AutoTokenizer
from pprint import pprint

In [None]:
fo_model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-160m-deduped",
  revision="step143000",
  cache_dir="./.cache/pythia-160m-deduped/step143000",
)

fo_tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-160m-deduped",
  revision="step143000",
  cache_dir="./.cache/pythia-160m-deduped/step143000",
)

inputs = fo_tokenizer("To be or not to be, that is the", return_tensors="pt")
tokens = fo_model.generate(**inputs)
fo_tokenizer.decode(tokens[0])


In [None]:
ba_tokenizer = AutoTokenizer.from_pretrained(
    "afterless/reverse-pythia-160m",
    # cache_dir="./.cache/reverse-pythia-160m",
)

ba_model = GPTNeoXForCausalLM.from_pretrained(
    "afterless/reverse-pythia-160m",
    # cache_dir="./.cache/reverse-pythia-160m",
)

inputs = ba_tokenizer(
    "the cheese was the best",
    return_token_type_ids=False,
    return_tensors="pt"
)

inputs['input_ids'] = t.flip(inputs.input_ids, (1,))
tokens = t.flip(ba_model.generate(**inputs), (1,))
ba_tokenizer.decode(tokens[0])


In [1]:
import torch as t
import numpy as np
from transformers import GPTNeoXForCausalLM, AutoTokenizer
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import pandas as pd
import rouge 
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/ivw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/ivw/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# Load models
def load_models():
    # Forward model
    fo_model = GPTNeoXForCausalLM.from_pretrained(
        "EleutherAI/pythia-160m-deduped",
        revision="step143000",
        cache_dir="./.cache/pythia-160m-deduped/step143000",
    )
    
    fo_tokenizer = AutoTokenizer.from_pretrained(
        "EleutherAI/pythia-160m-deduped",
        revision="step143000",
        cache_dir="./.cache/pythia-160m-deduped/step143000",
    )
    
    # Backward model
    ba_model = GPTNeoXForCausalLM.from_pretrained(
        "afterless/reverse-pythia-160m",
        cache_dir="./.cache/reverse-pythia-160m",
    )
    
    ba_tokenizer = AutoTokenizer.from_pretrained(
        "afterless/reverse-pythia-160m",
        cache_dir="./.cache/reverse-pythia-160m",
    )
    
    return fo_model, fo_tokenizer, ba_model, ba_tokenizer


In [3]:
# Load dataset
def load_cnn_dataset(num_samples=10):
    try:
        # Try with a specific cache directory
        dataset = load_dataset("cnn_dailymail", "3.0.0", cache_dir="./dataset_cache")
        print("Dataset loaded successfully")
        
        # Verify the structure - this helps debug
        if num_samples > 0:
            print("Example dataset item:", dataset['train'][0])
            
        # Take only a small sample for testing
        if hasattr(dataset, 'train'):
            return dataset['train'].select(range(min(num_samples, len(dataset['train']))))
        
        return dataset['train'][:num_samples]
        
    except Exception as e:
        print(f"Error loading full dataset: {e}")
        
        # Create a tiny synthetic dataset for testing
        print("Creating synthetic test dataset instead...")
        from datasets import Dataset
        
        sample_data = {
            'article': [
                "John likes to play basketball. He goes to the court every evening. His friends join him on weekends.",
                "The company announced record profits. Investors were pleased. The stock price increased by 10%."
            ],
            'highlights': [
                "John plays basketball regularly with friends.",
                "Company profits lead to stock price increase."
            ],
            'id': ['test1', 'test2']  # Added ID field
        }
        
        return Dataset.from_dict(sample_data)

In [4]:
def preprocess_text(example):
    """
    Process the CNN dataset example to extract article and highlight sentences
    """
    # Handle the article - could be a string or a list
    if isinstance(example['article'], list):
        # Already in list format
        article_sentences = example['article']
    else:
        # Need to tokenize
        article_sentences = nltk.sent_tokenize(example['article'])
    
    # Handle the highlights - could be a string or a list
    if isinstance(example['highlights'], list):
        # Already in list format
        highlight_sentences = example['highlights']
    else:
        # Need to tokenize
        highlight_sentences = nltk.sent_tokenize(example['highlights'])
    
    return {
        'article_sentences': article_sentences,
        'highlight_sentences': highlight_sentences
    }

In [None]:
# Forward model citation evaluation
def evaluate_forward_model(model, tokenizer, article_sentences, highlight_sentence, device='cpu'):
    model.to(device)
    model.eval()
    
    scores = []
    with t.no_grad():
        for sentence in article_sentences:
            # Create a prompt for citation task
            prompt = f"{sentence} is summarized by {highlight_sentence}"
            inputs = tokenizer(prompt, return_tensors="pt").to(device)
            
            # Calculate loss/perplexity
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            scores.append(loss.item())
    
    # Return the best match
    best_match_idx = np.argmin(scores)
    return article_sentences[best_match_idx], scores[best_match_idx]


In [None]:
# Backward model citation evaluation
def evaluate_backward_model(model, tokenizer, article_sentences, highlight_sentence, device='cpu'):
    model.to(device)
    model.eval()
    
    scores = []
    with t.no_grad():
        for sentence in article_sentences:
            # For backward model, we reverse the direction: "highlight is derived from article"
            prompt = f"{highlight_sentence} is derived from {sentence}"
            inputs = tokenizer(prompt, return_token_type_ids=False, return_tensors="pt")
            
            # Flip the input tokens for backward model
            inputs['input_ids'] = t.flip(inputs.input_ids, (1,))
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Calculate loss
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            scores.append(loss.item())
    
    # Return the best match
    best_match_idx = np.argmin(scores)
    return article_sentences[best_match_idx], scores[best_match_idx]


In [7]:
# Evaluation metrics
def calculate_metrics(predicted_sentence, gold_sentences):
    # Load a sentence embedding model
    sentence_model = SentenceTransformer('all-mpnet-base-v2')
    
    # TF-IDF similarity
    vectorizer = TfidfVectorizer()
    try:
        tfidf_matrix = vectorizer.fit_transform([predicted_sentence] + gold_sentences)
        tfidf_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
        tfidf_score = max(tfidf_similarities)
    except:
        tfidf_score = 0
    
    # Embedding similarity
    pred_embedding = sentence_model.encode(predicted_sentence)
    gold_embeddings = sentence_model.encode(gold_sentences)
    embedding_similarities = cosine_similarity([pred_embedding], gold_embeddings)[0]
    embedding_score = max(embedding_similarities)
    
    # ROUGE score
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(predicted_sentence, gold_sent) for gold_sent in gold_sentences]
    rouge1_score = max([score['rouge1'].fmeasure for score in rouge_scores])
    rougeL_score = max([score['rougeL'].fmeasure for score in rouge_scores])
    
    return {
        'tfidf_similarity': tfidf_score,
        'embedding_similarity': embedding_score,
        'rouge1': rouge1_score,
        'rougeL': rougeL_score
    }


In [13]:
def calculate_metrics(predicted_sentence, gold_sentences):

    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    
    # Handle empty inputs
    if not predicted_sentence or not gold_sentences:
        return {
            'tfidf_similarity': 0,
            'embedding_similarity': 0,
            'rouge1': 0,
            'rougeL': 0,
            'bleu': 0
        }
    
    # Load a sentence embedding model
    sentence_model = SentenceTransformer('all-mpnet-base-v2')
    
    # TF-IDF similarity
    vectorizer = TfidfVectorizer()
    try:
        tfidf_matrix = vectorizer.fit_transform([predicted_sentence] + gold_sentences)
        tfidf_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
        tfidf_score = max(tfidf_similarities) if len(tfidf_similarities) > 0 else 0
    except:
        tfidf_score = 0
    
    # Embedding similarity
    try:
        pred_embedding = sentence_model.encode(predicted_sentence)
        gold_embeddings = sentence_model.encode(gold_sentences)
        embedding_similarities = cosine_similarity([pred_embedding], gold_embeddings)[0]
        embedding_score = max(embedding_similarities) if len(embedding_similarities) > 0 else 0
    except:
        embedding_score = 0
    
    # ROUGE score
    try:
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
        rouge_scores = [scorer.score(predicted_sentence, gold_sent) for gold_sent in gold_sentences]
        rouge1_score = max([score['rouge1'].fmeasure for score in rouge_scores]) if rouge_scores else 0
        rougeL_score = max([score['rougeL'].fmeasure for score in rouge_scores]) if rouge_scores else 0
    except:
        rouge1_score = 0
        rougeL_score = 0
    
    # BLEU score
    try:
        # Tokenize for BLEU calculation
        predicted_tokens = nltk.word_tokenize(predicted_sentence.lower())
        gold_tokens = [nltk.word_tokenize(gold.lower()) for gold in gold_sentences]
        
        # Calculate BLEU score - take the best score against any reference
        bleu_scores = [
            sentence_bleu([gold_tok], predicted_tokens, weights=(0.25, 0.25, 0.25, 0.25))
            for gold_tok in gold_tokens
        ]
        bleu_score = max(bleu_scores) if bleu_scores else 0
    except Exception as e:
        print(f"Error calculating BLEU: {e}")
        bleu_score = 0
    
    return {
        'tfidf_similarity': tfidf_score,
        'embedding_similarity': embedding_score,
        'rouge1': rouge1_score,
        'rougeL': rougeL_score,
        'bleu': bleu_score
    }

In [14]:
dataset = load_cnn_dataset(3)
dataset

Dataset loaded successfully
Example dataset item: {'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places be

{'article': ['LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office cha

In [15]:
def calculate_perplexity(model, tokenizer, text, device):
    """Calculate the perplexity of a text using a language model"""
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt").to(device)
    
    # Calculate perplexity
    with t.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
    
    # Perplexity is the exponential of the average negative log-likelihood
    return t.exp(loss).item()

In [20]:
def run_evaluation(num_samples=10):
    # Load models and dataset
    fo_model, fo_tokenizer, ba_model, ba_tokenizer = load_models()
    dataset = load_cnn_dataset(num_samples)
    
    print(f"Dataset type: {type(dataset)}")
    print(f"Dataset keys: {dataset.keys()}")
    
    # Determine device
    if t.cuda.is_available(): 
        device = 'cuda' 
    elif t.backends.mps.is_available():
        device = 'mps'
    else:
        device = 'cpu'
    print(f"Using device: {device}")
    
    # Results
    results = []  # Store all results for tabular display
    
    # Correctly iterate through the dataset by index
    num_examples = len(dataset['article']) if 'article' in dataset else 0
    print(f"Processing {num_examples} examples")
    
    for idx in tqdm(range(num_examples)):
        try:
            # Properly construct each example from the dataset
            example = {
                'article': dataset['article'][idx],
                'highlights': dataset['highlights'][idx],
                'id': dataset['id'][idx] if 'id' in dataset else f"example_{idx}"
            }
            
            # Print article and highlight for visibility (truncated for readability)
            print(f"\n--- Example {idx+1} ---")
            print(f"Article (truncated): {example['article'][:150]}...")
            print(f"Highlight: {example['highlights']}")
            
            # Preprocess
            processed = preprocess_text(example)
            article_sentences = processed['article_sentences']
            
            # Skip if article is too short
            if len(article_sentences) < 3:
                print("Article too short, skipping")
                continue
                
            # Evaluate on first highlight sentence (if available)
            if processed['highlight_sentences']:
                highlight = processed['highlight_sentences'][0]
                
                # Forward model evaluation
                fw_sentence, fw_score = evaluate_forward_model(
                    fo_model, fo_tokenizer, article_sentences, highlight, device
                )
                fw_metrics = calculate_metrics(fw_sentence, processed['highlight_sentences'])
                fw_perplexity = calculate_perplexity(fo_model, fo_tokenizer, fw_sentence, device)
                fw_metrics['perplexity'] = fw_perplexity
                
                print(f"Forward model found: {fw_sentence}")
                
                # Backward model evaluation
                bw_sentence, bw_score = evaluate_backward_model(
                    ba_model, ba_tokenizer, article_sentences, highlight, device
                )
                bw_metrics = calculate_metrics(bw_sentence, processed['highlight_sentences'])
                bw_perplexity = calculate_perplexity(ba_model, ba_tokenizer, bw_sentence, device)
                bw_metrics['perplexity'] = bw_perplexity
                
                print(f"Backward model found: {bw_sentence}")
                
                # Store results for this example
                results.append({
                    'example_id': example['id'],
                    'highlight': highlight,
                    'forward_sentence': fw_sentence,
                    'backward_sentence': bw_sentence,
                    'forward_tfidf': fw_metrics['tfidf_similarity'],
                    'forward_embedding': fw_metrics['embedding_similarity'],
                    'forward_rouge1': fw_metrics['rouge1'],
                    'forward_rougeL': fw_metrics['rougeL'],
                    'forward_bleu': fw_metrics['bleu'],  # Add this line
                    'forward_perplexity': fw_perplexity,
                    'backward_tfidf': bw_metrics['tfidf_similarity'],
                    'backward_embedding': bw_metrics['embedding_similarity'],
                    'backward_rouge1': bw_metrics['rouge1'],
                    'backward_rougeL': bw_metrics['rougeL'],
                    'backward_bleu': bw_metrics['bleu'],  # Add this line
                    'backward_perplexity': bw_perplexity
                })
                
                # Print metrics for this example
                print("Forward Model Metrics:")
                for metric, value in fw_metrics.items():
                    print(f"  {metric}: {value:.4f}")
                
                print("Backward Model Metrics:")
                for metric, value in bw_metrics.items():
                    print(f"  {metric}: {value:.4f}")
        
        except Exception as e:
            print(f"Error processing example {idx}: {e}")
            continue
    
    # Skip table if no results
    if not results:
        print("No valid results found!")
        return [], []
    
    df = pd.DataFrame(results)
    
    # Add a row with average metrics
    avg_row = {
        'example_id': 'AVERAGE',
        'highlight': '',
        'forward_sentence': '',
        'backward_sentence': '',
    }
    
    for col in df.columns:
        if col not in ['example_id', 'highlight', 'forward_sentence', 'backward_sentence']:
            avg_row[col] = df[col].mean()
    
    # Append average row
    df = df._append(avg_row, ignore_index=True)
    
    # Display the full table
    print("\n--- Detailed Results ---")
    # Select just the metric columns for a cleaner display
    metrics_df = df[['example_id', 'forward_tfidf', 'forward_embedding', 'forward_rouge1', 'forward_rougeL', 'forward_perplexity',
                    'backward_tfidf', 'backward_embedding', 'backward_rouge1', 'backward_rougeL', 'backward_perplexity']]
    
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    print(metrics_df)
    
    # Create a summary table comparing forward vs backward
    print("\n--- Summary: Forward vs Backward ---")
    # In the summary_df creation part of run_evaluation:
    summary_df = pd.DataFrame({
        'Metric': ['TF-IDF Similarity', 'Embedding Similarity', 'ROUGE-1', 'ROUGE-L', 'BLEU', 'Perplexity'],
        'Forward Model': [
            df['forward_tfidf'].mean(),
            df['forward_embedding'].mean(),
            df['forward_rouge1'].mean(),
            df['forward_rougeL'].mean(),
            df['forward_bleu'].mean(),
            df['forward_perplexity'].mean()
        ],
        'Backward Model': [
            df['backward_tfidf'].mean(),
            df['backward_embedding'].mean(),
            df['backward_rouge1'].mean(),
            df['backward_rougeL'].mean(),
            df['backward_bleu'].mean(),
            df['backward_perplexity'].mean()
        ],
        'Difference': [
            df['backward_tfidf'].mean() - df['forward_tfidf'].mean(),
            df['backward_embedding'].mean() - df['forward_embedding'].mean(),
            df['backward_rouge1'].mean() - df['forward_rouge1'].mean(),
            df['backward_rougeL'].mean() - df['forward_rougeL'].mean(),
            df['backward_bleu'].mean() - df['forward_bleu'].mean(),
            df['backward_perplexity'].mean() - df['forward_perplexity'].mean()
        ]
    })

    # Add return statement at the end
    return summary_df, results
    

In [21]:
# Run the evaluation
if __name__ == "__main__":
    summary_df, results = run_evaluation(num_samples=3)  # Start with a small number for testing

Dataset loaded successfully
Example dataset item: {'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places be

  0%|          | 0/3 [00:00<?, ?it/s]


--- Example 1 ---
Article (truncated): LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monda...
Highlight: Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Forward model found: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
 33%|███▎      | 1/3 [00:49<01:38, 49.43s/it]

Backward model found: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.
Forward Model Metrics:
  tfidf_similarity: 0.4075
  embedding_similarity: 0.8524
  rouge1: 0.4400
  rougeL: 0.4400
  bleu: 0.1593
  perplexity: 26.3242
Backward Model Metrics:
  tfidf_similarity: 0.4075
  embedding_similarity: 0.8524
  rouge1: 0.4400
  rougeL: 0.4400
  bleu: 0.1593
  perplexity: 6018.7441

--- Example 2 ---
Article (truncated): Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events....
Highlight: Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's f

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Forward model found: Leifman says in 1955 there were more than half a million people in state mental hospitals, and today that number has been reduced 90 percent, and 40,000 to 50,000 people are in mental hospitals.


 67%|██████▋   | 2/3 [00:55<00:24, 24.12s/it]

Backward model found: MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor."
Forward Model Metrics:
  tfidf_similarity: 0.1216
  embedding_similarity: 0.4310
  rouge1: 0.1882
  rougeL: 0.1176
  bleu: 0.0000
  perplexity: 25.2208
Backward Model Metrics:
  tfidf_similarity: 0.3690
  embedding_similarity: 0.6910
  rouge1: 0.2985
  rougeL: 0.1791
  bleu: 0.0351
  perplexity: 11079.6162

--- Example 3 ---
Article (truncated): MINNEAPOLIS, Minnesota (CNN) -- Drivers who were on the Minneapolis bridge when it collapsed told harrowing tales of survival. "The whole bridge from ...
Highlight: NEW: "I thought I was going to die," driver says .
Man says pickup truck was folded in half; he just has cut on face .
Driver: "I probably had a 30-, 35-foot free fall"
Minnesota bridge collapsed during rush hour Wednesday .


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Forward model found: "I knew the deck was going down, there was no question about it, and I thought I was going to die," he said.


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
100%|██████████| 3/3 [01:01<00:00, 20.50s/it]

Backward model found: "I knew the deck was going down, there was no question about it, and I thought I was going to die," he said.
Forward Model Metrics:
  tfidf_similarity: 0.4311
  embedding_similarity: 0.5442
  rouge1: 0.4242
  rougeL: 0.4242
  bleu: 0.2922
  perplexity: 12.7295
Backward Model Metrics:
  tfidf_similarity: 0.4311
  embedding_similarity: 0.5442
  rouge1: 0.4242
  rougeL: 0.4242
  bleu: 0.2922
  perplexity: 776.8411

--- Detailed Results ---
                                 example_id  forward_tfidf  forward_embedding  forward_rouge1  forward_rougeL  forward_perplexity  backward_tfidf  backward_embedding  backward_rouge1  backward_rougeL  backward_perplexity
0  42c027e4ff9730fbb3de84c1af0d2c506e41c3e4       0.407464           0.852448        0.440000        0.440000           26.324179        0.407464            0.852448         0.440000         0.440000          6018.744141
1  ee8871b15c50d0db17b0179a6d2beab35065f1e9       0.121585           0.431025        0.188235  




In [None]:
backward_results

In [None]:
!jupyter nbconvert --to html experiment.ipynb