# Graph-Based Reranking for Fact Verification

Exploring graph reranking in 4 stages:
1. **Basic Graph Reranker**: Simple semantic similarity graph
2. **AMR + AMR-BERT**: Abstract Meaning Representation with specialized embeddings
3. **Full GRAG**: AMR + Document reranker with RoBERTa/BERT
4. **GRAG + Custom Weights**: Plugin architecture for weight functions

Each implementation is encapsulated for easy integration into the CoRAG pipeline.

In [64]:
import sys
!{sys.executable} -m pip install spacy

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: D:\Jigoku\Python\python.exe -m pip install --upgrade pip


In [1]:
import numpy as np
import networkx as nx
from typing import List, Dict, Tuple, Optional, Callable
from dataclasses import dataclass
from abc import ABC, abstractmethod
import matplotlib.pyplot as plt
from collections import defaultdict

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import amrlib
import penman

from transformers import AutoTokenizer, AutoModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from config import INDEX_DIR, QRELS_PATH, CLAIMS_PATH
from pyserini.search.lucene import LuceneSearcher
from datasets import load_dataset
import json
import os
from collections import defaultdict
from datasets import load_dataset

In [3]:
import sys
from pathlib import Path
PROJECT_ROOT = Path.cwd().parent
INDEX_DIR = PROJECT_ROOT / "data" / "wiki" / "index"
QRELS_PATH = PROJECT_ROOT / "data" / "fever-qrel.json"
print(f"Project root: {PROJECT_ROOT}")
print(f"Index directory: {INDEX_DIR}")

Project root: D:\Celestia\Projects\646_Project
Index directory: D:\Celestia\Projects\646_Project\data\wiki\index


## Preparing qrels and fever dataset using BM25 to test reranker on

In [21]:
# Qrel generation
if not QRELS_PATH.exists():
    print("GENERATING QRELS FROM LABELLED_DEV SPLIT")
    ds = load_dataset("fever", "v1.0", split="labelled_dev", trust_remote_code=True)
    print(f"Loaded {len(ds)} claims from labelled_dev split")
    
    qrels = defaultdict(lambda: defaultdict(lambda: 0))
    claims = []
    added_claims = set()
    
    # Process each example
    for ex in ds:
        cid = str(ex["id"])
        label = ex["label"]
        
        # Only use SUPPORTS/REFUTES (skip NEI)
        if label not in ("SUPPORTS", "REFUTES"):
            continue
        
        # Get evidence
        page = ex.get("evidence_wiki_url")
        sent_id = ex.get("evidence_sentence_id")
        claim_text = ex.get("claim")
        
        # Add to qrels if we have evidence
        if page and sent_id is not None and claim_text:
            qrels[cid][page] = 1
            
            if cid not in added_claims:
                claims.append({"id": cid, "input": claim_text})
                added_claims.add(cid)
    
    print(f"Processed {len(ds)} examples")
    print(f"Generated qrels for {len(qrels)} claims")
    print(f"Claims with evidence: {len(claims)}")
    
    # Save files
    QRELS_PATH.parent.mkdir(parents=True, exist_ok=True)
    
    with open(QRELS_PATH, "w", encoding="utf8") as f:
        json.dump(qrels, f, indent=2)
    
    with open(CLAIMS_PATH, "w", encoding="utf8") as f:
        json.dump(claims, f, indent=2)
    
    print(f"\n     Saved qrels to {QRELS_PATH}")
    print(f"Saved claims to {CLAIMS_PATH}")
else:
    print(f"Qrels already exist at {QRELS_PATH}")

Qrels already exist at D:\Celestia\Projects\646_Project\data\fever-qrel.json

Qrels loaded: 13332 claims
  Sample IDs: ['137334', '111897', '89891', '181634', '219028']


In [32]:
@dataclass
class Document:
    """Represents a retrieved document."""
    doc_id: str
    text: str
    score: float
    metadata: Optional[Dict] = None

@dataclass
class RerankedResult:
    """Result after reranking."""
    doc_id: str
    original_rank: int
    new_rank: int
    original_score: float
    reranked_score: float
    explanation: Optional[str] = None

In [33]:
class FEVERDataLoader:
    """Load real FEVER data and BM25 results for testing."""
    
    def __init__(self):
        print("Initializing FEVER data loader...")
        
        if not INDEX_DIR.exists():
            raise FileNotFoundError(f"Index not found at {INDEX_DIR}")
        
        self.searcher = LuceneSearcher(str(INDEX_DIR))
        self.searcher.set_bm25(1.2, 0.75)
        
        print("Loading FEVER dataset...")
        self.dataset = load_dataset("fever", "v1.0", trust_remote_code=True)

        full_dev_split = self.dataset["labelled_dev"]
        filtered_indices = [
            i for i, ex in enumerate(full_dev_split) 
            if ex['label'] in ('SUPPORTS', 'REFUTES')
        ]
        self.dev_split = full_dev_split.select(filtered_indices)
        
        self.qrels = None
        if QRELS_PATH.exists():
            with open(QRELS_PATH, 'r') as f:
                self.qrels = json.load(f)
            print(f"Loaded qrels with {len(self.qrels)} claims")
        else:
            print("No qrels found - retrieval metrics will be limited")
            print(" You can still evaluate ranking changes, but not P@3/R@3/MAP")
        
        print(f" Loaded {len(self.dev_split)} FEVER claims")
    
    def get_claim(self, index: int) -> dict:
        """Get a single claim from FEVER."""
        claim_data = self.dev_split[index]
        return {
            'id': claim_data['id'],
            'claim': claim_data['claim'],
            'label': claim_data['label'],
        }
    
    def get_bm25_results(self, claim: str, k: int = 10) -> List[Document]:
        """Get BM25 results for a claim."""
        hits = self.searcher.search(claim, k=k)
        
        documents = []
        for hit in hits:
            doc = self.searcher.doc(hit.docid)
            
            #  FIXED: Robust text extraction
            text = ""
            try:
                # Try raw() + JSON parse (most common)
                import json
                raw_doc = doc.raw()
                doc_dict = json.loads(raw_doc)
                text = doc_dict.get('contents', '')
            except:
                # Fallback: try contents() method
                try:
                    text = doc.contents()
                except:
                    # Last resort: stringify
                    text = str(doc)
            
            if not text:
                print(f" Warning: Empty text for doc {hit.docid}")
            
            documents.append(Document(
                doc_id=hit.docid,
                text=text,
                score=hit.score,
                metadata={'rank': len(documents) + 1}
            ))
        
        return documents
    
    def get_test_batch(self, start: int = 0, n: int = 10) -> List[dict]:
        """Get a batch of test claims with BM25 results."""
        batch = []
        for i in range(start, min(start + n, len(self.dev_split))):
            claim_data = self.get_claim(i)
            bm25_docs = self.get_bm25_results(claim_data['claim'], k=10)
            
            batch.append({
                'claim_data': claim_data,
                'bm25_docs': bm25_docs,
            })
        
        return batch


fever_loader = FEVERDataLoader()

`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'fever' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.

2025-11-14 05:45:21,116 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'fever' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


Initializing FEVER data loader...
Loading FEVER dataset...


Using the latest cached version of the dataset since fever couldn't be found on the Hugging Face Hub

Found the latest cached dataset configuration 'v1.0' at C:\Users\Shreya\.cache\huggingface\datasets\fever\v1.0\1.0.0\7f8936e0558704771b08c7ce9cc202071b29a0050603374507ba61d23c00a58e (last modified on Mon Oct 20 18:04:46 2025).



Loaded qrels with 13332 claims
 Loaded 28625 FEVER claims


# Stage 1: Basic Graph Reranker

**Approach**: 
- Build a similarity graph using sentence embeddings (BERT Model)
- Nodes = documents, edges = semantic similarity
- Rerank using a combination of BM25 score + graph centrality

In [34]:
class GraphReranker:
    """
    Basic graph reranker using semantic similarity.

    1. Compute document embeddings
    2. Build a graph with edges weighted by cosine similarity
    3. Compute PageRank centrality
    4. Combine BM25 + centrality for final score
    """
    
    def __init__(self, 
                 model_name: str = 'all-MiniLM-L6-v2',
                 similarity_threshold: float = 0.3,
                 alpha: float = 0.5): 
        """
        Args:
            model_name: SentenceTransformer model for embeddings
            similarity_threshold: Minimum similarity to create an edge
            alpha: Balance between BM25 (alpha) and centrality (1-alpha)
        """
        self.encoder = SentenceTransformer(model_name)
        self.similarity_threshold = similarity_threshold
        self.alpha = alpha
        
    def _build_similarity_graph(self, 
                                documents: List[Document],
                                claim: str) -> nx.Graph:
        """
        Prepares NetworkX graph with 'weight' on edges and 'bm25_score' on nodes
        """
        # Encode all texts
        texts = [claim] + [doc.text for doc in documents]
        embeddings = self.encoder.encode(texts)
        
        claim_emb = embeddings[0]
        doc_embs = embeddings[1:]
        
        # Compute pairwise similarities
        sim_matrix = cosine_similarity(doc_embs)
        claim_sims = cosine_similarity([claim_emb], doc_embs)[0]
        
        # Build graph
        G = nx.Graph()
        
        # Add nodes with attributes
        for i, doc in enumerate(documents):
            G.add_node(doc.doc_id, 
                      bm25_score=doc.score,
                      claim_similarity=claim_sims[i],
                      text=doc.text)
        
        # Add edges based on similarity
        for i in range(len(documents)):
            for j in range(i+1, len(documents)):
                similarity = sim_matrix[i][j]
                if similarity >= self.similarity_threshold:
                    G.add_edge(documents[i].doc_id, 
                             documents[j].doc_id, 
                             weight=similarity)
        
        return G
    
    def rerank(self, 
               claim: str, 
               documents: List[Document],
               top_k: int = 3) -> List[RerankedResult]:
        """
        Rerank documents using graph centrality.
        Args:
            claim: The claim to verify
            documents: BM25 retrieved documents
            top_k: Number of documents to return
        """
        if not documents:
            return []
        
        # Build graph
        G = self._build_similarity_graph(documents, claim)
        
        # Compute PageRank centrality or use degree if the graph is disconnected
        try:
            centrality = nx.pagerank(G, weight='weight')
        except:
            centrality = nx.degree_centrality(G)
        
        # Normalize BM25 scores
        max_bm25 = max(doc.score for doc in documents)
        min_bm25 = min(doc.score for doc in documents)
        bm25_range = max_bm25 - min_bm25 if max_bm25 != min_bm25 else 1
        
        # Combine scores
        results = []
        for i, doc in enumerate(documents):
            norm_bm25 = (doc.score - min_bm25) / bm25_range
            cent_score = centrality.get(doc.doc_id, 0)
            
            final_score = self.alpha * norm_bm25 + (1 - self.alpha) * cent_score
            
            results.append(RerankedResult(
                doc_id=doc.doc_id,
                original_rank=i+1,
                new_rank=0,  # Will be set after sorting
                original_score=doc.score,
                reranked_score=final_score,
                explanation=f"BM25: {norm_bm25:.3f}, Centrality: {cent_score:.3f}"
            ))
        
        # Sort and assign new ranks
        results.sort(key=lambda x: x.reranked_score, reverse=True)
        for i, result in enumerate(results):
            result.new_rank = i + 1
        
        return results[:top_k]

In [35]:
# def visualize_graph(self, claim: str, documents: List[Document]):
#     G = self._build_similarity_graph(documents, claim)
#     plt.figure(figsize=(12, 8))
#     pos = nx.spring_layout(G)
#     # Draw nodes
#     node_sizes = [G.nodes[node]['bm25_score'] * 100 for node in G.nodes()]
#     nx.draw_networkx_nodes(G, pos, node_size=node_sizes, 
#                           node_color='lightblue', alpha=0.7)
#     # Draw edges
#     edges = G.edges()
#     weights = [G[u][v]['weight'] for u, v in edges]
#     nx.draw_networkx_edges(G, pos, width=[w*3 for w in weights], 
#                           alpha=0.5)
#     # Labels
#     nx.draw_networkx_labels(G, pos, font_size=10)
#     plt.title(f"Document Similarity Graph\nClaim: {claim}")
#     plt.axis('off')
#     plt.tight_layout()
#     plt.show()

In [36]:
basic_reranker_s1 = GraphReranker(alpha=0.6)

num_test_claims = 50
test_batch = fever_loader.get_test_batch(start=0, n=num_test_claims)

rankings_changed = 0
total_tested = 0
precision_at_3 = []
recall_at_3 = []
average_precisions = []

print(f"Testing on {num_test_claims} claims...\n")

for idx, item in enumerate(test_batch):
    claim = item['claim_data']['claim']
    claim_id = str(item['claim_data']['id'])
    bm25_docs = item['bm25_docs']
    
    if len(bm25_docs) < 3:
        continue
    
    reranked = basic_reranker_s1.rerank(claim, bm25_docs, top_k=10)
    
    original_top3 = [doc.doc_id for doc in bm25_docs[:3]]
    reranked_top3 = [r.doc_id for r in reranked[:3]]
    
    changed = original_top3 != reranked_top3
    if changed:
        rankings_changed += 1
    
    # Evaluate precision, recall, MAP
    if fever_loader.qrels and claim_id in fever_loader.qrels:
        relevant_docs = set(fever_loader.qrels[claim_id].keys())
        
        reranked_ids = [r.doc_id for r in reranked]
        
        top3_relevant = len(set(reranked_ids[:3]) & relevant_docs)
        p_at_3 = top3_relevant / 3
        precision_at_3.append(p_at_3)
        
        r_at_3 = top3_relevant / len(relevant_docs) if relevant_docs else 0
        recall_at_3.append(r_at_3)
        
        num_relevant_seen = 0
        precisions_at_relevant = []
        for i, doc_id in enumerate(reranked_ids, 1):
            if doc_id in relevant_docs:
                num_relevant_seen += 1
                precisions_at_relevant.append(num_relevant_seen / i)
        
        ap = np.mean(precisions_at_relevant) if precisions_at_relevant else 0
        average_precisions.append(ap)
    
    total_tested += 1


Testing on 50 claims...



In [37]:
print("Basic Graph Reranker Summary")
print(f"Claims tested: {total_tested}")
print(f"Rankings changed: {rankings_changed} ({rankings_changed/total_tested*100:.1f}%)")

Basic Graph Reranker Summary
Claims tested: 50
Rankings changed: 9 (18.0%)


In [38]:
if precision_at_3:
    print(f"Retrieval Metrics:")
    print(f"Precision@3: {np.mean(precision_at_3):.3f} ± {np.std(precision_at_3):.3f}")
    print(f"Recall@3: {np.mean(recall_at_3):.3f} ± {np.std(recall_at_3):.3f}")
    print(f"MAP: {np.mean(average_precisions):.3f} ± {np.std(average_precisions):.3f}")

Retrieval Metrics:
Precision@3: 0.113 ± 0.158
Recall@3: 0.300 ± 0.436
MAP: 0.220 ± 0.334


# Stage 2: AMR Graph + AMR-BERT Reranker
**Approach**:
- Parse documents into Abstract Meaning Representation (AMR) graphs
- Use AMR-BERT for specialized semantic embeddings
- Build graph over AMR structures instead of raw text

**Key Papers**:
- AMR parsing: https://github.com/bjascob/amrlib
- AMR-BERT: Specialized transformer for AMR graphs

dowloaded AMR model from: https://github.com/bjascob/amrlib-models/releases/

In [55]:
import amrlib
import penman
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import torch
import re
from collections import Counter
        
class AMRGraphReranker:
    """
    Reranker using AMR graphs + AMR-BERT embeddings.
    
    Pipeline:
    1. Parse text to AMR graph (amrlib)
    2. Encode AMR to embeddings (AMR-BERT / RoBERTa)
    3. Build similarity graph
    4. Rerank using AMR structure + embeddings
    """
    
    def __init__(self,
                 model_name: str = 'all-MiniLM-L6-v2',
                 similarity_threshold: float = 0.2,
                 alpha: float = 0.5,
                 use_real_amr: bool = True):
        """
        Args:
            model_name: SentenceTransformer for text embeddings
            similarity_threshold: Min similarity for edges
            alpha: Balance BM25 vs AMR features
            use_real_amr: If True, use amrlib; if False, use regex fallback
        """
        self.encoder = SentenceTransformer(model_name)
        self.similarity_threshold = similarity_threshold
        self.alpha = alpha
        self.use_real_amr = use_real_amr
        self.amr_parser = None
        
        # Load AMR-BERT encoder (Using RoBERTa as alternative)
        print("Loading AMR-BERT encoder...")
        self.amr_bert_tokenizer = AutoTokenizer.from_pretrained('roberta-base')
        self.amr_bert_model = AutoModel.from_pretrained('roberta-base')
        self.amr_bert_model.eval()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.amr_bert_model.to(self.device)
        print(f"AMR-BERT encoder loaded (RoBERTa-base on {self.device})")
        
        if use_real_amr:
            try:
                print("Loading AMR parser from local model...")
                amr_model_path = Path.cwd() / "model" / "amr" / "model_parse_xfm_bart_large-v0_1_0"
                if not amr_model_path.exists():
                    raise FileNotFoundError(f"AMR model not found at {amr_model_path}")
                
                print(f"Model path: {amr_model_path}")
                self.amr_parser = amrlib.load_stog_model(model_dir=str(amr_model_path))
                
                print("AMR parser loaded successfully!")
                print("  Model: STOG (String-to-Graph) - BART Large")
            except Exception as e:
                print(f"Failed to load AMR parser: {e}")
                print(f"   Error type: {type(e).__name__}")
                print("Falling back to simplified feature extraction")
                self.use_real_amr = False
    
    def _encode_amr_with_bert(self, amr_string: str) -> np.ndarray:
        """
        Encode AMR graph using AMR-BERT (RoBERTa).
        
        Takes the linearized AMR string and encodes it with RoBERTa,
        treating it as structured text.
        
        Args: amr_string: Penman notation AMR string
            
        Returns: 768-dimensional embedding vector
        """
        try:
            # Tokenize AMR string
            inputs = self.amr_bert_tokenizer(
                amr_string,
                return_tensors='pt',
                truncation=True,
                max_length=512,
                padding=True
            ).to(self.device)
            
            # Get embeddings
            with torch.no_grad():
                outputs = self.amr_bert_model(**inputs)
                # Use [CLS] token embedding (first token)
                embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
            
            return embedding
            
        except Exception as e:
            print(f"AMR-BERT encoding error: {e}")
            return np.zeros(768) # Fallback: return zero vector
    
    def _parse_to_amr_real(self, text: str) -> Dict:
        """
        Parse text to AMR graph using amrlib.
        
        Returns: Dict with AMR concepts, relations, edges, graph, and AMR-BERT embedding
        """
        try:
            # Limit text length for speed 
            text = text[:500]
            
            # Parse to AMR
            amr_strings = self.amr_parser.parse_sents([text])
            
            if not amr_strings or len(amr_strings) == 0:
                return self._empty_amr(text)
            
            # Get first AMR string
            amr_string = amr_strings[0]
            
            # Parse with penman
            graph = penman.decode(amr_string)
            
            # Extract concepts
            concepts = []
            for instance in graph.instances():
                concepts.append(instance.target)
            
            # Extract relations and edges
            relations = []
            edges = []
            for edge in graph.edges():
                source, role, target = edge.source, edge.role, edge.target
                relations.append(role)
                edges.append((source, role, target))
            
            # Extract attributes
            attributes = {}
            for attr in graph.attributes():
                source, role, value = attr.source, attr.role, attr.target
                if role not in attributes:
                    attributes[role] = []
                attributes[role].append(value)
            
            # Encode AMR with AMR-BERT
            amr_bert_embedding = self._encode_amr_with_bert(amr_string)
            
            return {
                'concepts': concepts,
                'relations': relations,
                'edges': edges,
                'attributes': attributes,
                'graph': graph,
                'amr_string': amr_string,
                'amr_bert_embedding': amr_bert_embedding,  # ← NEW
                'text': text
            }
            
        except Exception as e:
            print(f"AMR parse error for text: {text[:50]}...")
            print(f"   Error: {e}")
            return self._empty_amr(text)
    
    def _parse_to_amr_simple(self, text: str) -> Dict:
        """Fallback: regex-based feature extraction. Only if use_real_amr == FALSE"""
       
        entities = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
        verbs = re.findall(
            r'\b(?:is|was|are|were|be|been|being|has|have|had|'
            r'do|does|did|can|could|will|would|shall|should|may|might|must|'
            r'\w+ed|\w+ing)\b',
            text.lower()
        )
        
        stop_words = {'the', 'a', 'an', 'is', 'was', 'are', 'were', 'in', 'on', 
                      'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as'}
        words = re.findall(r'\b[a-z]+\b', text.lower())
        concepts = [w for w in words if w not in stop_words and len(w) > 3]
        
        # For simple mode, encode text directly with SentenceTransformer
        amr_bert_embedding = self.encoder.encode(text)
        
        return {
            'entities': entities,
            'concepts': concepts,
            'relations': verbs,
            'concept_freq': Counter(concepts),
            'entity_freq': Counter(entities),
            'relation_freq': Counter(verbs),
            'amr_bert_embedding': amr_bert_embedding,  
            'text': text
        }
    
    def _empty_amr(self, text: str) -> Dict:
        """Return empty AMR structure."""
        return {
            'concepts': [],
            'relations': [],
            'edges': [],
            'attributes': {},
            'graph': None,
            'amr_bert_embedding': np.zeros(768 if self.use_real_amr else 384),
            'text': text
        }
    
    def _parse_to_amr(self, text: str) -> Dict:
        """Route to real or simple AMR parsing."""
        if self.use_real_amr and self.amr_parser is not None:
            return self._parse_to_amr_real(text)
        else:
            return self._parse_to_amr_simple(text)
    
    def _compute_amr_similarity(self, amr1: Dict, amr2: Dict) -> float:
        """
        Compute AMR graph similarity using BOTH:
        1. Structural similarity (concepts, relations, edges)
        2. AMR-BERT embedding similarity
        """
        # 1. Structural similarity
        concepts1 = set(amr1.get('concepts', []))
        concepts2 = set(amr2.get('concepts', []))
        
        concept_sim = 0.0
        if concepts1 or concepts2:
            intersection = len(concepts1 & concepts2)
            union = len(concepts1 | concepts2)
            concept_sim = intersection / union if union > 0 else 0.0
        
        # Relation overlap
        relations1 = set(amr1.get('relations', []))
        relations2 = set(amr2.get('relations', []))
        
        relation_sim = 0.0
        if relations1 or relations2:
            intersection = len(relations1 & relations2)
            union = len(relations1 | relations2)
            relation_sim = intersection / union if union > 0 else 0.0
        
        # Edge overlap
        edge_sim = 0.0
        if self.use_real_amr and 'edges' in amr1 and 'edges' in amr2:
            edges1 = set(amr1['edges'])
            edges2 = set(amr2['edges'])
            if edges1 or edges2:
                intersection = len(edges1 & edges2)
                union = len(edges1 | edges2)
                edge_sim = intersection / union if union > 0 else 0.0
        
        # Structural similarity score
        if self.use_real_amr:
            structural_sim = 0.4 * concept_sim + 0.3 * relation_sim + 0.3 * edge_sim
        else:
            structural_sim = 0.6 * concept_sim + 0.4 * relation_sim
        
        # 2. AMR-BERT embedding similarity
        emb1 = amr1.get('amr_bert_embedding')
        emb2 = amr2.get('amr_bert_embedding')
        
        amr_bert_sim = 0.0
        if emb1 is not None and emb2 is not None:
            amr_bert_sim = cosine_similarity([emb1], [emb2])[0][0]
        
        # Combine structural + embedding similarity
        # 60% AMR-BERT (dense, semantic) + 40% structural (explicit)
        combined_similarity = 0.6 * amr_bert_sim + 0.4 * structural_sim
        
        return combined_similarity
    
    def _build_amr_graph(self, documents: List[Document], claim: str) -> nx.Graph:
        """Build graph using AMR features + AMR-BERT embeddings."""
        
        # Parse claim
        print(f"  Parsing claim to AMR...")
        claim_amr = self._parse_to_amr(claim)
        
        # Parse documents
        print(f"  Parsing {len(documents)} documents...")
        doc_amrs = []
        for i, doc in enumerate(documents):
            if self.use_real_amr and (i % 3 == 0 or i == len(documents)-1):
                print(f"  Progress: {i+1}/{len(documents)}")
            doc_amrs.append(self._parse_to_amr(doc.text))
        
        # Also compute text embeddings (for comparison)
        claim_text_emb = self.encoder.encode(claim)
        doc_text_embs = self.encoder.encode([doc.text[:500] for doc in documents])
        
        # Build graph
        G = nx.Graph()
        
        # Add nodes
        for i, doc in enumerate(documents):
            # AMR similarity (structural + AMR-BERT)
            amr_sim = self._compute_amr_similarity(claim_amr, doc_amrs[i])
            
            # Text embedding similarity (baseline)
            text_sim = cosine_similarity([claim_text_emb], [doc_text_embs[i]])[0][0]
            
            G.add_node(doc.doc_id,
                      bm25_score=doc.score,
                      amr_similarity=amr_sim,  # Combined structural + AMR-BERT
                      text_similarity=text_sim,
                      amr_features=doc_amrs[i])
        
        # Add edges
        for i in range(len(documents)):
            for j in range(i+1, len(documents)):
                # AMR similarity (includes AMR-BERT)
                amr_sim = self._compute_amr_similarity(doc_amrs[i], doc_amrs[j])
                
                # Text similarity
                text_sim = cosine_similarity([doc_text_embs[i]], [doc_text_embs[j]])[0][0]
                
                # Edge weight: prioritize AMR similarity
                edge_weight = 0.7 * amr_sim + 0.3 * text_sim
                
                if edge_weight >= self.similarity_threshold:
                    G.add_edge(documents[i].doc_id, documents[j].doc_id, 
                             weight=edge_weight)
        
        return G
    
    def rerank(self, claim: str, documents: List[Document], 
               top_k: int = 3) -> List[RerankedResult]:
        """Rerank using AMR graph + AMR-BERT analysis."""
        
        if not documents:
            return []
        
        # Build graph
        G = self._build_amr_graph(documents, claim)
        
        # Compute centrality
        try:
            centrality = nx.pagerank(G, weight='weight')
        except:
            centrality = nx.degree_centrality(G)
        
        # Normalize BM25
        max_bm25 = max(doc.score for doc in documents)
        min_bm25 = min(doc.score for doc in documents)
        bm25_range = max_bm25 - min_bm25 if max_bm25 != min_bm25 else 1
        
        # Score documents
        results = []
        for i, doc in enumerate(documents):
            norm_bm25 = (doc.score - min_bm25) / bm25_range
            amr_sim = G.nodes[doc.doc_id]['amr_similarity']  # Includes AMR-BERT
            text_sim = G.nodes[doc.doc_id]['text_similarity']
            cent = centrality.get(doc.doc_id, 0)
            
            # Final score: weighted combination
            final_score = (
                self.alpha * norm_bm25 +
                (1 - self.alpha) * (
                    0.5 * amr_sim +      # AMR (structural + AMR-BERT)
                    0.2 * text_sim +     # Text embedding
                    0.3 * cent           # Graph centrality
                )
            )
            
            results.append(RerankedResult(
                doc_id=doc.doc_id,
                original_rank=i+1,
                new_rank=0,
                original_score=doc.score,
                reranked_score=final_score,
                explanation=f"AMR: {amr_sim:.3f}, Text: {text_sim:.3f}, Cent: {cent:.3f}"
            ))
        
        results.sort(key=lambda x: x.reranked_score, reverse=True)
        for i, result in enumerate(results):
            result.new_rank = i + 1
        
        return results[:top_k]

In [56]:
# def visualize_amr(self, text: str):
#     """Visualize AMR parse for debugging."""
#     amr = self._parse_to_amr(text)
    
#     print(f"\n{'='*70}")
#     print(f"Text: {text[:100]}...")
#     print(f"\nAMR Features:")
#     print(f"  Concepts: {amr.get('concepts', [])[:15]}")
#     print(f"  Relations: {amr.get('relations', [])[:15]}")
#     print(f"  Edges (first 5): {amr.get('edges', [])[:5]}")
    
#     if 'amr_bert_embedding' in amr:
#         emb = amr['amr_bert_embedding']
#         print(f"\nAMR-BERT Embedding: {emb.shape} - [{emb[:5]}...]")
    
#     if 'amr_string' in amr and amr['amr_string']:
#         print(f"\nFull AMR Graph:")
#         print(amr['amr_string'])
#     print('='*70)

In [62]:
amr_reranker_s2 = AMRGraphReranker(use_real_amr=True, alpha=0.6)  # ← Added use_real_amr=True

num_test_claims = 25
test_batch = fever_loader.get_test_batch(start=0, n=num_test_claims)

rankings_changed = 0
total_tested = 0
amr_concepts_found = 0
amr_bert_embeddings_found = 0 
precision_at_3 = []
recall_at_3 = []
average_precisions = []

print(f"\nTesting on {num_test_claims} claims...")

Loading AMR-BERT encoder...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AMR-BERT encoder loaded (RoBERTa-base on cpu)
Loading AMR parser from local model...
Model path: D:\Celestia\Projects\646_Project\src\model\amr\model_parse_xfm_bart_large-v0_1_0
AMR parser loaded successfully!
  Model: STOG (String-to-Graph) - BART Large

Testing on 25 claims...


In [63]:
import time
total_time = 0

for idx, item in enumerate(test_batch):
    claim = item['claim_data']['claim']
    claim_id = str(item['claim_data']['id'])
    bm25_docs = item['bm25_docs']
    
    if len(bm25_docs) < 3:
        continue
    
    start_time = time.time()
    
    # Rerank
    print(f"\n[{idx+1}/{num_test_claims}] Processing claim...")
    reranked = amr_reranker_s2.rerank(claim, bm25_docs, top_k=10)
    
    elapsed = time.time() - start_time
    total_time += elapsed
    
    # Compare rankings
    original_top3 = [doc.doc_id for doc in bm25_docs[:3]]
    reranked_top3 = [r.doc_id for r in reranked[:3]]
    
    changed = original_top3 != reranked_top3
    if changed:
        rankings_changed += 1
    
    # Check AMR features
    claim_amr = amr_reranker_s2._parse_to_amr(claim)
    if len(claim_amr.get('concepts', [])) > 0:
        amr_concepts_found += 1
    
    # Check if AMR-BERT embedding exists
    if 'amr_bert_embedding' in claim_amr and claim_amr['amr_bert_embedding'] is not None:
        amr_bert_embeddings_found += 1
    
    # Compute retrieval metrics
    if fever_loader.qrels and claim_id in fever_loader.qrels:
        relevant_docs = set(fever_loader.qrels[claim_id].keys())
        reranked_ids = [r.doc_id for r in reranked]
        
        # Precision@3
        top3_relevant = len(set(reranked_ids[:3]) & relevant_docs)
        p_at_3 = top3_relevant / 3
        precision_at_3.append(p_at_3)
        
        # Recall@3
        r_at_3 = top3_relevant / len(relevant_docs) if relevant_docs else 0
        recall_at_3.append(r_at_3)
        
        # Average Precision
        num_relevant_seen = 0
        precisions_at_relevant = []
        for i, doc_id in enumerate(reranked_ids, 1):
            if doc_id in relevant_docs:
                num_relevant_seen += 1
                precisions_at_relevant.append(num_relevant_seen / i)
        
        ap = np.mean(precisions_at_relevant) if precisions_at_relevant else 0
        average_precisions.append(ap)
    
    total_tested += 1
    
    if idx < 3:
        print(f"\n{'─'*70}")
        print(f"Claim: {claim[:70]}...")
        print(f"  AMR concepts: {claim_amr.get('concepts', [])[:5]}")
        print(f"  AMR relations: {claim_amr.get('relations', [])[:3]}")
        
        if 'amr_bert_embedding' in claim_amr:
            emb = claim_amr['amr_bert_embedding']
            print(f"  AMR-BERT embedding: {emb.shape if hasattr(emb, 'shape') else len(emb)}-dim")
        
        print(f"  BM25 top-3:     {original_top3}")
        print(f"  Reranked top-3: {reranked_top3}")
        print(f"  Changed: {'  YES' if changed else '✗ NO'}")
        
        if fever_loader.qrels and claim_id in fever_loader.qrels:
            print(f"  Metrics: P@3={p_at_3:.3f}, R@3={r_at_3:.3f}, AP={ap:.3f}")
        
        print(f"  Time: {elapsed:.1f}s")



[1/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10
  Progress: 7/10





  Progress: 10/10

──────────────────────────────────────────────────────────────────────
Claim: Fox 2000 Pictures released the film Soul Food....
  AMR concepts: ['release-01', 'company', 'name', 'film', 'name']
  AMR relations: [':ARG0', ':name', ':ARG1']
  AMR-BERT embedding: (768,)-dim
  BM25 top-3:     ['Soul_Food_-LRB-film-RRB-', 'List_of_20th_Century_Fox_films_-LRB-1935–99-RRB-', 'Maxine_Chadway']
  Reranked top-3: ['Soul_Food_-LRB-film-RRB-', 'List_of_20th_Century_Fox_films_-LRB-1935–99-RRB-', 'Maxine_Chadway']
  Changed: ✗ NO
  Metrics: P@3=0.333, R@3=1.000, AP=1.000
  Time: 883.5s

[2/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10
  Progress: 7/10





  Progress: 10/10

──────────────────────────────────────────────────────────────────────
Claim: Fox 2000 Pictures released the film Soul Food....
  AMR concepts: ['release-01', 'company', 'name', 'film', 'name']
  AMR relations: [':ARG0', ':name', ':ARG1']
  AMR-BERT embedding: (768,)-dim
  BM25 top-3:     ['Soul_Food_-LRB-film-RRB-', 'List_of_20th_Century_Fox_films_-LRB-1935–99-RRB-', 'Maxine_Chadway']
  Reranked top-3: ['Soul_Food_-LRB-film-RRB-', 'List_of_20th_Century_Fox_films_-LRB-1935–99-RRB-', 'Maxine_Chadway']
  Changed: ✗ NO
  Metrics: P@3=0.333, R@3=1.000, AP=1.000
  Time: 985.7s

[3/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10
  Progress: 7/10





  Progress: 10/10

──────────────────────────────────────────────────────────────────────
Claim: Fox 2000 Pictures released the film Soul Food....
  AMR concepts: ['release-01', 'company', 'name', 'film', 'name']
  AMR relations: [':ARG0', ':name', ':ARG1']
  AMR-BERT embedding: (768,)-dim
  BM25 top-3:     ['Soul_Food_-LRB-film-RRB-', 'List_of_20th_Century_Fox_films_-LRB-1935–99-RRB-', 'Maxine_Chadway']
  Reranked top-3: ['Soul_Food_-LRB-film-RRB-', 'List_of_20th_Century_Fox_films_-LRB-1935–99-RRB-', 'Maxine_Chadway']
  Changed: ✗ NO
  Metrics: P@3=0.333, R@3=1.000, AP=1.000
  Time: 786.7s

[4/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10
  Progress: 7/10





  Progress: 10/10

[5/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10
  Progress: 7/10





  Progress: 10/10

[6/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10





  Progress: 7/10






  Progress: 10/10






[7/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10





  Progress: 7/10






  Progress: 10/10






[8/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10





  Progress: 7/10






  Progress: 10/10






[9/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10





  Progress: 7/10






  Progress: 10/10






[10/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10





  Progress: 7/10






  Progress: 10/10






[11/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10





  Progress: 7/10






  Progress: 10/10

[12/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10






  Progress: 4/10
  Progress: 7/10






  Progress: 10/10

[13/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10





  Progress: 7/10





  Progress: 10/10

[14/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10





  Progress: 7/10





  Progress: 10/10

[15/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10






  Progress: 4/10





  Progress: 7/10
  Progress: 10/10






[16/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10






  Progress: 4/10





  Progress: 7/10
  Progress: 10/10







[17/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10






  Progress: 4/10





  Progress: 7/10
  Progress: 10/10







[18/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10






  Progress: 4/10





  Progress: 7/10
  Progress: 10/10







[19/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10






  Progress: 4/10





  Progress: 7/10
  Progress: 10/10







[20/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10






  Progress: 4/10





  Progress: 7/10
  Progress: 10/10







[21/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10





  Progress: 7/10





  Progress: 10/10

[22/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10






  Progress: 7/10
  Progress: 10/10

[23/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10






  Progress: 7/10
  Progress: 10/10

[24/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10






  Progress: 7/10
  Progress: 10/10

[25/25] Processing claim...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10






  Progress: 7/10
  Progress: 10/10


In [65]:
print(f"Claims tested: {total_tested}")
print(f"Rankings changed: {rankings_changed} ({rankings_changed/total_tested*100:.1f}%)")
print(f"AMR concepts detected: {amr_concepts_found}/{total_tested} ({amr_concepts_found/total_tested*100:.1f}%)")
print(f"AMR-BERT embeddings: {amr_bert_embeddings_found}/{total_tested} ({amr_bert_embeddings_found/total_tested*100:.1f}%)")  # ← NEW
print(f"Total time: {total_time:.1f}s (avg: {total_time/total_tested:.1f}s per claim)")

Claims tested: 25
Rankings changed: 2 (8.0%)
AMR concepts detected: 25/25 (100.0%)
AMR-BERT embeddings: 25/25 (100.0%)
Total time: 13477.7s (avg: 539.1s per claim)


In [66]:
if precision_at_3:
    print(f"\nRetrieval Metrics:")
    print(f"  Precision@3: {np.mean(precision_at_3):.3f} ± {np.std(precision_at_3):.3f}")
    print(f"  Recall@3:    {np.mean(recall_at_3):.3f} ± {np.std(recall_at_3):.3f}")
    print(f"  MAP:         {np.mean(average_precisions):.3f} ± {np.std(average_precisions):.3f}")


Retrieval Metrics:
  Precision@3: 0.187 ± 0.165
  Recall@3:    0.480 ± 0.458
  MAP:         0.356 ± 0.370


# Stage 3: Full GRAG Implementation

**Components** (from GRAG paper):
1. **AMR Graph Construction**: Parse documents to AMR
2. **Document Encoder**: RoBERTa for document embeddings
3. **Fact Encoder**: BERT for fact triplet embeddings
4. **Cross-Document Reasoning**: Build fact-level graph across documents
5. **Multi-stage Reranking**: AMR → Document → Fact

**Key Innovation**: Extract factual triplets (subject-relation-object) and reason over them

Uses AMR Component from above

In [67]:
import spacy
from collections import defaultdict

@dataclass
class FactTriplet:
    """Represents a fact as (subject, relation, object) triplet."""
    subject: str
    relation: str
    obj: str
    doc_id: str
    confidence: float = 1.0
    source_text: str = ""
    
    def __str__(self):
        return f"({self.subject}, {self.relation}, {self.obj})"
    
    def __hash__(self):
        return hash((self.subject.lower(), self.relation.lower(), self.obj.lower()))
    
    def __eq__(self, other):
        return (self.subject.lower() == other.subject.lower() and
                self.relation.lower() == other.relation.lower() and
                self.obj.lower() == other.obj.lower())

In [70]:
import sys
!{sys.executable} -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 217.9 kB/s eta 0:00:59
     --------------------------------------- 0.1/12.8 MB 327.7 kB/s eta 0:00:39
      --------------------------------------- 0.2/12.8 MB 1.0 MB/s eta 0:00:13
     --- ------------------------------------ 1.0/12.8 MB 4.4 MB/s eta 0:00:03
     --------- ------------------------------ 2.9/12.8 MB 10.4 MB/s eta 0:00:01
     -------------------- ------------------- 6.4/12.8 MB 19.6 MB/s eta 0:00:01
     ------------------------------- ------- 10.3/12.8 MB 40.9 MB/s eta 0:00:01
     ------------------------------------ --


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: D:\Jigoku\Python\python.exe -m pip install --upgrade pip


In [73]:
class GRAGReranker:
    """
    Full GRAG implementation with multi-stage graph reasoning.
    """
    
    def __init__(self,
                 doc_encoder: str = 'roberta-base',
                 fact_encoder: str = 'bert-base-uncased',
                 use_real_amr: bool = True,
                 device: str = 'cpu'):
        """
        Args:
            doc_encoder: Model for document embeddings (RoBERTa)
            fact_encoder: Model for fact embeddings (BERT)
            use_real_amr: Use real AMR parser or simplified version
            device: 'cpu' or 'cuda'
        """
        self.device = device
        self.use_real_amr = use_real_amr
        
        # Load document encoder (RoBERTa)
        print(f"Loading document encoder: {doc_encoder}...")
        self.doc_tokenizer = AutoTokenizer.from_pretrained(doc_encoder)
        self.doc_model = AutoModel.from_pretrained(doc_encoder).to(device)
        self.doc_model.eval()
        
        # Load fact encoder (BERT)
        print(f"Loading fact encoder: {fact_encoder}...")
        self.fact_tokenizer = AutoTokenizer.from_pretrained(fact_encoder)
        self.fact_model = AutoModel.from_pretrained(fact_encoder).to(device)
        self.fact_model.eval()
        
        # AMR component (reuse from Stage 2)
        print("Loading AMR parser...")
        self.amr_reranker = AMRGraphReranker(use_real_amr=use_real_amr)
        
        # Load spaCy for fact extraction
        try:
            print("Loading spaCy for fact extraction...")
            self.nlp = spacy.load("en_core_web_sm")
        except:
            print("  spaCy model not found. Installing...")
            import os
            os.system("")
            self.nlp = spacy.load("en_core_web_sm")
        
        print("GRAG Reranker initialized")
    
    def _encode_text(self, text: str, encoder: str = 'doc') -> np.ndarray:
        """
        Encode text using specified encoder.
        
        Args:
            text: Input text
            encoder: 'doc' (RoBERTa) or 'fact' (BERT)
            
        Returns:
            Embedding vector
        """
        if encoder == 'doc':
            tokenizer = self.doc_tokenizer
            model = self.doc_model
        else:
            tokenizer = self.fact_tokenizer
            model = self.fact_model
        
        # Tokenize
        inputs = tokenizer(
            text,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=512
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Encode
        with torch.no_grad():
            outputs = model(**inputs)
            # Use [CLS] token embedding
            embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
        
        return embedding
    
    def _extract_facts_spacy(self, text: str, doc_id: str) -> List[FactTriplet]:
        """
        Extract fact triplets using spaCy dependency parsing.
        
        Extracts (subject, relation/verb, object) patterns:
        - Subject-Verb-Object patterns from dependency tree
        """
        doc = self.nlp(text[:1000])  # Limit length for speed
        facts = []
        
        for sent in doc.sents:
            # Find verb as relation
            for token in sent:
                if token.pos_ == "VERB":
                    # Find subject
                    subjects = [child for child in token.children 
                               if child.dep_ in ("nsubj", "nsubjpass")]
                    
                    # Find object
                    objects = [child for child in token.children 
                              if child.dep_ in ("dobj", "pobj", "attr")]
                    
                    # Create facts
                    for subj in subjects:
                        for obj in objects:
                            # Get full noun phrases
                            subj_text = " ".join([t.text for t in subj.subtree])
                            obj_text = " ".join([t.text for t in obj.subtree])
                            
                            if len(subj_text) > 2 and len(obj_text) > 2:
                                facts.append(FactTriplet(
                                    subject=subj_text.strip(),
                                    relation=token.lemma_,
                                    obj=obj_text.strip(),
                                    doc_id=doc_id,
                                    source_text=sent.text
                                ))
        
        return facts
    
    def _extract_facts_from_amr(self, amr_data: Dict, doc_id: str) -> List[FactTriplet]:
        """
        Extract facts from AMR parse.
        AMR edges are already (source, relation, target) triplets
        """
        facts = []
        
        if 'edges' in amr_data and amr_data['edges']:
            for edge in amr_data['edges']:
                if len(edge) == 3:
                    source, relation, target = edge
                    facts.append(FactTriplet(
                        subject=str(source),
                        relation=str(relation),
                        obj=str(target),
                        doc_id=doc_id,
                        source_text=amr_data.get('text', '')
                    ))
        
        return facts
    
    def _extract_facts(self, text: str, doc_id: str) -> List[FactTriplet]:
        """
        Extract facts using both spaCy and AMR.
        
        Combines:
        1. spaCy dependency parsing (surface-level facts)
        2. AMR graph edges (deep semantic facts)
        """
        facts = []
        
        # Method 1: spaCy extraction
        spacy_facts = self._extract_facts_spacy(text, doc_id)
        facts.extend(spacy_facts)
        
        # Method 2: AMR extraction (if available)
        if self.use_real_amr and self.amr_reranker.use_real_amr:
            amr_data = self.amr_reranker._parse_to_amr(text)
            amr_facts = self._extract_facts_from_amr(amr_data, doc_id)
            facts.extend(amr_facts)
        
        # Deduplicate
        unique_facts = list(set(facts))
        
        return unique_facts
    
    def _compute_fact_similarity(self, fact1: FactTriplet, fact2: FactTriplet) -> float:
        """
        Compute similarity between two facts.
        
        Uses:
        1. Entity overlap (subject/object matching)
        2. Relation similarity
        3. Embedding similarity
        """
        if not hasattr(fact1, 'embedding'):
            print("Fact1 not pre-encoded, encoding now")
            fact1.embedding = self._encode_text(...)
        
        if not hasattr(fact2, 'embedding'):
            print("Fact2 not pre-encoded, encoding now")
            fact2.embedding = self._encode_text(...)
    
        # Entity overlap
        entities1 = {fact1.subject.lower(), fact1.obj.lower()}
        entities2 = {fact2.subject.lower(), fact2.obj.lower()}
        entity_overlap = len(entities1 & entities2) / len(entities1 | entities2) if entities1 | entities2 else 0
        
        # Relation match
        relation_match = 1.0 if fact1.relation.lower() == fact2.relation.lower() else 0.0
        
        # Embedding similarity
        fact1_text = f"{fact1.subject} {fact1.relation} {fact1.obj}"
        fact2_text = f"{fact2.subject} {fact2.relation} {fact2.obj}"
        
        emb1 = self._encode_text(fact1_text, encoder='fact')
        emb2 = self._encode_text(fact2_text, encoder='fact')
        emb_sim = cosine_similarity([emb1], [emb2])[0][0]
        
        # Weighted combination
        similarity = 0.3 * entity_overlap + 0.2 * relation_match + 0.5 * emb_sim
        
        return similarity

    def _build_fact_graph(self, 
                     all_facts: List[FactTriplet],
                     claim_facts: List[FactTriplet]) -> nx.Graph:
        """
        Build cross-document fact graph.
        
        Nodes: Individual facts from all documents
        Edges: Connect similar/related facts across documents
        """
        G = nx.Graph()
        print(f"Pre-encoding {len(all_facts)} document facts...")
        for fact in all_facts:
            fact_text = f"{fact.subject} {fact.relation} {fact.obj}"
            fact.embedding = self._encode_text(fact_text, encoder='fact')
        
        print(f"Pre-encoding {len(claim_facts)} claim facts...")
        for claim_fact in claim_facts:
            claim_fact_text = f"{claim_fact.subject} {claim_fact.relation} {claim_fact.obj}"
            claim_fact.embedding = self._encode_text(claim_fact_text, encoder='fact')
        
        print(f"All facts encoded, building graph...")
        
        # Add nodes (facts)
        for i, fact in enumerate(all_facts):
            fact_emb = fact.embedding 

            # Compute relevance to claim
            claim_relevance = 0.0
            if claim_facts:
                claim_sims = []
                for claim_fact in claim_facts:
                    sim = self._compute_fact_similarity(fact, claim_fact)
                    claim_sims.append(sim)
                claim_relevance = max(claim_sims) if claim_sims else 0.0
            
            G.add_node(i,
                      fact=fact,
                      embedding=fact_emb,  
                      doc_id=fact.doc_id,
                      claim_relevance=claim_relevance)

        # Add edges (fact-to-fact similarity)
        for i in range(len(all_facts)):
            for j in range(i+1, len(all_facts)):
                fact_i = all_facts[i]
                fact_j = all_facts[j]
                
                # Skip same document
                if fact_i.doc_id == fact_j.doc_id:
                    continue
                
                # Compute similarity 
                similarity = self._compute_fact_similarity(fact_i, fact_j)
                
                # Add edge if similar enough
                if similarity > 0.4:
                    G.add_edge(i, j, weight=similarity)
        
        return G
    
    def _build_multilevel_graph(self,
                               documents: List[Document],
                               claim: str) -> Dict[str, nx.Graph]:
        """
        Build multi-level graph structure (GRAG key innovation).
        
        Returns:
            Dict with 'amr', 'document', 'fact' graphs
        """
        print("Building multi-level graph...")
        
        # Level 1: AMR Graph
        print("Level 1: AMR graph...")
        amr_graph = self.amr_reranker._build_amr_graph(documents, claim)
        
        # Level 2: Document Graph
        print("Level 2: Document graph...")
        doc_graph = nx.Graph()
        claim_emb = self._encode_text(claim, encoder='doc')
        
        doc_embeddings = {}
        for doc in documents:
            doc_emb = self._encode_text(doc.text[:500], encoder='doc')
            doc_embeddings[doc.doc_id] = doc_emb
            
            claim_sim = cosine_similarity([claim_emb], [doc_emb])[0][0]
            doc_graph.add_node(doc.doc_id, embedding=doc_emb, claim_sim=claim_sim)
        
        # Add doc-doc edges
        for i, doc_i in enumerate(documents):
            for doc_j in documents[i+1:]:
                sim = cosine_similarity(
                    [doc_embeddings[doc_i.doc_id]],
                    [doc_embeddings[doc_j.doc_id]]
                )[0][0]
                if sim > 0.3:
                    doc_graph.add_edge(doc_i.doc_id, doc_j.doc_id, weight=sim)
        
        # Level 3: Fact Graph
        print("Level 3: Fact graph (cross-document reasoning)...")
        claim_facts = self._extract_facts(claim, "claim")
        all_facts = []
        for doc in documents:
            facts = self._extract_facts(doc.text[:500], doc.doc_id)
            all_facts.extend(facts)
        
        fact_graph = self._build_fact_graph(all_facts, claim_facts) if all_facts else nx.Graph()
        
        return {
            'amr': amr_graph,
            'document': doc_graph,
            'fact': fact_graph,
            'all_facts': all_facts
        }
    
    def rerank(self,
               claim: str,
               documents: List[Document],
               top_k: int = 3) -> List[RerankedResult]:
        """
        Multi-stage GRAG reranking with cross-document reasoning.
        
        Follows GRAG paper architecture
        """
        if not documents:
            return []
        
        print("  GRAG Multi-Stage Reranking:")
        
        # Build multi-level graph
        graphs = self._build_multilevel_graph(documents, claim)
        amr_graph = graphs['amr']
        doc_graph = graphs['document']
        fact_graph = graphs['fact']
        all_facts = graphs['all_facts']
        
        # Compute centrality scores
        print("    Computing graph centrality...")
        try:
            amr_centrality = nx.pagerank(amr_graph, weight='weight') if amr_graph.number_of_edges() > 0 else {}
            doc_centrality = nx.pagerank(doc_graph, weight='weight') if doc_graph.number_of_edges() > 0 else {}
            fact_centrality = nx.pagerank(fact_graph, weight='weight') if fact_graph.number_of_edges() > 0 else {}
        except:
            amr_centrality = {}
            doc_centrality = {}
            fact_centrality = {}
        
        # Aggregate fact scores per document
        print("    Aggregating scores...")
        fact_scores = defaultdict(float)
        if all_facts and fact_centrality:
            for node_id, cent in fact_centrality.items():
                if node_id in fact_graph.nodes:
                    doc_id = fact_graph.nodes[node_id]['doc_id']
                    claim_rel = fact_graph.nodes[node_id].get('claim_relevance', 0)
                    # Weight by both centrality and claim relevance
                    fact_scores[doc_id] += cent * (1 + claim_rel)
        
        # Normalize BM25 scores
        max_bm25 = max(doc.score for doc in documents)
        min_bm25 = min(doc.score for doc in documents)
        bm25_range = max_bm25 - min_bm25 if max_bm25 != min_bm25 else 1
        
        # Final scoring
        results = []
        for i, doc in enumerate(documents):
            # Get all scores
            norm_bm25 = (doc.score - min_bm25) / bm25_range
            
            amr_score = amr_centrality.get(doc.doc_id, 0) if doc.doc_id in amr_graph.nodes else 0
            if doc.doc_id in amr_graph.nodes:
                amr_score += amr_graph.nodes[doc.doc_id].get('amr_similarity', 0)
            amr_score = min(amr_score, 1.0)
            
            doc_score = doc_centrality.get(doc.doc_id, 0) if doc.doc_id in doc_graph.nodes else 0
            if doc.doc_id in doc_graph.nodes:
                doc_score += doc_graph.nodes[doc.doc_id].get('claim_sim', 0)
            doc_score = min(doc_score, 1.0)
            
            fact_score = fact_scores.get(doc.doc_id, 0)
            # Normalize fact score
            if fact_scores:
                max_fact = max(fact_scores.values())
                fact_score = fact_score / max_fact if max_fact > 0 else 0
            
            # Weighted combination (GRAG paper weights)
            final_score = (
                0.15 * norm_bm25 +      # BM25 baseline
                0.25 * amr_score +      # AMR structural match
                0.30 * doc_score +      # Document semantic + centrality
                0.30 * fact_score       # Fact reasoning + centrality
            )
            
            results.append(RerankedResult(
                doc_id=doc.doc_id,
                original_rank=i+1,
                new_rank=0,
                original_score=doc.score,
                reranked_score=final_score,
                explanation=(
                    f"BM25:{norm_bm25:.2f} AMR:{amr_score:.2f} "
                    f"Doc:{doc_score:.2f} Fact:{fact_score:.2f}"
                )
            ))
        
        # Sort and assign ranks
        results.sort(key=lambda x: x.reranked_score, reverse=True)
        for i, result in enumerate(results):
            result.new_rank = i + 1
        
        return results[:top_k]



In [74]:
# Initialize GRAG reranker
grag_reranker_s3 = GRAGReranker(
    doc_encoder='roberta-base',
    fact_encoder='bert-base-uncased',
    use_real_amr=True,
    device='cpu'
)

# Test configuration
num_test_claims = 25 
test_batch = fever_loader.get_test_batch(start=0, n=num_test_claims)

# Metrics storage
rankings_changed = 0
total_tested = 0
facts_extracted = 0
total_facts = 0
precision_at_3 = []
recall_at_3 = []
average_precisions = []

stage3_improvements = [] 

print(f"\nTesting on {num_test_claims} claims...")

Loading document encoder: roberta-base...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading fact encoder: bert-base-uncased...
Loading AMR parser...
Loading AMR-BERT encoder...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AMR-BERT encoder loaded (RoBERTa-base on cpu)
Loading AMR parser from local model...
Model path: D:\Celestia\Projects\646_Project\src\model\amr\model_parse_xfm_bart_large-v0_1_0
AMR parser loaded successfully!
  Model: STOG (String-to-Graph) - BART Large
Loading spaCy for fact extraction...
GRAG Reranker initialized

Testing on 25 claims...


In [75]:
import time
total_time = 0

for idx, item in enumerate(test_batch):
    claim = item['claim_data']['claim']
    claim_id = str(item['claim_data']['id'])
    bm25_docs = item['bm25_docs']
    
    if len(bm25_docs) < 3:
        continue
    
    print(f"\n{'='*80}")
    print(f"[{idx+1}/{num_test_claims}] Claim: {claim[:70]}...")
    print('='*80)
    
    # Track time
    start_time = time.time()
    
    # Rerank with GRAG
    reranked = grag_reranker_s3.rerank(claim, bm25_docs, top_k=10)
    
    elapsed = time.time() - start_time
    total_time += elapsed
    
    # Compare rankings
    original_top3 = [doc.doc_id for doc in bm25_docs[:3]]
    reranked_top3 = [r.doc_id for r in reranked[:3]]
    
    changed = original_top3 != reranked_top3
    if changed:
        rankings_changed += 1
    
    # Count facts extracted
    try:
        claim_facts = grag_reranker_s3._extract_facts(claim, "claim")
        doc_facts = []
        for doc in bm25_docs[:5]:  # Check first 5 docs
            facts = grag_reranker_s3._extract_facts(doc.text[:500], doc.doc_id)
            doc_facts.extend(facts)
        
        if claim_facts or doc_facts:
            facts_extracted += 1
        total_facts += len(doc_facts)
    except:
        pass
    
    # Compute retrieval metrics
    if fever_loader.qrels and claim_id in fever_loader.qrels:
        relevant_docs = set(fever_loader.qrels[claim_id].keys())
        reranked_ids = [r.doc_id for r in reranked]
        
        # Precision@3
        top3_relevant = len(set(reranked_ids[:3]) & relevant_docs)
        p_at_3 = top3_relevant / 3
        precision_at_3.append(p_at_3)
        
        # Recall@3
        r_at_3 = top3_relevant / len(relevant_docs) if relevant_docs else 0
        recall_at_3.append(r_at_3)
        
        # Average Precision
        num_relevant_seen = 0
        precisions_at_relevant = []
        for i, doc_id in enumerate(reranked_ids, 1):
            if doc_id in relevant_docs:
                num_relevant_seen += 1
                precisions_at_relevant.append(num_relevant_seen / i)
        
        ap = np.mean(precisions_at_relevant) if precisions_at_relevant else 0
        average_precisions.append(ap)
        
        # Track improvement
        original_top3_relevant = len(set(original_top3) & relevant_docs)
        improvement = top3_relevant - original_top3_relevant
        stage3_improvements.append(improvement)
    
    total_tested += 1
    
    # Print results for first 3 examples
    if idx < 3:
        print(f"\n{'─'*70}")
        print(f"Results:")
        print(f"  BM25 top-3:     {original_top3}")
        print(f"  GRAG top-3:     {reranked_top3}")
        print(f"  Changed:        {'✓ YES' if changed else '✗ NO'}")
        
        if fever_loader.qrels and claim_id in fever_loader.qrels:
            print(f"\n  Metrics:")
            print(f"\tPrecision@3: {p_at_3:.3f}")
            print(f"\tRecall@3:    {r_at_3:.3f}")
            print(f"\tAP:          {ap:.3f}")
            print(f"\tImprovement: {'+' if improvement > 0 else ''}{improvement} relevant docs in top-3")
        
        print(f"\n  Facts Extracted:")
        if claim_facts:
            print(f"Claim facts: {len(claim_facts)}")
            for f in claim_facts[:3]:
                print(f"      - {f}")
        if doc_facts:
            print(f"Document facts: {len(doc_facts)} total")
            print(f"(First 3: {doc_facts[:3]})")
        
        print(f"\n  Top-3 Scores & Explanations:")
        for r in reranked[:3]:
            print(f"{r.new_rank}. {r.doc_id}")
            print(f" Score: {r.reranked_score:.4f}")
            print(f"{r.explanation}")
        
        print(f"\n  Processing time: {elapsed:.1f}s")
    else:
        print(f" Processed in {elapsed:.1f}s")


[1/25] Claim: Fox 2000 Pictures released the film Soul Food....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10
  Progress: 7/10





  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...






Pre-encoding 405 document facts...
Pre-encoding 5 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...






──────────────────────────────────────────────────────────────────────
Results:
  BM25 top-3:     ['Soul_Food_-LRB-film-RRB-', 'List_of_20th_Century_Fox_films_-LRB-1935–99-RRB-', 'Maxine_Chadway']
  GRAG top-3:     ['Soul_Food_-LRB-film-RRB-', 'Ways_to_Live_Forever_-LRB-film-RRB-', 'Bootmen']
  Changed:        ✓ YES

  Metrics:
	Precision@3: 0.333
	Recall@3:    1.000
	AP:          1.000
	Improvement: 0 relevant docs in top-3

  Facts Extracted:
Claim facts: 5
      - (r, :ARG0, c)
      - (f, :name, n2)
      - (c, :name, n)
Document facts: 206 total
(First 3: [FactTriplet(subject='f', relation=':name', obj='n', doc_id='Soul_Food_-LRB-film-RRB-', confidence=1.0, source_text="Soul Food is a 1997 American comedy-drama film produced by Kenneth `` Babyface '' Edmonds , Tracey Edmonds and Robert Teitel and released by Fox 2000 Pictures . Featuring an ensemble cast , the film stars Vanessa L. Williams , Vivica A. Fox , Nia Long , Michael Beach , Mekhi Phifer , Jeffrey D. Sams , Irma P. Hall




  Progress: 4/10
  Progress: 7/10





  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...






Pre-encoding 405 document facts...
Pre-encoding 5 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...






──────────────────────────────────────────────────────────────────────
Results:
  BM25 top-3:     ['Soul_Food_-LRB-film-RRB-', 'List_of_20th_Century_Fox_films_-LRB-1935–99-RRB-', 'Maxine_Chadway']
  GRAG top-3:     ['Soul_Food_-LRB-film-RRB-', 'Ways_to_Live_Forever_-LRB-film-RRB-', 'Bootmen']
  Changed:        ✓ YES

  Metrics:
	Precision@3: 0.333
	Recall@3:    1.000
	AP:          1.000
	Improvement: 0 relevant docs in top-3

  Facts Extracted:
Claim facts: 5
      - (r, :ARG0, c)
      - (f, :name, n2)
      - (c, :name, n)
Document facts: 206 total
(First 3: [FactTriplet(subject='f', relation=':name', obj='n', doc_id='Soul_Food_-LRB-film-RRB-', confidence=1.0, source_text="Soul Food is a 1997 American comedy-drama film produced by Kenneth `` Babyface '' Edmonds , Tracey Edmonds and Robert Teitel and released by Fox 2000 Pictures . Featuring an ensemble cast , the film stars Vanessa L. Williams , Vivica A. Fox , Nia Long , Michael Beach , Mekhi Phifer , Jeffrey D. Sams , Irma P. Hall




  Progress: 4/10
  Progress: 7/10





  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...






Pre-encoding 405 document facts...
Pre-encoding 5 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...






──────────────────────────────────────────────────────────────────────
Results:
  BM25 top-3:     ['Soul_Food_-LRB-film-RRB-', 'List_of_20th_Century_Fox_films_-LRB-1935–99-RRB-', 'Maxine_Chadway']
  GRAG top-3:     ['Soul_Food_-LRB-film-RRB-', 'Ways_to_Live_Forever_-LRB-film-RRB-', 'Bootmen']
  Changed:        ✓ YES

  Metrics:
	Precision@3: 0.333
	Recall@3:    1.000
	AP:          1.000
	Improvement: 0 relevant docs in top-3

  Facts Extracted:
Claim facts: 5
      - (r, :ARG0, c)
      - (f, :name, n2)
      - (c, :name, n)
Document facts: 206 total
(First 3: [FactTriplet(subject='f', relation=':name', obj='n', doc_id='Soul_Food_-LRB-film-RRB-', confidence=1.0, source_text="Soul Food is a 1997 American comedy-drama film produced by Kenneth `` Babyface '' Edmonds , Tracey Edmonds and Robert Teitel and released by Fox 2000 Pictures . Featuring an ensemble cast , the film stars Vanessa L. Williams , Vivica A. Fox , Nia Long , Michael Beach , Mekhi Phifer , Jeffrey D. Sams , Irma P. Hall




  Progress: 4/10
  Progress: 7/10





  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...






Pre-encoding 405 document facts...
Pre-encoding 5 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...





 Processed in 4044.4s

[5/25] Claim: Fox 2000 Pictures released the film Soul Food....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10
  Progress: 7/10





  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...






Pre-encoding 405 document facts...
Pre-encoding 5 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...





 Processed in 3453.8s

[6/25] Claim: Telemundo is a English-language television network....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10





  Progress: 7/10






  Progress: 10/10





Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...








Pre-encoding 381 document facts...
Pre-encoding 5 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...





 Processed in 3617.3s

[7/25] Claim: Telemundo is a English-language television network....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10





  Progress: 7/10






  Progress: 10/10





Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...








Pre-encoding 381 document facts...
Pre-encoding 5 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...





 Processed in 3881.4s

[8/25] Claim: Telemundo is a English-language television network....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10





  Progress: 7/10






  Progress: 10/10





Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...








Pre-encoding 381 document facts...
Pre-encoding 5 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...





 Processed in 3643.0s

[9/25] Claim: Telemundo is a English-language television network....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10





  Progress: 7/10






  Progress: 10/10





Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...








Pre-encoding 381 document facts...
Pre-encoding 5 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...





 Processed in 4074.2s

[10/25] Claim: Telemundo is a English-language television network....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10





  Progress: 7/10






  Progress: 10/10





Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...








Pre-encoding 381 document facts...
Pre-encoding 5 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...





 Processed in 3968.6s

[11/25] Claim: Damon Albarn's debut album was released in 2011....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10





  Progress: 7/10






  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...








Pre-encoding 439 document facts...
Pre-encoding 5 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...






 Processed in 4566.8s

[12/25] Claim: There is a capital called Mogadishu....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10






  Progress: 4/10
  Progress: 7/10






  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...








Pre-encoding 286 document facts...
Pre-encoding 3 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...






 Processed in 2027.3s

[13/25] Claim: Savages was exclusively a German film....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10





  Progress: 7/10





  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...







Pre-encoding 268 document facts...
Pre-encoding 4 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...






 Processed in 1931.7s

[14/25] Claim: Savages was exclusively a German film....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10





  Progress: 7/10





  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...







Pre-encoding 268 document facts...
Pre-encoding 4 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...






 Processed in 1898.3s

[15/25] Claim: Andrew Kevin Walker is only Chinese....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10






  Progress: 4/10





  Progress: 7/10
  Progress: 10/10





Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...








Pre-encoding 396 document facts...
Pre-encoding 5 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...







 Processed in 3906.4s

[16/25] Claim: The Cretaceous ended....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10






  Progress: 4/10





  Progress: 7/10
  Progress: 10/10






Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...









Pre-encoding 353 document facts...
Pre-encoding 2 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...







 Processed in 3036.9s

[17/25] Claim: The Cretaceous ended....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10






  Progress: 4/10





  Progress: 7/10
  Progress: 10/10






Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...









Pre-encoding 353 document facts...
Pre-encoding 2 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...







 Processed in 3075.8s

[18/25] Claim: The Cretaceous ended....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10






  Progress: 4/10





  Progress: 7/10
  Progress: 10/10






Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...









Pre-encoding 353 document facts...
Pre-encoding 2 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...







 Processed in 2995.4s

[19/25] Claim: The Cretaceous ended....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10






  Progress: 4/10





  Progress: 7/10
  Progress: 10/10






Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...









Pre-encoding 353 document facts...
Pre-encoding 2 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...







 Processed in 2947.1s

[20/25] Claim: The Cretaceous ended....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10






  Progress: 4/10





  Progress: 7/10
  Progress: 10/10






Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...









Pre-encoding 353 document facts...
Pre-encoding 2 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...







 Processed in 2887.2s

[21/25] Claim: Murda Beatz's real name is Marshall Mathers....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10





  Progress: 7/10





  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...







Pre-encoding 380 document facts...
Pre-encoding 4 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...





 Processed in 3325.1s

[22/25] Claim: Nicholas Brody is a character on Homeland....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10






  Progress: 7/10
  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...






Pre-encoding 370 document facts...
Pre-encoding 3 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...






 Processed in 3032.2s

[23/25] Claim: Nicholas Brody is a character on Homeland....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10






  Progress: 7/10
  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...






Pre-encoding 370 document facts...
Pre-encoding 3 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...






 Processed in 4440.2s

[24/25] Claim: Nicholas Brody is a character on Homeland....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10






  Progress: 7/10
  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...






Pre-encoding 370 document facts...
Pre-encoding 3 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...






 Processed in 3114.6s

[25/25] Claim: Nicholas Brody is a character on Homeland....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10
  Progress: 4/10






  Progress: 7/10
  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...






Pre-encoding 370 document facts...
Pre-encoding 3 claim facts...
All facts encoded, building graph...
    Computing graph centrality...
    Aggregating scores...






 Processed in 2893.0s


In [None]:
# ## Left for reference 

# Results:
#   BM25 top-3:     ['Soul_Food_-LRB-film-RRB-', 'List_of_20th_Century_Fox_films_-LRB-1935–99-RRB-', 'Maxine_Chadway']
#   GRAG top-3:     ['Soul_Food_-LRB-film-RRB-', 'Ways_to_Live_Forever_-LRB-film-RRB-', 'Bootmen']
#   Changed:        ✓ YES

#   Metrics:
# 	Precision@3: 0.333
# 	Recall@3:    1.000
# 	AP:          1.000
# 	Improvement: 0 relevant docs in top-3

#   Facts Extracted:
# Claim facts: 5
#       - (r, :ARG0, c)
#       - (f, :name, n2)
#       - (c, :name, n)
# Document facts: 206 total
# (First 3: [FactTriplet(subject='f', relation=':name', obj='n', doc_id='Soul_Food_-LRB-film-RRB-', confidence=1.0, source_text="Soul Food is a 1997 American comedy-drama film produced by Kenneth `` Babyface '' Edmonds , Tracey Edmonds and Robert Teitel and released by Fox 2000 Pictures . Featuring an ensemble cast , the film stars Vanessa L. Williams , Vivica A. Fox , Nia Long , Michael Beach , Mekhi Phifer , Jeffrey D. Sams , Irma P. Hall , Gina Ravera and Brandon Hammond . Written and directed by George Tillman , Jr. -- in his major studio debut -- the film centers on the trials of an extended African-American family ,"), FactTriplet(subject='f2', relation=':mod', obj='c2', doc_id='Soul_Food_-LRB-film-RRB-', confidence=1.0, source_text="Soul Food is a 1997 American comedy-drama film produced by Kenneth `` Babyface '' Edmonds , Tracey Edmonds and Robert Teitel and released by Fox 2000 Pictures . Featuring an ensemble cast , the film stars Vanessa L. Williams , Vivica A. Fox , Nia Long , Michael Beach , Mekhi Phifer , Jeffrey D. Sams , Irma P. Hall , Gina Ravera and Brandon Hammond . Written and directed by George Tillman , Jr. -- in his major studio debut -- the film centers on the trials of an extended African-American family ,"), FactTriplet(subject='p12', relation=':name', obj='n14', doc_id='Soul_Food_-LRB-film-RRB-', confidence=1.0, source_text="Soul Food is a 1997 American comedy-drama film produced by Kenneth `` Babyface '' Edmonds , Tracey Edmonds and Robert Teitel and released by Fox 2000 Pictures . Featuring an ensemble cast , the film stars Vanessa L. Williams , Vivica A. Fox , Nia Long , Michael Beach , Mekhi Phifer , Jeffrey D. Sams , Irma P. Hall , Gina Ravera and Brandon Hammond . Written and directed by George Tillman , Jr. -- in his major studio debut -- the film centers on the trials of an extended African-American family ,")])

#   Top-3 Scores & Explanations:
# 1. Soul_Food_-LRB-film-RRB-
#  Score: 0.9370
# BM25:1.00 AMR:0.75 Doc:1.00 Fact:1.00
# 2. Ways_to_Live_Forever_-LRB-film-RRB-
#  Score: 0.7679
# BM25:0.00 AMR:0.74 Doc:1.00 Fact:0.95
# 3. Bootmen
#  Score: 0.7513
# BM25:0.03 AMR:0.74 Doc:1.00 Fact:0.87

#   Processing time: 14709.2s



[1/25] Claim: Fox 2000 Pictures released the film Soul Food....
  GRAG Multi-Stage Reranking:
Building multi-level graph...
Level 1: AMR graph...
  Parsing claim to AMR...
  Parsing 10 documents...
  Progress: 1/10





  Progress: 4/10
  Progress: 7/10





  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...






    Computing graph centrality...
    Aggregating scores...






──────────────────────────────────────────────────────────────────────
Results:
  BM25 top-3:     ['Soul_Food_-LRB-film-RRB-', 'List_of_20th_Century_Fox_films_-LRB-1935–99-RRB-', 'Maxine_Chadway']
  GRAG top-3:     ['Soul_Food_-LRB-film-RRB-', 'Ways_to_Live_Forever_-LRB-film-RRB-', 'Bootmen']
  Changed:        ✓ YES

  Metrics:
	Precision@3: 0.333
	Recall@3:    1.000
	AP:          1.000
	Improvement: 0 relevant docs in top-3

  Facts Extracted:
Claim facts: 5
      - (r, :ARG0, c)
      - (f, :name, n2)
      - (c, :name, n)
Document facts: 206 total
(First 3: [FactTriplet(subject='f', relation=':name', obj='n', doc_id='Soul_Food_-LRB-film-RRB-', confidence=1.0, source_text="Soul Food is a 1997 American comedy-drama film produced by Kenneth `` Babyface '' Edmonds , Tracey Edmonds and Robert Teitel and released by Fox 2000 Pictures . Featuring an ensemble cast , the film stars Vanessa L. Williams , Vivica A. Fox , Nia Long , Michael Beach , Mekhi Phifer , Jeffrey D. Sams , Irma P. Hall




  Progress: 4/10
  Progress: 7/10





  Progress: 10/10
Level 2: Document graph...
Level 3: Fact graph (cross-document reasoning)...






In [78]:
print(f"\nBasic Statistics:")
print(f"  Claims tested: {total_tested}")
print(f"  Rankings changed: {rankings_changed} ({rankings_changed/total_tested*100:.1f}%)")
print(f"  Facts extracted: {facts_extracted}/{total_tested} claims ({facts_extracted/total_tested*100:.1f}%)")
print(f"  Total facts: {total_facts} (avg: {total_facts/total_tested:.1f} per claim)")
print(f"  Total time: {total_time:.1f}s")
print(f"  Avg time per claim: {total_time/total_tested:.1f}s")


Basic Statistics:
  Claims tested: 25
  Rankings changed: 25 (100.0%)
  Facts extracted: 25/25 claims (100.0%)
  Total facts: 4445 (avg: 177.8 per claim)
  Total time: 83975.5s
  Avg time per claim: 3359.0s


In [77]:
if precision_at_3:
    print(f"\nRetrieval Metrics:")
    print(f"  Precision@3: {np.mean(precision_at_3):.3f} ± {np.std(precision_at_3):.3f}")
    print(f"  Recall@3: {np.mean(recall_at_3):.3f} ± {np.std(recall_at_3):.3f}")
    print(f"  MAP: {np.mean(average_precisions):.3f} ± {np.std(average_precisions):.3f}")
    
    if stage3_improvements:
        avg_improvement = np.mean(stage3_improvements)
        print(f"\nImprovement over BM25:")
        print(f"  Avg change in relevant docs (top-3): {avg_improvement:+.2f}")
        improved = sum(1 for x in stage3_improvements if x > 0)
        unchanged = sum(1 for x in stage3_improvements if x == 0)
        degraded = sum(1 for x in stage3_improvements if x < 0)
        print(f"  Improved:   {improved}/{len(stage3_improvements)} ({improved/len(stage3_improvements)*100:.1f}%)")
        print(f"  Unchanged:  {unchanged}/{len(stage3_improvements)} ({unchanged/len(stage3_improvements)*100:.1f}%)")
        print(f"  Degraded:   {degraded}/{len(stage3_improvements)} ({degraded/len(stage3_improvements)*100:.1f}%)")


Retrieval Metrics:
  Precision@3: 0.200 ± 0.163
  Recall@3: 0.520 ± 0.458
  MAP: 0.479 ± 0.431

Improvement over BM25:
  Avg change in relevant docs (top-3): +0.04
  Improved:   1/25 (4.0%)
  Unchanged:  24/25 (96.0%)
  Degraded:   0/25 (0.0%)


# Stage 4: GRAG + Custom Weight Functions

**Extension**: Plugin architecture for custom knowledge graph weighting.

**Weight Functions Defined**:
1. **Temporal weighting**: Prioritize recent facts
2. **Coherence**: Reward mutually supporting facts and penalizes contradictory facts
3. **Source reliability**: If metadata available, weight by source

In [None]:
class TemporalWeightMetric:
    """Relative recency within document set.
    
    Normalizes dates relative to the retrieved documents:
    - Newest doc in set: 1.5x
    - Oldest doc in set: 0.7x
    - Linear interpolation between."""
    
    

In [None]:
class ConsensusWeightMetric:
    """Consensus detection with contradiction penalty.
    
    - Facts supported by many similar facts: boosted
    - Facts that contradict neighbours: penalised (but not removed)
    - Isolated facts: neutral
    - Threshold to define similarity between the encodings
    """

In [None]:
# Based on the source of the document
## Only possible if we go with duckduckgo api extension
### Will not work on further beyond the following framework as that extension is not added yet

class SourceCredibilityWeight:
    """
    Rules on what makes a website credible (stanford) + LLM ambiguity
    
    Combines:
    - Fast rule-based filtering (whitelist/blacklist)
    - LLM evaluation for ambiguous cases
    - Confidence weighting between the two
    
    Use case: Best of both worlds - fast + nuanced
    """

In [None]:
class GRAGWithWeights(GRAGReranker):
   

In [2]:
# Initialise weights


# Final evaluation 
Testing successful approaches with the generation aspect of QwenB as well

1 : basic_reranker_s1
2 : amr_reranker_s2
3 : grag_reranker_s3

In [None]:
# Full pipeline with Qwen-8B
from src.model_clients import LlamaCppClient
from src.ragar_corag import RagarCorag

# Setup Qwen-8B
mc = LlamaCppClient(user_prompts_dir, sys_prompts_dir)

# Setup CoRAG with reranker
corag = RagarCorag(
    mc=mc,
    use_reranker=True,
    reranker_stage='grag'
)

# Run full pipeline
predictions = []
labels = []

for item in test_batch:
    claim = item['claim_data']['claim']
    true_label = item['claim_data']['label']
    
    # Full pipeline: BM25 → Rerank → Qwen-8B → Verdict
    result = corag.run(claim)
    verdict = result['verdict']  # 0=REFUTES, 1=SUPPORTS, 2=NEI
    
    fever_labels = ["REFUTES", "SUPPORTS", "NOT ENOUGH INFO"]
    pred_label = fever_labels[verdict] if verdict is not None else "NEI"
    
    predictions.append(pred_label)
    labels.append(true_label)

# Compute classification metrics
accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(labels)
print(f"End-to-end Accuracy: {accuracy:.3f}")