In [1]:
#This is the final code used in the paper

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import pipeline, AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
import re
import json
import time
from typing import List, Dict, Tuple, Optional, Union
from dataclasses import dataclass
from collections import defaultdict
import logging
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class EpistemicScore:
    """Structured representation of epistemic scoring components."""
    similarity: float
    anti_similarity: float
    logical_coherence: float
    semantic_consistency: float
    structural_validity: float
    confidence: float
    final_score: float
    components: Dict[str, float]

class EpistemicKnowledgeBase:
    """Enhanced knowledge base with hierarchical organization."""

    def __init__(self):
        self.knowledge_graph = {
            'physics': {
                'fundamental': [
                    "The Earth revolves around the Sun in an elliptical orbit.",
                    "Light travels at approximately 299,792,458 meters per second in vacuum.",
                    "Gravity is a fundamental force that attracts objects with mass.",
                    "Energy cannot be created or destroyed, only transformed.",
                    "The speed of light is constant in vacuum regardless of observer motion.",
                    "Time and space are interconnected in spacetime.",
                    "Mass and energy are equivalent according to E=mc²."
                ],
                'thermodynamics': [
                    "Water boils at 100°C (212°F) at standard atmospheric pressure.",
                    "Water freezes at 0°C (32°F) at standard atmospheric pressure.",
                    "Heat flows from hot objects to cold objects.",
                    "Entropy of an isolated system always increases.",
                    "Absolute zero is -273.15°C or 0 Kelvin."
                ],
                'mechanics': [
                    "Objects in motion tend to stay in motion unless acted upon by force.",
                    "For every action, there is an equal and opposite reaction.",
                    "Force equals mass times acceleration (F = ma)."
                ]
            },
            'chemistry': {
                'molecular': [
                    "Water is composed of two hydrogen atoms and one oxygen atom (H₂O).",
                    "Oxygen is necessary for combustion reactions.",
                    "Carbon dioxide is produced during cellular respiration.",
                    "The periodic table organizes elements by atomic number.",
                    "Chemical reactions involve the breaking and forming of bonds."
                ],
                'states': [
                    "Matter exists in solid, liquid, gas, and plasma states.",
                    "Phase changes occur at specific temperatures and pressures.",
                    "Sublimation is the transition from solid directly to gas."
                ]
            },
            'biology': {
                'genetics': [
                    "DNA contains genetic information in the form of nucleotide sequences.",
                    "Genes are passed from parents to offspring.",
                    "Mutations can occur in DNA during replication.",
                    "RNA serves as a messenger between DNA and proteins."
                ],
                'physiology': [
                    "The human heart pumps blood through the circulatory system.",
                    "Plants produce oxygen through photosynthesis.",
                    "Cellular respiration converts glucose and oxygen into ATP.",
                    "The nervous system transmits electrical signals."
                ],
                'evolution': [
                    "Species evolve through natural selection.",
                    "Organisms adapt to their environment over time.",
                    "Common ancestry explains similarities between species."
                ]
            },
            'mathematics': {
                'arithmetic': [
                    "2 + 2 = 4", "1 + 1 = 2", "10 - 5 = 5", "3 × 4 = 12", "8 ÷ 2 = 4"
                ],
                'geometry': [
                    "A circle has 360 degrees.",
                    "The sum of angles in a triangle is 180 degrees.",
                    "The Pythagorean theorem relates the sides of right triangles.",
                    "Parallel lines never intersect."
                ],
                'logic': [
                    "Prime numbers are divisible only by 1 and themselves.",
                    "The square root of 4 is 2.",
                    "Zero is neither positive nor negative."
                ]
            },
            'geography': {
                'capitals': [
                    "Paris is the capital of France.",
                    "London is the capital of the United Kingdom.",
                    "Tokyo is the capital of Japan.",
                    "Washington D.C. is the capital of the United States."
                ],
                'physical': [
                    "The Earth is approximately spherical.", # Added for similarity search
                    "Mountains are elevated landforms.",
                    "Oceans contain salt water.",
                    "Rivers flow from higher to lower elevations."
                ]
            },
            'astronomy': [
                "The Moon is Earth's natural satellite.",
                "The Sun is a star at the center of our solar system.",
                "Earth completes one rotation every 24 hours.",
                "The Moon causes tides through gravitational pull.",
                "Stars are distant suns that produce their own light."
            ],
            'technology': [
                "Python is a high-level programming language.",
                "Computers process information using binary code.",
                "The Internet connects computers worldwide.",
                "Artificial intelligence involves machine learning algorithms."
            ]
        }

        # Known false statements with categories - Expanded and made more challenging
        self.false_knowledge = {
            'physics_false': [
                "The Sun revolves around the Earth.",
                "Objects fall at different rates in vacuum based on weight.",
                "Light travels instantly across any distance.",
                "Energy can be created from nothing.",
                "Gravity pushes objects apart.", # More specific false statement
                "The Earth is flat." # Common misconception
            ],
            'chemistry_false': [
                "Water boils at -10°C at standard pressure.",
                "Fire can burn without oxygen.",
                "Ice is hotter than liquid water.",
                "Water is composed only of hydrogen.",
                "Metals are poor conductors of electricity.",
                "Rusting is a physical change." # Plausible but incorrect
            ],
            'biology_false': [
                "Humans can breathe underwater without any equipment.",
                "Plants survive without sunlight or artificial light.",
                "All birds can fly including penguins in air.",
                "DNA is made of proteins only.",
                "The human brain uses 100% of its capacity at all times.", # Common false belief
                "Vaccines cause autism." # Misinformation example
            ],
            'astronomy_false': [
                "The Moon is made of cheese.",
                "Stars are holes in the sky.",
                "The Earth is flat.", # Redundant but kept for emphasis in different categories
                "Gravity doesn't exist in space.",
                "Mars is closer to the Sun than Venus.", # Specific false fact
                "The Earth is the center of the universe." # Historical false belief
            ],
            'mathematics_false': [
                "2 + 2 = 5",
                "1 + 1 = 3",
                "0 divided by 0 equals 1",
                "All numbers are positive.",
                "Pi equals exactly 3.0.",
                "The square root of 16 is 5.", # Specific false math
                "Every even number is also an odd number." # Logically false
            ],
            'nonsensical': [
                "Purple dreams sing mathematics loudly.",
                "The colorless green ideas sleep furiously.",
                "Tuesday tastes like seventeen elephants.",
                "Silence has a bright blue temperature.",
                "Mathematics can be eaten with a spoon.",
                "The concept of time wears a hat made of rainbows and whispers secrets to the wind.", # More complex nonsensical
                "Invisible butterflies compose symphonies with their thoughts in a vacuum chamber."
            ]
        }

        # Flatten knowledge for embedding
        self.trusted_statements = self._flatten_knowledge(self.knowledge_graph)
        self.false_statements = self._flatten_knowledge(self.false_knowledge)

    def _flatten_knowledge(self, knowledge_dict: Dict) -> List[str]:
        """Recursively flatten nested knowledge dictionary."""
        statements = []
        for key, value in knowledge_dict.items():
            if isinstance(value, dict):
                statements.extend(self._flatten_knowledge(value))
            elif isinstance(value, list):
                statements.extend(value)
        return statements

class LogicalCoherenceAnalyzer:
    """Advanced logical coherence analysis using multiple approaches."""

    def __init__(self):
        self.contradiction_patterns = [
            # Temperature contradictions
            (r'(?:ice|frozen|cold)', r'(?:hot|boiling|warm)'),
            (r'(?:fire|flame|burning)', r'(?:cold|freezing|frozen)'),

            # Physical impossibilities
            (r'(?:elephant|whale|heavy)', r'(?:fly|flying|soar).*(?:naturally|without)'),
            (r'(?:breathe|breathing)', r'(?:underwater|submerged).*(?:without|no)'),
            (r'(?:travels?|moves?).*instantly', r'(?:across|any).*distance'), # Light travels instantly across any distance.

            # Mathematical contradictions
            (r'(?:2\s*\+\s*2|two\s+plus\s+two)', r'(?:5|five|6|six)'),
            (r'(?:1\s*\+\s*1|one\s+plus\s+one)', r'(?:3|three|4|four)'),
            (r'(?:square\s+root\s+of\s+16)', r'(?:5|five|6|six)'), # Specific math contradiction
            (r'(?:pi|π)', r'(?:equals?|is).*exactly\s+3(?:\.0)?'), # Pi exactly 3

            # Astronomical contradictions
            (r'(?:sun|solar)', r'(?:revolves?|orbits?).*(?:earth|planet)'),
            (r'(?:earth|planet)', r'(?:flat|disc|plane)'),
            (r'(?:moon|lunar).*made.*cheese', r''), # Moon made of cheese

            # Chemical contradictions
            (r'(?:water|H2O)', r'(?:boils?|boiling).*(?:-|\bminus\b|\bnegative\b)'),
            (r'(?:fire|combustion)', r'(?:without|no).*(?:oxygen|air)'),
        ]

        self.semantic_categories = {
            'temporal': ['yesterday', 'today', 'tomorrow', 'always', 'never'],
            'spatial': ['here', 'there', 'up', 'down', 'left', 'right'],
            'causal': ['because', 'therefore', 'thus', 'hence', 'due to'],
            'modal': ['can', 'cannot', 'must', 'might', 'should', 'would'],
            'quantitative': ['all', 'some', 'none', 'many', 'few', 'most']
        }

    def analyze_logical_structure(self, text: str) -> Dict[str, float]:
        """Analyze logical structure of text."""
        text_lower = text.lower()

        # Check for contradictions
        contradiction_score = self._check_contradictions(text_lower)

        # Check semantic consistency
        semantic_score = self._check_semantic_consistency(text_lower)

        # Check argument structure
        argument_score = self._check_argument_structure(text)

        # Check for logical fallacies
        fallacy_score = self._check_logical_fallacies(text_lower)

        return {
            'contradiction': contradiction_score,
            'semantic': semantic_score,
            'argument': argument_score,
            'fallacy': fallacy_score
        }

    def _check_contradictions(self, text: str) -> float:
        """Check for explicit contradictions. Returns a lower score for more contradictions."""
        contradiction_count = 0
        for pattern1, pattern2 in self.contradiction_patterns:
            if re.search(pattern1, text) and (pattern2 == '' or re.search(pattern2, text)):
                contradiction_count += 1

        # A higher contradiction count should result in a lower score
        # Using an exponential decay to strongly penalize contradictions
        return np.exp(-contradiction_count * 0.7) # Adjust multiplier for sensitivity

    def _check_semantic_consistency(self, text: str) -> float:
        """Check semantic consistency within categories. Lower score for more inconsistencies."""
        inconsistencies = 0

        # Check for conflicting temporal markers (e.g., "always" and "never")
        temporal_words_found = [word for word in self.semantic_categories['temporal'] if word in text]
        if 'always' in temporal_words_found and 'never' in temporal_words_found:
            inconsistencies += 1
        elif 'always' in temporal_words_found and ('sometimes' in temporal_words_found or 'rarely' in temporal_words_found):
            inconsistencies += 0.5 # Partial inconsistency

        # Check for conflicting modal statements (e.g., "can" and "cannot")
        modal_words_found = [word for word in self.semantic_categories['modal'] if word in text]
        if 'can' in modal_words_found and 'cannot' in modal_words_found:
            inconsistencies += 1

        # Check for conflicting quantitative statements (e.g., "all" and "none")
        quantitative_words_found = [word for word in self.semantic_categories['quantitative'] if word in text]
        if 'all' in quantitative_words_found and 'none' in quantitative_words_found:
            inconsistencies += 1

        # A higher inconsistency count should result in a lower score
        return np.exp(-inconsistencies * 0.5) # Adjust multiplier for sensitivity

    def _check_argument_structure(self, text: str) -> float:
        """Check for basic argument structure."""
        has_premise = any(word in text.lower() for word in ['because', 'since', 'given that', 'as a result of'])
        has_conclusion = any(word in text.lower() for word in ['therefore', 'thus', 'hence', 'consequently'])
        # A statement should generally have more than a few words to be considered structured
        has_sufficient_length = len(text.split()) > 5

        structure_score = 0.2  # Base score
        if has_premise: structure_score += 0.3
        if has_conclusion: structure_score += 0.3
        if has_sufficient_length: structure_score += 0.2

        return min(structure_score, 1.0)

    def _check_logical_fallacies(self, text: str) -> float:
        """Check for common logical fallacies. Returns lower score if fallacies are detected."""
        fallacy_patterns = [
            r'all .* are .* and .* are not .*', # Contradiction in universal statement
            r'if .* then .*(?:and not|but not) .*', # Affirming the consequent / Denying the antecedent indicators
            r'.* because .*(?:is true|is false)', # Circular reasoning indicators
            r'(?:everyone|no one|nobody).*believes', # Appeal to popularity/bandwagon
            r'(?:expert|authority).*says.*therefore.*it is true', # Appeal to authority (simplified)
            r'either .* or .*(?:not both)', # False dilemma
            r'(?:first|then).*caused.*', # Post hoc ergo propter hoc
        ]

        fallacy_count = 0
        text_lower = text.lower()
        for pattern in fallacy_patterns:
            if re.search(pattern, text_lower):
                fallacy_count += 1

        # Penalize more for more detected fallacies
        return max(0.0, 1.0 - fallacy_count * 0.25)

class EnhancedEpistemicFilter:
    """Enhanced epistemic filtering system with neural components."""

    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        logger.info(f"Device set to use {self.device}")

        # Initialize components
        self.embedder = SentenceTransformer(model_name)
        self.knowledge_base = EpistemicKnowledgeBase()
        self.logic_analyzer = LogicalCoherenceAnalyzer()

        # Initialize classification models
        self._initialize_models()

        # Precompute embeddings
        self._precompute_embeddings()

        # Initialize neural scorer (note: this is a conceptual component for POC, not actively trained here)
        # Input dimension will be the number of components passed to it (7 in this case)
        self.neural_scorer = self._build_neural_scorer(input_dim=7) # Corrected input_dim

    def _initialize_models(self):
        """Initialize various NLP models."""
        try:
            self.fact_checker = pipeline(
                "zero-shot-classification",
                model="facebook/bart-large-mnli",
                device=0 if torch.cuda.is_available() else -1
            )
            logger.info("✅ Loaded BART-MNLI for zero-shot classification")
        except Exception as e:
            logger.warning(f"❌ Could not load BART-MNLI: {e}. Fallback to pattern matching.")
            self.fact_checker = None

        try:
            # Suppress specific transformers warnings about unused weights if they are expected
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                self.sentiment_analyzer = pipeline(
                    "sentiment-analysis",
                    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
                    device=0 if torch.cuda.is_available() else -1
                )
            logger.info("✅ Loaded sentiment analyzer")
        except Exception as e:
            logger.warning(f"❌ Could not load sentiment analyzer: {e}. Fallback to default sentiment confidence.")
            self.sentiment_analyzer = None

    def _precompute_embeddings(self):
        """Precompute embeddings for knowledge base."""
        logger.info("Precomputing embeddings...")

        self.trusted_embeddings = self.embedder.encode(
            self.knowledge_base.trusted_statements,
            convert_to_tensor=True,
            device=self.device
        )

        self.false_embeddings = self.embedder.encode(
            self.knowledge_base.false_statements,
            convert_to_tensor=True,
            device=self.device
        )

        logger.info(f"Precomputed {len(self.knowledge_base.trusted_statements)} trusted embeddings")
        logger.info(f"Precomputed {len(self.knowledge_base.false_statements)} false embeddings")

    def _build_neural_scorer(self, input_dim: int) -> nn.Module: # input_dim now correctly reflects number of components
        """Build neural network for epistemic scoring. For POC, acts as a fixed combination layer."""
        class EpistemicScorer(nn.Module):
            def __init__(self, input_dim: int = 7, hidden_dim: int = 256): # Adjusted default input_dim
                super().__init__()
                self.layers = nn.Sequential(
                    nn.Linear(input_dim, hidden_dim),
                    nn.ReLU(),
                    nn.Dropout(0.1),
                    nn.Linear(hidden_dim, hidden_dim // 2),
                    nn.ReLU(),
                    nn.Dropout(0.1),
                    nn.Linear(hidden_dim // 2, 1),
                    nn.Sigmoid()
                )

            def forward(self, x):
                return self.layers(x)

        return EpistemicScorer(input_dim=input_dim).to(self.device)

    def compute_similarity_scores(self, text: str) -> Dict[str, float]:
        """Compute similarity scores against knowledge bases."""
        embedding = self.embedder.encode(text, convert_to_tensor=True, device=self.device)

        # Similarity to trusted knowledge
        trusted_similarities = util.pytorch_cos_sim(embedding, self.trusted_embeddings)
        max_trusted_sim = float(trusted_similarities.max())
        # mean_trusted_sim = float(trusted_similarities.mean()) # Not used in final score, can remove if not needed

        # Similarity to false knowledge (anti-similarity)
        false_similarities = util.pytorch_cos_sim(embedding, self.false_embeddings)
        max_false_sim = float(false_similarities.max())
        # mean_false_sim = float(false_similarities.mean()) # Not used in final score, can remove if not needed

        return {
            'max_trusted': max_trusted_sim,
            # 'mean_trusted': mean_trusted_sim,
            'max_false': max_false_sim,
            # 'mean_false': mean_false_sim,
            'anti_similarity': 1.0 - max_false_sim # Higher value means less similar to false statements
        }

    def analyze_linguistic_features(self, text: str) -> Dict[str, float]:
        """Analyze linguistic features of text."""
        words = text.split()

        # Basic linguistic features
        features = {
            'length': min(len(words) / 20.0, 1.0), # Normalize length
            'complexity': len(set(words)) / max(len(words), 1), # Lexical diversity
            'coherence': self._check_grammatical_coherence(text),
            'specificity': self._check_specificity(text),
            'confidence_markers': self._check_confidence_markers(text)
        }

        # Sentiment analysis (using confidence as a feature)
        if self.sentiment_analyzer:
            try:
                sentiment = self.sentiment_analyzer(text)[0]
                # Positive sentiment confidence can imply factual statements often have neutral/positive framing
                # Negative sentiment confidence might indicate strong opinions/bias
                features['sentiment_confidence'] = sentiment['score'] if sentiment['label'] == 'LABEL_2' else (1 - sentiment['score'])
            except Exception as e:
                logger.warning(f"Sentiment analysis failed for '{text[:50]}...': {e}")
                features['sentiment_confidence'] = 0.5 # Default if error
        else:
            features['sentiment_confidence'] = 0.5 # Default if no analyzer

        return features

    def _check_grammatical_coherence(self, text: str) -> float:
        """Enhanced grammatical coherence check. Penalizes lack of common grammatical structures."""
        if len(text.strip()) < 3:
            return 0.0

        words = text.split()

        # Check for presence of common parts of speech (simplified)
        has_verb = any(len(re.findall(r'\b(?:is|are|was|were|have|has|do|does|can|will|should|must)\b', word.lower())) > 0 for word in words)
        # Fix: Removed 'any()' wrapper, as the expression already returns a boolean. Added 'w and' for safety.
        has_noun = (len(re.findall(r'\b(?:the|a|an)\s+[a-z]+', text.lower())) > 0 or \
                    len([w for w in words if w and w[0].isupper() and w.lower() not in ['i', 'a']]) > 0)
        starts_with_capital = text[0].isupper() if text else False
        proper_punctuation = text.endswith('.') or text.endswith('!') or text.endswith('?')

        coherence_score = 0.1
        if has_verb: coherence_score += 0.3
        if has_noun: coherence_score += 0.2
        if starts_with_capital: coherence_score += 0.2
        if proper_punctuation: coherence_score += 0.1
        if len(words) >= 4: coherence_score += 0.1 # Bonus for reasonable length

        return min(coherence_score, 1.0)

    def _check_specificity(self, text: str) -> float:
        """Check specificity of claims."""
        specific_markers = [
            r'\d+(?:\.\d+)?',  # Numbers
            r'(?:approximately|exactly|about|around)',  # Precision markers
            r'(?:celsius|fahrenheit|kelvin|meters|seconds|years|miles|kilometers|percent)',  # Units
            r'(?:according to|research shows|studies indicate|data suggests|evidence points to)',  # Source markers
        ]

        specificity_score = 0.0
        for pattern in specific_markers:
            if re.search(pattern, text.lower()):
                specificity_score += 0.25 # Each marker adds to specificity

        return min(specificity_score, 1.0)

    def _check_confidence_markers(self, text: str) -> float:
        """Check for confidence/uncertainty markers. Higher score for explicit confidence, lower for uncertainty."""
        high_confidence = ['definitely', 'certainly', 'absolutely', 'always', 'never', 'proven', 'fact']
        low_confidence = ['might', 'maybe', 'perhaps', 'possibly', 'probably', 'it is said', 'some believe']

        text_lower = text.lower()

        high_conf_count = sum(1 for marker in high_confidence if marker in text_lower)
        low_conf_count = sum(1 for marker in low_confidence if marker in text_lower)

        # Normalize score between 0 and 1
        net_confidence = high_conf_count - low_conf_count
        # Map net_confidence to a 0-1 scale. Max possible high_conf is 6, max low_conf is 6. Range -6 to 6.
        return (net_confidence + len(high_confidence)) / (len(high_confidence) + len(low_confidence)) # Scale to 0-1

    def compute_epistemic_score(self, text: str) -> EpistemicScore:
        """Compute comprehensive epistemic score."""
        # Component scores
        similarity_scores = self.compute_similarity_scores(text)
        linguistic_features = self.analyze_linguistic_features(text)
        logical_analysis = self.logic_analyzer.analyze_logical_structure(text)

        # Zero-shot classification based logic score
        logic_score = self._compute_logic_score(text)

        # Component weights (adjusted based on empirical correlation from previous run)
        # Increased similarity/anti-similarity, reduced negatively correlated logic/semantic components
        weights = {
            'similarity': 0.35, # Increased due to high positive correlation
            'anti_similarity': 0.25, # Increased due to positive correlation
            'logical_coherence': 0.05, # Reduced due to negative correlation
            'semantic_consistency': 0.05, # Reduced due to negative correlation
            'structural_validity': 0.10, # Kept same, moderate positive correlation
            'linguistic_quality': 0.10, # Keep as is (average of linguistic features)
            'confidence': 0.10 # Keep as is (from confidence markers)
        }

        # Compute average for linguistic_quality from its sub-features
        avg_linguistic_quality = np.mean([
            linguistic_features['length'],
            linguistic_features['complexity'],
            linguistic_features['coherence'],
            linguistic_features['specificity'],
            linguistic_features['sentiment_confidence'] # Include sentiment confidence here
        ])

        components = {
            'similarity': similarity_scores['max_trusted'],
            'anti_similarity': similarity_scores['anti_similarity'],
            'logical_coherence': logical_analysis['contradiction'] * logical_analysis['semantic'] * logical_analysis['fallacy'], # Combined logic aspects
            'semantic_consistency': logical_analysis['semantic'], # Retain individual semantic consistency
            'structural_validity': linguistic_features['coherence'],
            'linguistic_quality': avg_linguistic_quality,
            'confidence': linguistic_features['confidence_markers']
        }

        # Ensure all components are floats and handle potential NaNs from empty lists
        for k, v in components.items():
            if isinstance(v, (list, np.ndarray)) and len(v) == 0:
                components[k] = 0.0
            if np.isnan(v):
                components[k] = 0.0

        # Calculate final score using the weighted sum
        final_score = sum(weights[k] * components[k] for k in weights.keys())

        # Confidence estimation based on component agreement
        confidence = self._compute_confidence(components)

        return EpistemicScore(
            similarity=components['similarity'],
            anti_similarity=components['anti_similarity'],
            logical_coherence=components['logical_coherence'],
            semantic_consistency=components['semantic_consistency'],
            structural_validity=components['structural_validity'],
            confidence=confidence,
            final_score=final_score,
            components=components
        )

    def _compute_logic_score(self, text: str) -> float:
        """Compute logical score using multiple approaches."""
        if not self.fact_checker:
            return self._fallback_logic_score(text)

        try:
            candidate_labels = [
                "factually accurate statement",
                "factually incorrect statement",
                "nonsensical statement",
                "ambiguous statement"
            ]

            result = self.fact_checker(text, candidate_labels)
            label = result['labels'][0]
            score = result['scores'][0]

            if label == 'factually accurate statement':
                return score
            elif label == 'factually incorrect statement':
                return 1.0 - score # Invert score for 'incorrect' to represent low truth-likelihood
            elif label == 'nonsensical statement':
                return 0.05 # Very low score for nonsensical
            else:  # ambiguous
                return 0.5 # Neutral score for ambiguous
        except Exception as e:
            logger.warning(f"Logic scoring error with BART-MNLI: {e}. Falling back to pattern matching.")
            return self._fallback_logic_score(text)

    def _fallback_logic_score(self, text: str) -> float:
        """Fallback logic scoring using pattern matching."""
        text_lower = text.lower()

        # More comprehensive positive patterns (from KB)
        positive_patterns = [
            r'earth revolves around the sun',
            r'light travels at approximately 299,792,458 meters per second',
            r'water boils at 100(?:°c| degrees celsius)',
            r'gravity is a fundamental force',
            r'energy cannot be created or destroyed',
            r'dna contains genetic information',
            r'plants produce oxygen through photosynthesis',
            r'human heart pumps blood through the circulatory system',
            r'cellular respiration converts glucose and oxygen into atp',
            r'2 \+ 2 = 4',
            r'square root of 16 is 4',
            r'prime numbers are divisible only by 1 and themselves',
            r'sum of angles in a triangle is 180 degrees',
            r'earth is approximately spherical',
        ]

        # More comprehensive negative patterns (from false_knowledge)
        negative_patterns = [
            r'sun revolves around the earth',
            r'water boils at -10 degrees celsius',
            r'objects fall at different rates in vacuum based on weight',
            r'light travels instantly across any distance',
            r'humans can breathe underwater without any equipment',
            r'plants can survive indefinitely without sunlight',
            r'human brain uses 100% of its capacity at all times',
            r'2 \+ 2 = 5',
            r'square root of 16 is 5',
            r'all numbers are positive integers',
            r'pi equals exactly 3(?:\.0)?',
            r'moon is made of cheese',
            r'earth is flat',
            r'gravity doesn\'t exist in space',
            r'vaccines cause autism',
            r'metals are poor conductors of electricity',
            r'rusting is a physical change'
        ]

        # Check patterns
        for pattern in positive_patterns:
            if re.search(pattern, text_lower):
                return 0.9 # High score for strong matches

        for pattern in negative_patterns:
            if re.search(pattern, text_lower):
                return 0.1 # Low score for strong false matches

        # Check for nonsensical patterns
        nonsensical_keywords = ['purple dreams', 'colorless green ideas', 'tastes like seventeen elephants', 'silence has a blue temperature', 'mathematics can be eaten']
        if any(keyword in text_lower for keyword in nonsensical_keywords):
            return 0.05 # Very low score for detected nonsensicality

        return 0.5 # Neutral if no strong patterns are found

    def _compute_confidence(self, components: Dict[str, float]) -> float:
        """Compute confidence in the epistemic score based on component agreement."""
        component_values = np.array(list(components.values()))

        # Filter out NaN/inf if any (should ideally not happen if components are handled)
        component_values = component_values[np.isfinite(component_values)]

        if len(component_values) < 2: # Need at least two values to compute std dev
            return 0.5 # Default confidence

        std_dev = np.std(component_values)
        mean_score = np.mean(component_values)

        # Confidence is higher when components agree (low std_dev) and when mean is extreme (close to 0 or 1)
        # Normalize std_dev to be between 0 and 1 (max possible std for 0-1 values is 0.5)
        normalized_std = min(std_dev / 0.5, 1.0)

        # Inverse relationship with normalized_std
        confidence_from_agreement = 1.0 - normalized_std

        # Boost confidence for extreme mean scores (closer to 0 or 1)
        extremity_boost = 1.0 + (0.5 - abs(mean_score - 0.5)) * 2 # Max boost at 0 or 1, min at 0.5

        final_confidence = confidence_from_agreement * extremity_boost
        return min(max(final_confidence, 0.0), 1.0)


    def filter_statements(self, statements: List[str], threshold: float = 0.5) -> Dict:
        """Filter statements based on epistemic scores."""
        results = {
            'accepted': [],
            'rejected': [],
            'scores': {},
            'summary': {}
        }

        for statement in statements:
            score_obj = self.compute_epistemic_score(statement)
            results['scores'][statement] = score_obj

            if score_obj.final_score >= threshold:
                results['accepted'].append(statement)
            else:
                results['rejected'].append(statement)

        # Generate summary statistics
        all_scores = [s.final_score for s in results['scores'].values()]
        results['summary'] = {
            'total_statements': len(statements),
            'accepted_count': len(results['accepted']),
            'rejected_count': len(results['rejected']),
            'acceptance_rate': len(statements) / len(statements) if len(statements) > 0 else 0.0, # Fixed typo (acceptance_rate)
            'mean_score': np.mean(all_scores) if len(all_scores) > 0 else 0.0,
            'std_score': np.std(all_scores) if len(all_scores) > 0 else 0.0,
            'threshold': threshold
        }

        return results

# Enhanced evaluation function
def evaluate_epistemic_filter():
    """Comprehensive evaluation of the epistemic filter."""

    # Initialize filter
    filter_system = EnhancedEpistemicFilter()

    # Enhanced test dataset (expanded for more robust testing)
    test_data = {
        'physics_true': [
            "The Earth revolves around the Sun in an elliptical orbit.",
            "Light travels at approximately 299,792,458 meters per second in vacuum.",
            "Water boils at 100 degrees Celsius at standard atmospheric pressure.",
            "Gravity is a fundamental force that attracts objects with mass.",
            "Energy cannot be created or destroyed, only transformed.",
            "The speed of light is constant in vacuum regardless of observer motion.",
            "Mass and energy are equivalent according to E=mc².",
        ],
        'biology_true': [
            "DNA contains genetic information in the form of nucleotide sequences.",
            "Plants produce oxygen through photosynthesis using sunlight.",
            "The human heart pumps blood through the circulatory system.",
            "Cellular respiration converts glucose and oxygen into ATP energy.",
            "Species evolve through natural selection.",
            "The nervous system transmits electrical signals throughout the body.",
        ],
        'math_true': [
            "Two plus two equals four.",
            "The square root of 16 is 4.",
            "Prime numbers are divisible only by 1 and themselves.",
            "The sum of angles in a triangle is 180 degrees.",
            "Zero is neither positive nor negative.",
            "The area of a circle is pi times the radius squared.",
        ],
        'physics_false': [
            "The Sun revolves around the Earth.",
            "Water boils at -10 degrees Celsius at standard pressure.",
            "Objects fall at different rates in vacuum based on their weight.",
            "Light travels instantly across any distance.",
            "The Earth is flat.",
            "Energy can be created from nothing.",
        ],
        'biology_false': [
            "Humans can breathe underwater without any equipment.",
            "Plants can survive indefinitely without sunlight or artificial light.",
            "The human brain uses 100% of its capacity at all times.",
            "All birds can fly, including penguins in the air.",
            "DNA is made of proteins only.",
            "Vaccines cause autism.",
        ],
        'math_false': [
            "Two plus two equals five.",
            "The square root of 16 is 5.",
            "All numbers are positive integers.",
            "Pi equals exactly 3.0.",
            "Every even number is also an odd number.",
            "Division by zero equals infinity.",
        ],
        'nonsensical': [
            "Purple dreams sing mathematics loudly in the quantum vacuum.",
            "The colorless green ideas sleep furiously on Tuesday mornings.",
            "Elephants can naturally fly through chocolate-flavored dimensions.",
            "Time tastes like seventeen blue thoughts dancing backwards.",
            "Invisible libraries contain books written in unspoken languages.",
            "The square circle meditates silently on the color of sound.",
        ]
    }

    # Flatten test data
    all_statements = []
    ground_truth = []

    for category, statements in test_data.items():
        all_statements.extend(statements)
        if 'true' in category:
            ground_truth.extend([1] * len(statements))
        else:
            ground_truth.extend([0] * len(statements))

    # Run filtering
    print("🧠 ENHANCED EPISTEMIC FILTERING EVALUATION")
    print("=" * 70)

    results = filter_system.filter_statements(all_statements, threshold=0.5)

    # Convert results to binary predictions
    predictions = []
    for statement in all_statements:
        score_obj = results['scores'][statement]
        predictions.append(1 if score_obj.final_score >= 0.5 else 0)

    # Calculate metrics
    accuracy = accuracy_score(ground_truth, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        ground_truth, predictions, average='binary', zero_division=0
    )

    # Confusion matrix
    cm = confusion_matrix(ground_truth, predictions)
    # Ensure cm has correct shape for ravel if all predictions are same
    if cm.shape == (1,1):
        # Determine if the single class is true (1) or false (0)
        if ground_truth[0] == 0: # All ground truths are 0
            tn, fp, fn, tp = cm[0,0], 0, 0, 0
        else: # All ground truths are 1
            tn, fp, fn, tp = 0, 0, 0, cm[0,0]
    elif cm.shape == (2,2):
        tn, fp, fn, tp = cm.ravel()
    else: # Fallback for unexpected shapes
        tn, fp, fn, tp = 0,0,0,0
        logger.warning("Confusion matrix has unexpected shape, cannot extract TN, FP, FN, TP directly.")


    print(f"\n📊 PERFORMANCE METRICS")
    print("-" * 40)
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1 Score:  {f1:.3f}")
    print(f"\nConfusion Matrix:")
    print(f"TN: {tn}, FP: {fp}")
    print(f"FN: {fn}, TP: {tp}")

    # Detailed analysis by category
    print(f"\n🔍 DETAILED ANALYSIS BY CATEGORY")
    print("-" * 50)

    for category, statements in test_data.items():
        category_scores = [results['scores'][stmt].final_score for stmt in statements]
        category_predictions = [1 if score >= 0.5 else 0 for score in category_scores]

        if 'true' in category:
            category_ground_truth = [1] * len(statements)
        else:
            category_ground_truth = [0] * len(statements)

        # Handle case where category_ground_truth might be empty or all same for metrics
        if len(category_ground_truth) > 0:
            category_accuracy = accuracy_score(category_ground_truth, category_predictions)
            cat_precision, cat_recall, cat_f1, _ = precision_recall_fscore_support(
                category_ground_truth, category_predictions, average='binary', zero_division=0
            )
        else:
            category_accuracy = 0.0
            cat_precision, cat_recall, cat_f1 = 0.0, 0.0, 0.0


        print(f"{category.upper()}: {category_accuracy:.3f} accuracy")
        if 'true' in category: # Only show P/R/F1 for 'true' categories as the positive class
            print(f"  Precision: {cat_precision:.3f}, Recall: {cat_recall:.3f}, F1: {cat_f1:.3f}")
        print(f"  Mean Score: {np.mean(category_scores):.3f}")
        print(f"  Score Range: {np.min(category_scores):.3f} - {np.max(category_scores):.3f}")

    # Show best and worst predictions based on FINAL SCORE, not just confidence
    print(f"\n✅ BEST PREDICTIONS (High Score / High Confidence)")
    print("-" * 50)

    # Sort by final_score (highest first) for 'best' predictions
    scored_statements_by_score = sorted([(stmt, results['scores'][stmt]) for stmt in all_statements],
                                        key=lambda x: x[1].final_score, reverse=True)

    for stmt, score_obj in scored_statements_by_score[:5]:
        print(f"Score: {score_obj.final_score:.3f} | Confidence: {score_obj.confidence:.3f}")
        print(f"  {stmt[:80]}...") # Truncate for display
        print()

    print(f"\n❌ CHALLENGING CASES (Low Score / Low Confidence)")
    print("-" * 50)

    # Sort by final_score (lowest first) for 'challenging' predictions
    for stmt, score_obj in scored_statements_by_score[-5:]:
        print(f"Score: {score_obj.final_score:.3f} | Confidence: {score_obj.confidence:.3f}")
        print(f"  {stmt[:80]}...") # Truncate for display
        print()

    # Component analysis
    print(f"\n⚙️ COMPONENT CONTRIBUTION ANALYSIS")
    print("-" * 50)

    component_names = ['similarity', 'anti_similarity', 'logical_coherence',
                      'semantic_consistency', 'structural_validity', 'linguistic_quality', 'confidence'] # Added linguistic_quality

    for component in component_names:
        component_scores = [s.components.get(component, 0.0) for s in results['scores'].values()] # Use .get for safety
        # Ensure that component_scores and ground_truth have values for correlation
        if len(component_scores) > 1 and len(ground_truth) > 1 and np.std(component_scores) != 0 and np.std(ground_truth) != 0:
            correlation = np.corrcoef(component_scores, ground_truth)[0, 1]
            print(f"{component.replace('_', ' ').title()}: {correlation:.3f} correlation with ground truth")
        else:
            print(f"{component.replace('_', ' ').title()}: Cannot compute correlation (insufficient variance or data)")


    return results, accuracy, precision, recall, f1


# Advanced epistemic uncertainty quantification
class EpistemicUncertaintyQuantifier:
    """Quantify epistemic uncertainty in statements."""

    def __init__(self, filter_system: EnhancedEpistemicFilter):
        self.filter_system = filter_system
        self.uncertainty_types = {
            'aleatory': 'Inherent randomness in the data',
            'epistemic': 'Uncertainty due to lack of knowledge',
            'model': 'Uncertainty in model predictions',
            'measurement': 'Uncertainty in measurement or observation'
        }

    def quantify_uncertainty(self, statement: str, n_samples: int = 10) -> Dict[str, float]:
        """Quantify different types of uncertainty."""

        # Get base score
        base_score_obj = self.filter_system.compute_epistemic_score(statement)
        base_score = base_score_obj.final_score

        # Perturb statement slightly and measure variance of final scores
        perturbed_scores = []
        for _ in range(n_samples):
            perturbed_statement = self._perturb_statement(statement)
            perturbed_score_obj = self.filter_system.compute_epistemic_score(perturbed_statement)
            perturbed_scores.append(perturbed_score_obj.final_score)

        # Calculate uncertainty metrics for final score
        score_std = np.std(perturbed_scores)

        # Epistemic uncertainty (based on component disagreement in base score)
        # Avoid division by zero
        components_values = np.array(list(base_score_obj.components.values()))
        components_values = components_values[np.isfinite(components_values)] # Filter out non-finite values

        if np.mean(components_values) > 0 and len(components_values) > 1:
            epistemic_uncertainty = np.std(components_values) / np.mean(components_values)
        else:
            epistemic_uncertainty = 1.0 # Max uncertainty if mean is zero or only one component

        # Model uncertainty (confidence-based)
        model_uncertainty = 1.0 - base_score_obj.confidence # Lower confidence means higher model uncertainty

        # Calculate 95% Confidence Interval for the perturbed scores
        ci_lower = np.percentile(perturbed_scores, 2.5)
        ci_upper = np.percentile(perturbed_scores, 97.5)

        return {
            'total_uncertainty': score_std,
            'epistemic_uncertainty': epistemic_uncertainty,
            'model_uncertainty': model_uncertainty,
            'confidence_interval': (ci_lower, ci_upper),
            'score_mean_from_perturbations': np.mean(perturbed_scores) # Useful for CI context
        }

    def _perturb_statement(self, statement: str) -> str:
        """Slightly perturb a statement for uncertainty analysis."""
        words = statement.split()
        if len(words) < 3:
            return statement

        perturbations_funcs = [
            lambda s: s.replace('.', '').replace(',', ''),  # Remove punctuation
            lambda s: s.lower(),  # Change case
            lambda s: s.capitalize(), # Capitalize first letter
            lambda s: s + ' in theory',  # Add qualifier
            lambda s: s.replace('the ', 'a '),  # Change articles (minor semantic shift)
            lambda s: re.sub(r'\b(is|are|was|were)\b', np.random.choice(['is', 'are', 'was', 'were']), s, 1) # Vary tense slightly
        ]
        # Choose a random perturbation
        perturbation = np.random.choice(perturbations_funcs)
        return perturbation(statement)


# Epistemic memory and learning system
class EpistemicMemorySystem:
    """System for learning and updating epistemic knowledge."""

    def __init__(self, filter_system: EnhancedEpistemicFilter):
        self.filter_system = filter_system
        self.memory_bank = {
            'verified_true': [],
            'verified_false': [],
            'uncertain': [],
            'contradictory': []
        }
        self.learning_rate = 0.1 # Conceptually for future learning
        self.confidence_threshold = 0.8 # Threshold for automatic verification

    def add_statement(self, statement: str, ground_truth: Optional[bool] = None,
                     source_reliability: float = 1.0):
        """Add a statement to memory with optional ground truth."""
        score_obj = self.filter_system.compute_epistemic_score(statement)

        entry = {
            'statement': statement,
            'score': score_obj.final_score,
            'confidence': score_obj.confidence,
            'source_reliability': source_reliability,
            'timestamp': time.time(),
            'ground_truth': ground_truth
        }

        # Categorize based on ground truth if provided, otherwise on score and confidence
        if ground_truth is not None:
            if ground_truth:
                self.memory_bank['verified_true'].append(entry)
            else:
                self.memory_bank['verified_false'].append(entry)
        else: # No ground truth, categorize based on internal score and confidence
            if score_obj.confidence > self.confidence_threshold:
                if score_obj.final_score > 0.7: # High confidence, likely true
                    self.memory_bank['verified_true'].append(entry)
                elif score_obj.final_score < 0.3: # High confidence, likely false
                    self.memory_bank['verified_false'].append(entry)
                else: # High confidence but ambiguous score, still uncertain
                    self.memory_bank['uncertain'].append(entry)
            else: # Low confidence, regardless of score, mark as uncertain
                self.memory_bank['uncertain'].append(entry)

    def get_memory_stats(self) -> Dict[str, int]:
        """Get statistics about memory bank."""
        return {
            category: len(statements)
            for category, statements in self.memory_bank.items()
        }

    def find_similar_statements(self, query_statement: str, threshold: float = 0.6) -> List[Dict]: # Lowered threshold slightly
        """Find similar statements in memory."""
        query_embedding = self.filter_system.embedder.encode(query_statement, convert_to_tensor=True, device=self.filter_system.device)
        similar_statements = []

        all_memory_statements = []
        # Collect all statements from all categories in memory
        for category, statements_list in self.memory_bank.items():
            for entry in statements_list:
                all_memory_statements.append(entry)

        if not all_memory_statements:
            return []

        # Encode all memory statements in a batch for efficiency
        memory_texts = [entry['statement'] for entry in all_memory_statements]
        memory_embeddings = self.filter_system.embedder.encode(
            memory_texts,
            convert_to_tensor=True,
            device=self.filter_system.device,
            show_progress_bar=False # Suppress progress bar during search
        )

        # Compute cosine similarity between query and all memory embeddings
        similarities = util.pytorch_cos_sim(query_embedding, memory_embeddings)[0] # Get the first (and only) row

        for i, sim_score in enumerate(similarities):
            if float(sim_score) >= threshold:
                entry = all_memory_statements[i]
                similar_statements.append({
                    'statement': entry['statement'],
                    'similarity': float(sim_score),
                    'category': [cat for cat, stmts in self.memory_bank.items() if entry in stmts][0], # Find original category
                    'score': entry['score'],
                    'confidence': entry['confidence'],
                    'ground_truth': entry['ground_truth'] # Include ground truth if known
                })

        return sorted(similar_statements, key=lambda x: x['similarity'], reverse=True)


# Advanced evaluation and benchmarking
def run_comprehensive_evaluation():
    """Run comprehensive evaluation with multiple test sets."""

    filter_system = EnhancedEpistemicFilter()
    uncertainty_quantifier = EpistemicUncertaintyQuantifier(filter_system)
    memory_system = EpistemicMemorySystem(filter_system)

    # Extended test datasets (same as evaluate_epistemic_filter for consistency)
    benchmark_datasets = {
        'scientific_facts': {
            'true': [
                "Water has a molecular formula of H2O consisting of two hydrogen atoms and one oxygen atom.",
                "The speed of light in vacuum is approximately 299,792,458 meters per second.",
                "DNA stores genetic information in the sequence of four nucleotide bases: A, T, G, and C.",
                "The Earth's atmosphere is approximately 78% nitrogen and 21% oxygen.",
                "Photosynthesis converts carbon dioxide and water into glucose using sunlight energy.",
                "The human body has 206 bones in the adult skeleton.",
                "Gravity accelerates objects at 9.8 meters per second squared on Earth's surface.",
            ],
            'false': [
                "The Sun is made primarily of liquid hydrogen and helium.",
                "Humans use 100% of their brain capacity at all times.",
                "Lightning never strikes the same place twice.",
                "Water always boils at exactly 100°C regardless of pressure.",
                "The Great Wall of China is visible from space with the naked eye.",
                "Diamonds are the hardest substance known to exist.",
            ]
        },
        'mathematical_statements': {
            'true': [
                "The sum of the angles in any triangle is always 180 degrees.",
                "Prime numbers greater than 2 are always odd numbers.",
                "The square root of 144 is 12.",
                "Zero is neither positive nor negative.",
                "The area of a circle is π times the radius squared.",
                "In base 10, the number 1000 has exactly 4 digits.",
            ],
            'false': [
                "The sum of any two even numbers is always odd.",
                "All prime numbers are odd numbers.",
                "The square root of 169 is 14.",
                "Negative numbers are larger than positive numbers.",
                "The circumference of a circle is 2 times π times the diameter squared.",
                "Division by zero equals infinity.",
            ]
        },
        'logical_reasoning': {
            'true': [
                "If all cats are mammals and all mammals are animals, then all cats are animals.",
                "If it is raining, then the ground will be wet (assuming no shelter).",
                "If A equals B and B equals C, then A equals C.",
                "Something cannot be both completely true and completely false simultaneously.",
            ],
            'false': [
                "If some birds can fly, then all animals can fly.",
                "If it is sunny, then it cannot be raining anywhere in the world.",
                "If A is greater than B, then B is greater than A.",
                "All statements are either true or false, with no middle ground.",
            ]
        },
        'nonsensical_statements': {
            'false': [
                "The color purple tastes like Wednesday morning's forgotten mathematics.",
                "Triangular emotions sing hexagonal melodies in the quantum soup.",
                "Yesterday's tomorrow is dancing with next week's blue silence.",
                "The number seven is heavier than the concept of circular time.",
                "Invisible libraries contain books written in unspoken languages.",
                "The square circle meditates silently on the color of sound.", # From basic test set
            ]
        }
    }

    print("🚀 COMPREHENSIVE EPISTEMIC FILTERING EVALUATION")
    print("=" * 80)

    overall_results = {}
    all_predictions = []
    all_ground_truth = []

    for dataset_name, categories in benchmark_datasets.items():
        print(f"\n📚 DATASET: {dataset_name.upper()}")
        print("-" * 60)

        dataset_statements = []
        dataset_ground_truth = []

        for category, statements in categories.items():
            dataset_statements.extend(statements)
            if category == 'true':
                dataset_ground_truth.extend([1] * len(statements))
            else:
                dataset_ground_truth.extend([0] * len(statements))

        # Run filtering
        results = filter_system.filter_statements(dataset_statements, threshold=0.5)

        # Convert to predictions
        predictions = []
        for statement in dataset_statements:
            score_obj = results['scores'][statement]
            predictions.append(1 if score_obj.final_score >= 0.5 else 0)

        # Calculate metrics
        accuracy = accuracy_score(dataset_ground_truth, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(
            dataset_ground_truth, predictions, average='binary', zero_division=0
        )

        overall_results[dataset_name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'total_statements': len(dataset_statements)
        }

        print(f"Accuracy: {accuracy:.3f} | Precision: {precision:.3f} | Recall: {recall:.3f} | F1: {f1:.3f}")

        # Add to overall evaluation
        all_predictions.extend(predictions)
        all_ground_truth.extend(dataset_ground_truth)

        # Add statements to memory system
        for statement, gt in zip(dataset_statements, dataset_ground_truth):
            memory_system.add_statement(statement, ground_truth=bool(gt))

    # Overall performance
    print(f"\n🎯 OVERALL PERFORMANCE")
    print("-" * 40)

    overall_accuracy = accuracy_score(all_ground_truth, all_predictions)
    overall_precision, overall_recall, overall_f1, _ = precision_recall_fscore_support(
        all_ground_truth, all_predictions, average='binary', zero_division=0
    )

    print(f"Overall Accuracy:  {overall_accuracy:.3f}")
    print(f"Overall Precision: {overall_precision:.3f}")
    print(f"Overall Recall:    {overall_recall:.3f}")
    print(f"Overall F1 Score:  {overall_f1:.3f}")

    # Uncertainty analysis on select statements
    print(f"\n🔮 UNCERTAINTY ANALYSIS")
    print("-" * 40)

    test_statements_uncertainty = [
        "The Earth revolves around the Sun.",
        "Water boils at 100 degrees Celsius.",
        "The Moon is made of cheese.",
        "Mathematics can be eaten with a spoon.",
        "Quantum mechanics describes the behavior of particles at atomic scales."
    ]

    for statement in test_statements_uncertainty:
        uncertainty = uncertainty_quantifier.quantify_uncertainty(statement)
        score = filter_system.compute_epistemic_score(statement) # Get the initial score, not from perturbation mean

        print(f"\nStatement: {statement}")
        print(f"  Score: {score.final_score:.3f}")
        print(f"  Total Uncertainty: {uncertainty['total_uncertainty']:.3f}")
        print(f"  Epistemic Uncertainty: {uncertainty['epistemic_uncertainty']:.3f}")
        print(f"  Model Uncertainty: {uncertainty['model_uncertainty']:.3f}")
        print(f"  95% CI: [{uncertainty['confidence_interval'][0]:.3f}, {uncertainty['confidence_interval'][1]:.3f}]")

    # Memory system statistics
    print(f"\n🧠 MEMORY SYSTEM STATISTICS")
    print("-" * 40)

    memory_stats = memory_system.get_memory_stats()
    for category, count in memory_stats.items():
        print(f"{category.replace('_', ' ').title()}: {count} statements")

    # Test similarity search
    print(f"\n🔍 SIMILARITY SEARCH TEST")
    print("-" * 40)

    test_query = "The Earth is spherical in shape."
    # Ensure this statement is in your knowledge base or memory if you expect a high similarity.
    # It is now in the knowledge_graph, so it should be found.
    similar_statements = memory_system.find_similar_statements(test_query, threshold=0.7)

    print(f"Query: {test_query}")
    print(f"Found {len(similar_statements)} similar statements:")

    if similar_statements:
        for similar in similar_statements[:3]:  # Show top 3
            print(f"  Similarity: {similar['similarity']:.3f} | Category: {similar['category']} | Ground Truth: {similar['ground_truth']}")
            print(f"  Statement: {similar['statement']}")
            print()
    else:
        print("  No similar statements found above threshold.")


    return overall_results, filter_system, memory_system, uncertainty_quantifier


# Main execution
if __name__ == "__main__":
    # Run basic evaluation
    print("Starting basic evaluation...")
    # Suppress warnings from transformers regarding unused weights for cleaner output during pipeline loading
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        basic_results = evaluate_epistemic_filter()

    print("\n" + "="*80)
    print("Starting comprehensive evaluation...")

    # Run comprehensive evaluation
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        comprehensive_results, filter_sys, memory_sys, uncertainty_sys = run_comprehensive_evaluation()

    # Final summary
    print(f"\n📋 FINAL SUMMARY")
    print("=" * 50)
    print("Enhanced Epistemic Filter Performance:")

    for dataset, metrics in comprehensive_results.items():
        print(f"\n{dataset.replace('_', ' ').title()}:")
        print(f"  Accuracy: {metrics['accuracy']:.3f}")
        print(f"  F1 Score: {metrics['f1']:.3f}")
        print(f"  Statements: {metrics['total_statements']}")

Starting basic evaluation...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Device set to use cpu


🧠 ENHANCED EPISTEMIC FILTERING EVALUATION

📊 PERFORMANCE METRICS
----------------------------------------
Accuracy:  0.721
Precision: 0.613
Recall:    1.000
F1 Score:  0.760

Confusion Matrix:
TN: 12, FP: 12
FN: 0, TP: 19

🔍 DETAILED ANALYSIS BY CATEGORY
--------------------------------------------------
PHYSICS_TRUE: 1.000 accuracy
  Precision: 1.000, Recall: 1.000, F1: 1.000
  Mean Score: 0.709
  Score Range: 0.622 - 0.802
BIOLOGY_TRUE: 1.000 accuracy
  Precision: 1.000, Recall: 1.000, F1: 1.000
  Mean Score: 0.759
  Score Range: 0.692 - 0.805
MATH_TRUE: 1.000 accuracy
  Precision: 1.000, Recall: 1.000, F1: 1.000
  Mean Score: 0.701
  Score Range: 0.586 - 0.845
PHYSICS_FALSE: 0.333 accuracy
  Mean Score: 0.519
  Score Range: 0.449 - 0.590
BIOLOGY_FALSE: 0.667 accuracy
  Mean Score: 0.418
  Score Range: 0.346 - 0.510
MATH_FALSE: 0.333 accuracy
  Mean Score: 0.481
  Score Range: 0.353 - 0.532
NONSENSICAL: 0.667 accuracy
  Mean Score: 0.467
  Score Range: 0.385 - 0.543

✅ BEST PREDICTIO

Device set to use cpu
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


🚀 COMPREHENSIVE EPISTEMIC FILTERING EVALUATION

📚 DATASET: SCIENTIFIC_FACTS
------------------------------------------------------------
Accuracy: 0.615 | Precision: 0.583 | Recall: 1.000 | F1: 0.737

📚 DATASET: MATHEMATICAL_STATEMENTS
------------------------------------------------------------
Accuracy: 0.583 | Precision: 0.545 | Recall: 1.000 | F1: 0.706

📚 DATASET: LOGICAL_REASONING
------------------------------------------------------------
Accuracy: 0.625 | Precision: 0.571 | Recall: 1.000 | F1: 0.727

📚 DATASET: NONSENSICAL_STATEMENTS
------------------------------------------------------------
Accuracy: 0.667 | Precision: 0.000 | Recall: 0.000 | F1: 0.000

🎯 OVERALL PERFORMANCE
----------------------------------------
Overall Accuracy:  0.615
Overall Precision: 0.531
Overall Recall:    1.000
Overall F1 Score:  0.694

🔮 UNCERTAINTY ANALYSIS
----------------------------------------

Statement: The Earth revolves around the Sun.
  Score: 0.560
  Total Uncertainty: 0.009
  Epistem