# Hallucination and Reliability in Large Language Models

## Overview

Hallucination in LLMs refers to the generation of content that appears plausible but is factually incorrect, unsupported by training data, or inconsistent with provided context. This notebook covers:

- **Hallucination Types**: Factual, contextual, and logical inconsistencies
- **Detection Methods**: Pattern-based and ML-based detection systems
- **Faithfulness Evaluation**: Measuring adherence to source information
- **Reliability Assessment**: Comprehensive frameworks for trustworthiness

Let's start by implementing practical hallucination detection systems.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from collections import Counter, defaultdict
from datetime import datetime
from typing import Dict, List, Any, Optional
import json
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print("Libraries imported successfully!")

## 1. Factual Hallucination Detection

Let's implement a system to detect factual inconsistencies in generated text:

In [None]:
class FactualHallucinationDetector:
    """Detect factual hallucinations in generated text"""
    
    def __init__(self):
        # Simple knowledge base for demonstration
        self.knowledge_base = {
            'facts': {
                'Paris is the capital of France': True,
                'London is the capital of England': True,
                'Tokyo is the capital of Japan': True,
                'Berlin is the capital of Germany': True,
                'Madrid is the capital of Spain': True,
                'Rome is the capital of Italy': True,
                'The Earth is flat': False,
                'The moon is made of cheese': False,
                'Water boils at 100°C at sea level': True,
                'Humans have 10 fingers': True,
                'There are 8 planets in our solar system': True,
                'Shakespeare wrote Romeo and Juliet': True
            },
            'entities': {
                'Paris': {'type': 'city', 'country': 'France'},
                'London': {'type': 'city', 'country': 'England'},
                'Tokyo': {'type': 'city', 'country': 'Japan'},
                'Shakespeare': {'type': 'person', 'profession': 'playwright'},
                'Earth': {'type': 'planet', 'system': 'Solar System'}
            }
        }
        
        # Factual claim patterns
        self.claim_patterns = [
            r'(.+) is the capital of (.+)',
            r'(.+) wrote (.+)',
            r'(.+) is (?:a|an) (.+)',
            r'There are (\d+) (.+)',
            r'(.+) (?:boils|melts|freezes) at (.+)'
        ]
    
    def detect_factual_errors(self, generated_text, context=None):
        """Detect factual inconsistencies in generated text"""
        claims = self.extract_claims(generated_text)
        errors = []
        
        for claim in claims:
            # Check against knowledge base
            kb_result = self.verify_against_kb(claim)
            
            if kb_result['status'] == 'FALSE':
                errors.append({
                    'claim': claim,
                    'type': 'factual_contradiction',
                    'confidence': kb_result['confidence'],
                    'explanation': kb_result['explanation']
                })
            
            # Cross-reference with context if provided
            if context and not self.is_supported_by_context(claim, context):
                errors.append({
                    'claim': claim,
                    'type': 'context_unsupported',
                    'severity': 'high',
                    'explanation': 'Claim not supported by provided context'
                })
        
        return {
            'total_claims': len(claims),
            'factual_errors': errors,
            'error_rate': len(errors) / len(claims) if claims else 0,
            'reliability_score': 1 - (len(errors) / len(claims)) if claims else 1.0
        }
    
    def extract_claims(self, text):
        """Extract verifiable claims from text"""
        claims = []
        sentences = nltk.sent_tokenize(text)
        
        for sentence in sentences:
            # Check if sentence contains factual claims
            if self.contains_factual_claim(sentence):
                claims.append({
                    'text': sentence.strip(),
                    'entities': self.extract_entities(sentence),
                    'claim_type': self.classify_claim_type(sentence)
                })
        
        return claims
    
    def contains_factual_claim(self, sentence):
        """Check if sentence contains a factual claim"""
        # Look for definitive statements
        definitive_patterns = [
            r'\bis\b', r'\bare\b', r'\bwas\b', r'\bwere\b',
            r'\bhas\b', r'\bhave\b', r'\bwrote\b', r'\binvented\b',
            r'\bdiscovered\b', r'\bfounded\b'
        ]
        
        return any(re.search(pattern, sentence.lower()) for pattern in definitive_patterns)
    
    def extract_entities(self, sentence):
        """Extract entities from sentence (simplified)"""
        entities = []
        
        # Simple entity extraction based on capitalization
        words = sentence.split()
        for word in words:
            clean_word = re.sub(r'[^\w]', '', word)
            if clean_word.istitle() and len(clean_word) > 1:
                if clean_word in self.knowledge_base['entities']:
                    entities.append({
                        'text': clean_word,
                        'info': self.knowledge_base['entities'][clean_word]
                    })
                else:
                    entities.append({'text': clean_word, 'info': None})
        
        return entities
    
    def classify_claim_type(self, sentence):
        """Classify the type of claim"""
        sentence_lower = sentence.lower()
        
        if 'capital' in sentence_lower:
            return 'geographical'
        elif any(word in sentence_lower for word in ['wrote', 'invented', 'discovered']):
            return 'attribution'
        elif any(word in sentence_lower for word in ['is', 'are', 'was', 'were']):
            return 'definitional'
        else:
            return 'general'
    
    def verify_against_kb(self, claim):
        """Verify claim against knowledge base"""
        claim_text = claim['text'].strip('.')
        
        # Direct lookup
        if claim_text in self.knowledge_base['facts']:
            is_true = self.knowledge_base['facts'][claim_text]
            return {
                'status': 'TRUE' if is_true else 'FALSE',
                'confidence': 0.95,
                'explanation': f"Direct match in knowledge base: {is_true}"
            }
        
        # Pattern-based verification
        for pattern in self.claim_patterns:
            match = re.search(pattern, claim_text, re.IGNORECASE)
            if match:
                return self.verify_pattern_match(pattern, match, claim_text)
        
        # Unknown claim
        return {
            'status': 'UNKNOWN',
            'confidence': 0.0,
            'explanation': 'Claim not found in knowledge base'
        }
    
    def verify_pattern_match(self, pattern, match, claim_text):
        """Verify pattern-based matches"""
        if 'capital' in pattern:
            city, country = match.groups()
            # Check if this capital relationship is correct
            correct_fact = f"{city.strip()} is the capital of {country.strip()}"
            if correct_fact in self.knowledge_base['facts']:
                is_correct = self.knowledge_base['facts'][correct_fact]
                return {
                    'status': 'TRUE' if is_correct else 'FALSE',
                    'confidence': 0.9,
                    'explanation': f"Capital relationship verified: {is_correct}"
                }
        
        return {
            'status': 'UNKNOWN',
            'confidence': 0.3,
            'explanation': 'Pattern matched but not verified'
        }
    
    def is_supported_by_context(self, claim, context):
        """Check if claim is supported by context"""
        claim_text = claim['text'].lower()
        context_lower = context.lower()
        
        # Simple keyword overlap check
        claim_words = set(re.findall(r'\w+', claim_text))
        context_words = set(re.findall(r'\w+', context_lower))
        
        overlap = len(claim_words.intersection(context_words))
        overlap_ratio = overlap / len(claim_words) if claim_words else 0
        
        return overlap_ratio > 0.5  # At least 50% word overlap

# Initialize the detector
hallucination_detector = FactualHallucinationDetector()
print("Factual hallucination detector initialized!")

### Testing Factual Hallucination Detection

Let's test our detector with various examples containing both correct and incorrect facts:

In [None]:
# Test cases with factual and hallucinated content
test_texts = [
    {
        'text': 'Paris is the capital of France. London is the capital of England. Shakespeare wrote Romeo and Juliet.',
        'description': 'All correct facts',
        'expected_errors': 0
    },
    {
        'text': 'Paris is the capital of Germany. Tokyo is the capital of China. The Earth is flat.',
        'description': 'Multiple factual errors',
        'expected_errors': 3
    },
    {
        'text': 'Berlin is the capital of Germany. Water boils at 100°C at sea level. The moon is made of cheese.',
        'description': 'Mixed correct and incorrect facts',
        'expected_errors': 1
    },
    {
        'text': 'I think machine learning is interesting. This might be a good approach. Perhaps we should consider this.',
        'description': 'Opinions and uncertain statements',
        'expected_errors': 0
    }
]

print("Testing factual hallucination detection...\n")

results_summary = []

for i, test_case in enumerate(test_texts, 1):
    print(f"Test {i}: {test_case['description']}")
    print(f"Text: {test_case['text']}")
    
    # Run detection
    result = hallucination_detector.detect_factual_errors(test_case['text'])
    
    print(f"\nResults:")
    print(f"  Total claims extracted: {result['total_claims']}")
    print(f"  Factual errors detected: {len(result['factual_errors'])}")
    print(f"  Error rate: {result['error_rate']:.2%}")
    print(f"  Reliability score: {result['reliability_score']:.3f}")
    
    # Show detected errors
    if result['factual_errors']:
        print(f"\n  Detected errors:")
        for j, error in enumerate(result['factual_errors'], 1):
            print(f"    {j}. {error['claim']['text']}")
            print(f"       Type: {error['type']}")
            print(f"       Confidence: {error['confidence']:.2f}")
            print(f"       Explanation: {error['explanation']}")
    
    # Check accuracy
    detected_errors = len(result['factual_errors'])
    expected_errors = test_case['expected_errors']
    accuracy = "✓" if detected_errors == expected_errors else "✗"
    
    print(f"\n  Expected errors: {expected_errors}")
    print(f"  Detection accuracy: {accuracy}")
    
    results_summary.append({
        'test': i,
        'detected': detected_errors,
        'expected': expected_errors,
        'correct': detected_errors == expected_errors,
        'reliability_score': result['reliability_score']
    })
    
    print("-" * 70)

# Summary statistics
correct_detections = sum(1 for r in results_summary if r['correct'])
total_tests = len(results_summary)
avg_reliability = np.mean([r['reliability_score'] for r in results_summary])

print(f"\n=== DETECTION SUMMARY ===")
print(f"Correct detections: {correct_detections}/{total_tests} ({correct_detections/total_tests:.1%})")
print(f"Average reliability score: {avg_reliability:.3f}")

## 2. Contextual Consistency Checker

Let's implement a system to check if generated text is consistent with provided context:

In [None]:
class ContextualConsistencyChecker:
    """Check contextual consistency of generated text"""
    
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
        
        # Contradiction indicators
        self.contradiction_patterns = [
            r'\b(?:not|never|no|none|neither)\b',
            r'\b(?:but|however|although|despite)\b',
            r'\b(?:opposite|contrary|different)\b',
            r'\b(?:wrong|incorrect|false)\b'
        ]
    
    def check_contextual_consistency(self, context, generated_text):
        """Check if generated text is consistent with context"""
        # Split generated text into sentences
        gen_sentences = nltk.sent_tokenize(generated_text)
        
        inconsistencies = []
        consistency_scores = []
        
        for sentence in gen_sentences:
            # Calculate semantic similarity
            similarity_score = self.calculate_semantic_similarity(context, sentence)
            
            # Check for explicit contradictions
            contradiction_score = self.detect_contradictions(context, sentence)
            
            # Overall consistency score
            consistency_score = similarity_score - contradiction_score
            consistency_scores.append(consistency_score)
            
            # Flag inconsistencies
            if consistency_score < 0.3:  # Low consistency threshold
                inconsistencies.append({
                    'sentence': sentence,
                    'type': 'low_consistency',
                    'similarity_score': similarity_score,
                    'contradiction_score': contradiction_score,
                    'consistency_score': consistency_score
                })
        
        return {
            'inconsistencies': inconsistencies,
            'overall_consistency': np.mean(consistency_scores) if consistency_scores else 0,
            'consistency_distribution': {
                'high': sum(1 for s in consistency_scores if s > 0.7),
                'medium': sum(1 for s in consistency_scores if 0.3 <= s <= 0.7),
                'low': sum(1 for s in consistency_scores if s < 0.3)
            },
            'total_sentences': len(gen_sentences)
        }
    
    def calculate_semantic_similarity(self, context, sentence):
        """Calculate semantic similarity using TF-IDF"""
        try:
            # Fit vectorizer on both texts
            tfidf_matrix = self.vectorizer.fit_transform([context, sentence])
            
            # Calculate cosine similarity
            similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
            
            return similarity
        except:
            # Fallback to simple word overlap
            context_words = set(re.findall(r'\w+', context.lower()))
            sentence_words = set(re.findall(r'\w+', sentence.lower()))
            
            if not sentence_words:
                return 0
            
            overlap = len(context_words.intersection(sentence_words))
            return overlap / len(sentence_words)
    
    def detect_contradictions(self, context, sentence):
        """Detect explicit contradictions"""
        contradiction_score = 0
        
        # Check for contradiction patterns in sentence
        sentence_lower = sentence.lower()
        for pattern in self.contradiction_patterns:
            if re.search(pattern, sentence_lower):
                contradiction_score += 0.2
        
        # Check for negation of context content
        context_words = set(re.findall(r'\w+', context.lower()))
        
        # Look for negated versions of context words
        negation_patterns = [r'not ' + word for word in list(context_words)[:10]]  # Limit for performance
        
        for pattern in negation_patterns:
            if re.search(pattern, sentence_lower):
                contradiction_score += 0.3
        
        return min(contradiction_score, 1.0)  # Cap at 1.0
    
    def analyze_information_flow(self, context, generated_text):
        """Analyze how information flows from context to generated text"""
        context_sentences = nltk.sent_tokenize(context)
        gen_sentences = nltk.sent_tokenize(generated_text)
        
        # Track which context sentences are referenced
        context_usage = []
        
        for i, ctx_sent in enumerate(context_sentences):
            max_similarity = 0
            best_match = None
            
            for j, gen_sent in enumerate(gen_sentences):
                similarity = self.calculate_semantic_similarity(ctx_sent, gen_sent)
                if similarity > max_similarity:
                    max_similarity = similarity
                    best_match = j
            
            context_usage.append({
                'context_sentence_id': i,
                'context_sentence': ctx_sent,
                'best_match_id': best_match,
                'best_match_sentence': gen_sentences[best_match] if best_match is not None else None,
                'similarity_score': max_similarity,
                'is_used': max_similarity > 0.3
            })
        
        used_context = sum(1 for usage in context_usage if usage['is_used'])
        context_coverage = used_context / len(context_sentences) if context_sentences else 0
        
        return {
            'context_usage': context_usage,
            'context_coverage': context_coverage,
            'unused_context_sentences': len(context_sentences) - used_context
        }

# Initialize the consistency checker
consistency_checker = ContextualConsistencyChecker()
print("Contextual consistency checker initialized!")

### Testing Contextual Consistency

Let's test the consistency checker with various context-generation pairs:

In [None]:
# Test cases for contextual consistency
consistency_tests = [
    {
        'context': 'Machine learning is a subset of artificial intelligence that enables computers to learn from data without explicit programming. It uses algorithms to identify patterns and make predictions.',
        'generated': 'Machine learning allows computers to learn from data and identify patterns. This technology is used for making predictions and is part of artificial intelligence.',
        'description': 'Consistent with context',
        'expected_consistency': 'high'
    },
    {
        'context': 'Python is a high-level programming language known for its simplicity and readability. It was created by Guido van Rossum in 1991.',
        'generated': 'Python is a low-level programming language that is very complex to use. It was invented by John Smith in 2005.',
        'description': 'Contradicts context facts',
        'expected_consistency': 'low'
    },
    {
        'context': 'Climate change refers to long-term shifts in global temperatures and weather patterns. Human activities are the primary driver of recent climate change.',
        'generated': 'Weather can vary from day to day. Some regions experience more rainfall than others. Temperature fluctuations are normal.',
        'description': 'Related but not directly addressing context',
        'expected_consistency': 'medium'
    },
    {
        'context': 'The Renaissance was a period of cultural rebirth in Europe from the 14th to 17th centuries. It marked a transition from medieval to modern times.',
        'generated': 'Quantum physics deals with the behavior of matter at the atomic level. Particles can exist in multiple states simultaneously.',
        'description': 'Completely unrelated topic',
        'expected_consistency': 'low'
    }
]

print("Testing contextual consistency...\n")

for i, test_case in enumerate(consistency_tests, 1):
    print(f"Test {i}: {test_case['description']}")
    print(f"Context: {test_case['context'][:100]}...")
    print(f"Generated: {test_case['generated'][:100]}...")
    
    # Run consistency check
    result = consistency_checker.check_contextual_consistency(
        test_case['context'], 
        test_case['generated']
    )
    
    print(f"\nConsistency Analysis:")
    print(f"  Overall consistency: {result['overall_consistency']:.3f}")
    print(f"  Total sentences: {result['total_sentences']}")
    print(f"  Inconsistencies found: {len(result['inconsistencies'])}")
    
    # Show consistency distribution
    dist = result['consistency_distribution']
    print(f"  Consistency distribution:")
    print(f"    High: {dist['high']} sentences")
    print(f"    Medium: {dist['medium']} sentences")
    print(f"    Low: {dist['low']} sentences")
    
    # Show inconsistencies if any
    if result['inconsistencies']:
        print(f"\n  Detected inconsistencies:")
        for j, inconsistency in enumerate(result['inconsistencies'], 1):
            print(f"    {j}. {inconsistency['sentence'][:80]}...")
            print(f"       Consistency score: {inconsistency['consistency_score']:.3f}")
    
    # Analyze information flow
    flow_analysis = consistency_checker.analyze_information_flow(
        test_case['context'], 
        test_case['generated']
    )
    
    print(f"\n  Information Flow:")
    print(f"    Context coverage: {flow_analysis['context_coverage']:.2%}")
    print(f"    Unused context sentences: {flow_analysis['unused_context_sentences']}")
    
    # Determine consistency level
    if result['overall_consistency'] > 0.7:
        consistency_level = 'high'
    elif result['overall_consistency'] > 0.3:
        consistency_level = 'medium'
    else:
        consistency_level = 'low'
    
    expected = test_case['expected_consistency']
    accuracy = "✓" if consistency_level == expected else "✗"
    
    print(f"\n  Expected: {expected}, Detected: {consistency_level} {accuracy}")
    print("-" * 80)

## 3. Comprehensive Reliability Assessment

Let's create a comprehensive framework that combines multiple reliability measures:

In [None]:
class ReliabilityAssessmentFramework:
    """Comprehensive reliability assessment for LLM outputs"""
    
    def __init__(self):
        self.hallucination_detector = FactualHallucinationDetector()
        self.consistency_checker = ContextualConsistencyChecker()
        self.uncertainty_estimator = UncertaintyEstimator()
        
        # Reliability weights
        self.weights = {
            'factual_accuracy': 0.35,
            'contextual_consistency': 0.25,
            'logical_coherence': 0.20,
            'uncertainty_calibration': 0.20
        }
    
    def assess_reliability(self, generated_text, context=None, sources=None):
        """Comprehensive reliability assessment"""
        assessment = {
            'timestamp': datetime.now().isoformat(),
            'text_length': len(generated_text),
            'factual_analysis': {},
            'consistency_analysis': {},
            'coherence_analysis': {},
            'uncertainty_analysis': {},
            'overall_reliability': 0.0,
            'reliability_grade': 'F',
            'recommendations': []
        }
        
        # 1. Factual Analysis
        factual_result = self.hallucination_detector.detect_factual_errors(generated_text, context)
        assessment['factual_analysis'] = factual_result
        
        # 2. Contextual Consistency (if context provided)
        if context:
            consistency_result = self.consistency_checker.check_contextual_consistency(context, generated_text)
            assessment['consistency_analysis'] = consistency_result
        else:
            assessment['consistency_analysis'] = {'overall_consistency': 0.5}  # Neutral score
        
        # 3. Logical Coherence
        coherence_result = self.analyze_logical_coherence(generated_text)
        assessment['coherence_analysis'] = coherence_result
        
        # 4. Uncertainty Analysis
        uncertainty_result = self.uncertainty_estimator.estimate_uncertainty(generated_text)
        assessment['uncertainty_analysis'] = uncertainty_result
        
        # 5. Calculate Overall Reliability
        assessment['overall_reliability'] = self.calculate_overall_reliability(assessment)
        assessment['reliability_grade'] = self.get_reliability_grade(assessment['overall_reliability'])
        
        # 6. Generate Recommendations
        assessment['recommendations'] = self.generate_recommendations(assessment)
        
        return assessment
    
    def analyze_logical_coherence(self, text):
        """Analyze logical coherence of the text"""
        sentences = nltk.sent_tokenize(text)
        
        coherence_scores = []
        contradictions = []
        
        # Check sentence-to-sentence coherence
        for i in range(len(sentences) - 1):
            current_sent = sentences[i]
            next_sent = sentences[i + 1]
            
            # Calculate semantic similarity between adjacent sentences
            similarity = self.consistency_checker.calculate_semantic_similarity(current_sent, next_sent)
            coherence_scores.append(similarity)
            
            # Check for contradictions
            contradiction_score = self.consistency_checker.detect_contradictions(current_sent, next_sent)
            if contradiction_score > 0.5:
                contradictions.append({
                    'sentence1': current_sent,
                    'sentence2': next_sent,
                    'contradiction_score': contradiction_score
                })
        
        # Check for logical flow indicators
        flow_indicators = self.count_flow_indicators(text)
        
        avg_coherence = np.mean(coherence_scores) if coherence_scores else 0.5
        
        return {
            'average_coherence': avg_coherence,
            'coherence_scores': coherence_scores,
            'contradictions': contradictions,
            'flow_indicators': flow_indicators,
            'coherence_grade': self.grade_coherence(avg_coherence, len(contradictions))
        }
    
    def count_flow_indicators(self, text):
        """Count logical flow indicators in text"""
        flow_patterns = {
            'sequence': [r'\bfirst\b', r'\bsecond\b', r'\bthen\b', r'\bfinally\b', r'\bnext\b'],
            'causation': [r'\bbecause\b', r'\btherefore\b', r'\bthus\b', r'\bso\b', r'\bas a result\b'],
            'contrast': [r'\bhowever\b', r'\bbut\b', r'\balthough\b', r'\bnevertheless\b'],
            'addition': [r'\bmoreover\b', r'\bfurthermore\b', r'\badditionally\b', r'\balso\b']
        }
        
        indicators = {}
        text_lower = text.lower()
        
        for category, patterns in flow_patterns.items():
            count = sum(len(re.findall(pattern, text_lower)) for pattern in patterns)
            indicators[category] = count
        
        indicators['total'] = sum(indicators.values())
        return indicators
    
    def grade_coherence(self, avg_coherence, contradiction_count):
        """Grade logical coherence"""
        # Penalize for contradictions
        penalty = contradiction_count * 0.1
        adjusted_score = max(0, avg_coherence - penalty)
        
        if adjusted_score > 0.8:
            return 'A'
        elif adjusted_score > 0.6:
            return 'B'
        elif adjusted_score > 0.4:
            return 'C'
        elif adjusted_score > 0.2:
            return 'D'
        else:
            return 'F'
    
    def calculate_overall_reliability(self, assessment):
        """Calculate weighted overall reliability score"""
        scores = {
            'factual_accuracy': assessment['factual_analysis']['reliability_score'],
            'contextual_consistency': assessment['consistency_analysis']['overall_consistency'],
            'logical_coherence': assessment['coherence_analysis']['average_coherence'],
            'uncertainty_calibration': 1.0 - assessment['uncertainty_analysis']['overall_uncertainty']
        }
        
        overall_score = sum(scores[component] * self.weights[component] for component in scores)
        return max(0, min(1, overall_score))
    
    def get_reliability_grade(self, score):
        """Convert reliability score to letter grade"""
        if score >= 0.9:
            return 'A+'
        elif score >= 0.8:
            return 'A'
        elif score >= 0.7:
            return 'B'
        elif score >= 0.6:
            return 'C'
        elif score >= 0.5:
            return 'D'
        else:
            return 'F'
    
    def generate_recommendations(self, assessment):
        """Generate improvement recommendations"""
        recommendations = []
        
        # Factual accuracy recommendations
        if assessment['factual_analysis']['reliability_score'] < 0.7:
            recommendations.append("Improve factual accuracy by verifying claims against reliable sources")
        
        # Consistency recommendations
        if assessment['consistency_analysis']['overall_consistency'] < 0.6:
            recommendations.append("Enhance contextual consistency by staying closer to provided information")
        
        # Coherence recommendations
        if assessment['coherence_analysis']['average_coherence'] < 0.5:
            recommendations.append("Improve logical flow with better transitions and structure")
        
        # Uncertainty recommendations
        if assessment['uncertainty_analysis']['overall_uncertainty'] > 0.7:
            recommendations.append("Reduce uncertainty by providing more confident and specific statements")
        
        if not recommendations:
            recommendations.append("Overall reliability is good - maintain current quality standards")
        
        return recommendations

class UncertaintyEstimator:
    """Estimate uncertainty in generated text"""
    
    def __init__(self):
        self.uncertainty_indicators = [
            r'\b(?:maybe|perhaps|possibly|probably|likely|might|could|may)\b',
            r'\b(?:i think|i believe|i guess|it seems|appears to be)\b',
            r'\b(?:uncertain|unsure|unclear|ambiguous)\b',
            r'\?',  # Question marks
            r'\b(?:approximately|roughly|about|around)\b'
        ]
    
    def estimate_uncertainty(self, text):
        """Estimate overall uncertainty in text"""
        sentences = nltk.sent_tokenize(text)
        sentence_uncertainties = []
        
        for sentence in sentences:
            uncertainty_score = self.calculate_sentence_uncertainty(sentence)
            sentence_uncertainties.append(uncertainty_score)
        
        overall_uncertainty = np.mean(sentence_uncertainties) if sentence_uncertainties else 0
        
        return {
            'overall_uncertainty': overall_uncertainty,
            'sentence_uncertainties': sentence_uncertainties,
            'uncertainty_distribution': self.categorize_uncertainties(sentence_uncertainties),
            'confidence_level': 1.0 - overall_uncertainty
        }
    
    def calculate_sentence_uncertainty(self, sentence):
        """Calculate uncertainty score for a single sentence"""
        uncertainty_count = 0
        sentence_lower = sentence.lower()
        
        for pattern in self.uncertainty_indicators:
            matches = len(re.findall(pattern, sentence_lower))
            uncertainty_count += matches
        
        # Normalize by sentence length (word count)
        word_count = len(sentence.split())
        if word_count == 0:
            return 0
        
        uncertainty_ratio = uncertainty_count / word_count
        return min(uncertainty_ratio * 5, 1.0)  # Scale and cap at 1.0
    
    def categorize_uncertainties(self, uncertainties):
        """Categorize uncertainty levels"""
        return {
            'high': sum(1 for u in uncertainties if u > 0.7),
            'medium': sum(1 for u in uncertainties if 0.3 <= u <= 0.7),
            'low': sum(1 for u in uncertainties if u < 0.3)
        }

# Initialize the comprehensive framework
reliability_framework = ReliabilityAssessmentFramework()
print("Comprehensive reliability assessment framework initialized!")

### Testing Comprehensive Reliability Assessment

Let's test our comprehensive framework with different types of generated text:

In [None]:
# Test cases for comprehensive reliability assessment
reliability_tests = [
    {
        'context': 'Artificial intelligence is transforming healthcare through machine learning algorithms that can analyze medical images and assist in diagnosis.',
        'generated': 'AI is revolutionizing healthcare by using machine learning to analyze medical images. These algorithms help doctors make more accurate diagnoses. This technology improves patient outcomes and reduces diagnostic errors.',
        'description': 'High-quality, consistent response'
    },
    {
        'context': 'Python is a programming language created by Guido van Rossum in 1991.',
        'generated': 'Python was invented by John Smith in 2005. It is a very difficult language to learn. Maybe it is used for web development, but I am not sure. The syntax might be complex.',
        'description': 'Factually incorrect and uncertain'
    },
    {
        'context': 'Climate change is caused by greenhouse gas emissions from human activities.',
        'generated': 'Climate change is a natural phenomenon. However, human activities also contribute to it. But then again, it might be entirely natural. Scientists are probably wrong about greenhouse gases.',
        'description': 'Self-contradictory and inconsistent'
    },
    {
        'context': 'The Renaissance was a period of cultural and artistic rebirth in Europe.',
        'generated': 'The Renaissance marked a significant cultural revival in Europe. This period saw remarkable achievements in art, literature, and science. Artists like Leonardo da Vinci and Michelangelo created masterpieces that continue to inspire us today.',
        'description': 'Coherent and well-structured'
    }
]

print("Testing comprehensive reliability assessment...\n")

assessment_results = []

for i, test_case in enumerate(reliability_tests, 1):
    print(f"Test {i}: {test_case['description']}")
    print(f"Context: {test_case['context'][:80]}...")
    print(f"Generated: {test_case['generated'][:80]}...")
    
    # Run comprehensive assessment
    assessment = reliability_framework.assess_reliability(
        test_case['generated'], 
        test_case['context']
    )
    
    print(f"\n=== RELIABILITY ASSESSMENT ===")
    print(f"Overall Reliability: {assessment['overall_reliability']:.3f} (Grade: {assessment['reliability_grade']})")
    
    # Component scores
    print(f"\nComponent Analysis:")
    print(f"  Factual Accuracy: {assessment['factual_analysis']['reliability_score']:.3f}")
    print(f"  Contextual Consistency: {assessment['consistency_analysis']['overall_consistency']:.3f}")
    print(f"  Logical Coherence: {assessment['coherence_analysis']['average_coherence']:.3f} (Grade: {assessment['coherence_analysis']['coherence_grade']})")
    print(f"  Confidence Level: {assessment['uncertainty_analysis']['confidence_level']:.3f}")
    
    # Issues found
    factual_errors = len(assessment['factual_analysis']['factual_errors'])
    inconsistencies = len(assessment['consistency_analysis'].get('inconsistencies', []))
    contradictions = len(assessment['coherence_analysis']['contradictions'])
    
    print(f"\nIssues Detected:")
    print(f"  Factual errors: {factual_errors}")
    print(f"  Contextual inconsistencies: {inconsistencies}")
    print(f"  Logical contradictions: {contradictions}")
    
    # Recommendations
    print(f"\nRecommendations:")
    for j, rec in enumerate(assessment['recommendations'], 1):
        print(f"  {j}. {rec}")
    
    assessment_results.append({
        'test': i,
        'description': test_case['description'],
        'reliability_score': assessment['overall_reliability'],
        'grade': assessment['reliability_grade']
    })
    
    print("-" * 80)

# Summary visualization
df_results = pd.DataFrame(assessment_results)

plt.figure(figsize=(12, 6))

# Reliability scores
plt.subplot(1, 2, 1)
bars = plt.bar(range(len(df_results)), df_results['reliability_score'], 
               color=['green' if score > 0.7 else 'orange' if score > 0.5 else 'red' 
                     for score in df_results['reliability_score']])
plt.xlabel('Test Case')
plt.ylabel('Reliability Score')
plt.title('Reliability Assessment Results')
plt.xticks(range(len(df_results)), [f"Test {i+1}" for i in range(len(df_results))])
plt.ylim(0, 1)

# Add score labels on bars
for i, bar in enumerate(bars):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{height:.3f}', ha='center', va='bottom')

# Grade distribution
plt.subplot(1, 2, 2)
grade_counts = df_results['grade'].value_counts()
plt.pie(grade_counts.values, labels=grade_counts.index, autopct='%1.0f%%')
plt.title('Grade Distribution')

plt.tight_layout()
plt.show()

print(f"\n=== SUMMARY STATISTICS ===")
print(f"Average reliability score: {df_results['reliability_score'].mean():.3f}")
print(f"Best performing test: {df_results.loc[df_results['reliability_score'].idxmax(), 'description']}")
print(f"Lowest performing test: {df_results.loc[df_results['reliability_score'].idxmin(), 'description']}")