# Pretraining Data Engineering for LLMs

## Overview

Data engineering for LLM pretraining involves sophisticated techniques to ensure high-quality, diverse, and uncontaminated training datasets. This notebook covers:

- **Data Deduplication**: Exact and near-duplicate detection using MinHash and LSH
- **Data Filtering**: Quality-based and content-based filtering strategies
- **Contamination Detection**: Test set and benchmark contamination analysis
- **Dataset Balancing**: Domain, quality, and temporal balancing techniques

Let's implement practical data engineering pipelines for LLM training.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hashlib
import re
import random
import string
from collections import defaultdict, Counter
from typing import Dict, List, Set, Tuple, Any
import json
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print("Libraries imported successfully!")

## 1. Data Deduplication System

Let's implement a comprehensive deduplication system using exact matching, MinHash, and LSH:

In [None]:
class MinHashDeduplicator:
    """MinHash-based deduplication for near-duplicate detection"""
    
    def __init__(self, num_hashes=128, shingle_size=3):
        self.num_hashes = num_hashes
        self.shingle_size = shingle_size
        self.hash_functions = self._generate_hash_functions()
        self.signatures = {}
        self.lsh_buckets = defaultdict(list)
        self.bands = 16  # Number of LSH bands
        self.rows_per_band = num_hashes // self.bands
    
    def _generate_hash_functions(self):
        """Generate hash functions for MinHash"""
        # Use different seeds for different hash functions
        hash_functions = []
        for i in range(self.num_hashes):
            # Create hash function with different parameters
            a = random.randint(1, 2**32 - 1)
            b = random.randint(0, 2**32 - 1)
            hash_functions.append((a, b))
        return hash_functions
    
    def _create_shingles(self, text):
        """Create character-level shingles from text"""
        # Normalize text
        text = re.sub(r'\s+', ' ', text.lower().strip())
        
        # Create shingles
        shingles = set()
        for i in range(len(text) - self.shingle_size + 1):
            shingle = text[i:i + self.shingle_size]
            shingles.add(shingle)
        
        return shingles
    
    def _hash_shingle(self, shingle, a, b):
        """Hash a shingle using linear hash function"""
        # Convert shingle to integer
        shingle_int = hash(shingle) % (2**32)
        return (a * shingle_int + b) % (2**32)
    
    def compute_minhash_signature(self, text, doc_id):
        """Compute MinHash signature for a document"""
        shingles = self._create_shingles(text)
        
        if not shingles:
            return None
        
        signature = []
        
        for a, b in self.hash_functions:
            min_hash = float('inf')
            
            for shingle in shingles:
                hash_val = self._hash_shingle(shingle, a, b)
                min_hash = min(min_hash, hash_val)
            
            signature.append(min_hash)
        
        self.signatures[doc_id] = signature
        return signature
    
    def estimate_jaccard_similarity(self, sig1, sig2):
        """Estimate Jaccard similarity from MinHash signatures"""
        if len(sig1) != len(sig2):
            return 0.0
        
        matches = sum(1 for h1, h2 in zip(sig1, sig2) if h1 == h2)
        return matches / len(sig1)
    
    def add_to_lsh(self, doc_id, signature):
        """Add document to LSH buckets for efficient similarity search"""
        for band in range(self.bands):
            start_idx = band * self.rows_per_band
            end_idx = start_idx + self.rows_per_band
            band_signature = tuple(signature[start_idx:end_idx])
            
            # Hash the band signature to create bucket key
            bucket_key = hash(band_signature)
            self.lsh_buckets[bucket_key].append(doc_id)
    
    def find_similar_documents(self, doc_id, similarity_threshold=0.8):
        """Find documents similar to the given document using LSH"""
        if doc_id not in self.signatures:
            return []
        
        signature = self.signatures[doc_id]
        candidate_docs = set()
        
        # Find candidates from LSH buckets
        for band in range(self.bands):
            start_idx = band * self.rows_per_band
            end_idx = start_idx + self.rows_per_band
            band_signature = tuple(signature[start_idx:end_idx])
            bucket_key = hash(band_signature)
            
            if bucket_key in self.lsh_buckets:
                candidate_docs.update(self.lsh_buckets[bucket_key])
        
        # Remove self
        candidate_docs.discard(doc_id)
        
        # Verify similarity for candidates
        similar_docs = []
        for candidate_id in candidate_docs:
            if candidate_id in self.signatures:
                similarity = self.estimate_jaccard_similarity(
                    signature, self.signatures[candidate_id]
                )
                if similarity >= similarity_threshold:
                    similar_docs.append((candidate_id, similarity))
        
        # Sort by similarity
        similar_docs.sort(key=lambda x: x[1], reverse=True)
        return similar_docs

class DataDeduplicationPipeline:
    """Comprehensive data deduplication pipeline"""
    
    def __init__(self):
        self.exact_hashes = set()
        self.minhash_deduplicator = MinHashDeduplicator()
        self.deduplication_stats = {
            'total_documents': 0,
            'exact_duplicates': 0,
            'near_duplicates': 0,
            'unique_documents': 0
        }
        self.processed_documents = {}
    
    def process_document(self, doc_id, text, similarity_threshold=0.8):
        """Process a single document through deduplication pipeline"""
        self.deduplication_stats['total_documents'] += 1
        
        # Step 1: Exact deduplication
        exact_hash = hashlib.md5(text.encode('utf-8')).hexdigest()
        
        if exact_hash in self.exact_hashes:
            self.deduplication_stats['exact_duplicates'] += 1
            return {
                'doc_id': doc_id,
                'is_duplicate': True,
                'duplicate_type': 'exact',
                'similarity': 1.0,
                'action': 'remove'
            }
        
        self.exact_hashes.add(exact_hash)
        
        # Step 2: Near-duplicate detection using MinHash
        signature = self.minhash_deduplicator.compute_minhash_signature(text, doc_id)
        
        if signature is None:
            return {
                'doc_id': doc_id,
                'is_duplicate': False,
                'duplicate_type': 'none',
                'similarity': 0.0,
                'action': 'keep',
                'note': 'empty_or_too_short'
            }
        
        # Add to LSH for future similarity searches
        self.minhash_deduplicator.add_to_lsh(doc_id, signature)
        
        # Find similar documents
        similar_docs = self.minhash_deduplicator.find_similar_documents(
            doc_id, similarity_threshold
        )
        
        if similar_docs:
            self.deduplication_stats['near_duplicates'] += 1
            best_match = similar_docs[0]
            return {
                'doc_id': doc_id,
                'is_duplicate': True,
                'duplicate_type': 'near',
                'similarity': best_match[1],
                'similar_to': best_match[0],
                'action': 'remove'
            }
        
        # Document is unique
        self.deduplication_stats['unique_documents'] += 1
        self.processed_documents[doc_id] = {
            'text': text,
            'signature': signature,
            'exact_hash': exact_hash
        }
        
        return {
            'doc_id': doc_id,
            'is_duplicate': False,
            'duplicate_type': 'none',
            'similarity': 0.0,
            'action': 'keep'
        }
    
    def get_deduplication_report(self):
        """Generate deduplication report"""
        stats = self.deduplication_stats.copy()
        
        if stats['total_documents'] > 0:
            stats['exact_duplicate_rate'] = stats['exact_duplicates'] / stats['total_documents']
            stats['near_duplicate_rate'] = stats['near_duplicates'] / stats['total_documents']
            stats['unique_rate'] = stats['unique_documents'] / stats['total_documents']
            stats['total_duplicate_rate'] = (stats['exact_duplicates'] + stats['near_duplicates']) / stats['total_documents']
        
        return stats

# Initialize deduplication pipeline
dedup_pipeline = DataDeduplicationPipeline()
print("Data deduplication pipeline initialized!")

### Testing Data Deduplication

Let's test our deduplication system with sample documents:

In [None]:
# Generate test documents with various levels of similarity
def generate_test_documents():
    """Generate test documents with exact and near duplicates"""
    base_texts = [
        "Machine learning is a subset of artificial intelligence that enables computers to learn from data without explicit programming.",
        "Deep learning uses neural networks with multiple layers to model and understand complex patterns in data.",
        "Natural language processing combines computational linguistics with machine learning to help computers understand human language.",
        "Computer vision is a field of AI that trains computers to interpret and understand visual information from the world.",
        "Reinforcement learning is a type of machine learning where agents learn to make decisions through trial and error."
    ]
    
    documents = []
    
    # Add original documents
    for i, text in enumerate(base_texts):
        documents.append((f"doc_{i+1}", text))
    
    # Add exact duplicates
    documents.append(("doc_6_exact_dup", base_texts[0]))  # Exact duplicate of doc_1
    documents.append(("doc_7_exact_dup", base_texts[1]))  # Exact duplicate of doc_2
    
    # Add near duplicates (with minor modifications)
    near_dup_1 = base_texts[0].replace("Machine learning", "ML").replace("artificial intelligence", "AI")
    documents.append(("doc_8_near_dup", near_dup_1))
    
    near_dup_2 = base_texts[2] + " This technology is widely used in various applications."
    documents.append(("doc_9_near_dup", near_dup_2))
    
    # Add some variations
    variation_1 = "Deep neural networks with many layers are used in deep learning to model complex data patterns."
    documents.append(("doc_10_variation", variation_1))
    
    # Add completely different document
    different_doc = "The weather today is sunny with a temperature of 25 degrees Celsius. Perfect for outdoor activities."
    documents.append(("doc_11_different", different_doc))
    
    return documents

# Generate test documents
test_documents = generate_test_documents()

print(f"Generated {len(test_documents)} test documents")
print("\nProcessing documents through deduplication pipeline...\n")

# Process documents
results = []
for doc_id, text in test_documents:
    result = dedup_pipeline.process_document(doc_id, text, similarity_threshold=0.7)
    results.append(result)
    
    # Print result
    status = "DUPLICATE" if result['is_duplicate'] else "UNIQUE"
    print(f"{doc_id}: {status}")
    print(f"  Type: {result['duplicate_type']}")
    print(f"  Similarity: {result['similarity']:.3f}")
    print(f"  Action: {result['action']}")
    
    if 'similar_to' in result:
        print(f"  Similar to: {result['similar_to']}")
    if 'note' in result:
        print(f"  Note: {result['note']}")
    print()

# Generate report
report = dedup_pipeline.get_deduplication_report()

print("=== DEDUPLICATION REPORT ===")
print(f"Total documents processed: {report['total_documents']}")
print(f"Exact duplicates found: {report['exact_duplicates']} ({report.get('exact_duplicate_rate', 0):.1%})")
print(f"Near duplicates found: {report['near_duplicates']} ({report.get('near_duplicate_rate', 0):.1%})")
print(f"Unique documents: {report['unique_documents']} ({report.get('unique_rate', 0):.1%})")
print(f"Total duplicate rate: {report.get('total_duplicate_rate', 0):.1%}")

# Visualize results
df_results = pd.DataFrame(results)

plt.figure(figsize=(15, 10))

# Duplicate type distribution
plt.subplot(2, 3, 1)
duplicate_counts = df_results['duplicate_type'].value_counts()
colors = ['lightgreen', 'orange', 'red']
plt.pie(duplicate_counts.values, labels=duplicate_counts.index, autopct='%1.1f%%', colors=colors)
plt.title('Duplicate Type Distribution')

# Similarity distribution
plt.subplot(2, 3, 2)
plt.hist(df_results['similarity'], bins=10, alpha=0.7, color='skyblue', edgecolor='black')
plt.xlabel('Similarity Score')
plt.ylabel('Frequency')
plt.title('Similarity Score Distribution')

# Action distribution
plt.subplot(2, 3, 3)
action_counts = df_results['action'].value_counts()
plt.bar(action_counts.index, action_counts.values, color=['green', 'red'])
plt.xlabel('Action')
plt.ylabel('Count')
plt.title('Action Distribution')

# Deduplication effectiveness
plt.subplot(2, 3, 4)
categories = ['Total', 'Exact Dup', 'Near Dup', 'Unique']
values = [report['total_documents'], report['exact_duplicates'], 
          report['near_duplicates'], report['unique_documents']]
colors = ['blue', 'red', 'orange', 'green']
plt.bar(categories, values, color=colors)
plt.xlabel('Category')
plt.ylabel('Count')
plt.title('Deduplication Results')
plt.xticks(rotation=45)

# Similarity vs Document ID
plt.subplot(2, 3, 5)
doc_indices = range(len(df_results))
colors_map = {'none': 'green', 'exact': 'red', 'near': 'orange'}
colors = [colors_map[dtype] for dtype in df_results['duplicate_type']]
plt.scatter(doc_indices, df_results['similarity'], c=colors, alpha=0.7, s=60)
plt.xlabel('Document Index')
plt.ylabel('Similarity Score')
plt.title('Similarity Scores by Document')
plt.legend(['Unique', 'Exact Duplicate', 'Near Duplicate'])

# Data reduction effectiveness
plt.subplot(2, 3, 6)
original_size = report['total_documents']
final_size = report['unique_documents']
reduction = original_size - final_size

plt.bar(['Original', 'After Dedup'], [original_size, final_size], 
        color=['lightblue', 'darkblue'])
plt.ylabel('Number of Documents')
plt.title(f'Data Reduction: {reduction} docs removed ({reduction/original_size:.1%})')

# Add reduction annotation
plt.annotate(f'-{reduction}\n({reduction/original_size:.1%})', 
             xy=(0.5, (original_size + final_size)/2), 
             ha='center', va='center', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n=== EFFICIENCY ANALYSIS ===")
print(f"Data reduction: {reduction} documents ({reduction/original_size:.1%})")
print(f"Storage savings: ~{reduction/original_size:.1%} (assuming uniform document sizes)")
print(f"Deduplication precision: {len([r for r in results if r['is_duplicate'] and ('dup' in r['doc_id'] or 'variation' in r['doc_id'])])} / {len([r for r in results if r['is_duplicate']])} detected duplicates were actual duplicates")

## 2. Data Quality Filtering System

Let's implement a comprehensive data quality filtering system:

In [None]:
class DataQualityFilter:
    """Comprehensive data quality filtering system"""
    
    def __init__(self):
        self.quality_metrics = {
            'length': self.check_length_quality,
            'language': self.check_language_quality,
            'content': self.check_content_quality,
            'structure': self.check_structure_quality,
            'encoding': self.check_encoding_quality
        }
        
        self.filter_stats = defaultdict(int)
        
        # Quality thresholds
        self.thresholds = {
            'min_length': 50,
            'max_length': 10000,
            'min_words': 10,
            'max_repetition_ratio': 0.3,
            'min_alpha_ratio': 0.6,
            'max_special_char_ratio': 0.1,
            'min_sentence_count': 2,
            'max_avg_word_length': 15
        }
    
    def assess_document_quality(self, text, doc_id=None):
        """Assess overall document quality"""
        quality_assessment = {
            'doc_id': doc_id,
            'overall_score': 0.0,
            'individual_scores': {},
            'passed_filters': [],
            'failed_filters': [],
            'quality_issues': [],
            'recommendation': 'unknown'
        }
        
        total_score = 0
        
        # Run all quality checks
        for metric_name, metric_func in self.quality_metrics.items():
            try:
                score, passed, issues = metric_func(text)
                quality_assessment['individual_scores'][metric_name] = score
                total_score += score
                
                if passed:
                    quality_assessment['passed_filters'].append(metric_name)
                else:
                    quality_assessment['failed_filters'].append(metric_name)
                    quality_assessment['quality_issues'].extend(issues)
                    
            except Exception as e:
                quality_assessment['individual_scores'][metric_name] = 0.0
                quality_assessment['failed_filters'].append(metric_name)
                quality_assessment['quality_issues'].append(f"{metric_name}_error: {str(e)}")
        
        # Calculate overall score
        quality_assessment['overall_score'] = total_score / len(self.quality_metrics)
        
        # Make recommendation
        quality_assessment['recommendation'] = self.make_quality_recommendation(
            quality_assessment['overall_score'], 
            quality_assessment['failed_filters']
        )
        
        # Update statistics
        self.filter_stats['total_documents'] += 1
        if quality_assessment['recommendation'] == 'accept':
            self.filter_stats['accepted_documents'] += 1
        elif quality_assessment['recommendation'] == 'reject':
            self.filter_stats['rejected_documents'] += 1
        else:
            self.filter_stats['review_documents'] += 1
        
        return quality_assessment
    
    def check_length_quality(self, text):
        """Check document length quality"""
        char_count = len(text)
        word_count = len(text.split())
        
        issues = []
        score = 1.0
        
        # Check character length
        if char_count < self.thresholds['min_length']:
            issues.append(f"Too short: {char_count} chars < {self.thresholds['min_length']}")
            score *= 0.3
        elif char_count > self.thresholds['max_length']:
            issues.append(f"Too long: {char_count} chars > {self.thresholds['max_length']}")
            score *= 0.7
        
        # Check word count
        if word_count < self.thresholds['min_words']:
            issues.append(f"Too few words: {word_count} < {self.thresholds['min_words']}")
            score *= 0.5
        
        passed = len(issues) == 0
        return score, passed, issues
    
    def check_language_quality(self, text):
        """Check language and character quality"""
        issues = []
        score = 1.0
        
        # Calculate character ratios
        total_chars = len(text)
        if total_chars == 0:
            return 0.0, False, ['Empty text']
        
        alpha_chars = sum(1 for c in text if c.isalpha())
        digit_chars = sum(1 for c in text if c.isdigit())
        space_chars = sum(1 for c in text if c.isspace())
        special_chars = total_chars - alpha_chars - digit_chars - space_chars
        
        alpha_ratio = alpha_chars / total_chars
        special_ratio = special_chars / total_chars
        
        # Check alphabetic character ratio
        if alpha_ratio < self.thresholds['min_alpha_ratio']:
            issues.append(f"Low alphabetic ratio: {alpha_ratio:.2f} < {self.thresholds['min_alpha_ratio']}")
            score *= 0.6
        
        # Check special character ratio
        if special_ratio > self.thresholds['max_special_char_ratio']:
            issues.append(f"High special char ratio: {special_ratio:.2f} > {self.thresholds['max_special_char_ratio']}")
            score *= 0.7
        
        # Check for non-printable characters
        non_printable = sum(1 for c in text if ord(c) < 32 and c not in '\t\n\r')
        if non_printable > 0:
            issues.append(f"Contains {non_printable} non-printable characters")
            score *= 0.8
        
        passed = len(issues) == 0
        return score, passed, issues
    
    def check_content_quality(self, text):
        """Check content quality and repetition"""
        issues = []
        score = 1.0
        
        words = text.lower().split()
        if not words:
            return 0.0, False, ['No words found']
        
        # Check repetition
        word_counts = Counter(words)
        most_common_word, max_count = word_counts.most_common(1)[0]
        repetition_ratio = max_count / len(words)
        
        if repetition_ratio > self.thresholds['max_repetition_ratio']:
            issues.append(f"High repetition: '{most_common_word}' appears {repetition_ratio:.2f} of the time")
            score *= 0.5
        
        # Check average word length
        avg_word_length = sum(len(word) for word in words) / len(words)
        if avg_word_length > self.thresholds['max_avg_word_length']:
            issues.append(f"Unusually long words: avg {avg_word_length:.1f} chars")
            score *= 0.8
        
        # Check for excessive capitalization
        caps_ratio = sum(1 for c in text if c.isupper()) / len(text)
        if caps_ratio > 0.3:
            issues.append(f"Excessive capitalization: {caps_ratio:.2f}")
            score *= 0.7
        
        passed = len(issues) == 0
        return score, passed, issues
    
    def check_structure_quality(self, text):
        """Check document structure quality"""
        issues = []
        score = 1.0
        
        # Check sentence structure
        sentences = nltk.sent_tokenize(text)
        sentence_count = len(sentences)
        
        if sentence_count < self.thresholds['min_sentence_count']:
            issues.append(f"Too few sentences: {sentence_count} < {self.thresholds['min_sentence_count']}")
            score *= 0.6
        
        # Check sentence length variation
        if sentences:
            sentence_lengths = [len(sent.split()) for sent in sentences]
            avg_sent_length = np.mean(sentence_lengths)
            
            if avg_sent_length < 3:
                issues.append(f"Very short sentences: avg {avg_sent_length:.1f} words")
                score *= 0.7
            elif avg_sent_length > 50:
                issues.append(f"Very long sentences: avg {avg_sent_length:.1f} words")
                score *= 0.8
        
        # Check paragraph structure (simple heuristic)
        paragraphs = text.split('\n\n')
        if len(paragraphs) == 1 and len(text) > 1000:
            issues.append("Long text without paragraph breaks")
            score *= 0.9
        
        passed = len(issues) == 0
        return score, passed, issues
    
    def check_encoding_quality(self, text):
        """Check text encoding quality"""
        issues = []
        score = 1.0
        
        # Check for encoding artifacts
        encoding_artifacts = ['�', '\ufffd', '\x00']
        for artifact in encoding_artifacts:
            if artifact in text:
                issues.append(f"Contains encoding artifact: {repr(artifact)}")
                score *= 0.5
        
        # Check for excessive whitespace
        whitespace_ratio = sum(1 for c in text if c.isspace()) / len(text) if text else 0
        if whitespace_ratio > 0.4:
            issues.append(f"Excessive whitespace: {whitespace_ratio:.2f}")
            score *= 0.8
        
        # Check for repeated whitespace patterns
        if re.search(r'\s{10,}', text):
            issues.append("Contains long sequences of whitespace")
            score *= 0.9
        
        passed = len(issues) == 0
        return score, passed, issues
    
    def make_quality_recommendation(self, overall_score, failed_filters):
        """Make recommendation based on quality assessment"""
        critical_filters = ['length', 'encoding']
        
        # Reject if critical filters failed
        if any(f in failed_filters for f in critical_filters):
            return 'reject'
        
        # Accept if high quality
        if overall_score >= 0.8:
            return 'accept'
        
        # Reject if very low quality
        elif overall_score < 0.4:
            return 'reject'
        
        # Review for medium quality
        else:
            return 'review'
    
    def get_filter_statistics(self):
        """Get filtering statistics"""
        stats = dict(self.filter_stats)
        
        if stats.get('total_documents', 0) > 0:
            total = stats['total_documents']
            stats['acceptance_rate'] = stats.get('accepted_documents', 0) / total
            stats['rejection_rate'] = stats.get('rejected_documents', 0) / total
            stats['review_rate'] = stats.get('review_documents', 0) / total
        
        return stats

# Initialize quality filter
quality_filter = DataQualityFilter()
print("Data quality filter initialized!")

### Testing Data Quality Filtering

Let's test our quality filtering system with various document types:

In [None]:
# Generate test documents with various quality levels
def generate_quality_test_documents():
    """Generate documents with different quality characteristics"""
    documents = [
        # High quality document
        ("high_quality", 
         "Machine learning has revolutionized many industries by enabling computers to learn from data. "
         "This technology uses algorithms to identify patterns and make predictions without explicit programming. "
         "Applications range from image recognition to natural language processing, transforming how we interact with technology."),
        
        # Too short document
        ("too_short", "ML is good."),
        
        # Repetitive document
        ("repetitive", 
         "The the the the the machine machine machine learning learning learning is is is very very very good good good. "
         "Machine machine machine learning learning learning helps helps helps us us us solve solve solve problems problems problems."),
        
        # High special character ratio
        ("special_chars", 
         "M@ch!n3 l3@rn!ng !$ @ $ub$3t 0f @rt!f!c!@l !nt3ll!g3nc3 th@t 3n@bl3$ c0mput3r$ t0 l3@rn fr0m d@t@. "
         "Th!$ t3chn0l0gy u$3$ @lg0r!thm$ t0 !d3nt!fy p@tt3rn$ @nd m@k3 pr3d!ct!0n$."),
        
        # Excessive capitalization
        ("excessive_caps", 
         "MACHINE LEARNING IS A SUBSET OF ARTIFICIAL INTELLIGENCE THAT ENABLES COMPUTERS TO LEARN FROM DATA. "
         "THIS TECHNOLOGY USES ALGORITHMS TO IDENTIFY PATTERNS AND MAKE PREDICTIONS WITHOUT EXPLICIT PROGRAMMING."),
        
        # Very long sentences
        ("long_sentences", 
         "Machine learning which is a subset of artificial intelligence that enables computers to learn from data "
         "without explicit programming and uses algorithms to identify patterns and make predictions has revolutionized "
         "many industries including healthcare finance transportation and technology by providing automated solutions "
         "that can process vast amounts of information and extract meaningful insights that would be impossible for "
         "humans to analyze manually in a reasonable timeframe."),
        
        # Encoding issues (simulated)
        ("encoding_issues", 
         "Machine learning is a subset of artificial intelligence� that enables computers to learn from data. "
         "This technology uses algorithms\x00 to identify patterns and make predictions."),
        
        # Medium quality document
        ("medium_quality", 
         "AI and machine learning are important. They help solve problems. Many companies use these technologies. "
         "The future looks promising for AI development and implementation in various sectors."),
        
        # Empty document
        ("empty", ""),
        
        # Excessive whitespace
        ("whitespace_issues", 
         "Machine    learning         is    a    subset         of    artificial         intelligence.    "
         "This         technology         uses         algorithms         to         identify         patterns.")
    ]
    
    return documents

# Generate and process test documents
quality_test_docs = generate_quality_test_documents()

print(f"Testing {len(quality_test_docs)} documents for quality...\n")

quality_results = []
for doc_id, text in quality_test_docs:
    assessment = quality_filter.assess_document_quality(text, doc_id)
    quality_results.append(assessment)
    
    print(f"Document: {doc_id}")
    print(f"  Overall Score: {assessment['overall_score']:.3f}")
    print(f"  Recommendation: {assessment['recommendation'].upper()}")
    print(f"  Passed Filters: {', '.join(assessment['passed_filters']) if assessment['passed_filters'] else 'None'}")
    print(f"  Failed Filters: {', '.join(assessment['failed_filters']) if assessment['failed_filters'] else 'None'}")
    
    if assessment['quality_issues']:
        print(f"  Issues:")
        for issue in assessment['quality_issues'][:3]:  # Show first 3 issues
            print(f"    - {issue}")
    
    print(f"  Individual Scores:")
    for metric, score in assessment['individual_scores'].items():
        print(f"    {metric}: {score:.3f}")
    print()

# Get filtering statistics
filter_stats = quality_filter.get_filter_statistics()

print("=== QUALITY FILTERING STATISTICS ===")
print(f"Total documents: {filter_stats['total_documents']}")
print(f"Accepted: {filter_stats.get('accepted_documents', 0)} ({filter_stats.get('acceptance_rate', 0):.1%})")
print(f"Rejected: {filter_stats.get('rejected_documents', 0)} ({filter_stats.get('rejection_rate', 0):.1%})")
print(f"Need Review: {filter_stats.get('review_documents', 0)} ({filter_stats.get('review_rate', 0):.1%})")

# Visualize quality assessment results
df_quality = pd.DataFrame(quality_results)

plt.figure(figsize=(16, 12))

# Overall quality scores
plt.subplot(3, 3, 1)
doc_names = [result['doc_id'] for result in quality_results]
scores = [result['overall_score'] for result in quality_results]
colors = ['green' if score >= 0.8 else 'orange' if score >= 0.4 else 'red' for score in scores]
plt.bar(range(len(scores)), scores, color=colors)
plt.xlabel('Document')
plt.ylabel('Quality Score')
plt.title('Overall Quality Scores')
plt.xticks(range(len(doc_names)), doc_names, rotation=45, ha='right')
plt.ylim(0, 1)

# Recommendation distribution
plt.subplot(3, 3, 2)
recommendations = [result['recommendation'] for result in quality_results]
rec_counts = Counter(recommendations)
colors = ['green', 'orange', 'red']
plt.pie(rec_counts.values(), labels=rec_counts.keys(), autopct='%1.1f%%', colors=colors)
plt.title('Recommendation Distribution')

# Individual metric scores heatmap
plt.subplot(3, 3, 3)
metrics = ['length', 'language', 'content', 'structure', 'encoding']
score_matrix = []
for result in quality_results:
    row = [result['individual_scores'].get(metric, 0) for metric in metrics]
    score_matrix.append(row)

score_matrix = np.array(score_matrix)
im = plt.imshow(score_matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
plt.colorbar(im)
plt.xlabel('Quality Metrics')
plt.ylabel('Documents')
plt.title('Individual Metric Scores')
plt.xticks(range(len(metrics)), metrics, rotation=45)
plt.yticks(range(len(doc_names)), doc_names)

# Failed filters analysis
plt.subplot(3, 3, 4)
all_failed_filters = []
for result in quality_results:
    all_failed_filters.extend(result['failed_filters'])
failed_counts = Counter(all_failed_filters)
if failed_counts:
    plt.bar(failed_counts.keys(), failed_counts.values(), color='red', alpha=0.7)
    plt.xlabel('Filter Type')
    plt.ylabel('Failure Count')
    plt.title('Most Common Filter Failures')
    plt.xticks(rotation=45)
else:
    plt.text(0.5, 0.5, 'No Filter Failures', ha='center', va='center', transform=plt.gca().transAxes)
    plt.title('Filter Failures')

# Quality score distribution
plt.subplot(3, 3, 5)
plt.hist(scores, bins=10, alpha=0.7, color='skyblue', edgecolor='black')
plt.axvline(x=0.8, color='green', linestyle='--', label='Accept Threshold')
plt.axvline(x=0.4, color='red', linestyle='--', label='Reject Threshold')
plt.xlabel('Quality Score')
plt.ylabel('Frequency')
plt.title('Quality Score Distribution')
plt.legend()

# Quality vs Document Length
plt.subplot(3, 3, 6)
doc_lengths = [len(doc[1]) for doc in quality_test_docs]
plt.scatter(doc_lengths, scores, c=colors, alpha=0.7, s=60)
plt.xlabel('Document Length (characters)')
plt.ylabel('Quality Score')
plt.title('Quality vs Document Length')

# Filter pass rate by metric
plt.subplot(3, 3, 7)
pass_rates = {}
for metric in metrics:
    passed = sum(1 for result in quality_results if metric in result['passed_filters'])
    pass_rates[metric] = passed / len(quality_results)

plt.bar(pass_rates.keys(), pass_rates.values(), color='lightblue')
plt.xlabel('Quality Metric')
plt.ylabel('Pass Rate')
plt.title('Filter Pass Rates')
plt.xticks(rotation=45)
plt.ylim(0, 1)

# Quality improvement potential
plt.subplot(3, 3, 8)
improvement_potential = [1 - score for score in scores]
plt.bar(range(len(improvement_potential)), improvement_potential, color='orange', alpha=0.7)
plt.xlabel('Document')
plt.ylabel('Improvement Potential')
plt.title('Quality Improvement Potential')
plt.xticks(range(len(doc_names)), doc_names, rotation=45, ha='right')

# Data retention after filtering
plt.subplot(3, 3, 9)
accepted = sum(1 for r in recommendations if r == 'accept')
rejected = sum(1 for r in recommendations if r == 'reject')
review = sum(1 for r in recommendations if r == 'review')

categories = ['Original', 'After Filtering']
original_count = len(quality_results)
retained_count = accepted + review  # Assuming review items might be kept

plt.bar(categories, [original_count, retained_count], color=['lightblue', 'darkblue'])
plt.ylabel('Number of Documents')
plt.title(f'Data Retention: {retained_count}/{original_count} ({retained_count/original_count:.1%})')

plt.tight_layout()
plt.show()

print(f"\n=== QUALITY ANALYSIS SUMMARY ===")
print(f"Average quality score: {np.mean(scores):.3f}")
print(f"Quality score std dev: {np.std(scores):.3f}")
print(f"Highest quality: {max(scores):.3f} ({doc_names[scores.index(max(scores))]})")
print(f"Lowest quality: {min(scores):.3f} ({doc_names[scores.index(min(scores))]})")
print(f"Data retention rate: {retained_count/original_count:.1%}")
print(f"Most common quality issues: {dict(Counter(all_failed_filters).most_common(3))}")