In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
# nltk.download('punkt_tab') # Removing this as it seems to not be the correct resource
# Re-downloading punkt to ensure English tokenizer is available
nltk.download('punkt')


# Input text
text = "This is an example sentence, showing off the text pre-processing pipeline with NLTK!"
print(f"Original text: {text}\n")

# Step 1: Tokenization
tokens = word_tokenize(text)
print(f"After Tokenization: {tokens}")
print("Explanation: Tokenization breaks the text into individual words or tokens. This is the first step in understanding the structure of the text.\n")

# Step 2: Lowercasing
tokens = [word.lower() for word in tokens]
print(f"After Lowercasing: {tokens}")
print("Explanation: Lowercasing converts all characters to lowercase. This ensures that words like 'The' and 'the' are treated as the same word, reducing the size of the vocabulary.\n")

# Step 3: Removing Punctuation
tokens = [word for word in tokens if word not in string.punctuation]
print(f"After Removing Punctuation: {tokens}")
print("Explanation: Removing punctuation removes symbols that do not contribute to the meaning of the words. This helps to focus on the actual content and reduces noise.\n")

# Step 4: Removing Stop Words
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
print(f"After Removing Stop Words: {tokens}")
print("Explanation: Removing stop words eliminates common words that do not carry significant meaning (e.g., 'this', 'is', 'an'). This helps to reduce the dimensionality of the data and focus on more important terms.\n")

# Step 5: Stemming
stemmer = PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens]
print(f"After Stemming: {tokens}")
print("Explanation: Stemming reduces words to their root form (e.g., 'showing' becomes 'show'). This helps to group words with similar meanings together, further reducing the vocabulary size and improving the efficiency of machine learning models.\n")

print("\nImportance for building vocabulary for machine learning:")
print("Each of these pre-processing steps helps to clean and normalize the text data. By reducing variations in words (due to casing, punctuation, or morphology) and removing irrelevant terms, we create a more focused and smaller set of unique words, which forms the vocabulary for machine learning models. A smaller and cleaner vocabulary leads to more efficient and effective model training.")

Original text: This is an example sentence, showing off the text pre-processing pipeline with NLTK!

After Tokenization: ['This', 'is', 'an', 'example', 'sentence', ',', 'showing', 'off', 'the', 'text', 'pre-processing', 'pipeline', 'with', 'NLTK', '!']
Explanation: Tokenization breaks the text into individual words or tokens. This is the first step in understanding the structure of the text.

After Lowercasing: ['this', 'is', 'an', 'example', 'sentence', ',', 'showing', 'off', 'the', 'text', 'pre-processing', 'pipeline', 'with', 'nltk', '!']
Explanation: Lowercasing converts all characters to lowercase. This ensures that words like 'The' and 'the' are treated as the same word, reducing the size of the vocabulary.

After Removing Punctuation: ['this', 'is', 'an', 'example', 'sentence', 'showing', 'off', 'the', 'text', 'pre-processing', 'pipeline', 'with', 'nltk']
Explanation: Removing punctuation removes symbols that do not contribute to the meaning of the words. This helps to focus on

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from collections import Counter

# Download required NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
# Removed the download for 'averaged_perceptron_tagger_eng' and 'punkt_tab'
# as they are not needed for the standard English tokenization and POS tagging.


def text_preprocessing_pipeline(text):
    """
    Complete text preprocessing pipeline using NLTK
    Shows each step and explains the changes made
    """

    print("="*80)
    print("TEXT PREPROCESSING PIPELINE FOR MACHINE LEARNING")
    print("="*80)

    # Original text
    print(f"\n📝 ORIGINAL TEXT:")
    print(f"'{text}'")
    print(f"Character count: {len(text)}")
    print(f"Word count (rough): {len(text.split())}")

    # Step 1: Case normalization (Lowercasing)
    print(f"\n🔽 STEP 1: LOWERCASING")
    text_lower = text.lower()
    print(f"Output: '{text_lower}'")
    print(f"Changes made: All uppercase letters converted to lowercase")
    print(f"🎯 ML Importance: Ensures 'Apple', 'APPLE', and 'apple' are treated as the same word")
    print(f"   This prevents the model from creating separate features for the same concept.")

    # Step 2: Expanding contractions (basic approach)
    print(f"\n📖 STEP 2: EXPANDING CONTRACTIONS")
    contractions = {
        "don't": "do not", "won't": "will not", "can't": "cannot",
        "n't": " not", "'re": " are", "'ve": " have", "'ll": " will",
        "'d": " would", "'m": " am", "it's": "it is", "that's": "that is"
    }
    text_expanded = text_lower
    for contraction, expansion in contractions.items():
        text_expanded = text_expanded.replace(contraction, expansion)

    print(f"Output: '{text_expanded}'")
    print(f"Changes made: Contractions expanded to full forms")
    print(f"🎯 ML Importance: Standardizes language - 'don't' and 'do not' become the same feature")

    # Step 3: Remove special characters and digits (keeping spaces)
    print(f"\n🧹 STEP 3: REMOVING SPECIAL CHARACTERS & DIGITS")
    text_clean = re.sub(r'[^a-zA-Z\s]', '', text_expanded)
    print(f"Output: '{text_clean}'")
    print(f"Changes made: Removed punctuation, numbers, and special characters")
    print(f"🎯 ML Importance: Reduces noise and focuses on meaningful words")
    print(f"   Prevents creating separate features for 'word' vs 'word!' vs 'word?'")

    # Step 4: Remove extra whitespaces
    print(f"\n⚪ STEP 4: WHITESPACE NORMALIZATION")
    text_normalized = re.sub(r'\s+', ' ', text_clean).strip()
    print(f"Output: '{text_normalized}'")
    print(f"Changes made: Multiple spaces converted to single spaces, leading/trailing spaces removed")
    print(f"🎯 ML Importance: Prevents tokenization errors and ensures consistent formatting")

    # Step 5: Tokenization
    print(f"\n🔪 STEP 5: TOKENIZATION")
    tokens = word_tokenize(text_normalized)
    print(f"Output: {tokens}")
    print(f"Changes made: Text split into individual words (tokens)")
    print(f"Token count: {len(tokens)}")
    print(f"🎯 ML Importance: Converts text into discrete units that algorithms can process")
    print(f"   Each token becomes a potential feature in your model")

    # Step 6: Remove stopwords
    print(f"\n🚫 STEP 6: STOPWORD REMOVAL")
    stop_words = set(stopwords.words('english'))
    tokens_no_stop = [token for token in tokens if token not in stop_words]
    print(f"Stopwords found and removed: {[token for token in tokens if token in stop_words]}")
    print(f"Output: {tokens_no_stop}")
    print(f"🎯 ML Importance: Removes high-frequency, low-information words")
    print(f"   Reduces vocabulary size and focuses on meaningful content words")
    print(f"   Token count reduced from {len(tokens)} to {len(tokens_no_stop)}")

    # Step 7: Part-of-Speech Tagging (for demonstration)
    print(f"\n🏷️  STEP 7: PART-OF-SPEECH TAGGING")
    pos_tags = pos_tag(tokens_no_stop)
    print(f"Output: {pos_tags}")
    print(f"Changes made: Each word tagged with its grammatical role")
    print(f"🎯 ML Importance: Helps distinguish word meanings (e.g., 'run' as noun vs verb)")
    print(f"   Can be used for advanced preprocessing like keeping only nouns/verbs")

    # Step 8: Stemming
    print(f"\n🌱 STEP 8: STEMMING")
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens_no_stop]
    print(f"Output: {stemmed_tokens}")
    print(f"Changes made: Words reduced to their root forms")

    # Show stemming examples
    stemming_examples = {}
    for original, stemmed in zip(tokens_no_stop, stemmed_tokens):
        if original != stemmed:
            stemming_examples[original] = stemmed

    if stemming_examples:
        print(f"Stemming examples: {stemming_examples}")

    print(f"🎯 ML Importance: Groups related words together (running, runs, ran → run)")
    print(f"   Reduces vocabulary size and improves generalization")

    # Step 9: Lemmatization
    print(f"\n🎯 STEP 9: LEMMATIZATION")
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens_no_stop]
    print(f"Output: {lemmatized_tokens}")
    print(f"Changes made: Words converted to their dictionary base forms")

    # Show lemmatization examples
    lemma_examples = {}
    for original, lemmatized in zip(tokens_no_stop, lemmatized_tokens):
        if original != lemmatized:
            lemma_examples[original] = lemmatized

    if lemma_examples:
        print(f"Lemmatization examples: {lemma_examples}")

    print(f"🎯 ML Importance: More accurate than stemming - preserves word meaning")
    print(f"   'better' becomes 'good', not 'bett' (as stemming would do)")

    # Final vocabulary analysis
    print(f"\n📊 FINAL VOCABULARY ANALYSIS")
    print(f"Original text length: {len(text)} characters")
    print(f"Final processed tokens: {lemmatized_tokens}")
    print(f"Unique tokens in final vocabulary: {len(set(lemmatized_tokens))}")
    print(f"Total tokens: {len(lemmatized_tokens)}")

    # Show vocabulary reduction
    original_vocab = set(text.lower().split())
    final_vocab = set(lemmatized_tokens)
    print(f"\nVocabulary size reduction:")
    print(f"  Before preprocessing: {len(original_vocab)} unique words")
    print(f"  After preprocessing: {len(final_vocab)} unique words")
    print(f"  Reduction: {len(original_vocab) - len(final_vocab)} words removed")

    print(f"\n🚀 READY FOR MACHINE LEARNING!")
    print(f"Clean tokens: {lemmatized_tokens}")

    return lemmatized_tokens

# Example usage with a realistic text sample
sample_text = "I'm absolutely LOVING the new iPhone's camera!!! It's amazing and I can't believe how great the photos are. The developers have really improved the technology significantly."

# Run the preprocessing pipeline
final_tokens = text_preprocessing_pipeline(sample_text)

print(f"\n" + "="*80)
print("WHY TEXT PREPROCESSING MATTERS FOR MACHINE LEARNING")
print("="*80)
print("""
🎯 VOCABULARY CONSISTENCY:
   - Without preprocessing: ['iPhone', 'iphone', 'IPHONE'] = 3 different features
   - With preprocessing: ['iphone'] = 1 consistent feature

🎯 NOISE REDUCTION:
   - Removes irrelevant punctuation, numbers, and special characters
   - Focuses model attention on meaningful content

🎯 DIMENSIONALITY REDUCTION:
   - Stemming/Lemmatization: ['running', 'runs', 'ran'] → ['run']
   - Stopword removal: Eliminates 'the', 'is', 'and', etc.
   - Reduces feature space size → faster training, less overfitting

🎯 IMPROVED GENERALIZATION:
   - Normalized text helps model recognize patterns across different writing styles
   - Model learns from clean, consistent data representation

🎯 COMPUTATIONAL EFFICIENCY:
   - Smaller vocabulary = fewer dimensions = faster processing
   - Less memory usage during training and inference

The final clean tokens become your MODEL'S VOCABULARY - the foundation for all
machine learning tasks like classification, sentiment analysis, or text generation!
""")

In [2]:
import spacy
import pandas as pd
import json
import time
import re
import string
from pathlib import Path
from typing import List, Dict, Any, Union, Optional
from collections import Counter, defaultdict
import numpy as np
from datetime import datetime
import logging
from dataclasses import dataclass, asdict
import os

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class PreprocessingMetrics:
    """Container for preprocessing performance metrics"""
    # Timing metrics
    total_processing_time: float
    avg_processing_time_per_doc: float
    preprocessing_speed_tokens_per_sec: float

    # Volume metrics
    total_documents: int
    total_tokens_before: int
    total_tokens_after: int
    token_reduction_rate: float

    # Vocabulary metrics
    unique_tokens_before: int
    unique_tokens_after: int
    vocabulary_reduction_rate: float

    # Quality metrics
    avg_tokens_per_document: float
    empty_documents_after_processing: int
    most_common_tokens: List[tuple]

    # Linguistic metrics
    named_entities_found: int
    pos_tag_distribution: Dict[str, int]
    sentence_count: int
    avg_sentence_length: float

    # Memory metrics
    memory_usage_mb: float

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)

class ProductionTextPreprocessor:
    """
    Production-ready text preprocessing pipeline using spaCy
    Used by industry models like Claude, ChatGPT, and Grok
    """

    def __init__(self,
                 model_name: str = "en_core_web_sm",
                 custom_stopwords: Optional[List[str]] = None,
                 keep_pos: Optional[List[str]] = None,
                 min_token_length: int = 2,
                 max_token_length: int = 20,
                 remove_punct: bool = True,
                 remove_spaces: bool = True,
                 remove_numbers: bool = False,
                 lemmatize: bool = True,
                 lowercase: bool = True):
        """
        Initialize the preprocessing pipeline

        Args:
            model_name: spaCy model to use
            custom_stopwords: Additional stopwords to remove
            keep_pos: POS tags to keep (e.g., ['NOUN', 'VERB', 'ADJ'])
            min_token_length: Minimum token length to keep
            max_token_length: Maximum token length to keep
            remove_punct: Remove punctuation
            remove_spaces: Remove spaces
            remove_numbers: Remove numeric tokens
            lemmatize: Apply lemmatization
            lowercase: Convert to lowercase
        """
        try:
            self.nlp = spacy.load(model_name)
        except OSError:
            logger.error(f"Model {model_name} not found. Please install with: python -m spacy download {model_name}")
            raise

        # Configure pipeline settings
        self.custom_stopwords = set(custom_stopwords) if custom_stopwords else set()
        self.keep_pos = set(keep_pos) if keep_pos else None
        self.min_token_length = min_token_length
        self.max_token_length = max_token_length
        self.remove_punct = remove_punct
        self.remove_spaces = remove_spaces
        self.remove_numbers = remove_numbers
        self.lemmatize = lemmatize
        self.lowercase = lowercase

        # Performance tracking
        self.processing_stats = defaultdict(list)

        logger.info(f"Initialized TextPreprocessor with model: {model_name}")

    def preprocess_text(self, text: str) -> Dict[str, Any]:
        """
        Preprocess a single text document

        Args:
            text: Input text string

        Returns:
            Dictionary containing processed text and metadata
        """
        start_time = time.time()

        if not isinstance(text, str) or not text.strip():
            return {
                'original_text': text,
                'processed_text': '',
                'tokens': [],
                'metadata': {
                    'original_length': len(text) if text else 0,
                    'processed_length': 0,
                    'token_count': 0,
                    'processing_time': 0,
                    'entities': [],
                    'pos_tags': {},
                    'sentences': 0
                }
            }

        # Process with spaCy
        doc = self.nlp(text)

        # Extract tokens based on configuration
        processed_tokens = []
        pos_counts = Counter()
        entities = []

        for token in doc:
            # Skip tokens based on configuration
            if self.remove_punct and token.is_punct:
                continue
            if self.remove_spaces and token.is_space:
                continue
            if self.remove_numbers and token.like_num:
                continue
            if token.is_stop or token.text.lower() in self.custom_stopwords:
                continue
            if len(token.text) < self.min_token_length or len(token.text) > self.max_token_length:
                continue
            if self.keep_pos and token.pos_ not in self.keep_pos:
                continue

            # Process token
            if self.lemmatize:
                processed_token = token.lemma_
            else:
                processed_token = token.text

            if self.lowercase:
                processed_token = processed_token.lower()

            # Additional cleaning
            processed_token = re.sub(r'[^\w\s]', '', processed_token)

            if processed_token.strip():
                processed_tokens.append(processed_token)
                pos_counts[token.pos_] += 1

        # Extract named entities
        for ent in doc.ents:
            entities.append({
                'text': ent.text,
                'label': ent.label_,
                'start': ent.start_char,
                'end': ent.end_char
            })

        # Create processed text
        processed_text = ' '.join(processed_tokens)

        processing_time = time.time() - start_time

        return {
            'original_text': text,
            'processed_text': processed_text,
            'tokens': processed_tokens,
            'metadata': {
                'original_length': len(text),
                'processed_length': len(processed_text),
                'token_count': len(processed_tokens),
                'processing_time': processing_time,
                'entities': entities,
                'pos_tags': dict(pos_counts),
                'sentences': len(list(doc.sents))
            }
        }

    def process_file(self,
                    input_path: Union[str, Path],
                    text_column: str = 'text',
                    id_column: Optional[str] = None) -> List[Dict[str, Any]]:
        """
        Process text from various file formats

        Args:
            input_path: Path to input file (.txt, .csv, .json, .jsonl)
            text_column: Column name containing text (for structured files)
            id_column: Column name for document IDs

        Returns:
            List of processed documents
        """
        input_path = Path(input_path)

        if not input_path.exists():
            raise FileNotFoundError(f"Input file not found: {input_path}")

        logger.info(f"Processing file: {input_path}")

        # Read file based on extension
        if input_path.suffix.lower() == '.txt':
            with open(input_path, 'r', encoding='utf-8') as f:
                texts = [{'text': f.read(), 'id': input_path.stem}]

        elif input_path.suffix.lower() == '.csv':
            df = pd.read_csv(input_path)
            texts = []
            for idx, row in df.iterrows():
                doc_id = row[id_column] if id_column and id_column in df.columns else f"doc_{idx}"
                texts.append({'text': str(row[text_column]), 'id': doc_id})

        elif input_path.suffix.lower() == '.json':
            with open(input_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            if isinstance(data, list):
                texts = []
                for idx, item in enumerate(data):
                    if isinstance(item, dict):
                        doc_id = item.get(id_column, f"doc_{idx}") if id_column else f"doc_{idx}"
                        texts.append({'text': str(item.get(text_column, '')), 'id': doc_id})
                    else:
                        texts.append({'text': str(item), 'id': f"doc_{idx}"})
            else:
                texts = [{'text': str(data), 'id': 'single_doc'}]

        elif input_path.suffix.lower() == '.jsonl':
            texts = []
            with open(input_path, 'r', encoding='utf-8') as f:
                for idx, line in enumerate(f):
                    item = json.loads(line.strip())
                    doc_id = item.get(id_column, f"doc_{idx}") if id_column else f"doc_{idx}"
                    texts.append({'text': str(item.get(text_column, '')), 'id': doc_id})

        else:
            raise ValueError(f"Unsupported file format: {input_path.suffix}")

        # Process all texts
        processed_docs = []
        for doc in texts:
            result = self.preprocess_text(doc['text'])
            result['document_id'] = doc['id']
            processed_docs.append(result)

        return processed_docs

    def save_results(self,
                    processed_docs: List[Dict[str, Any]],
                    output_path: Union[str, Path],
                    format_type: str = 'json',
                    include_metadata: bool = True) -> None:
        """
        Save processed results to file

        Args:
            processed_docs: List of processed documents
            output_path: Output file path
            format_type: Output format ('json', 'csv', 'jsonl')
            include_metadata: Whether to include processing metadata
        """
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)

        if format_type.lower() == 'json':
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(processed_docs, f, indent=2, ensure_ascii=False)

        elif format_type.lower() == 'csv':
            # Flatten for CSV format
            csv_data = []
            for doc in processed_docs:
                row = {
                    'document_id': doc.get('document_id', ''),
                    'original_text': doc['original_text'],
                    'processed_text': doc['processed_text'],
                    'tokens': '|'.join(doc['tokens']),  # Join tokens with delimiter
                }

                if include_metadata:
                    row.update({
                        'original_length': doc['metadata']['original_length'],
                        'processed_length': doc['metadata']['processed_length'],
                        'token_count': doc['metadata']['token_count'],
                        'processing_time': doc['metadata']['processing_time'],
                        'entity_count': len(doc['metadata']['entities']),
                        'sentence_count': doc['metadata']['sentences']
                    })

                csv_data.append(row)

            pd.DataFrame(csv_data).to_csv(output_path, index=False)

        elif format_type.lower() == 'jsonl':
            with open(output_path, 'w', encoding='utf-8') as f:
                for doc in processed_docs:
                    f.write(json.dumps(doc, ensure_ascii=False) + '\n')

        else:
            raise ValueError(f"Unsupported output format: {format_type}")

        logger.info(f"Results saved to: {output_path}")

    def compute_performance_metrics(self, processed_docs: List[Dict[str, Any]]) -> PreprocessingMetrics:
        """
        Compute comprehensive performance metrics

        Args:
            processed_docs: List of processed documents

        Returns:
            PreprocessingMetrics object with all performance metrics
        """
        if not processed_docs:
            raise ValueError("No processed documents provided")

        # Extract metrics from processed documents
        total_docs = len(processed_docs)
        processing_times = [doc['metadata']['processing_time'] for doc in processed_docs]
        original_lengths = [doc['metadata']['original_length'] for doc in processed_docs]
        processed_lengths = [doc['metadata']['processed_length'] for doc in processed_docs]
        token_counts_before = [len(doc['original_text'].split()) for doc in processed_docs]
        token_counts_after = [doc['metadata']['token_count'] for doc in processed_docs]

        # Collect all tokens for vocabulary analysis
        all_original_tokens = []
        all_processed_tokens = []
        all_entities = []
        all_pos_tags = Counter()
        sentence_counts = []

        for doc in processed_docs:
            all_original_tokens.extend(doc['original_text'].split())
            all_processed_tokens.extend(doc['tokens'])
            all_entities.extend(doc['metadata']['entities'])
            all_pos_tags.update(doc['metadata']['pos_tags'])
            sentence_counts.append(doc['metadata']['sentences'])

        # Calculate metrics
        total_processing_time = sum(processing_times)
        avg_processing_time = total_processing_time / total_docs
        total_tokens_before = sum(token_counts_before)
        total_tokens_after = sum(token_counts_after)

        # Performance metrics
        preprocessing_speed = total_tokens_after / total_processing_time if total_processing_time > 0 else 0
        token_reduction_rate = (total_tokens_before - total_tokens_after) / total_tokens_before if total_tokens_before > 0 else 0

        # Vocabulary metrics
        unique_tokens_before = len(set(all_original_tokens))
        unique_tokens_after = len(set(all_processed_tokens))
        vocab_reduction_rate = (unique_tokens_before - unique_tokens_after) / unique_tokens_before if unique_tokens_before > 0 else 0

        # Quality metrics
        avg_tokens_per_doc = total_tokens_after / total_docs
        empty_docs = sum(1 for doc in processed_docs if not doc['processed_text'].strip())
        most_common = Counter(all_processed_tokens).most_common(10)

        # Linguistic metrics
        total_sentences = sum(sentence_counts)
        avg_sentence_length = total_tokens_after / total_sentences if total_sentences > 0 else 0

        # Memory estimation (rough)
        memory_usage = sum(len(doc['processed_text']) for doc in processed_docs) / (1024 * 1024)  # MB

        return PreprocessingMetrics(
            total_processing_time=total_processing_time,
            avg_processing_time_per_doc=avg_processing_time,
            preprocessing_speed_tokens_per_sec=preprocessing_speed,
            total_documents=total_docs,
            total_tokens_before=total_tokens_before,
            total_tokens_after=total_tokens_after,
            token_reduction_rate=token_reduction_rate,
            unique_tokens_before=unique_tokens_before,
            unique_tokens_after=unique_tokens_after,
            vocabulary_reduction_rate=vocab_reduction_rate,
            avg_tokens_per_document=avg_tokens_per_doc,
            empty_documents_after_processing=empty_docs,
            most_common_tokens=most_common,
            named_entities_found=len(all_entities),
            pos_tag_distribution=dict(all_pos_tags),
            sentence_count=total_sentences,
            avg_sentence_length=avg_sentence_length,
            memory_usage_mb=memory_usage
        )

    def generate_performance_report(self, metrics: PreprocessingMetrics, output_path: Optional[str] = None) -> str:
        """
        Generate a comprehensive performance report

        Args:
            metrics: PreprocessingMetrics object
            output_path: Optional path to save the report

        Returns:
            Report as string
        """
        report = f"""
{'='*80}
TEXT PREPROCESSING PERFORMANCE REPORT
{'='*80}
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

📊 PROCESSING PERFORMANCE
├─ Total Documents Processed: {metrics.total_documents:,}
├─ Total Processing Time: {metrics.total_processing_time:.2f} seconds
├─ Average Time per Document: {metrics.avg_processing_time_per_doc:.4f} seconds
└─ Processing Speed: {metrics.preprocessing_speed_tokens_per_sec:.0f} tokens/second

🎯 TOKEN ANALYSIS
├─ Total Tokens Before: {metrics.total_tokens_before:,}
├─ Total Tokens After: {metrics.total_tokens_after:,}
├─ Token Reduction Rate: {metrics.token_reduction_rate:.1%}
└─ Average Tokens per Document: {metrics.avg_tokens_per_document:.1f}

📚 VOCABULARY ANALYSIS
├─ Unique Tokens Before: {metrics.unique_tokens_before:,}
├─ Unique Tokens After: {metrics.unique_tokens_after:,}
├─ Vocabulary Reduction Rate: {metrics.vocabulary_reduction_rate:.1%}
└─ Empty Documents After Processing: {metrics.empty_documents_after_processing}

🏷️  LINGUISTIC FEATURES
├─ Named Entities Found: {metrics.named_entities_found:,}
├─ Total Sentences: {metrics.sentence_count:,}
├─ Average Sentence Length: {metrics.avg_sentence_length:.1f} tokens
└─ POS Tag Distribution:
"""

        # Add POS tag distribution
        for pos_tag, count in sorted(metrics.pos_tag_distribution.items(), key=lambda x: x[1], reverse=True)[:10]:
            report += f"   ├─ {pos_tag}: {count:,} ({count/sum(metrics.pos_tag_distribution.values()):.1%})\n"

        report += f"""
🔤 MOST COMMON TOKENS
"""
        for token, count in metrics.most_common_tokens[:10]:
            report += f"├─ '{token}': {count:,} occurrences\n"

        report += f"""
💾 RESOURCE USAGE
├─ Estimated Memory Usage: {metrics.memory_usage_mb:.2f} MB
└─ Processing Efficiency: {metrics.total_tokens_after / (metrics.total_processing_time or 1):.0f} tokens/sec

🎯 QUALITY INDICATORS
├─ Token Retention Rate: {(1 - metrics.token_reduction_rate):.1%}
├─ Vocabulary Retention Rate: {(1 - metrics.vocabulary_reduction_rate):.1%}
├─ Document Success Rate: {((metrics.total_documents - metrics.empty_documents_after_processing) / metrics.total_documents):.1%}
└─ Average Document Reduction: {(metrics.token_reduction_rate):.1%}

🚀 READINESS FOR MACHINE LEARNING
✓ Standardized vocabulary
✓ Noise removed
✓ Consistent tokenization
✓ Linguistic features extracted
✓ Structured output format

{'='*80}
"""

        if output_path:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(report)
            logger.info(f"Performance report saved to: {output_path}")

        return report

def main():
    """Example usage of the production preprocessing pipeline"""

    # Initialize preprocessor with production settings
    preprocessor = ProductionTextPreprocessor(
        model_name="en_core_web_sm",  # Download with: python -m spacy download en_core_web_sm
        keep_pos=['NOUN', 'VERB', 'ADJ', 'ADV'],  # Keep content words
        min_token_length=2,
        lemmatize=True,
        lowercase=True,
        remove_numbers=False
    )

    # Example: Create sample data
    sample_data = [
        "I'm absolutely LOVING the new iPhone's camera!!! It's amazing and I can't believe how great the photos are.",
        "The developers have really improved the technology significantly. Machine learning models like GPT-4 and Claude are revolutionizing NLP.",
        "Data preprocessing is crucial for building robust machine learning pipelines in production environments.",
        "Natural Language Processing involves tokenization, lemmatization, and feature extraction for downstream tasks."
    ]

    # Save sample data to file
    sample_df = pd.DataFrame(sample_data, columns=['text'])
    sample_df['id'] = [f'doc_{i}' for i in range(len(sample_data))]
    sample_df.to_csv('sample_input.csv', index=False)

    # Process file
    processed_docs = preprocessor.process_file('sample_input.csv', text_column='text', id_column='id')

    # Save results in multiple formats
    preprocessor.save_results(processed_docs, 'processed_output.json', format_type='json')
    preprocessor.save_results(processed_docs, 'processed_output.csv', format_type='csv')

    # Compute performance metrics
    metrics = preprocessor.compute_performance_metrics(processed_docs)

    # Generate and display performance report
    report = preprocessor.generate_performance_report(metrics, 'performance_report.txt')
    print(report)

    # Save metrics as JSON for programmatic access
    with open('preprocessing_metrics.json', 'w') as f:
        json.dump(metrics.to_dict(), f, indent=2)

    print("✅ Processing complete! Check the output files:")
    print("   - processed_output.json (structured results)")
    print("   - processed_output.csv (tabular format)")
    print("   - performance_report.txt (detailed metrics)")
    print("   - preprocessing_metrics.json (metrics data)")

if __name__ == "__main__":
    main()


TEXT PREPROCESSING PERFORMANCE REPORT
Generated: 2025-08-03 12:41:39

📊 PROCESSING PERFORMANCE
├─ Total Documents Processed: 4
├─ Total Processing Time: 0.05 seconds
├─ Average Time per Document: 0.0136 seconds
└─ Processing Speed: 624 tokens/second

🎯 TOKEN ANALYSIS
├─ Total Tokens Before: 61
├─ Total Tokens After: 34
├─ Token Reduction Rate: 44.3%
└─ Average Tokens per Document: 8.5

📚 VOCABULARY ANALYSIS
├─ Unique Tokens Before: 55
├─ Unique Tokens After: 33
├─ Vocabulary Reduction Rate: 40.0%
└─ Empty Documents After Processing: 0

🏷️  LINGUISTIC FEATURES
├─ Named Entities Found: 4
├─ Total Sentences: 7
├─ Average Sentence Length: 4.9 tokens
└─ POS Tag Distribution:
   ├─ NOUN: 17 (50.0%)
   ├─ VERB: 9 (26.5%)
   ├─ ADJ: 6 (17.6%)
   ├─ ADV: 2 (5.9%)

🔤 MOST COMMON TOKENS
├─ 'machine': 2 occurrences
├─ 'absolutely': 1 occurrences
├─ 'love': 1 occurrences
├─ 'new': 1 occurrences
├─ 'camera': 1 occurrences
├─ 'amazing': 1 occurrences
├─ 'believe': 1 occurrences
├─ 'great': 1 occurre