In [None]:
!pip install spacy yake scikit-learn

Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting segtok (from yake)
  Downloading segtok-1.5.11-py3-none-any.whl.metadata (9.0 kB)
Downloading yake-0.4.8-py2.py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading segtok-1.5.11-py3-none-any.whl (24 kB)
Installing collected packages: segtok, yake
Successfully installed segtok-1.5.11 yake-0.4.8


In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
import yake
from typing import Dict, List, Tuple
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

class TextEnhancer:
    def __init__(self):
        # Load SpaCy model for NER and dependency parsing
        self.nlp = spacy.load("en_core_web_sm")
        # Initialize YAKE for keyword extraction
        self.kw_extractor = yake.KeywordExtractor(
            lan="en",
            n=3,  # ngram size
            dedupLim=0.3,  # deduplication threshold
            top=20,  # number of keywords to extract
        )

    def enhance_text(self, text: str) -> Dict:
        """
        Enhance input text with NER, key phrases, and importance scoring.
        Returns structured data ready for LLM consumption.
        """
        # Process text with SpaCy
        doc = self.nlp(text)

        # Extract named entities
        entities = self._extract_entities(doc)

        # Extract key phrases using YAKE
        keywords = self._extract_keywords(text)

        # Calculate term importance
        term_importance = self._calculate_term_importance(text)

        # Create enhanced context
        enhanced_data = {
            'original_text': text,
            'entities': entities,
            'key_phrases': keywords,
            'term_importance': term_importance,
            'enhanced_prompt': self._generate_enhanced_prompt(text, entities, keywords)
        }

        return enhanced_data

    def _extract_entities(self, doc) -> Dict[str, List[str]]:
        """Extract and categorize named entities."""
        entities = {
            'PERSON': [],
            'ORG': [],
            'DATE': [],
            'GPE': [],  # Geographical/Political Entities
            'CONCEPT': []  # Technical terms and concepts
        }

        for ent in doc.ents:
            if ent.label_ in entities:
                entities[ent.label_].append(ent.text)

        return entities

    def _extract_keywords(self, text: str) -> List[Tuple[str, float]]:
        """Extract key phrases using YAKE algorithm."""
        keywords = self.kw_extractor.extract_keywords(text)
        # Sort by score (lower is better in YAKE)
        return sorted(keywords, key=lambda x: x[1])

    def _calculate_term_importance(self, text: str) -> Dict[str, float]:
        """Calculate term importance using TF-IDF inspired approach."""
        # Create word frequency matrix
        vectorizer = CountVectorizer(stop_words='english')
        X = vectorizer.fit_transform([text])

        # Get word frequencies
        word_freq = dict(zip(vectorizer.get_feature_names_out(), X.toarray()[0]))

        # Calculate importance score (normalized frequency)
        total_words = sum(word_freq.values())
        importance_scores = {
            word: freq/total_words
            for word, freq in word_freq.items()
        }

        return importance_scores

    def _generate_enhanced_prompt(self, text: str, entities: Dict, keywords: List) -> str:
        """Generate an enhanced prompt for the LLM incorporating extracted information."""
        # Extract top 5 keywords
        top_keywords = [kw[0] for kw in keywords[:5]]

        # Create context string from entities
        entity_context = []
        for entity_type, values in entities.items():
            if values:
                entity_context.append(f"{entity_type}: {', '.join(values)}")

        # Construct enhanced prompt
        prompt = f"""
Context:
Important concepts: {', '.join(top_keywords)}
{chr(10).join(entity_context)}

Original text:
{text}

Please generate questions focusing on the relationships between these key concepts and entities.
Consider different types of questions:
1. Factual questions about identified entities
2. Conceptual questions about key terms
3. Relationship questions between different concepts
"""
        return prompt.strip()

def process_pdf_chunk(chunk: str) -> Dict:
    """
    Process a chunk of text from a PDF and prepare it for question generation.
    """
    enhancer = TextEnhancer()
    enhanced_data = enhancer.enhance_text(chunk)
    return enhanced_data

# Example usage
if __name__ == "__main__":
    sample_text = """
    The Industrial Revolution began in Britain in the late 18th century.
    This period marked a major turning point in human history, fundamentally
    changing economic and social organization. The transition included going
    from manual production methods to machine manufacturing, new chemical
    manufacturing and iron production processes, improved efficiency of water
    power, the increasing use of steam power, and the development of machine tools.
    """

    # Process the text
    enhancer = TextEnhancer()
    result = enhancer.enhance_text(sample_text)

    # Example of how to use the enhanced data with an LLM
    print("Enhanced prompt for LLM:")
    print(result['enhanced_prompt'])

Enhanced prompt for LLM:
Context:
Important concepts: Industrial Revolution began, began in Britain, fundamentally changing economic, major turning point, century
DATE: the late 18th century
GPE: Britain

Original text:

    The Industrial Revolution began in Britain in the late 18th century. 
    This period marked a major turning point in human history, fundamentally 
    changing economic and social organization. The transition included going 
    from manual production methods to machine manufacturing, new chemical 
    manufacturing and iron production processes, improved efficiency of water 
    power, the increasing use of steam power, and the development of machine tools.
    

Please generate questions focusing on the relationships between these key concepts and entities.
Consider different types of questions:
1. Factual questions about identified entities
2. Conceptual questions about key terms
3. Relationship questions between different concepts


Adding domain-specific entity recognition for your subject area

Implementing concept mapping between related terms

Adding readability scoring to adjust question difficulty

Implementing custom keyword extraction for educational content

In [None]:
# Install required packages
!pip install spacy
!pip install yake
!pip install networkx
!pip install nltk
!python -m spacy download en_core_web_sm
!pip install scikit-learn

# Import and download required NLTK data
import nltk
nltk.download('punkt')

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import spacy
from typing import List, Dict, Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.tokenize import sent_tokenize
import networkx as nx
from collections import Counter

class TextOptimizer:
    def __init__(self, max_chunk_size: int = 1000):
        self.nlp = spacy.load("en_core_web_sm")
        self.max_chunk_size = max_chunk_size
        self.tfidf = TfidfVectorizer(stop_words='english')

    def optimize_for_llm(self, text: str) -> List[Dict]:
        """
        Main pipeline to optimize text for LLM processing.
        Returns list of optimized chunks with metadata.
        """
        # Break into initial chunks
        sentences = sent_tokenize(text)
        chunks = self._create_semantic_chunks(sentences)

        # Process each chunk
        optimized_chunks = []
        for chunk in chunks:
            # Get the most information-dense sentences
            dense_text = self._extract_key_sentences(chunk)

            # Remove redundant information
            cleaned_text = self._remove_redundancy(dense_text)

            # Structure the chunk with metadata
            optimized_chunk = {
                'original_length': len(chunk),
                'optimized_length': len(cleaned_text),
                'compression_ratio': len(cleaned_text) / len(chunk),
                'content': cleaned_text,
                'key_concepts': self._extract_key_concepts(cleaned_text)
            }
            optimized_chunks.append(optimized_chunk)

        return optimized_chunks

    def _create_semantic_chunks(self, sentences: List[str]) -> List[str]:
        """
        Group sentences into semantic chunks based on topic similarity.
        """
        # Calculate sentence embeddings using SpaCy
        docs = [self.nlp(sent) for sent in sentences]
        embeddings = np.array([doc.vector for doc in docs])

        # Create similarity matrix
        similarity_matrix = np.inner(embeddings, embeddings)

        # Create graph from similarity matrix
        G = nx.from_numpy_array(similarity_matrix)

        # Find communities (chunks) using Louvain method
        communities = nx.community.louvain_communities(G)

        # Group sentences into chunks
        chunks = []
        for community in communities:
            chunk_sentences = [sentences[i] for i in community]
            chunk = ' '.join(chunk_sentences)

            # Split if chunk is too large
            if len(chunk) > self.max_chunk_size:
                sub_chunks = self._split_chunk(chunk)
                chunks.extend(sub_chunks)
            else:
                chunks.append(chunk)

        return chunks

    def _split_chunk(self, chunk: str) -> List[str]:
        """
        Split large chunks while maintaining context.
        """
        sentences = sent_tokenize(chunk)
        current_chunk = []
        chunks = []
        current_length = 0

        for sent in sentences:
            if current_length + len(sent) > self.max_chunk_size:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sent]
                current_length = len(sent)
            else:
                current_chunk.append(sent)
                current_length += len(sent)

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    def _extract_key_sentences(self, text: str) -> str:
        """
        Extract most informative sentences using TF-IDF scores.
        """
        sentences = sent_tokenize(text)
        if len(sentences) <= 3:
            return text

        # Calculate TF-IDF scores
        tfidf_matrix = self.tfidf.fit_transform(sentences)
        sentence_scores = np.sum(tfidf_matrix.toarray(), axis=1)

        # Select top sentences
        top_indices = np.argsort(sentence_scores)[-3:]
        selected_sentences = [sentences[i] for i in sorted(top_indices)]

        return ' '.join(selected_sentences)

    def _remove_redundancy(self, text: str) -> str:
        """
        Remove redundant information while preserving context.
        """
        doc = self.nlp(text)

        # Track mentioned entities and concepts
        mentioned = set()
        filtered_sents = []

        for sent in doc.sents:
            # Extract key information from sentence
            entities = {ent.text.lower() for ent in sent.ents}
            noun_phrases = {np.text.lower() for np in sent.noun_chunks}
            key_info = entities.union(noun_phrases)

            # Check for redundancy
            if len(key_info.intersection(mentioned)) < len(key_info) * 0.7:
                filtered_sents.append(sent.text)
                mentioned.update(key_info)

        return ' '.join(filtered_sents)

    def _extract_key_concepts(self, text: str) -> List[str]:
        """
        Extract key concepts for context preservation.
        """
        doc = self.nlp(text)

        # Count noun phrases and entities
        concepts = Counter()
        for np in doc.noun_chunks:
            concepts[np.text.lower()] += 1
        for ent in doc.ents:
            concepts[ent.text.lower()] += 2  # Weight entities higher

        return [concept for concept, _ in concepts.most_common(5)]

# Example usage
if __name__ == "__main__":
    optimizer = TextOptimizer(max_chunk_size=600)

    sample_text = """  [Photosynthesis, a fundamental process in the biosphere, is the conversion of light energy into chemical energy, primarily in plants, algae, and certain bacteria. This intricate process involves the absorption of sunlight, primarily by pigments like chlorophyll, and its subsequent utilization to synthesize organic compounds, primarily glucose, from inorganic substances like carbon dioxide and water. The by-product of this reaction is oxygen, a vital gas for most aerobic organisms.
The photosynthetic process is broadly divided into two main stages: the light-dependent reactions and the light-independent reactions, or the Calvin cycle. In the light-dependent reactions, sunlight 1 is harnessed to excite electrons in chlorophyll molecules, initiating a chain of electron transport. This energy is used to split water molecules into hydrogen ions and oxygen, releasing the latter into the atmosphere. Additionally, ATP (adenosine triphosphate), an energy-rich molecule, and NADPH (nicotinamide adenine dinucleotide phosphate), a reducing agent, are synthesized.
The light-independent reactions, or the Calvin cycle, occur in the stroma of the chloroplast, independent of light. Here, the ATP and NADPH generated in the light-dependent reactions provide the energy and reducing power, respectively, to fix carbon dioxide from the atmosphere. This process, known as carbon fixation, involves a series of enzyme-catalyzed reactions that ultimately lead to the formation of glucose. The fixed carbon is then used to synthesize other organic compounds like amino acids, fatty acids, and nucleotides, which are essential for plant growth and development.

Photosynthesis is a crucial process for life on Earth. It provides the primary source of energy for most ecosystems, directly or indirectly. The oxygen produced during photosynthesis is essential for aerobic respiration, the process by which organisms obtain energy from organic molecules. Additionally, photosynthesis plays a significant role in the global carbon cycle, regulating the levels of carbon dioxide in the atmosphere and mitigating climate change.
In conclusion, photosynthesis is a complex and vital process that sustains life on Earth. By harnessing the energy of sunlight, plants and other photosynthetic organisms convert inorganic substances into organic matter, providing the foundation for the food chain and regulating the Earth's atmosphere. Understanding the intricacies of photosynthesis is essential for addressing global challenges like climate change and food security.
]  """

    optimized_chunks = optimizer.optimize_for_llm(sample_text)

    for i, chunk in enumerate(optimized_chunks, 1):
        print(f"\nChunk {i}:")
        print(f"Compression ratio: {chunk['compression_ratio']:.2f}")
        print(f"Key concepts: {', '.join(chunk['key_concepts'])}")
        print("Optimized content:", chunk['content'])


Chunk 1:
Compression ratio: 1.00
Key concepts: this intricate process, the absorption, sunlight, pigments, chlorophyll
Optimized content: This intricate process involves the absorption of sunlight, primarily by pigments like chlorophyll, and its subsequent utilization to synthesize organic compounds, primarily glucose, from inorganic substances like carbon dioxide and water. It provides the primary source of energy for most ecosystems, directly or indirectly.

Chunk 2:
Compression ratio: 0.85
Key concepts: photosynthesis, atp, nadph, two, calvin
Optimized content:   [Photosynthesis, a fundamental process in the biosphere, is the conversion of light energy into chemical energy, primarily in plants, algae, and certain bacteria. The photosynthetic process is broadly divided into two main stages: the light-dependent reactions and the light-independent reactions, or the Calvin cycle. Additionally, ATP (adenosine triphosphate), an energy-rich molecule, and NADPH (nicotinamide adenine dinucl

In [None]:
import spacy
from typing import List, Dict
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.tokenize import sent_tokenize
import networkx as nx
from collections import Counter

class TextOptimizer:
    def __init__(
        self,
        max_chunk_size: int = 1000,
        min_chunk_size: int = 200,
        compression_target: float = 0.5,  # Target compression ratio (0.5 = 50% reduction)
        similarity_threshold: float = 0.3,  # Threshold for semantic similarity
        key_sentence_ratio: float = 0.3,   # Proportion of sentences to keep
        redundancy_threshold: float = 0.5   # Threshold for redundant information
    ):
        self.nlp = spacy.load("en_core_web_sm")
        self.max_chunk_size = max_chunk_size
        self.min_chunk_size = min_chunk_size
        self.compression_target = compression_target
        self.similarity_threshold = similarity_threshold
        self.key_sentence_ratio = key_sentence_ratio
        self.redundancy_threshold = redundancy_threshold
        self.tfidf = TfidfVectorizer(stop_words='english')

    def optimize_for_llm(self, text: str) -> List[Dict]:
        """
        Optimized pipeline with better compression control.
        """
        # Initial sentence splitting
        sentences = sent_tokenize(text)

        # Create semantic chunks
        chunks = self._create_semantic_chunks(sentences)

        # Process each chunk
        optimized_chunks = []
        for chunk in chunks:
            # Skip chunks that are too small
            if len(chunk) < self.min_chunk_size:
                continue

            # Get the most important sentences
            dense_text = self._extract_key_sentences(chunk)

            # Remove redundant information
            cleaned_text = self._remove_redundancy(dense_text)

            # Only include chunk if it meets compression target
            if len(cleaned_text) <= len(chunk) * self.compression_target:
                optimized_chunks.append({
                    'original_length': len(chunk),
                    'optimized_length': len(cleaned_text),
                    'compression_ratio': len(cleaned_text) / len(chunk),
                    'content': cleaned_text,
                    'key_concepts': self._extract_key_concepts(cleaned_text)
                })

        return optimized_chunks

    def _create_semantic_chunks(self, sentences: List[str]) -> List[str]:
        """
        Create chunks based on semantic similarity with improved control.
        """
        if len(sentences) <= 3:
            return [' '.join(sentences)]

        # Calculate sentence embeddings
        docs = [self.nlp(sent) for sent in sentences]
        embeddings = np.array([doc.vector for doc in docs])

        # Create similarity matrix
        similarity_matrix = np.inner(embeddings, embeddings)

        # Apply similarity threshold
        similarity_matrix[similarity_matrix < self.similarity_threshold] = 0

        # Create graph and find communities
        G = nx.from_numpy_array(similarity_matrix)
        communities = nx.community.louvain_communities(G)

        # Process communities into chunks
        chunks = []
        current_chunk = []
        current_length = 0

        for community in communities:
            community_sentences = [sentences[i] for i in sorted(community)]
            community_text = ' '.join(community_sentences)

            # Check if adding this community would exceed max chunk size
            if current_length + len(community_text) > self.max_chunk_size:
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                current_chunk = community_sentences
                current_length = len(community_text)
            else:
                current_chunk.extend(community_sentences)
                current_length += len(community_text)

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    def _extract_key_sentences(self, text: str) -> str:
        """
        Extract important sentences with configurable ratio.
        """
        sentences = sent_tokenize(text)
        if len(sentences) <= 3:
            return text

        # Calculate sentence importance
        tfidf_matrix = self.tfidf.fit_transform(sentences)
        sentence_scores = np.sum(tfidf_matrix.toarray(), axis=1)

        # Select top sentences based on key_sentence_ratio
        num_sentences = max(2, int(len(sentences) * self.key_sentence_ratio))
        top_indices = np.argsort(sentence_scores)[-num_sentences:]
        selected_sentences = [sentences[i] for i in sorted(top_indices)]

        return ' '.join(selected_sentences)

    def _remove_redundancy(self, text: str) -> str:
        """
        Remove redundant information with configurable threshold.
        """
        doc = self.nlp(text)
        mentioned = set()
        filtered_sents = []

        for sent in doc.sents:
            entities = {ent.text.lower() for ent in sent.ents}
            noun_phrases = {np.text.lower() for np in sent.noun_chunks}
            key_info = entities.union(noun_phrases)

            # Check redundancy against threshold
            if not key_info or len(key_info.intersection(mentioned)) / len(key_info) < self.redundancy_threshold:
                filtered_sents.append(sent.text)
                mentioned.update(key_info)

        return ' '.join(filtered_sents)

# Example usage with different parameter combinations
def test_optimization_parameters(text: str):
    """Test different parameter combinations and show results."""
    parameter_sets = [
        # {
        #     'compression_target': 0.3,  # Aggressive compression
        #     'key_sentence_ratio': 0.2,
        #     'redundancy_threshold': 0.3
        # },
        {
            'compression_target': 0.5,  # Moderate compression
            'key_sentence_ratio': 0.3,
            'redundancy_threshold': 0.5
        },
        {
            'compression_target': 0.7,  # Light compression
            'key_sentence_ratio': 0.4,
            'redundancy_threshold': 0.7
        }
    ]

    for params in parameter_sets:
        print(f"\nTesting parameters: {params}")
        optimizer = TextOptimizer(**params)
        chunks = optimizer.optimize_for_llm(text)

        total_original = sum(chunk['original_length'] for chunk in chunks)
        total_optimized = sum(chunk['optimized_length'] for chunk in chunks)

        print(f"Number of chunks: {len(chunks)}")
        print(f"Overall compression ratio: {total_optimized/total_original:.2f}")
        print(f"Original total length: {total_original}")
        print(f"Optimized total length: {total_optimized}")

# Test with sample text
if __name__ == "__main__":
    sample_text = """
    [Photosynthesis, a fundamental process in the biosphere, is the conversion of light energy into chemical energy, primarily in plants, algae, and certain bacteria. This intricate process involves the absorption of sunlight, primarily by pigments like chlorophyll, and its subsequent utilization to synthesize organic compounds, primarily glucose, from inorganic substances like carbon dioxide and water. The by-product of this reaction is oxygen, a vital gas for most aerobic organisms.
The photosynthetic process is broadly divided into two main stages: the light-dependent reactions and the light-independent reactions, or the Calvin cycle. In the light-dependent reactions, sunlight 1 is harnessed to excite electrons in chlorophyll molecules, initiating a chain of electron transport. This energy is used to split water molecules into hydrogen ions and oxygen, releasing the latter into the atmosphere. Additionally, ATP (adenosine triphosphate), an energy-rich molecule, and NADPH (nicotinamide adenine dinucleotide phosphate), a reducing agent, are synthesized.
The light-independent reactions, or the Calvin cycle, occur in the stroma of the chloroplast, independent of light. Here, the ATP and NADPH generated in the light-dependent reactions provide the energy and reducing power, respectively, to fix carbon dioxide from the atmosphere. This process, known as carbon fixation, involves a series of enzyme-catalyzed reactions that ultimately lead to the formation of glucose. The fixed carbon is then used to synthesize other organic compounds like amino acids, fatty acids, and nucleotides, which are essential for plant growth and development.

Photosynthesis is a crucial process for life on Earth. It provides the primary source of energy for most ecosystems, directly or indirectly. The oxygen produced during photosynthesis is essential for aerobic respiration, the process by which organisms obtain energy from organic molecules. Additionally, photosynthesis plays a significant role in the global carbon cycle, regulating the levels of carbon dioxide in the atmosphere and mitigating climate change.
In conclusion, photosynthesis is a complex and vital process that sustains life on Earth. By harnessing the energy of sunlight, plants and other photosynthetic organisms convert inorganic substances into organic matter, providing the foundation for the food chain and regulating the Earth's atmosphere. Understanding the intricacies of photosynthesis is essential for addressing global challenges like climate change and food security.
]
    """

    test_optimization_parameters(sample_text)


Testing parameters: {'compression_target': 0.5, 'key_sentence_ratio': 0.3, 'redundancy_threshold': 0.5}


AttributeError: 'TextOptimizer' object has no attribute '_extract_key_concepts'

In [None]:
class TextOptimizer:
    def __init__(
        self,
        max_chunk_size: int = 1000,
        min_chunk_size: int = 50,
        compression_target: float = 0.5,
        similarity_threshold: float = 0.3,
        key_sentence_ratio: float = 0.3,
        redundancy_threshold: float = 0.5
    ):
        self.nlp = spacy.load("en_core_web_sm")
        self.max_chunk_size = max_chunk_size
        self.min_chunk_size = min_chunk_size
        self.compression_target = compression_target
        self.similarity_threshold = similarity_threshold
        self.key_sentence_ratio = key_sentence_ratio
        self.redundancy_threshold = redundancy_threshold
        self.tfidf = TfidfVectorizer(stop_words='english')

    def optimize_for_llm(self, text: str) -> List[Dict]:
        """
        Optimized pipeline with better compression control.
        """
        # Initial sentence splitting
        sentences = sent_tokenize(text)

        # Create semantic chunks
        chunks = self._create_semantic_chunks(sentences)

        # Process each chunk
        optimized_chunks = []
        for chunk in chunks:
            # Get the most important sentences
            dense_text = self._extract_key_sentences(chunk)

            # Remove redundant information
            cleaned_text = self._remove_redundancy(dense_text)

            # Only include chunk if it meets compression target and isn't too small
            if len(cleaned_text) > self.min_chunk_size:
                optimized_chunks.append({
                    'original_length': len(chunk),
                    'optimized_length': len(cleaned_text),
                    'compression_ratio': len(cleaned_text) / len(chunk),
                    'content': cleaned_text,
                    'key_concepts': self._extract_key_concepts(cleaned_text)
                })

        return optimized_chunks

    def _create_semantic_chunks(self, sentences: List[str]) -> List[str]:
        """
        Create chunks based on semantic similarity with improved control.
        """
        if len(sentences) <= 3:
            return [' '.join(sentences)]

        # Calculate sentence embeddings
        docs = [self.nlp(sent) for sent in sentences]
        embeddings = np.array([doc.vector for doc in docs])

        # Create similarity matrix
        similarity_matrix = np.inner(embeddings, embeddings)

        # Apply similarity threshold
        similarity_matrix[similarity_matrix < self.similarity_threshold] = 0

        # Create graph and find communities
        G = nx.from_numpy_array(similarity_matrix)

        try:
            communities = nx.community.louvain_communities(G)
        except:
            # If community detection fails, create simple chunks
            return [' '.join(sentences[i:i+5]) for i in range(0, len(sentences), 5)]

        # Process communities into chunks
        chunks = []
        current_chunk = []
        current_length = 0

        for community in communities:
            community_sentences = [sentences[i] for i in sorted(community)]
            community_text = ' '.join(community_sentences)

            if current_length + len(community_text) > self.max_chunk_size and current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = community_sentences
                current_length = len(community_text)
            else:
                current_chunk.extend(community_sentences)
                current_length += len(community_text)

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        # If no chunks were created, return the original text as one chunk
        return chunks if chunks else [' '.join(sentences)]

    def _extract_key_sentences(self, text: str) -> str:
        """
        Extract important sentences with configurable ratio.
        """
        sentences = sent_tokenize(text)
        if len(sentences) <= 3:
            return text

        # Calculate sentence importance
        tfidf_matrix = self.tfidf.fit_transform(sentences)
        sentence_scores = np.sum(tfidf_matrix.toarray(), axis=1)

        # Select top sentences based on key_sentence_ratio
        num_sentences = max(2, int(len(sentences) * self.key_sentence_ratio))
        top_indices = np.argsort(sentence_scores)[-num_sentences:]
        selected_sentences = [sentences[i] for i in sorted(top_indices)]

        return ' '.join(selected_sentences)

    def _remove_redundancy(self, text: str) -> str:
        """
        Remove redundant information with configurable threshold.
        """
        doc = self.nlp(text)
        mentioned = set()
        filtered_sents = []

        for sent in doc.sents:
            entities = {ent.text.lower() for ent in sent.ents}
            noun_phrases = {np.text.lower() for np in sent.noun_chunks}
            key_info = entities.union(noun_phrases)

            # Check redundancy against threshold
            if not key_info or len(key_info.intersection(mentioned)) / len(key_info) < self.redundancy_threshold:
                filtered_sents.append(sent.text)
                mentioned.update(key_info)

        return ' '.join(filtered_sents)

    def _extract_key_concepts(self, text: str) -> List[str]:
        """
        Extract key concepts for context preservation.
        """
        doc = self.nlp(text)

        # Count noun phrases and entities
        concepts = Counter()

        # Add noun phrases
        for np in doc.noun_chunks:
            if len(np.text.split()) <= 3:  # Limit to phrases of 3 words or less
                concepts[np.text.lower()] += 1

        # Add named entities with higher weight
        for ent in doc.ents:
            if len(ent.text.split()) <= 3:
                concepts[ent.text.lower()] += 2

        # Get most common concepts
        return [concept for concept, _ in concepts.most_common(5)]


def test_optimization_parameters(text: str):
    """Test different parameter combinations and show results."""
    parameter_sets = [
        {
            'compression_target': 0.3,
            'key_sentence_ratio': 0.3,
            'redundancy_threshold': 0.4,
            'max_chunk_size': 800,
            'min_chunk_size': 50
        },
        {
            'compression_target': 0.5,
            'key_sentence_ratio': 0.4,
            'redundancy_threshold': 0.5,
            'max_chunk_size': 800,
            'min_chunk_size': 50
        },
        {
            'compression_target': 0.7,
            'key_sentence_ratio': 0.5,
            'redundancy_threshold': 0.6,
            'max_chunk_size': 800,
            'min_chunk_size': 50
        }
    ]

    for params in parameter_sets:
        print(f"\nTesting parameters: {params}")
        optimizer = TextOptimizer(**params)
        chunks = optimizer.optimize_for_llm(text)

        if not chunks:
            print("No chunks generated with these parameters - they may be too aggressive")
            continue

        total_original = sum(chunk['original_length'] for chunk in chunks)
        total_optimized = sum(chunk['optimized_length'] for chunk in chunks)

        print(f"Number of chunks: {len(chunks)}")
        print(f"Overall compression ratio: {total_optimized/total_original:.2f}")
        print(f"Original total length: {total_original}")
        print(f"Optimized total length: {total_optimized}")

        # Print first chunk as example
        if chunks:
            print("\nFirst chunk example:")
            print("Content:", chunks[0]['content'])
            print("Key concepts:", chunks[0]['key_concepts'])

# Test with your sample text
test_optimization_parameters(sample_text)


Testing parameters: {'compression_target': 0.3, 'key_sentence_ratio': 0.3, 'redundancy_threshold': 0.4, 'max_chunk_size': 800, 'min_chunk_size': 50}
Number of chunks: 3
Overall compression ratio: 0.35
Original total length: 2042
Optimized total length: 716

First chunk example:
Content: The process occurs in two main stages: the light-dependent reactions and the light-independent reactions (Calvin cycle). This process, called carbon fixation, uses the ATP and NADPH produced in the light-dependent reactions.
Key concepts: ['the light-dependent reactions', 'two', 'atp', 'the process', 'two main stages']

Testing parameters: {'compression_target': 0.5, 'key_sentence_ratio': 0.4, 'redundancy_threshold': 0.5, 'max_chunk_size': 800, 'min_chunk_size': 50}
Number of chunks: 4
Overall compression ratio: 0.48
Original total length: 2041
Optimized total length: 987

First chunk example:
Content: Using the energy from sunlight, these materials are converted into glucose and oxygen is released as 

In [None]:
def moderate_compression_output(text: str):
    """Run moderate compression on the entire text and return the optimized result."""
    # Define moderate compression parameters
    params = {
        'compression_target': 0.5,
        'key_sentence_ratio': 0.4,
        'redundancy_threshold': 0.5,
        'max_chunk_size': 800,
        'min_chunk_size': 50
    }

    # Initialize the optimizer with moderate parameters
    optimizer = TextOptimizer(**params)

    # Optimize the text
    chunks = optimizer.optimize_for_llm(text)

    if not chunks:
        print("No optimized text generated. The parameters might be too restrictive.")
        return ""

    # Merge all optimized chunks into one text output
    optimized_text = " ".join(chunk['content'] for chunk in chunks)

    # Print overall compression details
    total_original = sum(chunk['original_length'] for chunk in chunks)
    total_optimized = sum(chunk['optimized_length'] for chunk in chunks)

    print("Moderate Compression Results:")
    print(f"Original total length: {total_original}")
    print(f"Optimized total length: {total_optimized}")
    print(f"Overall compression ratio: {total_optimized/total_original:.2f}")

    return optimized_text


# Example usage with sample text
optimized_result = moderate_compression_output(sample_text)
print("\nOptimized Text Output:")
print(optimized_result)


Moderate Compression Results:
Original total length: 2042
Optimized total length: 947
Overall compression ratio: 0.46

Optimized Text Output:
Understanding this process is crucial for addressing challenges like food security and climate change in the modern world. The process occurs in two main stages: the light-dependent reactions and the light-independent reactions (Calvin cycle). This process, called carbon fixation, uses the ATP and NADPH produced in the light-dependent reactions. This process takes place in the chloroplasts, specifically using chlorophyll, the green pigment involved in photosynthesis. The glucose produced provides food not only for plants but also for animals that eat plants, forming the basis of most food chains on Earth. Using the energy from sunlight, these materials are converted into glucose and oxygen is released as a byproduct. The hydrogen ions help create ATP, while electrons from the split water molecules are used to form NADPH. Scientists continue to st