In [None]:
!pip install nltk numpy networkx scikit-learn gensim



In [None]:
import nltk
import numpy as np
import networkx as nx
import re
import string
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from gensim import corpora
from gensim.models import LdaModel
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)
nltk.download('punkt_tab', quiet=True)

True

In [None]:
class EnhancedNewsArticleSummarizer:
    def __init__(self, compression_ratio=0.4):
        """
        Initialize the summarizer with enhanced capabilities

        Parameters:
        -----------
        compression_ratio : float
            The ratio of sentences to keep in the summary (0.0 to 1.0)
        """
        self.compression_ratio = compression_ratio
        self.stop_words = set(stopwords.words('english'))
        self.ps = PorterStemmer()


        # Load paraphrasing rules
        self.paraphrase_rules = self._load_paraphrase_rules()

        # Transition phrases for better coherence
        self.transition_phrases = {
            'addition': ['Additionally', 'Furthermore', 'Moreover', 'Also'],
            'contrast': ['However', 'Nevertheless', 'In contrast', 'On the other hand'],
            'cause': ['Therefore', 'As a result', 'Consequently', 'Thus'],
            'time': ['Meanwhile', 'Subsequently', 'Later', 'Previously'],
            'example': ['For instance', 'For example', 'Specifically'],
            'emphasis': ['Notably', 'Importantly', 'Significantly'],
            'conclusion': ['In conclusion', 'Ultimately', 'Finally']
        }

    def _load_paraphrase_rules(self):
        """Load rules for enhanced rule-based paraphrasing"""
        return {
            # Attribution and Reporting
            r'according to .+?,': '',
            r'as per the .+?,': '',
            r'as stated by .+?,': '',
            r'in a statement,': '',
            r'reportedly': '',
            r'it is reported that': '',
            r'it has been reported that': '',
            r'said that': 'stated',
            r'pointed out that': 'indicated',
            r'mentioned that': 'noted',
            r'emphasized that': 'stressed',
            r'highlighted that': 'emphasized',
            r'expressed that': 'stated',
            r'claimed that': 'stated',
            r'confirmed that': 'stated',
            r'argued that': 'stated',
            r'asserted that': 'stated',
            r'added that': 'stated',

            # Formal and Redundant Phrases
            r'it is important to note that': '',
            r'it should be noted that': '',
            r'it is worth mentioning that': '',
            r'it is to be noted that': '',
            r'it must be mentioned that': '',
            r'in order to': 'to',
            r'in terms of': 'regarding',
            r'with regard to': 'regarding',
            r'with respect to': 'regarding',
            r'in light of the fact that': 'because',
            r'due to the fact that': 'because',
            r'for the purpose of': 'for',
            r'at this point in time': 'now',
            r'at the present moment': 'now',
            r'in the near future': 'soon',
            r'in the coming days': 'soon',
            r'for all intents and purposes': '',
            r'for the most part': '',
            r'as a matter of fact': '',
            r'with the aim of': 'to',
            r'notwithstanding the fact that': 'despite',

            # Weak and Filler Words
            r'actually': '',
            r'basically': '',
            r'currently': '',
            r'generally': '',
            r'simply': '',
            r'that is to say': '',
            r'to put it simply': '',
            r'needless to say': '',
            r'to be honest': '',
            r'frankly speaking': '',
            r'in fact': '',
            r'in other words': '',

            # Transition and Linking Phrases
            r'on the other hand,': 'however,',
            r'in contrast,': 'however,',
            r'at the same time,': 'meanwhile,',
            r'after all,': '',
            r'to that end,': '',
            r'by the same token,': '',
            r'as a result of': 'due to',
            r'due to the fact that': 'because',
            r'as far as .*? is concerned,': '',
            r'if truth be told,': '',
            r'to sum up,': 'in conclusion,',
            r'in the final analysis,': 'ultimately,',
            r'in the meantime,': 'meanwhile,',
            r'what’s more,': 'moreover,',
            r'taking into account the fact that': 'considering',

            # Opinion and Value Statements
            r'it is believed that': 'experts believe',
            r'some people think that': 'some believe',
            r'it is widely accepted that': 'it is accepted that',
            r'there is no doubt that': 'clearly,',
            r'it is true that': '',
            r'it is possible that': 'perhaps',
            r'it would appear that': 'apparently',
            r'it is assumed that': 'it is believed that',

            # Common Press and News Jargon
            r'breaking news:': '',
            r'sources said that': 'sources stated',
            r'close sources stated that': 'sources stated',
            r'in an exclusive interview,': '',
            r'our correspondent reported that': '',
            r'officials have confirmed that': 'officials confirmed',
            r'recent developments suggest that': 'recent reports suggest',

            # Legal and Political Terms
            r'legally speaking,': '',
            r'politically speaking,': '',
            r'it remains to be seen whether': 'whether',
            r'to the best of my knowledge,': '',
            r'from a legal perspective,': '',
            r'from a political perspective,': '',

            # Common Numeric/Statistical Phrases
            r'approximately': 'about',
            r'around': 'about',
            r'in excess of': 'more than',
            r'not less than': 'at least',
            r'less than': 'under',
            r'over a period of': 'in',
            r'at a rate of': 'at',

            # Date and Time Phrases
            r'at this time': 'now',
            r'at present': 'now',
            r'in the not-so-distant future': 'soon',
            r'in recent days': 'recently',
            r'for the time being': 'currently',

            # Miscellaneous
            r'on a daily basis': 'daily',
            r'on a regular basis': 'regularly',
            r'to a certain extent': 'somewhat',
            r'to some degree': 'somewhat',
            r'from time to time': 'occasionally',
            r'in the event that': 'if',
            r'under the circumstances': 'considering',
            r'with the exception of': 'except for',
            r'in spite of the fact that': 'despite',
            r'in the vicinity of': 'near',
            r'if and when': 'if',
            r'in connection with': 'regarding',
        }


    def preprocess_text(self, text):
        """Clean and tokenize the text into sentences and words"""
        # Clean text
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        text = re.sub(r'\[[0-9]*\]', '', text)  # Remove citations

        # Tokenize into sentences
        sentences = sent_tokenize(text)

        # Process each sentence
        clean_sentences = []
        tokenized_sentences = []

        for sentence in sentences:
            # Remove punctuation
            clean_sentence = sentence.translate(str.maketrans('', '', string.punctuation))
            clean_sentence = clean_sentence.lower()

            # Tokenize the sentence
            words = word_tokenize(clean_sentence)

            # Remove stopwords and stem
            filtered_words = []
            for word in words:
                if word not in self.stop_words:
                    filtered_words.append(self.ps.stem(word))

            clean_sentences.append(clean_sentence)
            tokenized_sentences.append(filtered_words)

        return sentences, clean_sentences, tokenized_sentences

    def extract_entities(self, text):
        """Extract named entities from text"""
        try:
            words = word_tokenize(text)
            pos_tags = nltk.pos_tag(words)
            named_entities = nltk.ne_chunk(pos_tags)

            entities = []
            for chunk in named_entities:
                if hasattr(chunk, 'label'):
                    entity = ' '.join(c[0] for c in chunk)
                    entities.append(entity)

            return entities
        except:
            # Fallback if NER fails
            return []

    def extract_time_references(self, text):
        """Extract time references from text for chronological ordering"""
        time_patterns = [
            r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?,\s+\d{4}\b',
            r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2}(?:st|nd|rd|th)?,\s+\d{4}\b',
            r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',
            r'\b\d{4}-\d{2}-\d{2}\b',
            r'\byesterday\b',
            r'\btoday\b',
            r'\btomorrow\b',
            r'\blast (?:week|month|year)\b',
            r'\bnext (?:week|month|year)\b',
            r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b',
        ]

        time_references = []
        for pattern in time_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            time_references.extend(matches)

        return time_references

    def textrank_score(self, sentences, clean_sentences):
        """Score sentences using the TextRank algorithm"""
        if len(clean_sentences) < 2:
            return [(0, 1.0)] if clean_sentences else []

        # Create a similarity matrix
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(clean_sentences)

        # Calculate similarity between sentences
        similarity_matrix = tfidf_matrix * tfidf_matrix.T

        # Create a graph from the similarity matrix
        nx_graph = nx.from_scipy_sparse_array(similarity_matrix)

        # Apply PageRank algorithm
        scores = nx.pagerank(nx_graph)

        # Convert scores to a list
        ranked_sentences = [(i, scores[i]) for i in range(len(sentences))]

        return ranked_sentences

    def tfidf_score(self, sentences, clean_sentences):
        """Score sentences based on TF-IDF weights"""
        if len(clean_sentences) < 2:
            return [(0, 1.0)] if clean_sentences else []

        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(clean_sentences)

        # Get feature names
        feature_names = vectorizer.get_feature_names_out()

        # Score sentences based on the sum of TF-IDF values
        scores = []
        for i, sentence in enumerate(clean_sentences):
            # Get the TF-IDF values for this sentence
            feature_index = tfidf_matrix[i, :].nonzero()[1]
            tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])

            # Calculate sentence score (sum of TF-IDF values)
            score = sum(val for _, val in tfidf_scores)
            scores.append((i, score))

        return scores

    def luhn_score(self, sentences, tokenized_sentences):
        """Score sentences using Luhn's algorithm based on significant words"""
        if not tokenized_sentences:
            return []

        # Calculate word frequency
        all_words = [word for sentence in tokenized_sentences for word in sentence]
        word_freq = Counter(all_words)

        # Find significant words (above threshold)
        threshold = sum(word_freq.values()) / len(word_freq) if word_freq else 0
        significant_words = [word for word, freq in word_freq.items() if freq > threshold]

        # Score sentences based on clusters of significant words
        scores = []
        for i, sentence in enumerate(tokenized_sentences):
            word_positions = [j for j, word in enumerate(sentence) if word in significant_words]

            if not word_positions:
                scores.append((i, 0))
                continue

            clusters = []
            if word_positions:
                current_cluster = [word_positions[0]]

                for pos in word_positions[1:]:
                    if pos - current_cluster[-1] < 3:  # Words less than 3 positions apart form a cluster
                        current_cluster.append(pos)
                    else:
                        clusters.append(current_cluster)
                        current_cluster = [pos]

                if current_cluster:
                    clusters.append(current_cluster)

            # Calculate score based on the square of significant words divided by total words
            max_cluster_size = max([len(cluster) for cluster in clusters]) if clusters else 0
            score = (max_cluster_size ** 2) / len(sentence) if len(sentence) > 0 else 0
            scores.append((i, score))

        return scores

    def lsa_score(self, clean_sentences):
        """Score sentences using Latent Semantic Analysis"""
        if len(clean_sentences) < 2:
            return [(0, 1.0)] if clean_sentences else []

        # Create TF-IDF matrix
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(clean_sentences)

        # Apply SVD
        n_components = min(len(clean_sentences) - 1, 5)  # Number of topics
        n_components = max(1, n_components)
        svd = TruncatedSVD(n_components=n_components)
        svd.fit(tfidf_matrix)

        # Get sentence scores based on the first singular vector
        u = svd.components_[0]
        sentence_scores = [(i, abs(u[i])) for i in range(len(clean_sentences))]

        return sentence_scores

    def entity_score(self, sentences, tokenized_sentences):
        """Score sentences based on named entity presence"""
        # Extract all entities from the full text
        full_text = ' '.join(sentences)
        all_entities = self.extract_entities(full_text)

        if not all_entities:
            # Fallback to important noun phrases if NER fails
            all_words = [word for sentence in tokenized_sentences for word in sentence]
            word_freq = Counter(all_words)
            important_words = [word for word, freq in word_freq.most_common(10)]

            scores = []
            for i, sentence in enumerate(tokenized_sentences):
                if not sentence:
                    scores.append((i, 0))
                    continue

                # Count important words in sentence
                count = sum(1 for word in sentence if word in important_words)
                score = count / len(sentence) if sentence else 0
                scores.append((i, score))

            return scores

        # Score sentences based on entity presence
        scores = []
        for i, sentence in enumerate(sentences):
            # Count entities in this sentence
            entity_count = sum(1 for entity in all_entities if entity.lower() in sentence.lower())

            # Calculate score based on entity density
            words = len(tokenized_sentences[i]) if i < len(tokenized_sentences) and tokenized_sentences[i] else 1
            score = entity_count / words if words > 0 else 0
            scores.append((i, score))

        return scores

    def numeric_score(self, sentences):
        """Score sentences based on presence of numbers/statistics"""
        scores = []

        # Regular expression for numbers
        number_pattern = re.compile(r'\b\d+(?:\.\d+)?%?\b')

        for i, sentence in enumerate(sentences):
            # Count numbers in sentence
            numbers = number_pattern.findall(sentence)

            # Score based on number presence
            score = min(1.0, len(numbers) * 0.25)  # Cap at 1.0
            scores.append((i, score))

        return scores

    def quote_score(self, sentences):
        """Score sentences containing quotes"""
        scores = []

        # Regular expression for quotes
        quote_pattern = re.compile(r'["\'](.*?)["\']')

        for i, sentence in enumerate(sentences):
            # Check if sentence contains a quote
            quotes = quote_pattern.findall(sentence)

            # Score based on quote presence
            score = 1.0 if quotes else 0.0
            scores.append((i, score))

        return scores

    def _calculate_position_score(self, i, n):
        """Calculate position score based on sentence position"""
        # Position-based scoring (modified for news articles)
        if i == 0:
            score = 1.0  # First sentence (headline)
        elif i == 1:
            score = 0.95  # Second sentence (often contains key info)
        elif i < n * 0.1:
            score = 0.9  # First 10% (lead paragraph)
        elif i < n * 0.2:
            score = 0.8  # First 20%
        elif i < n * 0.3:
            score = 0.7  # First 30%
        elif i > n * 0.8:
            score = 0.6  # Last 20% (conclusion, future implications)
        else:
            score = 0.5  # Middle sentences

        return score

    def position_score(self, sentences):
        """Score sentences based on their position in the document"""
        scores = []
        n = len(sentences)

        for i in range(n):
            scores.append((i, self._calculate_position_score(i, n)))

        return scores

    def title_similarity_score(self, title, clean_sentences, tokenized_sentences):
        """Score sentences based on similarity to the title"""
        if not title:
            return [(i, 0) for i in range(len(clean_sentences))]

        # Clean and tokenize the title
        clean_title = title.translate(str.maketrans('', '', string.punctuation)).lower()
        title_words = [self.ps.stem(word) for word in word_tokenize(clean_title)
                      if word not in self.stop_words]

        # Score sentences based on overlap with title words
        scores = []
        for i, sentence in enumerate(tokenized_sentences):
            if not sentence or not title_words:
                scores.append((i, 0))
                continue

            # Count words in both title and sentence
            overlap = sum(1 for word in sentence if word in title_words)

            # Calculate score as ratio of overlap to possible overlap
            score = overlap / len(title_words) if len(title_words) > 0 else 0
            scores.append((i, score))

        return scores

    def cohesion_score(self, sentences, tokenized_sentences):
        """Score sentences based on cohesion with neighboring sentences"""
        scores = []
        n = len(sentences)

        for i in range(n):
            # Get neighboring sentences
            prev_idx = max(0, i - 1)
            next_idx = min(n - 1, i + 1)

            # Calculate similarity with neighbors
            prev_sim = self._sentence_similarity_tokens(tokenized_sentences[i], tokenized_sentences[prev_idx]) if i > 0 else 0
            next_sim = self._sentence_similarity_tokens(tokenized_sentences[i], tokenized_sentences[next_idx]) if i < n - 1 else 0

            # Average similarity
            avg_sim = (prev_sim + next_sim) / 2 if i > 0 and i < n - 1 else (prev_sim + next_sim)
            scores.append((i, avg_sim))

        return scores

    def semantic_cluster_score(self, sentences, clean_sentences):
        """Score sentences based on semantic clustering"""
        if len(clean_sentences) < 2:
            return [(0, 1.0)] if clean_sentences else []

        # Create TF-IDF matrix for semantic representation
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(clean_sentences)

        # Determine optimal number of clusters (topics)
        n_clusters = min(max(2, len(sentences) // 5), 8)  # Between 2 and 8 clusters

        # Apply K-means clustering
        km = KMeans(n_clusters=n_clusters, random_state=42)
        km.fit(tfidf_matrix)
        clusters = km.labels_

        # Find sentences closest to cluster centroids
        centroids = km.cluster_centers_
        central_sentences = {}

        for cluster_id in range(n_clusters):
            # Get sentences in this cluster
            cluster_sentence_indices = [i for i, label in enumerate(clusters) if label == cluster_id]

            if not cluster_sentence_indices:
                continue

            # Find sentence closest to centroid
            closest_idx = None
            min_distance = float('inf')

            for idx in cluster_sentence_indices:
                distance = np.linalg.norm(tfidf_matrix[idx].toarray() - centroids[cluster_id])
                if distance < min_distance:
                    min_distance = distance
                    closest_idx = idx

            if closest_idx is not None:
                central_sentences[closest_idx] = 1.0

        # Convert to standard score format
        scores = [(i, central_sentences.get(i, 0.0)) for i in range(len(sentences))]
        return scores

    def topic_coverage_score(self, sentences, clean_sentences, tokenized_sentences):
        """Score sentences based on topic coverage using LDA"""
        if len(clean_sentences) < 5:  # Need enough sentences for topic modeling
            return [(i, 1.0) for i in range(len(clean_sentences))]

        # Create dictionary and corpus for LDA
        all_tokenized = [sentence for sentence in tokenized_sentences if sentence]
        dictionary = corpora.Dictionary(all_tokenized)
        corpus = [dictionary.doc2bow(sentence) for sentence in all_tokenized]

        # Determine optimal number of topics
        n_topics = min(max(2, len(sentences) // 10), 5)  # Between 2 and 5 topics

        # Train LDA model
        lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics, passes=10, random_state=42)

        # Get topic distribution for each sentence
        sentence_topics = [lda[corpus[i]] for i in range(len(corpus))]

        # Find most representative sentence for each topic
        topic_representatives = {}
        for topic_id in range(n_topics):
            max_prob = -1
            max_sent_idx = -1

            for i, sent_topic in enumerate(sentence_topics):
                for t, prob in sent_topic:
                    if t == topic_id and prob > max_prob:
                        max_prob = prob
                        max_sent_idx = i

            if max_sent_idx != -1:
                # Map from tokenized sentence index to original sentence index
                original_idx = tokenized_sentences.index(all_tokenized[max_sent_idx]) if all_tokenized[max_sent_idx] in tokenized_sentences else -1
                if original_idx != -1:
                    topic_representatives[original_idx] = 1.0

        # Convert to standard score format
        scores = [(i, topic_representatives.get(i, 0.0)) for i in range(len(sentences))]
        return scores

    def _sentence_similarity_tokens(self, tokens1, tokens2):
        """Calculate Jaccard similarity between two token lists"""
        if not tokens1 or not tokens2:
            return 0

        set1 = set(tokens1)
        set2 = set(tokens2)

        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))

        return intersection / union if union > 0 else 0

    def optimized_ensemble_score(self, sentences, clean_sentences, tokenized_sentences, title=None):
        """Combine multiple scoring methods with optimized weights for news articles"""
        # Get individual scores
        scores_dict = {}

        scores_dict['textrank'] = dict(self.textrank_score(sentences, clean_sentences))
        scores_dict['tfidf'] = dict(self.tfidf_score(sentences, clean_sentences))
        scores_dict['luhn'] = dict(self.luhn_score(sentences, tokenized_sentences))
        scores_dict['lsa'] = dict(self.lsa_score(clean_sentences))
        scores_dict['entity'] = dict(self.entity_score(sentences, tokenized_sentences))
        scores_dict['numeric'] = dict(self.numeric_score(sentences))
        scores_dict['quote'] = dict(self.quote_score(sentences))
        scores_dict['position'] = dict(self.position_score(sentences))
        scores_dict['cohesion'] = dict(self.cohesion_score(sentences, tokenized_sentences))
        scores_dict['semantic'] = dict(self.semantic_cluster_score(sentences, clean_sentences))
        scores_dict['topic'] = dict(self.topic_coverage_score(sentences, clean_sentences, tokenized_sentences))

        # Title similarity (if title is provided)
        if title:
            scores_dict['title'] = dict(self.title_similarity_score(title, clean_sentences, tokenized_sentences))
        else:
            scores_dict['title'] = {i: 0 for i in range(len(sentences))}

        # Normalize scores between 0 and 1
        def normalize(scores):
            max_score = max(scores.values()) if scores.values() and max(scores.values()) > 0 else 1
            return {i: (score / max_score if max_score > 0 else 0) for i, score in scores.items()}

        # Normalize all score sets
        for key in scores_dict:
            scores_dict[key] = normalize(scores_dict[key])

        # Weights for different methods (optimized for news articles)
        weights = {
            'textrank': 0.15,    # TextRank captures overall importance
            'tfidf': 0.10,       # TF-IDF captures distinctive content
            'luhn': 0.05,        # Luhn captures clusters of important words
            'lsa': 0.05,         # LSA captures latent topics
            'entity': 0.20,      # Entity captures key people/organizations
            'numeric': 0.10,     # Numeric captures important statistics
            'quote': 0.10,       # Quote captures important statements
            'position': 0.20,    # Position captures structural importance
            'cohesion': 0.05,    # Cohesion captures narrative flow
            'title': 0.05,       # Title similarity captures central theme
            'semantic': 0.05,    # Semantic clustering for diversity
            'topic': 0.05        # Topic coverage for completeness
        }

        # Combine scores with weights
        ensemble_scores = []
        for i in range(len(sentences)):
            score = sum(weights[key] * scores_dict[key].get(i, 0) for key in weights)
            ensemble_scores.append((i, score))

        return ensemble_scores


    def select_balanced_sentences(self, scores, sentences, tokenized_sentences):
        """Select sentences that balance importance, diversity, and coverage"""
        # Sort by score (highest first)
        sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

        # Calculate how many sentences to include
        n_sentences = max(2, int(len(sentences) * self.compression_ratio))

        # Create variables to track selection
        selected_indices = []
        selected_sentences = []
        covered_words = set()
        covered_entities = set()

        # Always include the highest scoring sentence
        top_idx, _ = sorted_scores[0]
        selected_indices.append(top_idx)
        selected_sentences.append(sentences[top_idx])
        covered_words.update(tokenized_sentences[top_idx])

        # Extract entities from the selected sentence
        entities = self.extract_entities(sentences[top_idx])
        covered_entities.update([e.lower() for e in entities])

        # Create a candidate list (excluding the already selected sentence)
        candidates = sorted_scores[1:]

        # Select remaining sentences
        while len(selected_indices) < n_sentences and candidates:
            best_idx = None
            best_score = -1
            best_redundancy = 1  # Lower is better

            for idx, base_score in candidates:
                current_sentence = sentences[idx].lower()
                current_tokens = set(tokenized_sentences[idx])

                # Extract entities from this sentence
                current_entities = [e.lower() for e in self.extract_entities(sentences[idx])]

                # Skip if too similar to already selected sentences
                too_similar = False
                for selected_sentence in selected_sentences:
                    similarity = self._sentence_similarity(current_sentence, selected_sentence.lower())
                    if similarity > 0.7:  # Threshold for redundancy
                        too_similar = True
                        break

                if too_similar:
                    continue

                # Calculate coverage gain (how many new words this adds)
                new_words = current_tokens - covered_words
                coverage_gain = len(new_words) / len(current_tokens) if current_tokens else 0

                # Calculate entity coverage gain
                new_entities = [e for e in current_entities if e not in covered_entities]
                entity_gain = len(new_entities) / max(1, len(current_entities))

                # Calculate redundancy
                redundancy = 1 - ((coverage_gain + entity_gain) / 2)

                # Calculate combined score with multiple factors
                combined_score = (base_score * 0.6) + (coverage_gain * 0.2) + (entity_gain * 0.2)

                # Select if better than current best
                if combined_score > best_score or (combined_score == best_score and redundancy < best_redundancy):
                    best_score = combined_score
                    best_redundancy = redundancy
                    best_idx = idx

            if best_idx is not None:
                # Add best sentence to selection
                selected_indices.append(best_idx)
                selected_sentences.append(sentences[best_idx])
                covered_words.update(tokenized_sentences[best_idx])
                covered_entities.update([e.lower() for e in self.extract_entities(sentences[best_idx])])

                # Remove selected sentence from candidates
                candidates = [(i, s) for i, s in candidates if i != best_idx]
            else:
                # If no suitable sentence is found, exit loop
                break

        # Sort selected indices by their position in the original document
        selected_indices.sort()

        return selected_indices

    def _sentence_similarity(self, sentence1, sentence2):
        """Calculate cosine similarity between two sentences"""
        # Tokenize
        words1 = sentence1.split()
        words2 = sentence2.split()

        # Create word sets
        set1 = set(words1)
        set2 = set(words2)

        # Calculate Jaccard similarity
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))

        return intersection / union if union > 0 else 0

    def _get_transition_phrase(self, current_idx, prev_idx, sentences):
        """Determine appropriate transition phrase based on relationship between sentences"""
        # If first sentence, no transition needed
        if prev_idx is None:
            return ""

        current_sentence = sentences[current_idx]
        prev_sentence = sentences[prev_idx]

        # Look for time references
        current_time_refs = self.extract_time_references(current_sentence)
        prev_time_refs = self.extract_time_references(prev_sentence)

        # Check for contrast indicators
        contrast_words = ["but", "however", "although", "though", "yet", "despite", "nevertheless", "on the other hand", "in contrast"]
        has_contrast = any(word in current_sentence.lower() for word in contrast_words)

        # Check for cause-effect indicators
        cause_words = ["because", "since", "as a result", "therefore", "thus", "consequently", "hence"]
        has_cause = any(word in current_sentence.lower() for word in cause_words)

        # Check for additional information
        addition_words = ["also", "additionally", "furthermore", "moreover", "in addition", "besides"]
        has_addition = any(word in current_sentence.lower() for word in addition_words)

        # Check for examples
        example_words = ["for example", "for instance", "such as", "like"]
        has_example = any(word in current_sentence.lower() for word in example_words)

        # Determine appropriate transition
        if has_contrast:
            return self.transition_phrases["contrast"][0] + ", "
        elif has_cause:
            return self.transition_phrases["cause"][0] + ", "
        elif has_addition:
            return self.transition_phrases["addition"][0] + ", "
        elif has_example:
            return self.transition_phrases["example"][0] + ", "
        elif current_time_refs and prev_time_refs:
            return self.transition_phrases["time"][0] + ", "
        else:
            # No specific relationship detected
            return ""

    def _simple_paraphrase(self, sentence):
        """Apply rule-based paraphrasing to simplify sentences"""
        paraphrased = sentence

        for pattern, replacement in self.paraphrase_rules.items():
            paraphrased = re.sub(pattern, replacement, paraphrased)

        return paraphrased

    def _enhance_coherence(self, selected_sentences, original_indices, sentences):
        """Enhance coherence by adding transition phrases and light paraphrasing"""
        coherent_summary = []
        prev_idx = None

        for i, sentence_idx in enumerate(original_indices):
            sentence = sentences[sentence_idx]

            # Simplify sentence through light paraphrasing
            simplified = self._simple_paraphrase(sentence)

            # Add appropriate transition phrase
            transition = self._get_transition_phrase(sentence_idx, prev_idx, sentences)

            # For first sentence, no transition needed
            if i == 0:
                coherent_summary.append(simplified)
            else:
                # Add transition phrase or use the original sentence
                if transition and not any(simplified.startswith(t) for t in sum(self.transition_phrases.values(), [])):
                    coherent_summary.append(transition + simplified[0].lower() + simplified[1:])
                else:
                    coherent_summary.append(simplified)

            prev_idx = sentence_idx

        return coherent_summary

    def summarize(self, text, title=None):
        """
        Generate a summary of the given text

        Parameters:
        -----------
        text : str
            The text to summarize
        title : str, optional
            The title of the text

        Returns:
        --------
        str
            The generated summary
        """
        # Preprocess text
        sentences, clean_sentences, tokenized_sentences = self.preprocess_text(text)

        if not sentences:
            return ""

        # Score sentences
        scores = self.optimized_ensemble_score(sentences, clean_sentences, tokenized_sentences, title)

        # Select sentences
        selected_indices = self.select_balanced_sentences(scores, sentences, tokenized_sentences)

        # Enhance coherence
        coherent_summary = self._enhance_coherence(
            [sentences[idx] for idx in selected_indices],
            selected_indices,
            sentences
        )

        # Join sentences into summary
        summary = " ".join(coherent_summary)

        return summary

    def evaluate_summary(self, original_text, reference_summary, generated_summary):
        """Evaluate summary without external ROUGE packages"""

        def get_ngrams(text, n):
            """Get n-grams from text"""
            words = re.findall(r'\b\w+\b', text.lower())
            ngrams = [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
            return set(ngrams)

        # Get words and n-grams
        ref_words = set(re.findall(r'\b\w+\b', reference_summary.lower()))
        gen_words = set(re.findall(r'\b\w+\b', generated_summary.lower()))

        ref_bigrams = get_ngrams(reference_summary, 2)
        gen_bigrams = get_ngrams(generated_summary, 2)

        # Calculate basic overlap metrics
        word_overlap = len(ref_words.intersection(gen_words))
        bigram_overlap = len(ref_bigrams.intersection(gen_bigrams))

        # Calculate precision, recall, and F1
        rouge_1 = 2 * word_overlap / (len(ref_words) + len(gen_words)) if (len(ref_words) + len(gen_words)) > 0 else 0
        rouge_2 = 2 * bigram_overlap / (len(ref_bigrams) + len(gen_bigrams)) if (len(ref_bigrams) + len(gen_bigrams)) > 0 else 0
        rouge_l = (rouge_1 + rouge_2) / 2  # Simple approximation

        # Compression ratio
        original_words = len(re.findall(r'\b\w+\b', original_text))
        summary_words = len(re.findall(r'\b\w+\b', generated_summary))
        compression_ratio = summary_words / original_words if original_words > 0 else 0

        # Return metrics
        metrics = {
            'rouge-1': rouge_1,
            'rouge-2': rouge_2,
            'rouge-l': rouge_l,
            'compression_ratio': compression_ratio,
            'summary_word_count': summary_words,
            'original_word_count': original_words
        }

        return metrics

In [None]:
if __name__ == "__main__":
    # Sample news article
    article = """
    Karnataka Deputy Chief Minister DK Shivakumar on Saturday urged women to gear up for the upcoming Assembly and Lok Sabha elections, emphasizing the impact of the Women's Reservation Bill.
According to news agency ANI, he was speaking at an International Women's Day event, he said, "The Women's Reservation Bill is expected to take effect in 2028. Prepare to contest the elections—no one can stop you in a democratic system."
Highlighting the growing role of women in governance, he noted that women hold power from the panchayat level to Parliament. "While there is already 50% reservation for women in Panchayats, many men still control decision-making through female family members. This will change as women become fully capable of leading on their own," he added.
He noted that the previous UPA government at the Centre had tried to pass the Women's Reservation Bill but could not do so due to "certain reasons."
"The Congress government under the leadership of Sonia Gandhi and Manmohan Singh was planning to bring a women's reservation bill but could not due to certain reasons. The Bill has been passed, and it will take effect for the upcoming Assembly and Lok Sabha elections. We are not sure who will have to lose the seats because of this new reservation," he said.
He said that women play an important role in their families and society.
"There are umpteen examples in history which highlight the importance of women in our society. Basavanna rightly called them punya sthree," he said.
"Women are making waves in all fields. They have the ability to rule the country in the future. Indira Gandhi had already set a precedent. Women's reservation bill changes a lot of things in the days to come," he added.
Shivakumar said that four of five guarantees of the Congress government in the State were "directly empowering women."
"Women's day events must be fully organised and managed by women. Male officials should not be associated with such events in the future. I have come to this region to take the blessings of women of Kalyana Karnataka, though there was a major event in Bengaluru," he said.
"Four of five guarantees are directly empowering women. Our government is committed to empowering women economically," he added."""

    # Example of how to train with human summaries (if available)
    human_summary = """Karnataka Deputy CM DK Shivakumar, speaking at an International Women's Day event, urged women to prepare for the upcoming Assembly and Lok Sabha elections, highlighting the impact of the Women's Reservation Bill, which is expected to take effect in 2028. He emphasized the growing role of women in governance and noted that while there is already 50% reservation for women in Panchayats, decision-making is still influenced by men. He acknowledged the previous UPA government's attempt to pass the bill and praised women's increasing influence in society. He also mentioned that four of five Congress government guarantees in the state empower women directly. """


    # Create summarizer with ML enabled and training data
    summarizer = EnhancedNewsArticleSummarizer(compression_ratio=0.3)

    # Generate summary
    title = "Karnataka Women Empowerment"
    summary = summarizer.summarize(article, title=title)

    print("Original Article:\n", article)
    print("\nGenerated Summary:\n", summary)


    # Evaluate summary quality
    metrics = summarizer.evaluate_summary(article, human_summary, summary)
    print("\nEvaluation Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")

Original Article:
 
    Karnataka Deputy Chief Minister DK Shivakumar on Saturday urged women to gear up for the upcoming Assembly and Lok Sabha elections, emphasizing the impact of the Women's Reservation Bill.
According to news agency ANI, he was speaking at an International Women's Day event, he said, "The Women's Reservation Bill is expected to take effect in 2028. Prepare to contest the elections—no one can stop you in a democratic system."
Highlighting the growing role of women in governance, he noted that women hold power from the panchayat level to Parliament. "While there is already 50% reservation for women in Panchayats, many men still control decision-making through female family members. This will change as women become fully capable of leading on their own," he added.
He noted that the previous UPA government at the Centre had tried to pass the Women's Reservation Bill but could not do so due to "certain reasons."
"The Congress government under the leadership of Sonia Gan