In [1]:
!pip install spacy pysbd pandas tqdm requests nltk conllu --quiet
!python -m spacy download en_core_web_lg

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/71.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [15]:
import requests
from conllu import parse
from io import StringIO

# Download UD English EWT test set (CoNLL-U format)
url = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/r2.12/en_ewt-ud-test.conllu"
response = requests.get(url)
data = response.text

# Parse sentences
sentences = parse(data)
print(f"Loaded {len(sentences)} sentences.")

# Print a few sample sentences (as words)
for i, sent in enumerate(sentences[:5]):
    print(f"{i+1}: {' '.join(token['form'] for token in sent)}")


Loaded 2077 sentences.
1: What if Google Morphed Into GoogleOS ?
2: What if Google expanded on its search - engine ( and now e-mail ) wares into a full - fledged operating system ?
3: [ via Microsoft Watch from Mary Jo Foley ]
4: ( And , by the way , is anybody else just a little nostalgic for the days when that was a good thing ? )
5: This BuzzMachine post argues that Google's Google 's rush toward ubiquity might backfire -- which we've we 've all heard before , but it's it 's particularly well - put in this post .


In [16]:
import pandas as pd
import re
import random
from tqdm import tqdm
import spacy
from typing import List, Dict, Set, Tuple, Optional
import logging
from collections import defaultdict
import requests
from urllib.parse import urlparse
import time
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

# Install and import PySBD
try:
    import pysbd
    PYSBD_AVAILABLE = True
    print("✓ PySBD is available")
except ImportError:
    PYSBD_AVAILABLE = False
    print("✗ PySBD not found. Install with: pip install pysbd")

# Set up enhanced logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('splitter_evaluation.log', mode='w', encoding='utf-8')
    ]
)
logger = logging.getLogger(__name__)

# ===============================
# 1. Enhanced Sentiment-based Switching Detection (English)
# ===============================
class SentimentSwitchDetector:
    """
    Enhanced sentiment switching detector for English with quote handling
    """

    def __init__(self):
        # Extended explicit contrasting switchers
        self.explicit_switchers = {
            'but': 'contrasting',
            'however': 'contrasting',
            'although': 'contrasting',
            'though': 'contrasting',
            'nevertheless': 'contrasting',
            'nonetheless': 'contrasting',
            'yet': 'contrasting',
            'despite': 'contrasting',
            'in contrast': 'contrasting',
            'on the other hand': 'contrasting',
            'whereas': 'contrasting',
            'while': 'contrasting',
            'even though': 'contrasting',
            'regardless': 'contrasting',
            'conversely': 'contrasting',
            'instead': 'contrasting',
            'rather': 'contrasting',
            'still': 'contrasting',
            'except': 'contrasting',
            'otherwise': 'contrasting'
        }

        # Enhanced positive sentiment indicators
        self.positive_indicators = {
            'good', 'great', 'excellent', 'wonderful', 'fantastic', 'awesome', 'amazing', 'perfect',
            'love', 'like', 'best', 'superb', 'outstanding', 'fabulous', 'terrific', 'brilliant',
            'recommend', 'satisfied', 'happy', 'pleased', 'delighted', 'impressed', 'enjoy',
            'smooth', 'easy', 'fast', 'quick', 'reliable', 'affordable', 'valuable', 'worth',
            'beautiful', 'comfortable', 'convenient', 'efficient', 'effective', 'powerful',
            'improved', 'better', 'superior', 'exceptional', 'flawless', 'ideal', 'incredible'
        }

        # Enhanced negative sentiment indicators
        self.negative_indicators = {
            'bad', 'poor', 'terrible', 'awful', 'horrible', 'worst', 'disappointing', 'disappointed',
            'hate', 'dislike', 'problem', 'issue', 'fault', 'defect', 'broken', 'damaged',
            'slow', 'difficult', 'hard', 'complex', 'confusing', 'frustrating', 'annoying',
            'expensive', 'overpriced', 'waste', 'useless', 'pointless', 'worthless', 'unreliable',
            'inferior', 'mediocre', 'average', 'ordinary', 'subpar', 'lacking', 'missing',
            'noisy', 'loud', 'uncomfortable', 'painful', 'dangerous', 'risky', 'flawed'
        }

        # Enhanced implicit switching patterns
        self.implicit_patterns = [
            r'\.\s*[Bb]ut\s',          # But after period
            r'\.\s*[Hh]owever\s',      # However after period
            r'\.\s*[Tt]hat said\s',    # That said after period
            r'\.\s*[Oo]n the other hand\s',
            r'\.\s*[Uu]nfortunately\s',
            r'\.\s*[Dd]isappointingly\s',
            r'\.\s*[Rr]egrettably\s',
            r'\.\s*[Ww]hereas\s',
            r'\.\s*[Yy]et\s',
            r'\.\s*[Nn]evertheless\s',
            r'\.\s*[Oo]therwise\s',
            r'\.\s*[Cc]onversely\s',
            r'\.\s*[Ii]n contrast\s',
            r'\.\s*[Aa]lthough\s',
            r'\.\s*[Dd]espite\s',
            r'\.\s*[Ee]ven though\s'
        ]

        # Context-aware sentiment patterns
        self.context_patterns = {
            'disappointment': [
                r'expected.*?but.*?disappointed',
                r'hoped.*?but.*?let down',
                r'looked forward.*?but.*?disappointing'
            ],
            'mixed_feelings': [
                r'good.*?but.*?bad',
                r'great.*?except.*?issue',
                r'love.*?but.*?hate'
            ],
            'conditional_positive': [
                r'not bad.*?but.*?great',
                r'okay.*?but.*?excellent',
                r'acceptable.*?but.*?wonderful'
            ]
        }

    def detect_sentiment(self, text: str) -> str:
        """
        Enhanced sentiment detection for English
        """
        text_lower = text.lower()

        # Count positive and negative indicators
        positive_count = sum(1 for indicator in self.positive_indicators
                           if re.search(r'\b' + re.escape(indicator) + r'\b', text_lower))
        negative_count = sum(1 for indicator in self.negative_indicators
                           if re.search(r'\b' + re.escape(indicator) + r'\b', text_lower))

        # Check for context patterns and adjust counts
        context_bonus = 0
        if any(re.search(pattern, text_lower) for pattern in self.context_patterns['disappointment']):
            negative_count += 1
        if any(re.search(pattern, text_lower) for pattern in self.context_patterns['mixed_feelings']):
            context_bonus = 0.5
        if any(re.search(pattern, text_lower) for pattern in self.context_patterns['conditional_positive']):
            positive_count += 0.5

        # Determine overall sentiment
        if positive_count > negative_count + context_bonus:
            return 'positive'
        elif negative_count > positive_count + context_bonus:
            return 'negative'
        else:
            return 'neutral'

    def has_explicit_switcher(self, text: str) -> bool:
        """
        Checks for explicit switchers
        """
        for switcher in self.explicit_switchers.keys():
            if re.search(r'\b' + re.escape(switcher) + r'\b', text, re.IGNORECASE):
                return True
        return False

    def has_implicit_switcher(self, text: str) -> bool:
        """
        Checks for implicit switching patterns
        """
        for pattern in self.implicit_patterns:
            if re.search(pattern, text):
                return True
        return False

    def should_split_by_sentiment(self, text: str) -> bool:
        """
        Determines if a sentence should be split based on sentiment switching
        """
        # Handle quotes as special case
        if '"' in text or "'" in text:
            return True

        if self.has_explicit_switcher(text):
            logger.debug(f"Explicit switcher found in: {text[:50]}...")
            return True

        if self.has_implicit_switcher(text):
            logger.debug(f"Implicit switcher found in: {text[:50]}...")
            return True

        # Split by punctuation for sentiment analysis
        punctuation_splits = re.split(r'([.!?]+)', text)
        sentences_for_sentiment = []
        for i in range(0, len(punctuation_splits), 2):
            if i + 1 < len(punctuation_splits):
                sentence = punctuation_splits[i] + punctuation_splits[i+1]
                if sentence.strip():
                    sentences_for_sentiment.append(sentence.strip())
            else:
                if punctuation_splits[i].strip():
                    sentences_for_sentiment.append(punctuation_splits[i].strip())

        if len(sentences_for_sentiment) < 2:
            return False

        sentiments = [self.detect_sentiment(segment) for segment in sentences_for_sentiment]
        has_positive = 'positive' in sentiments
        has_negative = 'negative' in sentiments

        result = has_positive and has_negative
        if result:
            logger.debug(f"Sentiment switching detected: {sentiments} in {text[:50]}...")

        return result

# Initialize enhanced sentiment detector
sentiment_detector = SentimentSwitchDetector()

# ===============================
# 2. Enhanced Data Loading with Custom Dataset
# ===============================
def load_custom_dataset() -> List[str]:
    """
    Load custom English dataset from the specified URL
    """
    logger.info("=" * 60)
    logger.info("LOADING CUSTOM ENGLISH DATASET")
    logger.info("=" * 60)

    try:
        # Load the custom dataset
        logger.info("Downloading custom dataset from GitHub...")
        url = "https://raw.githubusercontent.com/thunlp/QuoteR/main/data/english.txt"
        response = requests.get(url)
        response.raise_for_status()

        # Parse the content
        content = response.text
        logger.info(f"Successfully downloaded {len(content)} characters")

        # Split into sentences/lines
        lines = content.strip().split('\n')
        sentences = [line.strip() for line in lines if line.strip()]
        logger.info(f"Extracted {len(sentences)} sentences from custom dataset")

        # Filter for quality and length
        quality_sentences = []
        for sentence in sentences:
            # Basic quality filters
            if (len(sentence) >= 20 and  # Minimum length
                len(sentence) <= 500 and  # Maximum length
                sentence.count('.') <= 5 and  # Not too many periods
                not sentence.startswith(('#', '//', '/*')) and  # Not comments
                re.search(r'[a-zA-Z]', sentence)):  # Contains letters
                quality_sentences.append(sentence)

        logger.info(f"Filtered to {len(quality_sentences)} quality sentences")

        # Prioritize sentences with quotes and contrasting patterns
        quote_sentences = [s for s in quality_sentences if '"' in s or "'" in s]
        contrast_sentences = [s for s in quality_sentences if sentiment_detector.has_explicit_switcher(s)]
        complex_sentences = [s for s in quality_sentences if len(s) > 100]

        logger.info(f"Found {len(quote_sentences)} sentences with quotes")
        logger.info(f"Found {len(contrast_sentences)} sentences with contrasting words")
        logger.info(f"Found {len(complex_sentences)} complex sentences")

        # Create a balanced dataset
        final_sentences = []

        # Add all quote sentences
        final_sentences.extend(quote_sentences)

        # Add contrast sentences not already included
        for s in contrast_sentences:
            if s not in final_sentences:
                final_sentences.append(s)

        # Add complex sentences not already included
        for s in complex_sentences:
            if s not in final_sentences and len(final_sentences) < 8000:
                final_sentences.append(s)

        # Fill remaining with random quality sentences
        remaining = [s for s in quality_sentences if s not in final_sentences]
        sample_size = min(10000 - len(final_sentences), len(remaining))
        if sample_size > 0:
            final_sentences.extend(random.sample(remaining, sample_size))

        logger.info(f"Final dataset size: {len(final_sentences)} sentences")

        # Show examples
        logger.info("\nSample sentences from custom dataset:")
        for i, s in enumerate(final_sentences[:5]):
            logger.info(f"{i+1}. {s}")

        return final_sentences

    except Exception as e:
        logger.error(f"Error loading custom dataset: {e}")
        logger.info("Using fallback English sentences")
        return create_fallback_english_data()

def create_fallback_english_data() -> List[str]:
    """
    Create fallback English sentences with quotes and contrasting patterns
    """
    logger.info("Creating fallback English sentences...")

    sentences = [
        "The manager said, 'We need to improve our service,' but customers seem happy overall.",
        "She claimed, \"This is the best product ever,\" however I found several flaws.",
        "While the book was interesting, the ending felt rushed.",
        "He shouted, 'I won't accept this!' and stormed out of the meeting.",
        "The instructions said, 'Turn left at the intersection,' but the sign indicated right.",
        "The review stated, 'The food was delicious,' although the service was slow.",
        "They announced, 'We're launching a new product next week,' which surprised everyone.",
        "I thought, 'This will be easy,' but it turned out to be quite challenging.",
        "The sign read, 'Do not enter,' yet people kept walking through the door.",
        "She whispered, 'I know the secret,' and then disappeared into the crowd.",
        "The contract states, 'Payment is due upon delivery,' however we allow a 30-day grace period.",
        "He declared, 'This changes everything!' but in reality, nothing changed at all.",
        "The warning said, 'High voltage—do not touch,' nevertheless someone tried to grab it.",
        "I remember thinking, 'This is too good to be true,' and unfortunately, I was right.",
        "The label claims, 'All natural ingredients,' whereas the ingredients list shows several chemicals.",
        "My father always said, 'Honesty is the best policy,' which I've found to be absolutely true.",
        "The email stated, 'Your account has been suspended,' causing immediate panic among users.",
        "She commented, 'The design is beautiful,' but the functionality really needs improvement.",
        "The forecast predicted, 'Sunny skies all day,' yet it started raining heavily by noon.",
        "He argued, 'We should invest more in marketing,' while others wanted to cut costs significantly.",
        "The movie was entertaining and well-acted. However, the plot had several logical inconsistencies.",
        "I love the convenience of online shopping. On the other hand, I miss the personal touch of local stores.",
        "The restaurant has excellent food and great atmosphere. Nevertheless, the service can be quite slow.",
        "The software is powerful and feature-rich. Unfortunately, it has a steep learning curve.",
        "She's incredibly talented and hardworking. Despite this, she often doubts her abilities.",
        "The weather was perfect for hiking. Even though we were well-prepared, we encountered unexpected challenges.",
        "The hotel room was spacious and comfortable. Conversely, the bathroom was cramped and outdated.",
        "I expected the concert to be amazing. Regrettably, the sound quality was poor throughout.",
        "The book starts with an engaging premise. Disappointingly, it fails to deliver on its initial promise.",
        "The product works exactly as advertised. That said, the price point is quite high for most consumers."
    ]

    # Extend with variations and longer examples
    extended = []
    for s in sentences:
        extended.append(s)
        # Create variations with different structures
        if ' but ' in s:
            extended.append(s.replace(' but ', ' however '))
        if ' however ' in s:
            extended.append(s.replace(' however ', ' yet '))

        # Create compound sentences
        extended.append("First, " + s)
        extended.append(s + " This is important to remember.")

        # Create sentences with multiple clauses
        if len(s.split('.')) == 2:  # Two sentences
            parts = s.split('.')
            if len(parts[0]) > 20 and len(parts[1]) > 20:
                extended.append(parts[0] + ', although ' + parts[1].strip().lower())

    # Add more challenging examples
    challenging_examples = [
        "The CEO announced, 'We're expanding globally,' but employees worry about job security in the current market.",
        "She exclaimed, 'This is revolutionary!' though critics argue it's merely an incremental improvement.",
        "The study concluded, 'Exercise improves mental health,' while acknowledging that more research is needed.",
        "He insisted, 'The project will be completed on time,' despite numerous delays and setbacks.",
        "The advertisement promises, 'Instant results guaranteed,' whereas actual results may take weeks to appear.",
        "They claimed, 'Our product is environmentally friendly,' but the manufacturing process raises concerns.",
        "The teacher said, 'Everyone did well on the exam,' although several students struggled significantly.",
        "She declared, 'I'm confident in this decision,' yet her body language suggested otherwise.",
        "The manual states, 'Assembly takes 30 minutes,' however most users report it takes much longer.",
        "He announced, 'We're ahead of schedule,' while the team knew they were actually falling behind."
    ]

    extended.extend(challenging_examples)

    logger.info(f"Created {len(extended)} fallback English sentences")
    return extended

# ===============================
# 3. Enhanced Ground Truth Creation with Improved Accuracy
# ===============================
def create_accurate_ground_truth(sentences: List[str], sample_size: int = 1000) -> Dict[str, List[str]]:
    """
    Create highly accurate ground truth splits for English sentences
    """
    logger.info("=" * 60)
    logger.info("CREATING ACCURATE GROUND TRUTH")
    logger.info("=" * 60)

    actual_sample_size = min(sample_size, len(sentences))
    logger.info(f"Creating ground truth for {actual_sample_size} sentences...")

    sample_sentences = random.sample(sentences, actual_sample_size)
    ground_truth = {}

    # Statistics tracking
    stats = {
        'quote_splits': 0,
        'explicit_switcher_splits': 0,
        'implicit_switcher_splits': 0,
        'punctuation_splits': 0,
        'clause_boundary_splits': 0,
        'sentiment_based_splits': 0,
        'no_splits': 0
    }

    for i, sentence in enumerate(tqdm(sample_sentences, desc="Creating accurate ground truth")):
        splits = []
        split_method = "no_split"

        # Clean the sentence
        sentence = sentence.strip()
        if not sentence:
            continue

        # Strategy 1: Quote-based splitting (highest priority)
        if '"' in sentence or "'" in sentence:
            splits = split_by_quotes(sentence)
            if len(splits) > 1:
                split_method = "quote_split"
                stats['quote_splits'] += 1

        # Strategy 2: Explicit switcher splitting
        if not splits and sentiment_detector.has_explicit_switcher(sentence):
            splits = split_by_explicit_switcher(sentence)
            if len(splits) > 1:
                split_method = "explicit_switcher"
                stats['explicit_switcher_splits'] += 1

        # Strategy 3: Implicit switcher splitting
        if not splits and sentiment_detector.has_implicit_switcher(sentence):
            splits = split_by_implicit_switcher(sentence)
            if len(splits) > 1:
                split_method = "implicit_switcher"
                stats['implicit_switcher_splits'] += 1

        # Strategy 4: Punctuation-based splitting
        if not splits and contains_multiple_sentences(sentence):
            splits = smart_punctuation_split(sentence)
            if len(splits) > 1:
                split_method = "punctuation"
                stats['punctuation_splits'] += 1

        # Strategy 5: Clause boundary splitting for complex sentences
        if not splits and len(sentence) > 80:
            splits = split_by_clause_boundary(sentence)
            if len(splits) > 1:
                split_method = "clause_boundary"
                stats['clause_boundary_splits'] += 1

        # Strategy 6: Sentiment-based splitting
        if not splits and sentiment_detector.should_split_by_sentiment(sentence):
            splits = split_by_sentiment_change(sentence)
            if len(splits) > 1:
                split_method = "sentiment_based"
                stats['sentiment_based_splits'] += 1

        # Fallback: keep original if no good splits found
        if not splits or len(splits) == 0:
            splits = [sentence]
            stats['no_splits'] += 1

        # Clean and validate splits
        splits = [s.strip() for s in splits if s.strip()]
        if not splits:
            splits = [sentence]

        ground_truth[sentence] = splits

        # Log examples
        if i < 5:
            logger.info(f"\nExample {i+1} (Ground Truth):")
            logger.info(f"  Original: {sentence}")
            logger.info(f"  Splits ({len(splits)}): {splits}")
            logger.info(f"  Method: {split_method}")

    # Log comprehensive statistics
    logger.info(f"\nGround Truth Statistics:")
    logger.info(f"  Total sentences: {len(ground_truth)}")
    for method, count in stats.items():
        percentage = (count / len(ground_truth)) * 100
        logger.info(f"  {method}: {count} ({percentage:.1f}%)")

    # Distribution of split counts
    split_counts = defaultdict(int)
    for splits in ground_truth.values():
        split_counts[len(splits)] += 1

    logger.info(f"\nSplit count distribution:")
    for count, freq in sorted(split_counts.items()):
        percentage = (freq / len(ground_truth)) * 100
        logger.info(f"  {count} splits: {freq} sentences ({percentage:.1f}%)")

    logger.info("=" * 60)
    logger.info("ACCURATE GROUND TRUTH CREATION COMPLETED")
    logger.info("=" * 60)

    return ground_truth

def split_by_quotes(text: str) -> List[str]:
    """
    Split text by quotes while preserving quote boundaries
    """
    # Handle both single and double quotes
    quote_pattern = r'([\'"].*?[\'"])'
    parts = re.split(quote_pattern, text)

    result = []
    current = ""

    for part in parts:
        part = part.strip()
        if not part:
            continue

        if part.startswith(('"', "'")):
            # This is a quoted part
            if current:
                result.append(current.strip())
                current = ""
            result.append(part)
        else:
            # This is non-quoted text
            if current and part:
                current += " " + part
            elif part:
                current = part

    if current:
        result.append(current.strip())

    return [s for s in result if s]

def split_by_explicit_switcher(text: str) -> List[str]:
    """
    Split text by explicit contrasting switchers
    """
    for switcher in sentiment_detector.explicit_switchers.keys():
        pattern = r'\b' + re.escape(switcher) + r'\b'
        if re.search(pattern, text, re.IGNORECASE):
            # Find the position of the switcher
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                pos = match.start()
                # Split at the switcher, keeping it with the second part
                part1 = text[:pos].strip()
                part2 = text[pos:].strip()

                if part1 and part2:
                    return [part1, part2]

    return [text]

def split_by_implicit_switcher(text: str) -> List[str]:
    """
    Split text by implicit switching patterns
    """
    for pattern in sentiment_detector.implicit_patterns:
        if re.search(pattern, text):
            # Split at the pattern
            parts = re.split(pattern, text, 1)
            if len(parts) >= 2:
                # Find the actual switcher word
                match = re.search(pattern, text)
                if match:
                    switcher = match.group(0)
                    part1 = parts[0].strip()
                    part2 = (switcher + parts[1]).strip() if len(parts) > 1 else ""

                    if part1 and part2:
                        return [part1, part2]

    return [text]

def contains_multiple_sentences(text: str) -> bool:
    """
    Check if text contains multiple sentences
    """
    # Count sentence-ending punctuation
    sentence_endings = re.findall(r'[.!?]+', text)

    # Must have at least 2 sentence endings, or 1 ending not at the very end
    if len(sentence_endings) > 1:
        return True
    elif len(sentence_endings) == 1:
        # Check if the ending is not at the very end
        last_punct_pos = text.rfind(sentence_endings[0])
        return last_punct_pos < len(text) - len(sentence_endings[0])

    return False

def smart_punctuation_split(text: str) -> List[str]:
    """
    Intelligently split text by punctuation
    """
    # Split by sentence-ending punctuation
    sentences = re.split(r'([.!?]+)', text)

    result = []
    for i in range(0, len(sentences), 2):
        if i + 1 < len(sentences):
            sentence = sentences[i] + sentences[i + 1]
            if sentence.strip():
                result.append(sentence.strip())
        else:
            if sentences[i].strip():
                result.append(sentences[i].strip())

    return [s for s in result if s]

def split_by_clause_boundary(text: str) -> List[str]:
    """
    Split text by clause boundaries
    """
    # Look for clause boundary patterns
    clause_patterns = [
        r'(.*?[,;:])\s+(?:but|however|although|though|yet|while|whereas|nevertheless|nonetheless|despite|in contrast|on the other hand|even though|regardless|conversely|instead|rather|still|except|otherwise)\b',
        r'(.*?)\s+(?:but|however|although|though|yet|while|whereas|nevertheless|nonetheless|despite|in contrast|on the other hand|even though|regardless|conversely|instead|rather|still|except|otherwise)\b\s+(.*)',
    ]

    for pattern in clause_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            if isinstance(matches[0], tuple):
                # Multiple capturing groups
                parts = [part.strip() for part in matches[0] if part.strip()]
                if len(parts) >= 2:
                    return parts[:2]  # Return first two parts
            else:
                # Single capturing group
                match_text = matches[0].strip()
                remaining = text[text.find(match_text) + len(match_text):].strip()
                if remaining:
                    return [match_text, remaining]

    return [text]

def split_by_sentiment_change(text: str) -> List[str]:
    """
    Split text by sentiment changes
    """
    # First try punctuation-based splitting
    punct_splits = smart_punctuation_split(text)
    if len(punct_splits) > 1:
        return punct_splits

    # If no punctuation splits, try clause boundaries
    return split_by_clause_boundary(text)

# ===============================
# 4. Enhanced Splitter Functions
# ===============================
def load_spacy_model() -> Optional[spacy.Language]:
    """
    Loads spaCy English model
    """
    try:
        logger.info("Loading spaCy model 'en_core_web_lg'...")
        nlp = spacy.load("en_core_web_lg")
        logger.info("✓ spaCy en_core_web_lg model loaded successfully")
        return nlp
    except IOError:
        logger.warning("✗ spaCy en_core_web_lg model not found!")
        try:
            logger.info("Attempting to load fallback spaCy model 'en_core_web_sm'...")
            nlp = spacy.load("en_core_web_sm")
            logger.info("✓ spaCy en_core_web_sm model loaded as fallback")
            return nlp
        except IOError:
            logger.warning("✗ No spaCy English model found!")
            return None

# Initialize spaCy model
nlp_spacy = load_spacy_model()

def sentiment_based_split(text: str) -> List[str]:
    """
    Enhanced sentiment-based splitting with quote handling
    """
    return split_by_quotes(text) if ('"' in text or "'" in text) else (
        split_by_explicit_switcher(text) if sentiment_detector.has_explicit_switcher(text) else
        split_by_implicit_switcher(text) if sentiment_detector.has_implicit_switcher(text) else
        smart_punctuation_split(text) if contains_multiple_sentences(text) else [text]
    )

def spacy_split(text: str) -> List[str]:
    """
    Enhanced spaCy-based splitting
    """
    if nlp_spacy is None:
        return [text]
    try:
        doc = nlp_spacy(text)
        sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
        return sentences if sentences else [text]
    except Exception as e:
        logger.debug(f"spaCy error: {e}")
        return [text]

def nltk_split(text: str) -> List[str]:
    """
    Enhanced NLTK-based splitting
    """
    try:
        sentences = sent_tokenize(text)
        return [s.strip() for s in sentences if s.strip()]
    except Exception as e:
        logger.debug(f"NLTK error: {e}")
        return [text]

def pysbd_split(text: str) -> List[str]:
    """
    Enhanced PySBD splitting
    """
    if not PYSBD_AVAILABLE:
        return smart_punctuation_split(text)
    try:
        seg = pysbd.Segmenter(language="en", clean=False)
        sentences = seg.segment(text)
        return [s.strip() for s in sentences if s.strip()]
    except Exception as e:
        logger.debug(f"PySBD error: {e}")
        return [text]

def advanced_regex_split(text: str) -> List[str]:
    """
    Advanced regex-based splitting
    """
    # Priority order: quotes, explicit switchers, punctuation, clause boundaries
    if '"' in text or "'" in text:
        return split_by_quotes(text)
    elif sentiment_detector.has_explicit_switcher(text):
        return split_by_explicit_switcher(text)
    elif contains_multiple_sentences(text):
        return smart_punctuation_split(text)
    else:
        return split_by_clause_boundary(text)

def ensemble_split(text: str) -> List[str]:
    """
    Ensemble splitter: majority or best of several splitters
    """
    results = [
        sentiment_based_split(text),
        spacy_split(text),
        nltk_split(text),
        pysbd_split(text) if PYSBD_AVAILABLE else smart_punctuation_split(text),
        advanced_regex_split(text),
        smart_punctuation_split(text),
    ]
    # Choose the split with most splits (but not just 1-word splits)
    results_multi = [r for r in results if len(r) > 1 and all(len(s) > 2 for s in r)]
    if results_multi:
        # Return the split with number of segments closest to 2 (most common in quote splitting)
        return min(results_multi, key=lambda x: abs(len(x) - 2))
    else:
        return [text]

splitters = {
    "spaCy": spacy_split,
    "NLTK": nltk_split,
    "PySBD": pysbd_split if PYSBD_AVAILABLE else smart_punctuation_split,
    "Sentiment-based": sentiment_based_split,
    "Advanced Regex": advanced_regex_split,
    "Ensemble": ensemble_split,
}

# ===============================
# 5. Evaluation Metrics & Loop
# ===============================
def calculate_split_similarity(predicted: List[str], true: List[str]) -> Dict[str, float]:
    # Exact match
    exact_match = predicted == true
    # Count similarity
    count_diff = abs(len(predicted) - len(true))
    count_similarity = 1.0 / (1.0 + count_diff)
    # Content match
    predicted_content = ''.join(predicted)
    true_content = ''.join(true)
    content_match = predicted_content == true_content
    # Boundary positions
    predicted_boundaries = set()
    true_boundaries = set()
    pos = 0
    for split in predicted[:-1]:
        pos += len(split)
        predicted_boundaries.add(pos)
    pos = 0
    for split in true[:-1]:
        pos += len(split)
        true_boundaries.add(pos)
    # Boundary metrics
    if len(true_boundaries) == 0 and len(predicted_boundaries) == 0:
        boundary_precision = boundary_recall = boundary_f1 = 1.0
    elif len(predicted_boundaries) == 0:
        boundary_precision = boundary_recall = boundary_f1 = 0.0
    elif len(true_boundaries) == 0:
        boundary_precision = 0.0
        boundary_recall = 1.0
        boundary_f1 = 0.0
    else:
        common = len(predicted_boundaries & true_boundaries)
        boundary_precision = common / len(predicted_boundaries)
        boundary_recall = common / len(true_boundaries)
        boundary_f1 = (2 * boundary_precision * boundary_recall) / (boundary_precision + boundary_recall) if (boundary_precision + boundary_recall) > 0 else 0.0
    return {
        'exact_match': exact_match,
        'count_similarity': count_similarity,
        'content_match': content_match,
        'boundary_precision': boundary_precision,
        'boundary_recall': boundary_recall,
        'boundary_f1': boundary_f1
    }

def evaluate_splitter_accuracy(splitter_func, ground_truth: Dict[str, List[str]], name: str = "splitter") -> Dict:
    results = []
    total_exact_matches = 0
    total_sentences = 0
    boundary_precisions = []
    boundary_recalls = []
    boundary_f1s = []
    count_similarities = []
    for sentence, true_splits in tqdm(ground_truth.items(), desc=f"Evaluating {name}"):
        try:
            predicted_splits = splitter_func(sentence)
            if not predicted_splits or all(not s.strip() for s in predicted_splits):
                predicted_splits = [sentence]
            similarities = calculate_split_similarity(predicted_splits, true_splits)
            if similarities['exact_match']:
                total_exact_matches += 1
            boundary_precisions.append(similarities['boundary_precision'])
            boundary_recalls.append(similarities['boundary_recall'])
            boundary_f1s.append(similarities['boundary_f1'])
            count_similarities.append(similarities['count_similarity'])
            results.append({
                'sentence': sentence,
                'true_splits': true_splits,
                'predicted_splits': predicted_splits,
                'similarities': similarities
            })
            total_sentences += 1
        except Exception as e:
            logger.error(f"Error processing sentence with {name}: {e}")
            continue
    exact_match_ratio = total_exact_matches / max(total_sentences, 1)
    avg_boundary_precision = sum(boundary_precisions) / max(len(boundary_precisions), 1)
    avg_boundary_recall = sum(boundary_recalls) / max(len(boundary_recalls), 1)
    avg_boundary_f1 = sum(boundary_f1s) / max(len(boundary_f1s), 1)
    avg_count_similarity = sum(count_similarities) / max(len(count_similarities), 1)
    return {
        'name': name,
        'exact_match_ratio': exact_match_ratio,
        'avg_boundary_precision': avg_boundary_precision,
        'avg_boundary_recall': avg_boundary_recall,
        'avg_boundary_f1': avg_boundary_f1,
        'avg_count_similarity': avg_count_similarity,
        'total_sentences': total_sentences,
        'detailed_results': results
    }

def display_results(results: List[Dict]):
    if not results:
        logger.warning("No results to display.")
        return
    comparison_data = []
    for result in results:
        comparison_data.append({
            'Splitter': result['name'],
            'Exact Match': f"{result['exact_match_ratio']:.3f}",
            'Boundary F1': f"{result['avg_boundary_f1']:.3f}",
            'Boundary Precision': f"{result['avg_boundary_precision']:.3f}",
            'Boundary Recall': f"{result['avg_boundary_recall']:.3f}",
            'Count Similarity': f"{result['avg_count_similarity']:.3f}",
            'Total Sentences': result['total_sentences']
        })
    comparison_df = pd.DataFrame(comparison_data)
    print("\n" + "="*90)
    print("SPLITTER ACCURACY COMPARISON")
    print("="*90)
    print(comparison_df.to_string(index=False))
    print("\n" + "="*90)
    print("BEST PERFORMERS")
    print("="*90)
    best_exact = max(results, key=lambda x: x['exact_match_ratio'])
    best_boundary_f1 = max(results, key=lambda x: x['avg_boundary_f1'])
    best_count_sim = max(results, key=lambda x: x['avg_count_similarity'])
    print(f"Best Exact Match: {best_exact['name']} ({best_exact['exact_match_ratio']:.3f})")
    print(f"Best Boundary F1: {best_boundary_f1['name']} ({best_boundary_f1['avg_boundary_f1']:.3f})")
    print(f"Best Count Similarity: {best_count_sim['name']} ({best_count_sim['avg_count_similarity']:.3f})")
    # Detailed examples
    print("\n" + "="*90)
    print("DETAILED EXAMPLES")
    print("="*90)
    if results and results[0]['detailed_results']:
        sample_details = results[0]['detailed_results'][:3]
        for i, detail in enumerate(sample_details):
            sentence = detail['sentence']
            true_splits = detail['true_splits']
            print(f"\n{i+1}. Original Sentence: {sentence}")
            print(f"    Ground Truth ({len(true_splits)} splits): {true_splits}")
            for result in results:
                matching_detail = next((d for d in result['detailed_results'] if d['sentence'] == sentence), None)
                if matching_detail:
                    pred_splits = matching_detail['predicted_splits']
                    similarities = matching_detail['similarities']
                    exact = "✓ Exact Match" if similarities['exact_match'] else "✗ No Exact Match"
                    f1 = similarities['boundary_f1']
                    print(f"    - {result['name']}: {exact}, Boundary F1: {f1:.2f}")
                    print(f"      Predicted ({len(pred_splits)} splits): {pred_splits}")

# ===============================
# 6. Main Execution Function
# ===============================
def main():
    print("="*90)
    print("INSTALLATION REQUIREMENTS")
    print("="*90)
    print("For Google Colab or local environment, run these commands:")
    print("  !pip install spacy pysbd pandas tqdm requests nltk")
    print("  !python -m spacy download en_core_web_lg")
    print("  !python -m spacy download en_core_web_sm  # optional fallback")
    print()
    if not PYSBD_AVAILABLE:
        print("⚠️  PySBD is not installed. Install with: pip install pysbd")
    else:
        print("✓  PySBD is ready")
    print("="*90)
    print()
    random.seed(42)
    logger.info("Random seed set to 42")
    # Load sentences from QuoteR (custom dataset)
    sentences = load_custom_dataset()
    if not sentences:
        logger.error("Failed to load sentences. Exiting.")
        return
    # Ground truth creation
    ground_truth = create_accurate_ground_truth(sentences, sample_size=500)  # for Colab, use sample_size=500 or 1000
    if not ground_truth:
        logger.error("Failed to create ground truth. Exiting.")
        return
    logger.info(f"Created ground truth for {len(ground_truth)} sentences.")
    # Evaluate splitters
    logger.info("\nRunning evaluation for all splitters...")
    results = []
    for splitter_name, splitter_fn in splitters.items():
        logger.info(f"\n--- Evaluating: {splitter_name} ---")
        result = evaluate_splitter_accuracy(splitter_fn, ground_truth, splitter_name)
        results.append(result)
        logger.info(f"--- Finished: {splitter_name} ---")
    logger.info("\n" + "="*90)
    logger.info("EVALUATION COMPLETED")
    logger.info("="*90)
    display_results(results)
    print("\n" + "="*90)
    print("KEY FEATURES FOR ENGLISH SENTENCE SPLITTING")
    print("="*90)
    print("1. ✓ Special handling for quotes and dialogue")
    print("2. ✓ English-specific sentiment switching detection")
    print("3. ✓ Advanced clause boundary recognition")
    print("4. ✓ Multiple splitting strategies (NLTK, spaCy, PySBD, custom)")
    print("5. ✓ Ensemble methods combining multiple approaches")
    print("6. ✓ Comprehensive evaluation metrics")
    print("7. ✓ Real-world English sentence examples (QuoteR)")
    print("8. ✓ Error handling and fallback mechanisms")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✓ PySBD is available
INSTALLATION REQUIREMENTS
For Google Colab or local environment, run these commands:
  !pip install spacy pysbd pandas tqdm requests nltk
  !python -m spacy download en_core_web_lg
  !python -m spacy download en_core_web_sm  # optional fallback

✓  PySBD is ready



Creating accurate ground truth: 100%|██████████| 500/500 [00:00<00:00, 43594.40it/s]
Evaluating spaCy: 100%|██████████| 500/500 [00:16<00:00, 30.21it/s]
Evaluating NLTK: 100%|██████████| 500/500 [00:00<00:00, 2881.08it/s]
Evaluating PySBD: 100%|██████████| 500/500 [00:02<00:00, 179.66it/s]
Evaluating Sentiment-based: 100%|██████████| 500/500 [00:00<00:00, 15817.06it/s]
Evaluating Advanced Regex: 100%|██████████| 500/500 [00:00<00:00, 18470.93it/s]
Evaluating Ensemble: 100%|██████████| 500/500 [00:19<00:00, 26.19it/s]


SPLITTER ACCURACY COMPARISON
       Splitter Exact Match Boundary F1 Boundary Precision Boundary Recall Count Similarity  Total Sentences
          spaCy       0.000       0.038              0.040           0.241            0.365              500
           NLTK       0.196       0.196              0.196           0.196            0.418              500
          PySBD       0.004       0.053              0.063           0.251            0.362              500
Sentiment-based       1.000       1.000              1.000           1.000            1.000              500
 Advanced Regex       1.000       1.000              1.000           1.000            1.000              500
       Ensemble       0.486       0.508              0.521           0.699            0.633              500

BEST PERFORMERS
Best Exact Match: Sentiment-based (1.000)
Best Boundary F1: Sentiment-based (1.000)
Best Count Similarity: Sentiment-based (1.000)

DETAILED EXAMPLES

1. Original Sentence: do nothing of mys


