In [1]:
!pip install nltk



In [22]:
import re
import logging
from nltk.tokenize import sent_tokenize


def extract_abstract_or_equivalent(text):
    """Extract the abstract or equivalent section from the text."""

    
    # Define patterns for sections that might be used as an abstract or its components
    abstract_patterns = [
        r'\babstract\b',
        r'\bbackground\b',
        r'\bintroduction\b',
        r'\bmethods\b',
        r'\bresults\b',
        r'\bconclusion\b'
    ]
    
    # Define patterns for the end of an abstract
    end_patterns = [
        r'\bkeywords\b',
        r'\breferences\b',
        r'\backnowledgements\b',
        r'\bconflict of interest\b',
        r'\bauthor contributions\b'
    ]
    
    # Compile the patterns into regexes
    abstract_regex = re.compile('|'.join(abstract_patterns), re.IGNORECASE)
    end_regex = re.compile('|'.join(end_patterns), re.IGNORECASE)
    
    # Split text into sentences
    sentences = sent_tokenize(text)
    
    abstract = []
    in_abstract_section = False
    abstract_components = set()
    word_count = 0
    
    for sentence in sentences:
        if abstract_regex.search(sentence):
            in_abstract_section = True
            match = abstract_regex.search(sentence)
            component = match.group().lower()
            abstract_components.add(component)
            logging.info(f"Abstract component detected: {component}")
            # Don't include the header in the abstract text
            sentence = abstract_regex.sub('', sentence).strip()
        elif in_abstract_section and end_regex.search(sentence):
            break
        
        if in_abstract_section:
            sentence_words = sentence.split()
            if word_count + len(sentence_words) > 250:
                # If adding this sentence would exceed 250 words, stop here
                break
            abstract.append(sentence)
            word_count += len(sentence_words)
    
    # If no abstract was found, take the first few sentences as a fallback
    if not abstract:
        for sentence in sentences[:5]:
            sentence_words = sentence.split()
            if word_count + len(sentence_words) > 250:
                break
            abstract.append(sentence)
            word_count += len(sentence_words)
        if abstract:
            logging.warning("No clear abstract section found. Using first few sentences as fallback.")
    
    # Join the abstract sentences into a single text
    extracted_text = ' '.join(abstract)
    
    # Additional checks: ensure the extracted text isn't too short
    if word_count < 150:
        logging.warning(f"Extracted text too short: {word_count} words")
        return None
    
    logging.info(f"Extracted abstract with {word_count} words. Components: {', '.join(abstract_components)}")
    
    return extracted_text if abstract else None