In [1]:
import nltk
import re
from nltk.tokenize import sent_tokenize

# Ensure you have downloaded the punkt tokenizer from nltk
nltk.download('punkt')

def parse_text_to_sentences(text):
    """
    Parses a chunk of text into sentences, preparing it for translation.
    This function handles common issues such as extra whitespace, bullet points, and page headers/footers.

    Args:
        text (str): The input text to be parsed.

    Returns:
        list: A list of cleaned sentences ready for translation.
    """
    # Remove any page headers/footers that may be present (commonly seen in UN PDFs)
    text = re.sub(r'\n\s*Page \d+\s*\n', ' ', text)

    # Remove bullet points or numbering that may interfere with sentence splitting
    text = re.sub(r'\n\s*[\u2022\-\*\d]+\s+', ' ', text)

    # Replace multiple newlines or extra spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Use NLTK's sentence tokenizer to split text into sentences
    sentences = sent_tokenize(text)

    # Clean and strip each sentence
    cleaned_sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

    return cleaned_sentences

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


• Introduction to climate action.
The UN is committed to taking steps towards sustainability.
Climate change affects us all.
Page 2 Goals for 2030 include reducing emissions.
Everyone must do their part.


In [2]:
# Example usage
if __name__ == "__main__":
    example_text = """
    Page 1
    • Introduction to climate action.
    The UN is committed to taking steps towards sustainability.
    Climate change affects us all. Page 2
    • Goals for 2030 include reducing emissions.
    Everyone must do their part.
    """
    parsed_sentences = parse_text_to_sentences(example_text)
    for sentence in parsed_sentences:
        print(sentence)


• Introduction to climate action.
The UN is committed to taking steps towards sustainability.
Climate change affects us all.
Page 2 Goals for 2030 include reducing emissions.
Everyone must do their part.
