In [1]:
pip install nltk

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.




In [20]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import string

# Download necessary resources from nltk
nltk.download('punkt')
nltk.download('stopwords')

# Step 1: Web Scraping
def webscrap(url):
    try:
        # Send an HTTP request to the URL
        a = requests.get(url)
        a.raise_for_status()
        
        # Parse the HTML content of the page using BeautifulSoup
        b = BeautifulSoup(a.text, 'html.parser')
        
        return b
    except Exception as e:
        print(f"Error while scraping: {e}")
        return None

# Step 2: Extract Headings and Sections
def extract(b):
    headings_and_sections = []

    # Find all headings (e.g., h1, h2, h3, etc.)
    headings = b.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

    for heading in headings:
        section_title = heading.get_text(strip=True)
        section_content = []

        # Extract text content under the heading until the next heading
        sibling = heading.find_next()  # Move to the next element, not just sibling
        while sibling and sibling.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            if sibling.name == 'p':
                section_content.append(sibling.get_text(strip=True))
            sibling = sibling.find_next()

        headings_and_sections.append({
            'title': section_title,
            'content': ' '.join(section_content).strip(),  # Clean and join content
        })

    return headings_and_sections

# Define nltk_summarize before using it
def nltk_summarize(text):
    if not text.strip():  # Check if the text is empty
        return "No content to summarize."

    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)
    
    # Create a frequency table for words
    freq_table = {}
    for word in words:
        word = word.lower()
        if word not in stop_words and word not in string.punctuation:
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

    # Tokenize sentences
    sentences = sent_tokenize(text)
    if len(sentences) == 0:  # No sentences found
        return "No sentences to summarize."

    sentence_value = {}

    # Assign a score to each sentence based on word frequencies
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in freq_table:
                if sentence in sentence_value:
                    sentence_value[sentence] += freq_table[word]
                else:
                    sentence_value[sentence] = freq_table[word]

    # If no sentence is scored, return empty
    if len(sentence_value) == 0:
        return "No significant sentences to summarize."

    # Calculate the average score
    average_score = sum(sentence_value.values()) / len(sentence_value)

    # Return sentences that have a score higher than the average
    summary = ' '.join([sentence for sentence in sentences if sentence_value.get(sentence, 0) > average_score])

    return summary if summary else "No significant content to summarize."

# Step 3: Contextual Understanding
def contextual_understanding(section):
    """Identify the major theme or context of the section."""
    section_content = section['content']

    if not section_content:
        return "No content to summarize."

    # Use the nltk_summarize function from earlier to summarize the section content
    return nltk_summarize(section_content)

# Step 4: Key Points Extraction
def extract_key_points(section_content):
    """Extract key sentences that are most important."""
    if not section_content.strip():
        return "No content to summarize."

    stop_words = set(stopwords.words("english"))
    words = word_tokenize(section_content)
    
    # Create a frequency table for words
    freq_table = {}
    for word in words:
        word = word.lower()
        if word not in stop_words and word not in string.punctuation:
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

    # Tokenize sentences
    sentences = sent_tokenize(section_content)
    sentence_value = {}

    # Assign a score to each sentence based on word frequencies
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in freq_table:
                if sentence in sentence_value:
                    sentence_value[sentence] += freq_table[word]
                else:
                    sentence_value[sentence] = freq_table[word]

    # Return the highest-ranked sentences
    average_score = sum(sentence_value.values()) / len(sentence_value)
    key_points = [sentence for sentence in sentences if sentence_value.get(sentence, 0) > average_score]

    return key_points

# Step 5: Coherence Rewriting
def rewrite_for_coherence(key_points):
    """Rewrite the extracted key points to form a coherent summary."""
    if not key_points:
        return "No significant content to summarize."
    
    # Join key points together in a way that reads smoothly
    coherent_summary = ' '.join(key_points)
    
    return coherent_summary

# Step 6: Summarization Pipeline
def summary_nltk(headings_and_sections):
    summarized_sections = []

    for section in headings_and_sections:
        section_title = section['title']
        section_content = section['content']

        if not section_content:
            section_summary = "No content to summarize."
        else:
            # Step 1: Understand the context of the section
            theme = contextual_understanding(section)
            
            # Step 2: Extract key points from the section
            key_points = extract_key_points(section_content)
            
            # Step 3: Rewrite key points into a coherent summary
            section_summary = rewrite_for_coherence(key_points)

        summarized_sections.append({
            'title': section_title,
            'summary': section_summary,
        })

    return summarized_sections

# Step 7: Main Pipeline
def main(url):
    # Step 1: Web Scraping
    soup = webscrap(url)

    if soup is not None:
        # Step 2: Extract Headings and Sections
        headings_and_sections = extract(soup)

        if headings_and_sections:
            # Step 3: Summarization using NLTK with coherence improvement
            summarized_sections = summary_nltk(headings_and_sections)

            # Step 4: Save the summaries to a file
            with open('scraped_and_summarized.txt', 'w', encoding='utf-8') as file:
                for section in summarized_sections:
                    file.write(f"Section Title: {section['title']}\n")
                    file.write(f"Section Summary: {section['summary']}\n\n")
                print("Summarized content saved to scraped_and_summarized.txt")

# Provide the URL of the web page to be scraped and summarized
web_page_url = 'https://en.wikipedia.org/wiki/Alexander_the_Great'
main(web_page_url)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vedee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vedee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Summarized content saved to scraped_and_summarized.txt


In [23]:
with open("scraped_and_summarized.txt", "r", encoding="utf-8") as file:
    content = file.read()
print(content)    


Section Title: Contents
Section Summary: No content to summarize.

Section Title: Alexander the Great
Section Summary: Alexander III of Macedon(Ancient Greek:Ἀλέξανδρος,romanized:Alexandros; 20/21 July 356 BC – 10/11 June 323 BC), most commonly known asAlexander the Great,[c]was a king of theancient Greekkingdom ofMacedon. [d]He succeeded his fatherPhilip IIto the throne in 336 BC at the age of 20 and spent most of his ruling years conducting a lengthymilitary campaignthroughoutWestern Asia,Central Asia, parts ofSouth Asia, andEgypt. In 335 BC, shortly after his assumption of kingship over Macedon, hecampaigned in the Balkansand reasserted control overThraceand parts ofIllyriabefore marching on the city ofThebes, which wassubsequently destroyed in battle. Following his conquest ofAsia Minor, Alexander broke the power of Achaemenid Persia in a series of decisive battles, including those atIssusandGaugamela; he subsequently overthrewDarius IIIand conquered the Achaemenid Empire in its en

In [6]:
import requests
from bs4 import BeautifulSoup
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer

# Function to get summaries (similar to the previous examples)
def get_wikipedia_original_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    original_text = []
    headings = soup.find_all(['h1', 'h2', 'h3'])
    
    for heading in headings:
        section_title = heading.get_text(strip=True)
        section_content = []
        sibling = heading.find_next_sibling()
        
        while sibling and sibling.name not in ['h1', 'h2', 'h3']:
            if sibling.name in ['p', 'li', 'ul', 'ol']:
                section_content.append(sibling.get_text(strip=True))
            sibling = sibling.find_next_sibling()
        
        content = ' '.join(section_content)
        original_text.append(f"{section_title}: {content}")

    return original_text

# Get the summary content
url = 'https://en.wikipedia.org/wiki/Alexander_the_Great'
original_text = get_wikipedia_original_text(url)

# Function to evaluate summaries
def evaluate_summary(summary):
    scores = []
    for section in summary:
        # Readability metrics
        readability_score = textstat.flesch_reading_ease(section)
        scores.append((section, readability_score))
    
    # Print scores for each section
    for section, score in scores:
        print(f"Section: {section[:50]}... | Readability Score: {score}")

# Evaluate the original text
evaluate_summary(original_text)


Section: Contents: ... | Readability Score: 36.62
Section: Alexander the Great: ... | Readability Score: 93.81
Section: Early life: ... | Readability Score: 77.91
Section: Lineage and childhood: ... | Readability Score: 59.97
Section: Education: ... | Readability Score: -132.59
Section: Heir of Philip II: ... | Readability Score: 118.18
Section: Regency and ascent of Macedon: ... | Readability Score: 49.48
Section: Exile and return: ... | Readability Score: 59.97
Section: King of Macedon: ... | Readability Score: 93.81
Section: Accession: ... | Readability Score: -47.99
Section: Consolidation of power: ... | Readability Score: -24.64
Section: Balkan campaign: ... | Readability Score: 77.91
Section: Destruction of Thebes: ... | Readability Score: 59.97
Section: Conquest of the Achaemenid Persian Empire: ... | Readability Score: 73.85
Section: Asia Minor: ... | Readability Score: 77.91
Section: The Levant and Syria: ... | Readability Score: 75.88
Section: Egypt: ... | Readability Score: 

In [27]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the pre-trained GPT-2 model and tokenizer from Hugging Face
model_name = "gpt2"  # You can use "gpt2-medium" or "gpt2-large" for larger models
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Define the content with sections and headings
content = """
Section: Early Life
Text: Alexander III was born in Pella, the capital of the Kingdom of Macedon, on the sixth day of
the ancient Greek month of Hekatombaion...
Section: Conquests
Text: Alexander's conquests extended his empire across three continents...
"""

# Initialize the conversation
conversation = []

# Split the content into sections
sections = content.strip().split("\n\n")
logging.info(f"Content split into {len(sections)} sections.")

# Generate text for each section using GPT-2
summaries = []
for section in sections:
    heading, text = section.strip().split("\n", 1)

    # Prepare the input text for GPT-2 (we will use a prompt)
    prompt = f"Summarize the following section: {text}"
    
    # Tokenize the prompt
    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=1024, truncation=True)

    # Generate text with GPT-2
    summary_ids = model.generate(inputs, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2, num_beams=5, early_stopping=True)

    # Decode the generated text and append it to the summaries
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Save the summary with the section heading
    summaries.append(f"Section: {heading}\nSummary: {summary}")

# Save the summaries to a file
with open("summarized_document.txt", "w", encoding="utf-8") as summarized_file:
    for summary in summaries:
        summarized_file.write(summary + "\n")
    logging.info("Summaries saved to summarized_document.txt.")


2024-10-04 11:32:05,119 - INFO - Content split into 1 sections.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
2024-10-04 11:32:58,110 - INFO - Summaries saved to summarized_document.txt.


In [25]:
# Open the summarized document and print its contents
with open("summarized_document.txt", "r", encoding="utf-8") as summarized_file:
    content = summarized_file.read()

# Print the file contents
print(content)


Section Title: Contents
Section Summary: No content to summarize.

Section Title: Alexander the Great
Section Summary: Alexander III of Macedon(Ancient Greek:Ἀλέξανδρος,romanized:Alexandros; 20/21 July 356 BC – 10/11 June 323 BC), most commonly known asAlexander the Great,[c]was a king of theancient Greekkingdom ofMacedon. [d]He succeeded his fatherPhilip IIto the throne in 336 BC at the age of 20 and spent most of his ruling years conducting a lengthymilitary campaignthroughoutWestern Asia,Central Asia, parts ofSouth Asia, andEgypt. In 335 BC, shortly after his assumption of kingship over Macedon, hecampaigned in the Balkansand reasserted control overThraceand parts ofIllyriabefore marching on the city ofThebes, which wassubsequently destroyed in battle. Following his conquest ofAsia Minor, Alexander broke the power of Achaemenid Persia in a series of decisive battles, including those atIssusandGaugamela; he subsequently overthrewDarius IIIand conquered the Achaemenid Empire in its en