In [122]:
# Install dependencies

!pip install requests beautifulsoup4 sumy rake-nltk
import nltk

# These are the 3 essentials for your News Summarizer project:
nltk.download('punkt')      # For sentence splitting (Sumy)
nltk.download('punkt_tab')  # For newer NLTK compatibility (Sumy)
nltk.download('stopwords')  # For identifying key phrases (RAKE)

Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\prash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [123]:
# Import packages

import requests
from bs4 import BeautifulSoup
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from rake_nltk import Rake

In [124]:
#  Data Extraction function 

def get_only_text(url):
    """Fetches the title and paragraph text from a URL."""
    try:
        # User-Agent header helps avoid being blocked by servers
        headers = {'User-Agent': 'Mozilla/5.0'}
        page = requests.get(url, headers=headers, timeout=10)
        
        if page.status_code != 200:
            return "Error", f"Could not access page. Status code: {page.status_code}"

        soup = BeautifulSoup(page.content, "lxml")

        # Extract Title safely
        title = ' '.join(soup.title.stripped_strings) if soup.title else "No Title Found"

        # Extract all paragraph text
        paragraphs = soup.find_all('p')
        text = ' '.join([p.get_text() for p in paragraphs])

        return title, text

    except Exception as e:
        return "Error", str(e)

In [125]:
# Scraping the Article

url = "https://en.wikinews.org/wiki/Global_markets_plunge"
title, article_text = get_only_text(url)

print(f"Title: {title}")
print(f"Word Count: {len(article_text.split())}")

Title: Global markets plunge - Wikinews, the free news source
Word Count: 1209


In [126]:
# Summarization using Sumy(LexRank)

def summarize_content(text, sentences_count=5):
    # Initialize the parser and tokenizer
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    
    # Initialize the LexRank summarizer
    summarizer = LexRankSummarizer()
    
    # Generate the summary
    summary = summarizer(parser.document, sentences_count)
    
    # Combine sentences into a single string
    return " ".join([str(sentence) for sentence in summary])

summary_result = summarize_content(article_text, sentences_count=3)
print("--- ARTICLE SUMMARY ---")
print(summary_result)

--- ARTICLE SUMMARY ---
The reality is that most investors have been spooked by the sheer pressure that the credit crunch is putting on the global economy.” The Japanese Nikkei 225 has recorded it's third biggest drop in history with a massive sell-off in the exchange that has resulted in USD 250 billion being knocked of the index's value. "Here's what the American people need to know: that the United States government is acting; we will continue to act to resolve this crisis and restore stability to our markets. “I think we quickly realised that we cannot solve the problems we have got as a result of the sub-prime market collapse simply by improving liquidity," he said speaking in Birmingham to business leaders earlier today.


In [127]:
# Keyword Extraction using Rake

def extract_keywords(text):
    r = Rake()
    r.extract_keywords_from_text(text)
    # Get top 10 ranked keyword phrases
    return r.get_ranked_phrases()[:10]

keywords = extract_keywords(article_text)
print("--- TOP KEYWORDS ---")
for kw in keywords:
    print(f"- {kw}")

--- TOP KEYWORDS ---
- barclays wealth analyst henk potts commented
- oil related companies saw large drops
- dow jones industrial average falling
- dow jones industrial average fell
- ftse 100 index fell dramatically
- template {{ editprotected }}
- template {{ editprotected }}
- creative commons attribution 2
- business leaders earlier today
- yamato life insurance company
