<a href="https://colab.research.google.com/github/srilamaiti/ml_works/blob/main/news_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install newspaper3k
!pip install annotated-types==0.6.0
!pip install anyio==4.3.0
!pip install beautifulsoup4==4.12.3
!pip install certifi==2024.2.2
!pip install charset-normalizer==3.3.2
!pip install colorama==0.4.6
!pip install distro==1.9.0
!pip install feedparser==6.0.11
!pip install h11==0.14.0
!pip install httpcore==1.0.5
!pip install httpx==0.27.0
!pip install idna==3.7
!pip install openai==1.30.1
!pip install pydantic==2.7.1
!pip install pydantic_core==2.18.2
!pip install python-dotenv==1.0.1
!pip install requests==2.31.0
!pip install sgmllib3k==1.0.0
!pip install sniffio==1.3.1
!pip install soupsieve==2.5
!pip install tqdm==4.66.4
!pip install typing_extensions==4.11.0
!pip install urllib3==2.2.1
!pip install langchain
!pip install langchain-openai
!pip install langchain[docarray]
!pip install lxml
!pip install xxhash

Collecting xxhash
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xxhash
Successfully installed xxhash-3.4.1


In [None]:
import nltk
from textblob import TextBlob
import openai
import newspaper
from newspaper import Article
from newspaper import fulltext
import feedparser
import requests
import os
import concurrent.futures
from configparser import ConfigParser
import re
import shelve
import xxhash
import threading
from queue import Queue
from datetime import datetime, timedelta
import feedparser
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI
import json
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
os.environ['GDRIVE_CONFIG_DIR'] = "/content/drive/MyDrive"

Mounted at /content/drive


In [None]:
# Download required NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!rm -rf .env
!touch .env
!cat /content/drive/MyDrive/openai_api_key.txt >> .env

In [None]:
if not os.path.exists(".env"):
    print("Error: .env file not found. Please create a .env file and set the OPENAI_API_KEY environment variable.")
    exit(1)

In [None]:
aws_url='https://aws.amazon.com/about-aws/whats-new/recent/feed/'
azure_url='https://azurecomcdn.azureedge.net/en-us/updates/feed/'
financial_feed_url='https://www.ft.com/news-feed'
num_days=7
model='gpt-3.5-turbo'
temperature=0.5
max_tokens=64
top_p=1

In [None]:
def fetch_rss_articles(urls):
    """Fetches articles from the given RSS feed URLs."""
    articles = []
    count = 0
    img_count = 0
    for url in urls:
        print(f"Fetching articles from {url}...")
        feed = feedparser.parse(url)
        if feed.bozo:
            print(f"Error fetching articles from {url}: {feed.bozo_exception}")
            continue

        for entry in feed.entries:
            article = {
                "id": count,
                "title": entry.title,
                "link": entry.link,
                "published": entry.get("published", ""),
                "updated": entry.get("updated", ""),
                "content": "",
                "image": "",
            }

            # If 'content' is an array, merge all elements into a single string
            if hasattr(entry, "content") and isinstance(entry.content, list):
                content_merged = "".join([item.value for item in entry.content])
                article["content"] = content_merged
            elif hasattr(entry, "description"):
                article["content"] = entry.description


            # Extracting the first image from the content
            article["image"] = find_the_first_image(article["content"])
            # Clean the HTML content
            article["content"] = clean_html_content(article["content"])
            if article["image"]:
                # print(f"Found image: {article['image']}")
                img_count += 1

            articles.append(article)
            count += 1
    print(f"Fetched {count} articles, {img_count} with images.")
    return articles

def summarize_text(text, model = model, temperature = temperature, max_tokens = max_tokens, top_p = top_p):
    """
    This function uses the OpenAI Chat completion api to summarize provided text.

    :param text: the text which will be summarized
    :return: summarized text
    """
    response = openai.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "Summarize content you are provided with for a second-grade student."
            },
            {
                "role": "user",
                "content": f"Summarize the following text in one sentence:\n\n{text}"
            }
        ],
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p
    )
    return response.choices[0].message.content

def process_entry(entry, one_week_ago):
    """
    Process single feed.
    :param entry: to be processed
    :param one_week_ago: published date to be filtered
    :return: dict
    """
    published_date = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %Z')

    if published_date >= one_week_ago:
        summary = summarize_text(entry.summary)
        return {
            'title': entry.title,
            'link': entry.link,
            'published': published_date.strftime('%Y-%m-%d'),
            'summary': summary
        }
    return None


def fetch_parsed_feed(filtered_entries, aws_url = financial_feed_url, azure_url = azure_url, num_days = num_days):
    """
    Fetching the feed to be parsed.

    :return: the feed
    """
    rss_url = aws_url

    now = datetime.now()
    one_week_ago = now - timedelta(days=num_days)

    feed = feedparser.parse(rss_url)

    if feed.bozo:
        print("Failed to parse the RSS feed.")
        exit(1)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_entry, entry, one_week_ago) for entry in feed.entries if 'summary' in list(dict(entry).keys())]

        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            if result:
                filtered_entries.append(result)

    return filtered_entries

def create_report(entries):
    """
    Creates the html report

    :param entries: news which were parsed
    :return: nothing, creates the actual html report file
    """

    html_content = """
<html>
<head>
    <title>News</title>
    <style>
        body {
            font-family: Arial, sans-serif;
        }
        .panel {
            background-color: #ffffff;
            border: 1px solid #ccc;
            border-radius: 5px;
            margin: 10px 0;
            padding: 10px;
            cursor: pointer;
        }
        .panel-title {
            font-size: 18px;
            font-weight: bold;
        }
        .panel-content {
            display: none;
            margin-top: 10px;
        }
    </style>
    <script>
        function togglePanelContent(panel) {
            var content = panel.querySelector('.panel-content');
            if (content.style.display === 'none' || content.style.display === '') {
                content.style.display = 'block';
            } else {
                content.style.display = 'none';
            }
        }
    </script>
</head>
<body>
    <h1>AWS News</h1>
"""
    for entry in entries:
        html_content += f"""
        <div class="panel" onclick="togglePanelContent(this)">
            <div class="panel-title">{entry['title']}</div>
            <div class="panel-content">
                <p><strong>Link:</strong> <a href="{entry['link']}">{entry['link']}</a></p>
                <p><strong>AI Summary:</strong> {entry['summary']}</p>
            </div>
        </div>
    """
    html_content += """
    </body>
    </html>
    """

    with open("newsBoard.html", "w", encoding="utf-8") as file:
        file.write(html_content)
    print("The news entries have been saved.")

In [None]:
feed = feedparser.parse(aws_url)
[e for e in feed.entries][0]

{'id': 'c31649696350948f772b5efde84c415cf65a083c',
 'guidislink': False,
 'title': 'Amazon OpenSearch Service now supports OpenSearch version 2.13',
 'title_detail': {'type': 'text/plain',
  'language': None,
  'base': 'https://aws.amazon.com/about-aws/whats-new/recent/feed/',
  'value': 'Amazon OpenSearch Service now supports OpenSearch version 2.13'},
 'summary': 'You can now run OpenSearch version 2.13 in Amazon OpenSearch Service. With OpenSearch 2.13, we have made several improvements to search performance and resiliency, OpenSearch Dashboards, and added new features to help you build AI-powered applications. We have introduced concurrent segment search that allows users to query index segments in parallel at the shard level. This offers improved latency for long-running requests that contain aggregations or large ranges. You can now index quantized vectors with FAISS-engine-based k-NN indexes, with potential to reduce memory footprint by as much as 50 percent with minimal impact 

In [None]:
filtered_entries = []
fetch_parsed_feed(filtered_entries)
create_report(filtered_entries)

The news entries have been saved.


In [None]:
cnn_paper = newspaper.build('http://cnn.com')
for article in cnn_paper.articles:
    print(article.url)

CRITICAL:newspaper.network:[REQUEST FAILED] 404 Client Error: Not Found for url: http://www.cnn.com/feeds
CRITICAL:newspaper.network:[REQUEST FAILED] 404 Client Error: Not Found for url: http://www.cnn.com/feed


http://cnn.com/2024/05/21/politics/takeaways-from-donald-trumps-defense-in-the-hush-money-trial/index.html
http://cnn.com/2024/05/21/middleeast/european-hospital-gaza-medics-freed-intl-latam/index.html
http://cnn.com/2024/05/17/opinions/open-ai-chatgpt-4o-yang/index.html
http://cnn.com/2024/05/21/politics/rudy-giuliani-arizona-election-subversion/index.html
http://cnn.com/2024/05/21/politics/us-assesses-russia-launched-counter-space-weapon/index.html
http://cnn.com/2024/05/21/us/grizzly-bear-attack-wyoming/index.html
http://cnn.com/2024/05/21/us/deaths-falls-hiking-climbing-parks/index.html
http://cnn.com/2024/05/21/cars/vinfast-regulators-investigating-fatal-crash-ev/index.html
http://cnn.com/2024/05/21/uk/us-embassy-london-congestion-fees-scli-intl/index.html
http://cnn.com/2024/05/06/climate/video/refreeze-arctic-sea-ice-technology-ldn-digvid
http://cnn.com/2024/05/21/world/video/raisi-funeral-procession-tabriz-tehran-iran-ldn-digvid
http://cnn.com/2024/05/21/style/huia-feather-sold

In [None]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
for category in cnn_paper.category_urls():
    print(category)

http://cnn.com
https://edition.cnn.com
https://cnnespanol.cnn.com
http://cnn.com/follow
https://arabic.cnn.com
https://us.cnn.com
https://money.cnn.com
https://www.cnn.com


In [None]:
cnn_article = cnn_paper.articles[0]
cnn_article.download()
cnn_article.parse()
cnn_article.nlp()

In [None]:
url = "https://www.cnn.com/2024/05/21/americas/us-lawmakers-turks-caicos-detained-americans/index.html"
# Initialize an Article object with the specified URL
article = Article(url)

# Download and parse the article's HTML
article.download()
article.parse()

# Use natural language processing to extract useful information from the article
article.nlp()

In [None]:
# Print the article's title, author(s), publication date and summary
print(f'Title: {article.title}')
print('\n')
print(f'Authors: {article.authors}')
print('\n')
print(f'Publication Date: {article.publish_date}')
print('\n')
print(f'Summary: {article.summary}')

# Analyze the sentiment of the article using TextBlob
analysis = TextBlob(article.text)
print('\n')

# Print the polarity (i.e. sentiment score) of the article
print(f"Polarity : {analysis.polarity}")

# Print whether the sentiment of the article is positive, negative, or neutral
print(f'Sentiment: {"Positive" if analysis.polarity > 0 else "Negative" if analysis.polarity < 0 else "Neutral"}')

Title: Bipartisan group of lawmakers travel to Turks and Caicos to push for release of detained Americans


Authors: ['Lauren Mascarenhas']


Publication Date: 2024-05-21 00:00:00


Summary: CNN —A bipartisan group of US lawmakers traveled to Turks and Caicos to press for the release of five Americans detained on ammunition possession charges – but encountered resistance from officials on the island, the group announced Monday.
The lawmakers met with Turks and Caicos officials, including the governor, attorney general, minister of tourism and police leaders, the statement from Mullin’s office said.
“Unfortunately, despite our willingness to work with Turks and Caicos officials to get our constituents home, we were not able to find a path forward today,” Mullin said in the statement.
Bringing firearms or ammunition, including stray bullets, into Turks and Caicos without prior permission from police is “strictly forbidden,” according to a statement from its government.
Judges can lower t