In [1]:
pip install feedparser newspaper3k transformers torch requests


Collecting feedparser
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspap

In [2]:
"""
smart_news_curator.py
----------------------------------
An automation tool that:
- Fetches trending news across major sources
- Summarizes and simplifies content
- Formats ready-to-post social updates
"""

import feedparser
from newspaper import Article
from transformers import pipeline
from datetime import datetime
from collections import Counter
import re
import hashlib
import sqlite3
import time

# -------------------------------
# CONFIGURATION
# -------------------------------
RSS_FEEDS = [
    "https://news.google.com/rss?hl=en-IN&gl=IN&ceid=IN:en",
    "https://feeds.bbci.co.uk/news/world/rss.xml",
    "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
    "https://www.aljazeera.com/xml/rss/all.xml",
    "https://timesofindia.indiatimes.com/rssfeedstopstories.cms"
]

DB_PATH = "news_summaries.db"
SUMMARY_MODEL = "facebook/bart-large-cnn"
MAX_ARTICLES_PER_RUN = 15

# -------------------------------
# DATABASE SETUP
# -------------------------------
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS news (
    id TEXT PRIMARY KEY,
    title TEXT,
    url TEXT,
    summary TEXT,
    source TEXT,
    published TEXT,
    created_at TEXT
)
''')
conn.commit()

# -------------------------------
# SUMMARIZER INITIALIZATION
# -------------------------------
print("⏳ Loading summarization model...")
summarizer = pipeline("summarization", model=SUMMARY_MODEL)
print("✅ Model loaded successfully!")

# -------------------------------
# UTILITIES
# -------------------------------
def normalize_title(title):
    """Normalize title for comparison."""
    return re.sub(r'\W+', '', title.lower())

def article_id(url):
    return hashlib.sha256(url.encode("utf-8")).hexdigest()

def already_stored(aid):
    c.execute("SELECT 1 FROM news WHERE id=?", (aid,))
    return c.fetchone() is not None

def fetch_article_text(url):
    """Download and parse article content."""
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text, article.title
    except Exception:
        return "", ""

def summarize_text(text):
    """Generate a simplified, clear summary."""
    if not text:
        return ""
    try:
        result = summarizer(text[:1024], max_length=80, min_length=30, do_sample=False)
        return result[0]['summary_text']
    except Exception:
        return text[:250] + "..."

def make_social_post(title, summary, url):
    """Format into a short, readable post."""
    return f"📰 {title}\n\n{summary}\n\nRead more: {url}\n#NewsUpdate #Breaking"

# -------------------------------
# MAIN LOGIC
# -------------------------------
def gather_all_entries():
    print("📡 Gathering news from RSS feeds...")
    all_entries = []
    for feed_url in RSS_FEEDS:
        try:
            feed = feedparser.parse(feed_url)
            for e in feed.entries:
                e["source"] = feed.feed.get("title", "Unknown Source")
                all_entries.append(e)
        except Exception as e:
            print("⚠️ Failed to parse feed:", feed_url, e)
    print(f"✅ Collected {len(all_entries)} raw entries.")
    return all_entries

def get_major_stories(entries):
    """Detect major stories that appear in multiple feeds."""
    counter = Counter(normalize_title(e["title"]) for e in entries)
    top_titles = [t for t, count in counter.items() if count > 1]
    return [e for e in entries if normalize_title(e["title"]) in top_titles]

def process_story(entry):
    url = entry.get("link") or entry.get("id")
    if not url:
        return

    aid = article_id(url)
    if already_stored(aid):
        return

    text, title = fetch_article_text(url)
    if not text:
        return

    summary = summarize_text(text)
    formatted_post = make_social_post(title, summary, url)

    c.execute('''
        INSERT INTO news (id, title, url, summary, source, published, created_at)
        VALUES (?, ?, ?, ?, ?, ?, ?)
    ''', (
        aid,
        title,
        url,
        summary,
        entry.get("source", ""),
        entry.get("published", ""),
        datetime.utcnow().isoformat()
    ))
    conn.commit()

    print("\n✅ New story added:")
    print(formatted_post)
    print("-" * 80)

def main():
    all_entries = gather_all_entries()
    major_stories = get_major_stories(all_entries)

    print(f"🔥 Found {len(major_stories)} major stories.")
    for entry in major_stories[:MAX_ARTICLES_PER_RUN]:
        process_story(entry)
        time.sleep(2)  # polite delay between requests

    print("\n🎉 Run complete! Database updated with new summaries.")

if __name__ == "__main__":
    main()


ImportError: lxml.html.clean module is now a separate project lxml_html_clean.
Install lxml[html_clean] or lxml_html_clean directly.

In [3]:
pip install lxml[html_clean]

Collecting lxml_html_clean (from lxml[html_clean])
  Downloading lxml_html_clean-0.4.3-py3-none-any.whl.metadata (2.3 kB)
Downloading lxml_html_clean-0.4.3-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.3


In [4]:
"""
smart_news_curator.py
----------------------------------
An automation tool that:
- Fetches trending news across major sources
- Summarizes and simplifies content
- Formats ready-to-post social updates
"""

import feedparser
from newspaper import Article
from transformers import pipeline
from datetime import datetime
from collections import Counter
import re
import hashlib
import sqlite3
import time

# -------------------------------
# CONFIGURATION
# -------------------------------
RSS_FEEDS = [
    "https://news.google.com/rss?hl=en-IN&gl=IN&ceid=IN:en",
    "https://feeds.bbci.co.uk/news/world/rss.xml",
    "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
    "https://www.aljazeera.com/xml/rss/all.xml",
    "https://timesofindia.indiatimes.com/rssfeedstopstories.cms"
]

DB_PATH = "news_summaries.db"
SUMMARY_MODEL = "facebook/bart-large-cnn"
MAX_ARTICLES_PER_RUN = 15

# -------------------------------
# DATABASE SETUP
# -------------------------------
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS news (
    id TEXT PRIMARY KEY,
    title TEXT,
    url TEXT,
    summary TEXT,
    source TEXT,
    published TEXT,
    created_at TEXT
)
''')
conn.commit()

# -------------------------------
# SUMMARIZER INITIALIZATION
# -------------------------------
print("⏳ Loading summarization model...")
summarizer = pipeline("summarization", model=SUMMARY_MODEL)
print("✅ Model loaded successfully!")

# -------------------------------
# UTILITIES
# -------------------------------
def normalize_title(title):
    """Normalize title for comparison."""
    return re.sub(r'\W+', '', title.lower())

def article_id(url):
    return hashlib.sha256(url.encode("utf-8")).hexdigest()

def already_stored(aid):
    c.execute("SELECT 1 FROM news WHERE id=?", (aid,))
    return c.fetchone() is not None

def fetch_article_text(url):
    """Download and parse article content."""
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text, article.title
    except Exception:
        return "", ""

def summarize_text(text):
    """Generate a simplified, clear summary."""
    if not text:
        return ""
    try:
        result = summarizer(text[:1024], max_length=80, min_length=30, do_sample=False)
        return result[0]['summary_text']
    except Exception:
        return text[:250] + "..."

def make_social_post(title, summary, url):
    """Format into a short, readable post."""
    return f"📰 {title}\n\n{summary}\n\nRead more: {url}\n#NewsUpdate #Breaking"

# -------------------------------
# MAIN LOGIC
# -------------------------------
def gather_all_entries():
    print("📡 Gathering news from RSS feeds...")
    all_entries = []
    for feed_url in RSS_FEEDS:
        try:
            feed = feedparser.parse(feed_url)
            for e in feed.entries:
                e["source"] = feed.feed.get("title", "Unknown Source")
                all_entries.append(e)
        except Exception as e:
            print("⚠️ Failed to parse feed:", feed_url, e)
    print(f"✅ Collected {len(all_entries)} raw entries.")
    return all_entries

def get_major_stories(entries):
    """Detect major stories that appear in multiple feeds."""
    counter = Counter(normalize_title(e["title"]) for e in entries)
    top_titles = [t for t, count in counter.items() if count > 1]
    return [e for e in entries if normalize_title(e["title"]) in top_titles]

def process_story(entry):
    url = entry.get("link") or entry.get("id")
    if not url:
        return

    aid = article_id(url)
    if already_stored(aid):
        return

    text, title = fetch_article_text(url)
    if not text:
        return

    summary = summarize_text(text)
    formatted_post = make_social_post(title, summary, url)

    c.execute('''
        INSERT INTO news (id, title, url, summary, source, published, created_at)
        VALUES (?, ?, ?, ?, ?, ?, ?)
    ''', (
        aid,
        title,
        url,
        summary,
        entry.get("source", ""),
        entry.get("published", ""),
        datetime.utcnow().isoformat()
    ))
    conn.commit()

    print("\n✅ New story added:")
    print(formatted_post)
    print("-" * 80)

def main():
    all_entries = gather_all_entries()
    major_stories = get_major_stories(all_entries)

    print(f"🔥 Found {len(major_stories)} major stories.")
    for entry in major_stories[:MAX_ARTICLES_PER_RUN]:
        process_story(entry)
        time.sleep(2)  # polite delay between requests

    print("\n🎉 Run complete! Database updated with new summaries.")

if __name__ == "__main__":
    main()



⏳ Loading summarization model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


✅ Model loaded successfully!
📡 Gathering news from RSS feeds...
✅ Collected 191 raw entries.
🔥 Found 2 major stories.


  datetime.utcnow().isoformat()



✅ New story added:
📰 Netanyahu Says Israel and Hamas Are on the Brink of a Hostage Deal

Demonstrators gathered at “Hostages Square” in Tel Aviv on Saturday. Many said there was a rare sense of optimism that a deal could now be reached. But demonstrators also worried it could be torpedoed by either Israel or Hamas.

Read more: https://www.nytimes.com/live/2025/10/04/world/israel-hamas-gaza-trump/heres-the-latest
#NewsUpdate #Breaking
--------------------------------------------------------------------------------

✅ New story added:
📰 Middle East Updates: Trump Says Hamas Is Ready for Peace and Tells Israel to Stop Bombing

President Trump said he believed Hamas was “ready for a lasting PEACE’ and demanded Israel “immediately stop the bombing of Gaza” The militant group said on Friday that it was ready to release all of the Israeli hostages.

Read more: https://www.nytimes.com/live/2025/10/03/world/hamas-hostages-israel-gaza/hamas-hostage-release-gaza-trump
#NewsUpdate #Breaking
-----