In [1]:
!pip install feedparser pymongo
!pip install rapidfuzz
!pip install selenium

import feedparser
from pymongo import MongoClient
from datetime import datetime
from rapidfuzz import fuzz
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options





In [2]:
# List of RSS feed URLs
rss_feeds = {
    "BBC News": "http://feeds.bbci.co.uk/news/rss.xml",
    #"CNN": "http://rss.cnn.com/rss/edition.rss",
    #"Reuters": "http://feeds.reuters.com/reuters/topNews",
    #"The New York Times": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
    #"The Guardian": "https://www.theguardian.com/uk/rss",
    #"Al Jazeera": "https://www.aljazeera.com/xml/rss/all.xml"
}

In [3]:
# MongoDB Setup
client = MongoClient('mongodb://localhost:27017/')  # Update with your MongoDB connection URI
db = client['news_database']
collection = db['news_articles']

In [4]:
def parse_rss_feed(feed_url):
    """Fetch and parse RSS feed"""
    feed = feedparser.parse(feed_url)
    articles = []
    
    for entry in feed.entries:
        # Extract the fields you need
        article = {
            'title': entry.title,
            'link': entry.link,
            'published': entry.published if 'published' in entry else None,
            'summary': entry.summary if 'summary' in entry else None,
            'description': fetch_full_article(entry.link),  # Fetch full content from the article link
            'source': feed.feed.title,
            'fetched_at': datetime.now(),
            'tag': assign_tag(entry)  # Add the tag field here
        }
        articles.append(article)
    
    return articles

In [5]:
def assign_tag(entry):
    """Assign a specific tag to an article based on custom logic"""
    if 'breaking' in entry.title.lower():
        return 1  # Tag for breaking news
    elif 'covid' in entry.title.lower():
        return 2  # Tag for COVID-related news
    elif 'election' in entry.title.lower():
        return 3  # Tag for election news
    else:
        return 0  # Default tag for other news

In [6]:
def is_duplicate_article(new_article, existing_articles):
    """Check if the new article is a duplicate based on title similarity using rapidfuzz."""
    for article in existing_articles:
        similarity = fuzz.token_set_ratio(new_article['title'], article['title'])
        if similarity > 85:  # Threshold for considering articles as duplicates
            print(f"Duplicate title found: {article['title']}")
            return True
    return False

In [7]:
def save_articles_to_mongo(articles):
    """Save articles to MongoDB, avoiding duplicates based on title similarity across sources."""
    # Fetch all existing articles from the database for comparison
    existing_articles = list(collection.find({}, {'title': 1}))  # Only fetch titles for comparison

    for article in articles:
        # Print the article details to the console before saving
        print(f"Title: {article['title']}")
        print(f"Link: {article['link']}")
        print(f"Published: {article['published']}")
        print(f"Summary: {article['summary']}")
        print(f"Description: {article['description']}")
        print(f"Source: {article['source']}")
        print(f"Tag: {article['tag']}")
        print(f"Fetched at: {article['fetched_at']}")
        print("=" * 40)  # Separator line

        # Check for duplicate articles based on title similarity
        if not is_duplicate_article(article, existing_articles):
            collection.insert_one(article)
            print(f"Saved article: {article['title']}")
        else:
            print(f"Duplicate article skipped: {article['title']}")


In [8]:
def fetch_full_article(url):
    """Fetch the full article content by scraping the article's page"""
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})  # Adding User-Agent for better compatibility
        soup = BeautifulSoup(response.content, 'html.parser')

        # Try extracting paragraphs from common HTML structures
        paragraphs = soup.find_all('p')  # Most news sites use <p> for paragraphs
        if not paragraphs:
            # Some websites use <div> with specific classes, adjust according to your target websites
            content_div = soup.find('div', {'class': 'article-content'})
            if content_div:
                paragraphs = content_div.find_all('p')
        
        # Join all paragraph texts to form the full article content
        full_text = ' '.join([p.get_text() for p in paragraphs])

        # Return the full content if paragraphs were found, otherwise return a default message
        return full_text if full_text else "Full content not available"
    
    except Exception as e:
        print(f"Failed to fetch full content from {url}: {e}")
        return "Failed to fetch full content"



In [9]:
def fetch_full_article(url):
    """Try to fetch article content using requests and BeautifulSoup, fall back to Selenium."""
    try:
        # First attempt using requests and BeautifulSoup
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(response.content, 'html.parser')

        # Attempt to extract paragraphs directly
        paragraphs = soup.find_all('p')
        full_text = ' '.join([p.get_text() for p in paragraphs])

        # If no content found, fall back to Selenium
        if not full_text:
            full_text = fetch_full_article_with_selenium(url)

        return full_text if full_text else "Full content not available"
    
    except Exception as e:
        print(f"Failed to fetch full content from {url}: {e}")
        return "Failed to fetch full content"



In [10]:
def fetch_full_article_with_selenium(url):
    """Fetch full article content using Selenium for dynamically loaded pages."""
    try:
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')

        service = Service('/path/to/chromedriver')  # Update the path to your ChromeDriver
        driver = webdriver.Chrome(service=service, options=options)

        driver.get(url)
        driver.implicitly_wait(10)

        paragraphs = driver.find_elements(By.TAG_NAME, 'p')
        full_text = ' '.join([p.text for p in paragraphs])

        driver.quit()
        return full_text if full_text else "Full content not available"
    
    except Exception as e:
        print(f"Failed to fetch full content from {url}: {e}")
        return "Failed to fetch full content"

In [11]:
def crawl_news():
    """Crawl all news outlets and save to MongoDB"""
    for source, url in rss_feeds.items():
        print(f"Crawling articles from: {source}")
        articles = parse_rss_feed(url)
        save_articles_to_mongo(articles)

In [12]:
if __name__ == "__main__":
    crawl_news()

Crawling articles from: BBC News
Failed to fetch full content from https://www.bbc.com/news/articles/cjd58p1g515o: name 'requests' is not defined
Failed to fetch full content from https://www.bbc.com/news/articles/cm249ym44dvo: name 'requests' is not defined
Failed to fetch full content from https://www.bbc.com/news/articles/cwyek8pgy7po: name 'requests' is not defined
Failed to fetch full content from https://www.bbc.com/news/articles/cy0l19j9jgko: name 'requests' is not defined
Failed to fetch full content from https://www.bbc.com/news/articles/cly2d829dgyo: name 'requests' is not defined
Failed to fetch full content from https://www.bbc.com/news/articles/cgey0vvxxw7o: name 'requests' is not defined
Failed to fetch full content from https://www.bbc.com/news/articles/ce8v4467806o: name 'requests' is not defined
Failed to fetch full content from https://www.bbc.com/news/articles/cm2yrpxxzmpo: name 'requests' is not defined
Failed to fetch full content from https://www.bbc.com/news/arti