In [None]:
!pip install feedparser




In [None]:
!pip install beautifulsoup4




In [None]:
import logging


In [None]:
!pip install scikit-learn pandas




In [None]:
import feedparser
import logging
from bs4 import BeautifulSoup

# List of RSS feed URLs
RSS_FEEDS = [
    'http://rss.cnn.com/rss/cnn_topstories.rss',
    'http://qz.com/feed',
    'http://feeds.foxnews.com/foxnews/politics',
    'http://feeds.reuters.com/reuters/businessNews',
    'http://feeds.feedburner.com/NewshourWorld',
    'https://feeds.bbci.co.uk/news/world/asia/india/rss.xml'
]

# Categories and their associated keywords for classification
CATEGORIES = {
    'Terrorism / protest / political unrest / riot': ['terrorism', 'protest', 'political unrest', 'riot'],
    'Positive/Uplifting': ['uplifting', 'inspiring', 'success', 'achievement'],
    'Natural Disasters': ['earthquake', 'flood', 'hurricane', 'wildfire', 'tsunami'],
    'Others': []  # Fallback category
}

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def fetch_articles(feed_url):
    """Fetch articles from an RSS feed and return a list of parsed articles."""
    logger.info(f"Fetching articles from {feed_url}")
    feed = feedparser.parse(feed_url)
    articles = []

    for entry in feed.entries:
        content = extract_content(entry)
        if content:
            articles.append({
                'title': entry.title,
                'content': content,
                'publication_date': entry.get('published', 'No publication date'),
                'source_url': entry.link
            })

    return articles

def extract_content(entry, num_paragraphs=5):
    """Extracts and returns the article content. Assumes HTML content is available."""
    if 'content' in entry:
        # Use the content that usually contains HTML
        content_html = entry.content[0].value if entry.content else ''
        soup = BeautifulSoup(content_html, 'html.parser')

        # Get all paragraphs from the article
        paragraphs = soup.find_all('p')

        # Ensure there are enough paragraphs to extract
        if len(paragraphs) >= num_paragraphs:
            # Join the text of the specified number of paragraphs and return
            return '\n\n'.join(paragraph.get_text() for paragraph in paragraphs[:num_paragraphs])

    return None  # Return None if conditions are not met

def classify_article(content):
    """Classify the article based on its content."""
    content_lower = content.lower()

    for category, keywords in CATEGORIES.items():
        for keyword in keywords:
            if keyword in content_lower:
                return category

    return 'Others'

def fetch_and_classify_articles():
    """Fetch articles from all RSS feeds and classify them."""
    classified_articles = []

    for feed_url in RSS_FEEDS:
        try:
            articles = fetch_articles(feed_url)
            for article in articles:
                # Classify the article based on its content
                category = classify_article(article['content'])
                article['category'] = category
                classified_articles.append(article)
        except Exception as e:
            logger.error(f"Error processing feed {feed_url}: {e}")

    return classified_articles

if __name__ == "__main__":
    articles = fetch_and_classify_articles()

    # Print the articles for verification
    for article in articles:
        print(f"Title: {article['title']}\n")

        # Print content with paragraphs
        content_paragraphs = article['content'].split('\n\n')
        for paragraph in content_paragraphs:
            print(f"{paragraph}\n")  # Print each paragraph

        print(f"Publication Date: {article['publication_date']}")
        print(f"Source URL: {article['source_url']}")
        print(f"Category: {article['category']}")
        print("-" * 80)


Title: Arizona begins in-person and absentee voting, here's what you need to know

Arizona began early voting Wednesday, marking yet another major swing state where voting is underway in the 2024 election.

With Arizona now in the mix, 41 states and Washington, D.C., have launched some form of early voting.

Here is everything you need to know to cast your ballot in the state.

NEXT PRESIDENTIAL ADMINISTRATION HAS TO 'GET SERIOUS' ABOUT IMMIGRATION, SAY VOTERS IN KEY BATTLEGROUND STATE

President Biden scored a crucial victory in Arizona in the last presidential election, flipping the state to the Democrats for the first time since 1996.

Publication Date: Wed, 09 Oct 2024 05:00:15 -0400
Source URL: https://www.foxnews.com/politics/arizona-begins-in-person-absentee-voting-heres-what-you-need-know
Category: Others
--------------------------------------------------------------------------------
Title: Biden-Harris admin ‘taking advice from foreign governments’ on policing speech, lawmake

In [None]:
import feedparser
import joblib
from datetime import datetime

# Load the SVM model and vectorizer
svm_model = joblib.load('svm_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

# List of RSS Feeds
rss_feeds = [
    'http://rss.cnn.com/rss/cnn_topstories.rss',
    'http://qz.com/feed',
    'http://feeds.foxnews.com/foxnews/politics',
    'http://feeds.reuters.com/reuters/businessNews',
    'http://feeds.feedburner.com/NewshourWorld',
    'https://feeds.bbci.co.uk/news/world/asia/india/rss.xml'
]

def fetch_articles():
    articles = []

    for feed in rss_feeds:
        feed_data = feedparser.parse(feed)

        for entry in feed_data.entries:
            # Get the title, content (if available), link, and published date
            title = entry.title
            link = entry.link
            content = entry.summary if 'summary' in entry else 'No content available'

            # Get the publication date, convert it to a readable format
            published_date = entry.published if 'published' in entry else 'No date available'
            try:
                published_date = datetime(*entry.published_parsed[:6]).strftime('%Y-%m-%d %H:%M:%S') if 'published_parsed' in entry else 'No date available'
            except:
                published_date = 'No date available'

            # Classify the article based on the title
            category = classify_article(title)

            # Beautify content (for example, keeping a minimum of 3 paragraphs)
            content = beautify_content(content)

            # Append article to the list
            articles.append({
                'title': title,
                'content': content,
                'link': link,
                'published_date': published_date,
                'category': category
            })

    return articles

# Beautify content (ensure at least 3 paragraphs)
def beautify_content(content):
    paragraphs = content.split('\n\n')
    if len(paragraphs) < 3:
        # Add dummy paragraphs if less than 3
        paragraphs += [''] * (3 - len(paragraphs))
    return '\n\n'.join(paragraphs[:3])

# Classify article based on its title
def classify_article(title):
    title_vectorized = vectorizer.transform([title])
    prediction = svm_model.predict(title_vectorized)
    return prediction[0]

# Example Usage
if __name__ == '__main__':
    articles = fetch_articles()
    for article in articles:
        print(f"Title: {article['title']}")
        print(f"Content: {article['content']}")
        print(f"Link: {article['link']}")
        print(f"Published Date: {article['published_date']}")
        print(f"Category: {article['category']}")
        print("-" * 80)


Title: Some on-air claims about Dominion Voting Systems were false, Fox News acknowledges in statement after deal is announced
Content: No content available




Link: https://www.cnn.com/business/live-news/fox-news-dominion-trial-04-18-23/index.html
Published Date: 2023-04-19 12:44:51
Category: politics
--------------------------------------------------------------------------------
Title: Dominion still has pending lawsuits against election deniers such as Rudy Giuliani and Sidney Powell
Content: No content available




Link: https://www.cnn.com/business/live-news/fox-news-dominion-trial-04-18-23/h_8d51e3ae2714edaa0dace837305d03b8
Published Date: No date available
Category: politics
--------------------------------------------------------------------------------
Title: Here are the 20 specific Fox broadcasts and tweets Dominion says were defamatory
Content: • Fox-Dominion trial delay 'is not unusual,' judge says
• Fox News' defamation battle isn't stopping Trump's election lies




L

In [None]:
!pip install mysql-connector-python


Collecting mysql-connector-python
  Downloading mysql_connector_python-9.0.0-cp310-cp310-manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading mysql_connector_python-9.0.0-cp310-cp310-manylinux_2_17_x86_64.whl (19.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mysql-connector-python
Successfully installed mysql-connector-python-9.0.0
