In [87]:
# Import necessary libraries
import feedparser
from sqlalchemy import create_engine, Column, String, Integer, DateTime, text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from celery import Celery
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import logging
import nltk


In [65]:
# Initialize Celery
celery = Celery('news_processing', broker='pyamqp://guest:guest@localhost//')
# Database setup
Base = declarative_base()

  Base = declarative_base()


In [66]:
class NewsArticle(Base):
    __tablename__ = 'news_articles'
    id = Column(Integer, primary_key=True)
    title = Column(String)
    content = Column(String)
    pub_date = Column(DateTime)
    source_url = Column(String)
    category = Column(String)


In [67]:
# Initialize database engine and create tables
engine = create_engine('postgresql://postgres:prudhvi@localhost/Rss', echo=True)
Base.metadata.create_all(bind=engine)


2024-01-28 20:38:35,272 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2024-01-28 20:38:35,273 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-01-28 20:38:35,273 INFO sqlalchemy.engine.Engine select current_schema()
2024-01-28 20:38:35,275 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-01-28 20:38:35,276 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2024-01-28 20:38:35,276 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-01-28 20:38:35,277 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-01-28 20:38:35,279 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname

In [68]:
# Initialize NLTK
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kprud\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [88]:
def classify_category(content):
    # Add your NLP classification logic here
    # For example, a simple keyword-based approach
    if 'terrorism' in content.lower() or 'protest' in content.lower() or 'political unrest' in content.lower() or 'riot' in content.lower():
        return 'Terrorism/Protest/Political Unrest/Riot'
    elif 'positive' in content.lower() or 'uplifting' in content.lower():
        return 'Positive/Uplifting'
    elif 'natural disaster' in content.lower():
        return 'Natural Disasters'
    else:
        return 'Others'

In [70]:
def save_to_database(article, category):
    Session = sessionmaker(bind=engine)
    session = Session()

    # Check for duplicate articles
    if not session.query(NewsArticle).filter_by(title=article['title'], source_url=article['link']).first():
        # Save new article to the database
        new_article = NewsArticle(
            title=article['title'],
            content=article['content'],
            pub_date=article['published'],
            source_url=article['link'],
            category=category
        )
        session.add(new_article)
        session.commit()

In [95]:
# Celery task for processing news articles

def process_news_article(article):
    try:
        # Perform NLP processing to determine category
        category = classify_category(article['content'])
        print(category)
        # Update the database with the assigned category
        save_to_database(article, category)
    except Exception as e:
        logging.error(f"Error processing article: {str(e)}")


In [96]:
def parse_rss_feeds():
    rss_feeds = [
        'http://rss.cnn.com/rss/cnn_topstories.rss',
        'http://qz.com/feed',
        'http://feeds.foxnews.com/foxnews/politics',
        'http://feeds.reuters.com/reuters/businessNews',
        'http://feeds.feedburner.com/NewshourWorld',
        'https://feeds.bbci.co.uk/news/world/asia/india/rss.xml'
    ]

    for feed_url in rss_feeds:
        feed = feedparser.parse(feed_url)
        for entry in feed.entries:
            try:
                # Check if the 'published' key exists
                published = entry.published
            except AttributeError:
                try:
                    # If 'published' key does not exist, try using 'updated' key
                    published = entry.updated
                except AttributeError:
                    # If neither 'published' nor 'updated' exist, set a default value or handle it as needed
                    published = None

            try:
                # Check if the 'summary' key exists
                content = entry.summary
            except AttributeError:
                # If 'summary' key does not exist, use 'description' or other suitable field
                content = entry.get('description', '')

            # Send article to Celery queue for asynchronous processing
            process_news_article({
                'title': entry.title,
                'content': content,
                'published': published,
                'link': entry.link
            })

In [97]:
if __name__ == '__main__':
    # Run the feed parser script
    parse_rss_feeds()


Others
2024-01-28 21:10:54,362 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-01-28 21:10:54,363 INFO sqlalchemy.engine.Engine SELECT news_articles.id AS news_articles_id, news_articles.title AS news_articles_title, news_articles.content AS news_articles_content, news_articles.pub_date AS news_articles_pub_date, news_articles.source_url AS news_articles_source_url, news_articles.category AS news_articles_category 
FROM news_articles 
WHERE news_articles.title = %(title_1)s AND news_articles.source_url = %(source_url_1)s 
 LIMIT %(param_1)s
2024-01-28 21:10:54,364 INFO sqlalchemy.engine.Engine [generated in 0.00048s] {'title_1': 'Some on-air claims about Dominion Voting Systems were false, Fox News acknowledges in statement after deal is announced', 'source_url_1': 'https://www.cnn.com/business/live-news/fox-news-dominion-trial-04-18-23/index.html', 'param_1': 1}
2024-01-28 21:10:54,377 INFO sqlalchemy.engine.Engine INSERT INTO news_articles (title, content, pub_date, source_url, c