In [None]:
import feedparser
from datetime import datetime
from sqlalchemy import create_engine, Column, String, Text, DateTime, Integer
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

Base = declarative_base()

class Article(Base):
    __tablename__ = 'articles'
    id = Column(String, primary_key=True)
    title = Column(String)
    content = Column(Text)
    pub_date = Column(DateTime)
    source_url = Column(String)
    category = Column(String)

engine = create_engine('postgresql://user:password@localhost/news_db')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()

feeds = [
    'http://rss.cnn.com/rss/cnn_topstories.rss',
    'http://qz.com/feed',
    'http://feeds.foxnews.com/foxnews/politics',
    'http://feeds.reuters.com/reuters/businessNews',
    'http://feeds.feedburner.com/NewshourWorld',
    'https://feeds.bbci.co.uk/news/world/asia/india/rss.xml'
]

def parse_feed(feed_url):
    feed = feedparser.parse(feed_url)
    for entry in feed.entries:
        article = Article(
            id=entry.id,
            title=entry.title,
            content=entry.description,
            pub_date=datetime(*entry.published_parsed[:6]),
            source_url=entry.link
        )
        session.merge(article)
    session.commit()

for feed in feeds:
    parse_feed(feed)


In [None]:
from celery import Celery
from sqlalchemy import update
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

app = Celery('tasks', broker='pyamqp://guest@localhost//')

@app.task
def process_article(article_id):
    article = session.query(Article).get(article_id)
    # Simple keyword-based classification
    if 'terrorism' in article.content or 'protest' in article.content:
        category = 'Terrorism / protest / political unrest / riot'
    elif 'positive' in article.content or 'uplifting' in article.content:
        category = 'Positive/Uplifting'
    elif 'earthquake' in article.content or 'flood' in article.content:
        category = 'Natural Disasters'
    else:
        category = 'Others'

    stmt = update(Article).where(Article.id == article_id).values(category=category)
    session.execute(stmt)
    session.commit()

for article in session.query(Article).all():
    process_article.delay(article.id)


In [None]:
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s')

def parse_feed(feed_url):
    try:
        feed = feedparser.parse(feed_url)
        for entry in feed.entries:
            article = Article(
                id=entry.id,
                title=entry.title,
                content=entry.description,
                pub_date=datetime(*entry.published_parsed[:6]),
                source_url=entry.link
            )
            session.merge(article)
        session.commit()
        logging.info(f'Successfully parsed and stored articles from {feed_url}')
    except Exception as e:
        logging.error(f'Error parsing feed {feed_url}: {e}')

for feed in feeds:
    parse_feed(feed)
