In [1]:
import feedparser
from datetime import datetime

# List of RSS Feeds
rss_feeds = [
    "http://rss.cnn.com/rss/cnn_topstories.rss",
    "http://qz.com/feed",
    "http://feeds.foxnews.com/foxnews/politics",
    "http://feeds.reuters.com/reuters/businessNews",
    "http://feeds.feedburner.com/NewshourWorld",
    "https://feeds.bbci.co.uk/news/world/asia/india/rss.xml"
]

# Dictionary to store articles
articles = {}

# Iterate through each RSS feed
for feed_url in rss_feeds:
    feed = feedparser.parse(feed_url)
    # Iterate through each entry in the feed
    for entry in feed.entries:
        # Extract relevant information
        title = entry.title
        content = entry.get("description", "")
        publication_date_str = entry.get("published", "")
        if publication_date_str:
            try:
                # Parse the publication date
                publication_date = datetime.strptime(publication_date_str, "%a, %d %b %Y %H:%M:%S %Z")
            except ValueError:
                # Try another format
                try:
                    publication_date = datetime.strptime(publication_date_str, "%a, %d %b %Y %H:%M:%S %z")
                except ValueError:
                    publication_date = None
        else:
            publication_date = None
        source_url = entry.link
        # Check for duplicate articles
        if title not in articles:
            articles[title] = {
                "title": title,
                "content": content,
                "publication_date": publication_date,
                "source_url": source_url
            }


In [2]:
print(articles)




In [3]:
for article in articles.values():
    print("Title:", article["title"])
    print("Content:", article["content"])
    print("Publication Date:", article["publication_date"])
    print("Source URL:", article["source_url"])
    print("-" * 50)

Title: Some on-air claims about Dominion Voting Systems were false, Fox News acknowledges in statement after deal is announced
Content: 
Publication Date: 2023-04-19 12:44:51
Source URL: https://www.cnn.com/business/live-news/fox-news-dominion-trial-04-18-23/index.html
--------------------------------------------------
Title: Dominion still has pending lawsuits against election deniers such as Rudy Giuliani and Sidney Powell
Content: 
Publication Date: None
Source URL: https://www.cnn.com/business/live-news/fox-news-dominion-trial-04-18-23/h_8d51e3ae2714edaa0dace837305d03b8
--------------------------------------------------
Title: Here are the 20 specific Fox broadcasts and tweets Dominion says were defamatory
Content: • Fox-Dominion trial delay 'is not unusual,' judge says
• Fox News' defamation battle isn't stopping Trump's election lies
Publication Date: 2023-04-17 16:01:11
Source URL: https://www.cnn.com/2023/04/17/media/dominion-fox-news-allegations/index.html
--------------------

In [7]:
from sqlalchemy import create_engine, text
from sqlalchemy.exc import OperationalError

# Define the database connection URL without specifying the database name
db_url = 'postgresql://rajsingh:[Raj];12@localhost:5432/rajsingh565'

# Create a temporary engine to connect to the default database
temp_engine = create_engine(db_url)

# Connect to the default database and create a new database
db_name = 'rss_feed_database'
with temp_engine.connect() as connection:
    try:
        connection.execution_options(isolation_level="AUTOCOMMIT").execute(text(f'CREATE DATABASE {db_name}'))
        print(f"Database '{db_name}' created successfully!")
    except OperationalError as e:
        print(f"Error creating database '{db_name}': {e}")

Database 'rss_feed_database' created successfully!


In [16]:
from sqlalchemy import create_engine, Column, Integer, String, Text, TIMESTAMP, UniqueConstraint
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from datetime import datetime
from sqlalchemy.orm import declarative_base

# Define the database connection URL with the newly created database
db_url_with_db = 'postgresql://rajsingh:[Raj];12@localhost:5432/rss_feed_database'

# Create a new engine with the specified database name
engine_with_db = create_engine(db_url_with_db)

# Define the base class for ORM
Base = declarative_base()

# Define the Article model
class Article(Base):
    __tablename__ = 'articles'

    id = Column(Integer, primary_key=True)
    title = Column(String, nullable=False)
    content = Column(Text)
    publication_date = Column(TIMESTAMP)
    source_url = Column(String)
    
    # Ensure each article is unique based on title and source URL
    __table_args__ = (
        UniqueConstraint('title', 'source_url', name='uq_title_source_url'),
    )

# Create the table in the database if it doesn't exist
Base.metadata.create_all(engine_with_db)

# Function to insert articles into the database without duplicates
def insert_articles(articles):
    Session = sessionmaker(bind=engine_with_db)
    session = Session()
    
    for article in articles.values():
        # Check if the article already exists
        existing_article = session.query(Article).filter_by(title=article['title'], source_url=article['source_url']).first()
        if not existing_article:
            new_article = Article(title=article['title'], content=article['content'], 
                                  publication_date=article['publication_date'], source_url=article['source_url'])
            session.add(new_article)
    
    session.commit()
    session.close()

In [34]:
insert_articles(articles)


In [36]:
Session = sessionmaker(bind=engine_with_db)
session = Session()

# Query the database to retrieve all articles
all_articles = session.query(Article).all()

# Print the number of articles inserted
print(f"Total number of articles inserted: {len(all_articles)}")

# Optionally, you can print the titles of the inserted articles
for article in all_articles:
    print(article.title)

# Close the session
session.close()


Total number of articles inserted: 173
Some on-air claims about Dominion Voting Systems were false, Fox News acknowledges in statement after deal is announced
Dominion still has pending lawsuits against election deniers such as Rudy Giuliani and Sidney Powell
Here are the 20 specific Fox broadcasts and tweets Dominion says were defamatory
Judge in Fox News-Dominion defamation trial: 'The parties have resolved their case'
'Difficult to say with a straight face': Tapper reacts to Fox News' statement on settlement
Millions in the US could face massive consequences unless McCarthy can navigate out of a debt trap he set for Biden
White homeowner accused of shooting a Black teen who rang his doorbell turns himself in to face criminal charges
Newly released video shows scene of Jeremy Renner's snowplow accident
Jake Gyllenhaal and Jamie Lee Curtis spent the Covid-19 lockdown together
Toddler crawls through White House fence, prompts Secret Service response
Jamie Foxx remains hospitalized near

In [27]:
pip install celery

Note: you may need to restart the kernel to use updated packages.


In [45]:
from celery import Celery

# Configure Celery to use the solo (in-memory) message broker
app = Celery('tasks', broker='memory://')

In [60]:
from celery import Celery
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import models  # Update 'my_module' with the correct module name
from celeryconfig import broker

# Initialize NLTK
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define database connection
db_url = 'postgresql://rajsingh:[Raj];12@localhost:5432/rss_feed_database'
engine_with_db = create_engine(db_url)
Session = sessionmaker(bind=engine_with_db)

ImportError: cannot import name 'broker' from 'celeryconfig' (C:\Users\KIIT\OneDrive\Desktop\New\celeryconfig.py)

In [44]:
@app.task
def classify_and_update_database(article):
    # Perform category classification using NLTK or spaCy
    # Example NLTK-based classification
    categories = classify_with_nltk(article['content'])
    article['category'] = categories

    # Update the database with the assigned category
    update_database(article)

    return article


def classify_with_nltk(content):
    # Perform classification using NLTK
    # Example: Tokenization, stop word removal, lemmatization, and simple keyword matching
    tokens = word_tokenize(content)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in filtered_tokens]

    # Simple keyword matching to determine category
    categories = []
    for token in lemmatized_tokens:
        if token in ['terrorism', 'protest', 'political', 'unrest', 'riot']:
            categories.append('Terrorism / protest / political unrest / riot')
        elif token in ['positive', 'uplifting', 'happy', 'joyful']:
            categories.append('Positive/Uplifting')
        elif token in ['natural', 'disaster', 'earthquake', 'flood', 'hurricane']:
            categories.append('Natural Disasters')
    
    # If no category matches, assign to 'Others'
    if not categories:
        categories.append('Others')

    return categories

In [15]:
def update_database(article):
    # Create a new session
    session = Session()

    # Check if the article already exists in the database
    existing_article = session.query(Article).filter_by(title=article['title']).first()

    if existing_article:
        # Update the existing article's category
        existing_article.category = article['category']
    else:
        # Create a new article entry in the database
        new_article = Article(title=article['title'], content=article['content'], publication_date=article['publication_date'], source_url=article['source_url'], category=article['category'])
        session.add(new_article)

    # Commit the changes and close the session
    session.commit()
    session.close()

In [48]:
!pip install psycopg2



In [52]:
!pip install Article

Collecting Article
  Downloading article-0.1.1.tar.gz (724 bytes)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: Article
  Building wheel for Article (setup.py): started
  Building wheel for Article (setup.py): finished with status 'done'
  Created wheel for Article: filename=article-0.1.1-py3-none-any.whl size=1046 sha256=b791debcfb56fc69d58818d2a9405f09e63367926d372518211a1e344f206a41
  Stored in directory: c:\users\kiit\appdata\local\pip\cache\wheels\b3\e7\a3\e2180c3e4c13dd7314014a20ad99d0a1555144a6df876f712c
Successfully built Article
Installing collected packages: Article
Successfully installed Article-0.1.1


In [59]:
!pip install celeryconfig

ERROR: Could not find a version that satisfies the requirement celeryconfig (from versions: none)
ERROR: No matching distribution found for celeryconfig
