In [1]:
!pip install feedparser sqlalchemy psycopg2 spacy redis celery
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 6.7 MB/s eta 0:00:02
     ------ --------------------------------- 2.1/12.8 MB 6.5 MB/s eta 0:00:02
     ----------- ---------------------------- 3.7/12.8 MB 6.8 MB/s eta 0:00:02
     --------------- ------------------------ 5.0/12.8 MB 6.6 MB/s eta 0:00:02
     ------------------ --------------------- 6.0/12.8 MB 6.4 MB/s eta 0:00:02
     ---------------------- ----------------- 7.3/12.8 MB 6.5 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 6.5 MB/s eta 0:00:01
     ------------------------------- -------- 10.2/12.8 MB 6.4 MB/s eta 0:00:01
     ------------------------------------ --- 11.5/12.8 MB 6.6 MB/s eta 0:00:01
     ----------------------------------

In [2]:
!pip install feedparser sqlalchemy psycopg2-binary celery nltk spacy




In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import spacy
!python -m spacy download en_core_web_sm


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\poona\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\poona\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
def fetch_articles(feed_urls):
    articles = []
    for url in feed_urls:
        feed = feedparser.parse(url)
        for entry in feed.entries:
            # Use .get() method to safely access attributes
            articles.append({
                'title': entry.title,
                'content': getattr(entry, 'summary', ''),  # Default to empty string if not present
                'pub_date': getattr(entry, 'published', None),  # Default to None if not present
                'link': entry.link
            })
    return articles


In [5]:
# Define RSS feed URLs
rss_feeds = [
    'https://rss.cnn.com/rss/cnn_topstories.rss',  # CNN Top Stories
    'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml',  # NY Times Home Page
    # Add more RSS feed URLs as needed
    
]

# Function to fetch articles (make sure this is implemented in your code)
def fetch_articles(rss_feeds):
    articles = []
    # Logic to fetch and process articles from RSS feeds
    return articles

# Fetch articles from the RSS feeds
articles = fetch_articles(rss_feeds)

# Print the first few articles to verify
for article in articles[:5]:
    print(article)


In [6]:
# Fetch articles from the RSS feeds
articles = fetch_articles(rss_feeds)

# Print the first few articles to verify
for article in articles[:5]:
    print(article)


In [7]:
unique_articles = []
seen_titles = set()

for article in articles:
    if article['title'] not in seen_titles:
        seen_titles.add(article['title'])
        unique_articles.append(article)


In [8]:
!pip install mysql-connector-python




In [9]:
import mysql.connector

# Establish the connection
connection = mysql.connector.connect(
    host='localhost',          # Your database host
    user='root',      # Your MySQL username
    password='root',  # Your MySQL password
    database='db'   # The name of the database you want to use
)

# Create a cursor object
cursor = connection.cursor()

# Check if the connection was successful
if connection.is_connected():
    print("Connected to the database")
else:
    print("Connection failed")


Connected to the database


In [10]:
create_table_query = """
CREATE TABLE IF NOT EXISTS articles (
    id INT AUTO_INCREMENT PRIMARY KEY,
    title TEXT UNIQUE,
    content TEXT,
    pub_date DATETIME,
    link TEXT,
    category VARCHAR(50)
);
"""

cursor.execute(create_table_query)
print("Table created successfully")


Table created successfully


In [11]:
cursor.close()
connection.close()


In [13]:
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from datetime import datetime
from dateutil import parser  # to parse date strings

# Use SQLite in-memory database or a file-based database
DATABASE_URL = 'sqlite:///articles.db'  # ':memory:' for in-memory
engine = create_engine(DATABASE_URL)
Base = declarative_base()

class Article(Base):
    __tablename__ = 'articles'
    id = Column(Integer, primary_key=True)
    title = Column(Text, unique=True)
    content = Column(Text)
    pub_date = Column(DateTime)
    link = Column(Text)
    category = Column(String(50))

# Create the database and tables
Base.metadata.create_all(engine)

# Create a new session
Session = sessionmaker(bind=engine)
session = Session()

# Convert pub_date to datetime if it's a string
for article in unique_articles:
    if isinstance(article.get('pub_date'), str):
        try:
            article['pub_date'] = parser.parse(article['pub_date'])  # Parse string to datetime
        except (ValueError, TypeError):
            article['pub_date'] = None  # Set to None if parsing fails

    new_article = Article(**article)
    session.add(new_article)

# Commit the changes
try:
    session.commit()
except Exception as e:
    session.rollback()  # Rollback in case of error
    print(f"Error occurred: {e}")
finally:
    session.close()


  Base = declarative_base()


In [14]:
from celery import Celery

app = Celery('tasks', broker='redis://localhost:6379/0')

@app.task
def classify_article(article):
    # Dummy classification for demonstration; replace with your NLP logic
    if 'earthquake' in article['content'].lower():
        category = 'Natural Disasters'
    elif 'protest' in article['content'].lower():
        category = 'Terrorism / protest / political unrest / riot'
    else:
        category = 'Others'

    # Update category in the database
    db_article = session.query(Article).filter_by(title=article['title']).first()
    db_article.category = category
    session.commit()


In [15]:
from sqlalchemy.exc import IntegrityError

def add_article(session, article):
    # Check if the article with the same title already exists
    existing_article = session.query(Article).filter_by(title=article['title']).first()
    
    if existing_article:
        logging.warning(f"Article with title '{article['title']}' already exists. Skipping insert.")
        return
    
    # Otherwise, add the new article
    try:
        new_article = Article(**article)
        session.add(new_article)
        session.commit()
        logging.info(f"Article '{article['title']}' added successfully.")
    except IntegrityError as e:
        session.rollback()  # Rollback in case of failure
        logging.error(f"IntegrityError occurred: {e}")


In [16]:
from sqlalchemy import text
from sqlalchemy.exc import IntegrityError

try:
    session.execute(
        text("INSERT OR IGNORE INTO articles (title, content, pub_date, link, category) VALUES (?, ?, ?, ?, ?)"),
        (
            article.get('title'), 
            article.get('content', ''),  # Default to an empty string if 'content' is missing
            article.get('pub_date'),
            article.get('link'),
            article.get('category', None)  # Default to None if 'category' is missing
        )
    )
    session.commit()
except IntegrityError as e:
    print(f"Integrity error occurred: {e}")
    session.rollback()  # Roll back the session to clear the error
except Exception as e:
    print(f"An error occurred: {e}")
    session.rollback()


An error occurred: name 'article' is not defined


In [17]:
try:
    for article in unique_articles:
        # Check if the article with the same title already exists
        existing_article = session.query(Article).filter_by(title=article['title']).first()

        if existing_article:
            # Update the existing article if it already exists
            existing_article.content = article.get('content', existing_article.content)
            existing_article.pub_date = article.get('pub_date', existing_article.pub_date)
            existing_article.link = article.get('link', existing_article.link)
            existing_article.category = article.get('category', existing_article.category)
        else:
            # Create a new article if it does not exist
            new_article = Article(**article)
            session.add(new_article)

    # Commit the changes once, after the loop to improve performance
    session.commit()

    # Queue classification for each unique article processed
    for article in unique_articles:
        classify_article.delay(article)  # Queue for classification

except Exception as e:
    # Rollback the session if there was an error
    session.rollback()
    print(f"An error occurred: {e}")

finally:
    # Close the session (optional, if you're done with the session)
    session.close()


In [18]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

CATEGORIES = {
    "terrorism": ["terrorism", "riot", "protest", "political unrest"],
    "positive": ["uplifting", "positive", "inspiring"],
    "natural_disasters": ["earthquake", "flood", "natural disaster", "hurricane"],
    "others": []
}

def classify_article(content):
    doc = nlp(content)
    for category, keywords in CATEGORIES.items():
        if any(keyword in doc.text.lower() for keyword in keywords):
            return category
    return "others"


In [19]:
def fetch_and_process_articles():
    articles = get_all_articles()
    for article in articles:
        process_article.delay(article)  # Add each article to the Celery queue


In [20]:
import logging

logging.basicConfig(filename='news_app.log', level=logging.INFO)

def safe_parse_rss_feed(feed_url):
    try:
        return parse_rss_feed(feed_url)
    except Exception as e:
        logging.error(f"Failed to parse feed {feed_url}: {e}")
        return []

# Implement network error handling in database operations
def safe_store_article(article, category):
    try:
        store_article(article, category)
    except Exception as e:
        logging.error(f"Failed to store article {article['title']}: {e}")


In [21]:
import logging

logging.basicConfig(level=logging.INFO)

def log_article_processing(article):
    logging.info(f"Processing article: {article['title']}")


In [22]:
try:
    # Article processing logic here
    pass  # Replace with actual logic
except Exception as e:
    logging.error(f"Error processing article: {e}")


In [23]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd

# Assuming Article is already imported and is your SQLAlchemy ORM model

# Define your database URL (e.g., for SQLite)
DATABASE_URL = "sqlite:///articles.db"  # Make sure this is the correct path

# Create an engine that connects to the database
engine = create_engine(DATABASE_URL)

# Create a configured "Session" class
Session = sessionmaker(bind=engine)

# Create a session instance
session = Session()

# Query the data from the Article table
df = pd.read_sql(session.query(Article).statement, session.bind)

# Save the data to a CSV file
df.to_csv('articles.csv', index=False)


In [24]:
import os

print(os.path.exists('articles.db'))  # Returns True if the file exists
print(os.path.exists('rss_feed_articles.db'))


True
True


In [25]:
df.to_csv('articles.csv', index=False, encoding='utf-8')


In [26]:
df.to_csv('articles.csv', index=False, encoding='utf-8-sig')  # Use utf-8-sig if there are BOM issues
