### Task: application that collects news articles from various RSS feeds

In [14]:
# Feeds
links = [
    "http://rss.cnn.com/rss/cnn_topstories.rss",
    "http://qz.com/feed",
    "http://feeds.foxnews.com/foxnews/politics",
    "http://feeds.reuters.com/reuters/businessNews",
    "http://feeds.feedburner.com/NewshourWorld",
    "https://feeds.bbci.co.uk/news/world/asia/india/rss.xml"
]


In [15]:
# Feed Parser
'''
Parse each feed and extract relevant information from each news article,
including title, content, publication date, and source URL.
'''


'\nParse each feed and extract relevant information from each news article,\nincluding title, content, publication date, and source URL.\n'

In [16]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

In [17]:
# ETL

In [215]:
def item_parser(link):
    r = requests.get(link) 
    soup = BeautifulSoup(r.content, 'xml')
    items = soup.find_all('item')

    a = pd.DataFrame(columns=['title','link','pub_date','description','content'])
    for item in items:
        title = item.title.text
        link = item.link.text
        try:
            pub_date = item.pubDate.text
        except:
            pub_date = None
        try:
            description = item.description.text
        except:
            description = None
        try:
            content = item.description.text
        except:
            content = None

        a.loc[len(a)] = [title,link,pub_date,description,content]

    return a

In [216]:
def get_data(links):
    #df = pd.DataFrame(columns=['title','link','pub_date','description'])
    p = []
    for link in links:
        try:
            a = item_parser(link)
            p.append(a)
            #df=pd.concat([df,a],axis=0)
        except:
            print(link)
    return p

In [218]:
n= get_data(links)

http://feeds.reuters.com/reuters/businessNews


In [220]:
pd.concat(n,axis=0)

Unnamed: 0,title,link,pub_date,description,content
0,Some on-air claims about Dominion Voting Syste...,https://www.cnn.com/business/live-news/fox-new...,"Wed, 19 Apr 2023 12:44:51 GMT",,
1,Dominion still has pending lawsuits against el...,https://www.cnn.com/business/live-news/fox-new...,,,
2,Here are the 20 specific Fox broadcasts and tw...,https://www.cnn.com/2023/04/17/media/dominion-...,"Mon, 17 Apr 2023 16:01:11 GMT","• Fox-Dominion trial delay 'is not unusual,' j...","• Fox-Dominion trial delay 'is not unusual,' j..."
3,Judge in Fox News-Dominion defamation trial: '...,https://www.cnn.com/2023/04/18/media/fox-domin...,"Wed, 19 Apr 2023 08:28:17 GMT",The judge just announced in court that a settl...,The judge just announced in court that a settl...
4,'Difficult to say with a straight face': Tappe...,https://www.cnn.com/videos/politics/2023/04/18...,"Tue, 18 Apr 2023 21:17:44 GMT",A settlement has been reached in Dominion Voti...,A settlement has been reached in Dominion Voti...
...,...,...,...,...,...
15,Indian PM Modi inaugurates temple in Ayodhya,https://www.bbc.co.uk/news/world-asia-india-68...,"Mon, 22 Jan 2024 08:06:54 GMT",The opening of the grand temple comes ahead of...,The opening of the grand temple comes ahead of...
16,Plane gets jammed under bridge in India,https://www.bbc.co.uk/news/world-asia-india-67...,"Fri, 29 Dec 2023 23:17:00 GMT",The scrap fuselage was on its way to Assam fro...,The scrap fuselage was on its way to Assam fro...
17,Intruder jumps on table in Indian parliament,https://www.bbc.co.uk/news/world-asia-india-67...,"Wed, 13 Dec 2023 09:49:30 GMT",Lawmakers said two men jumped into the well of...,Lawmakers said two men jumped into the well of...
18,Moment worker is pulled from collapsed tunnel,https://www.bbc.co.uk/news/world-asia-india-67...,"Tue, 28 Nov 2023 15:13:21 GMT",Fourty-one Indian workers are finally rescued ...,Fourty-one Indian workers are finally rescued ...


In [25]:
df = get_data(links)

http://feeds.reuters.com/reuters/businessNews


In [27]:
df.dropna(inplace=True)

In [10]:
from sqlalchemy_utils import database_exists, create_database
from config import postgresql as settings
# Example using SQLAlchemy for database storage
from sqlalchemy import create_engine, Column, String, DateTime, MetaData
from sqlalchemy.orm import sessionmaker,declarative_base

In [11]:
def get_engine(user, passwd, host, port, db):
    url = f"postgresql://{user}:{passwd}@{host}:{port}/{db}"
    if not database_exists(url):
        create_database(url)
    engine = create_engine(url, pool_size=50, echo=False)
    return engine

In [12]:
Base = declarative_base()

class NewsArticle(Base):
    __tablename__ = 'news_articles'

    link = Column(String, primary_key=True)
    title = Column(String)
    description = Column(String)
    content = Column(String)
    pub_date = Column(DateTime)
    

engine = get_engine(settings['pguser'],
                    settings['pgpasswd'],
                    settings['pghost'],
                    settings['pgport'],
                    settings['pgdb'])

Base.metadata.create_all(engine)

Session = sessionmaker(bind=engine)
session = Session()

def store_articles_in_database(articles: pd.DataFrame):
    for i,article in articles.iterrows():
        # Check for duplicates before adding to the database
        existing_article = session.query(NewsArticle).filter_by(title=article['link']).first()
        if not existing_article:
            db_article = NewsArticle(**article.to_dict())
            session.add(db_article)
    session.commit()


In [13]:
store_articles_in_database(df)

In [None]:
from celery import Celery
from nltk import classify_news_category  # Implement your own classification function

app = Celery('news_processing')
app.config_from_object('celery_config')

@app.task
def process_news_articles(articles):
    for article in articles:
        category = classify_news_category(article['content'])
        article['category'] = category
        update_database_with_category(article)

In [None]:
'''
Categories the news item should fall under are:
● Terrorism / protest / political unrest / riot
● Positive/Uplifting
● Natural Disasters
● Others
'''

In [29]:
import gensim.downloader as api
import spacy

In [4]:
wv = api.load('glove-twitter-50')



In [7]:
wv.save('vectors.kv')

In [30]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load('vectors.kv')

In [3]:
wv["floods"]

array([ 0.19816 ,  0.41016 , -0.11844 , -0.71497 , -0.74505 , -0.3282  ,
        0.023166, -0.64659 ,  0.19508 , -1.213   ,  0.78447 , -1.493   ,
       -2.393   ,  0.89226 ,  0.9995  , -0.12802 , -0.33537 , -0.12927 ,
        0.40244 , -1.1615  , -0.027187, -0.68055 , -0.59593 ,  0.1749  ,
        0.23277 ,  0.96935 , -0.80234 ,  1.6536  , -0.74963 ,  0.66573 ,
       -0.26484 , -0.042629,  0.96103 ,  0.71919 ,  0.2738  , -0.22882 ,
       -1.0678  ,  0.72851 ,  0.13782 ,  0.32018 ,  0.64071 , -0.62929 ,
       -0.23395 ,  0.044309, -0.76914 ,  0.60806 , -0.38999 ,  0.081423,
        0.17033 ,  1.3904  ], dtype=float32)

In [41]:
def sent_vec(sent):
    vector_size = wv.vector_size
    wv_res = np.zeros(vector_size)
    ctr = 1
    for w in sent:
        if w in wv:
            ctr += 1
            wv_res += wv[w]
    wv_res = wv_res/ctr
    return wv_res

In [32]:
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
print(stop_words)

{'’m', 'become', 'several', 'her', "'d", 'thereupon', 'last', 'the', 'noone', 'two', 'ourselves', 'another', 'as', 'afterwards', 'please', 'one', 'against', 'these', '‘s', 'thence', 'because', 'forty', 'sometimes', 'perhaps', 'now', 'do', 'anywhere', 'we', 'how', 'ca', 'during', 'below', 'further', 'amongst', 'had', 'first', 'themselves', 'about', 'this', 'every', 'everything', 'whereas', 'must', 'across', 'have', 'beyond', 'by', 'made', 'therefore', 'empty', 'ten', 'those', 'once', 'whole', 'nowhere', 'out', 'third', "'ll", 'n‘t', "'ve", 'name', 'between', 'front', 'whence', 'ever', 'various', 'off', 'nothing', 'should', 'upon', 'done', 'whom', 'besides', 'doing', '‘m', 'if', 'your', 'least', 'whereupon', '’ve', 'formerly', 'give', 'it', 'serious', 'up', 'six', 'were', 'around', 'hence', 'seems', 'both', 'regarding', 'its', 'myself', 'eight', 'so', 'there', 'himself', 'for', 'am', 'still', 'call', 'can', 'yet', 'without', 'mostly', 'does', 'alone', 'being', 'side', 'thereafter', 'else

In [33]:
import string
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [34]:
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # print(mytokens)

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [113]:
import re
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [114]:
df['content'] = df['content'].str.lower()
df['content'] = df['content'].apply(remove_html)
df['tokens'] = df['content'].apply(spacy_tokenizer)

In [115]:
df['vec'] = df['tokens'].apply(sent_vec)

In [203]:
classes = [
    ["terrorism", "protest"],
    ["positive","uplifting"],
    ["earthquake", "floods"] , # natural disasters
]

In [204]:
class_names = ['Political_Unreset','Positive/Uplifting','Natural_Disaster']

In [205]:
vectors = np.array([sent_vec(i) for i in classes])
def get_class(vector):
    similarity = KeyedVectors.cosine_similarities(vector, vectors)
    if(max(similarity)) >= 0.65:
        return class_names[np.argmax(similarity)]
    return 'other'

In [206]:
df['class'] = df['vec'].apply(get_class)

In [207]:
df['class'].value_counts()

other                 92
Political_Unreset     61
Positive/Uplifting     1
Natural_Disaster       1
Name: class, dtype: int64