# Get News Articles from RSS Feeds

### RSS Feeds of Top News Channels

In [80]:
import json
import pandas as pd

In [41]:
# Load CSV as DataFrame
df_feeds = pd.read_csv('rss_urls.csv')

# Convert DataFrame to JSON
rss_feeds = df_feeds.to_json(orient='records')

json_feeds = json.loads(rss_feeds)

### Parse RSS Feeds

In [81]:
import feedparser
from bs4 import BeautifulSoup

In [72]:
news_items = []

for feed in json_feeds:
    publisher_name = feed['Publisher Name']
    rss_feed_url = feed['RSS Feed Link']
    placeholder_imageUrl = feed['Publiser Image']

    # Parse RSS feed
    feed_data = feedparser.parse(rss_feed_url)

    # Extract relevant data and store in a list of dictionaries
    for entry in feed_data.entries:
        news_item = {
            "header": entry.title,
            "sourceUrl": entry.link,
            "publisher": publisher_name,
            "publishedOn": entry.published,
            "description": BeautifulSoup(entry.summary, 'html.parser').get_text()
        }
        
        # Check if the entry has media content (thumbnail image)
        if 'media_content' in entry:
            news_item["imageUrl"] = entry.media_content[0]["url"]
        else:
            news_item["imageUrl"] = placeholder_imageUrl
        news_items.append(news_item)



In [73]:
# Create a DataFrame for the current feed
df = pd.DataFrame(news_items)

# Get Stocks Mapped from PineCone Index and Sentiment via OpenAI

#### PineCone Index has the context about the stocks embedded in some format via their earnings call transcripts

### Pinecone Details

In [82]:
import pinecone

pinecone.init(api_key="", environment="")
index=pinecone.Index("")

  from tqdm.autonotebook import tqdm


### OpenAI Details

In [83]:
import openai

# Set up your OpenAI API credentials
openai.api_key = ''

### Using Query Retrieval Chain from Langchain

In [None]:

from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

prompt_template = """
Just output which companies will have an impact due to the news. 

News: {question}
=========
{context}
=========

Reply only with the below pattern of JSON. Do not add any text as reply. If there is no answer, reply with empty object. Reply only the JSON object and nothing else. 
[{{ "company":"company_name", "sentiment": "neutral"}}]

"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

text_field = "text"

embed = OpenAIEmbeddings(
    model="text-embedding-ada-002",
    openai_api_key=openai.api_key
)

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

# chat completion llm
llm = ChatOpenAI(
    openai_api_key=openai.api_key,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

chain_type_kwargs = {"prompt": PROMPT}

# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={'filter': {'country':'US', 'quarter': {'$in': ['Q42022','Q12023']}}}),
    chain_type_kwargs=chain_type_kwargs,
    return_source_documents=True
)

In [None]:
import time
from tqdm import tqdm


def apply_qa(row):
    query = 'Headline: '+ row['header'] + '\n'+ 'Summary: '+ row ['description']
    result = qa({"query": query})
    return result['result']

# Count the total number of rows
total_rows = len(df)

# Initialize a progress bar
progress_bar = tqdm(total=total_rows, desc="Processing")

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Apply the qa() function to the concatenated query and store the result in the 'companySentiment' column
    df.loc[index, 'companySentiment'] = apply_qa(row)
    
    # Update the progress bar
    progress_bar.update(1)
    
    # Add a time gap between each iteration (e.g., 1 second)
    time.sleep(1)

# Close the progress bar
progress_bar.close()

# Filter out Articles using OpenAI
##### This is to keep the list of articles more focussed. Based on OpenAI's interpretation of News' impact on markets

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document

prompt_template = """

You are a helpful assistant. You provide an impact score when I share a news article title and short summary based on how much impact the news can have on capital markets.You rate the scores 0 to 10 based on level of impact.


News 
=========
{text}
=========


Reply only with the below pattern of JSON. Do not add any text as reply. If there is no answer, reply with empty object. Reply only the JSON object and nothing else. 
{{"impact_score":""}}
"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])

# chat completion llm
llm = ChatOpenAI(
    openai_api_key=openai.api_key,
    model="gpt-3.5-turbo",
    temperature=0.0
)

chain = load_summarize_chain(llm, chain_type="stuff", prompt=PROMPT)


text = 'Headline: '+ 'Russia-Ukraine war live: Putin says those behind ‘armed rebellion’ will be punished;' + '\n' + 'Summary: ' + 'Wagner chief says he’s in Rostov military HQ – latest updates - '

result = chain.run([Document(page_content=text)])

print(result)

In [None]:
import time
from tqdm import tqdm


def apply_qa(row):
    text = 'Headline: '+ row['header'] + '\n'+ 'Summary: '+ row ['description']
    result = chain.run([Document(page_content=text)])
    return result

# Count the total number of rows
total_rows = len(df)

# Initialize a progress bar
progress_bar = tqdm(total=total_rows, desc="Processing")

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Apply the qa() function to the concatenated query and store the result in the 'companySentiment' column
    df.loc[index, 'impact_score'] = apply_qa(row)
    
    # Update the progress bar
    progress_bar.update(1)
    
    # Add a time gap between each iteration (e.g., 1 second)
#     time.sleep(1)

# Close the progress bar
progress_bar.close()

In [None]:
df['isImportant'] = df['impact_score'].apply(lambda x: json.loads(x)['impact_score'] > 5)
num_important_rows = df['isImportant'].sum()

# Update MongoDB to serve the web-app


### MongoDB Details 

In [None]:
from pymongo import MongoClient

# Connect to the MongoDB instance, database & collection
MONGO_URL="YOUR_MONGO_URL"
DB_NAME="YOUR_DB_NAME"
COLLECTION = "YOUR_DB_COLLECTION"

client = MongoClient(MONGO_URL)
db = client[DB_NAME]
collection = db[COLLECTION]

### Date Formatting before updating the MongoDB

In [None]:
from dateutil import parser
import pytz

# Assuming your DataFrame is called 'df' and it contains a field 'publishedOn'

# Define timezones
us_eastern = pytz.timezone('US/Eastern')

# Function to convert and format time to EDT
def convert_to_edt(time_str):
    parsed_time = parser.parse(time_str)
    eastern_time = parsed_time.astimezone(us_eastern)
    edt_time_format = '%Y-%m-%d %H:%M:%S EDT'
    return eastern_time.strftime(edt_time_format)

# Create the new field 'publishedOn_EDT'
df['publishedOn_EDT'] = df['publishedOn'].apply(convert_to_edt)

### Add df to a collection in MongoDB

In [None]:
# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Get the company sentiment string for the current row
    company_sentiments_str = row['companySentiment']
    
    # Convert the string to a Python object (list of dictionaries)
    company_sentiments = json.loads(company_sentiments_str)
    
    company_names = row['associated_companies']

    # Create the MongoDB document
    document = {
        "header": row['header'],
        "sourceUrl": row['sourceUrl'],
        "publisher": row['publisher'],
        "publishedOn": row['publishedOn_EDT'],
        "description": row['description'],
        "isImportant": row['isImportant'],
        "imageUrl": row['imageUrl'],
        "companyNames": company_names,
        "companySentiment": company_sentiments,
    }

    # Insert the document into MongoDB
    collection.insert_one(document)