# RAG-based Real-time News Sentiment and Summary for Stocks


This notebook reads a real-time news feed (a stock RSS) in this case.
- A RAG-based app using a large language Model (ChatGPT/GPT4) is written to enable users to query the data.
-In addition, sentiment analysis is carried out using the NLP app

Install python packages

In [None]:
!pip install -qU langchain langchain-openai langchain_chroma langchain_community langchainhub langgraph langchain_text_splitters bs4

In [2]:
!pip install --upgrade --quiet  feedparser newspaper3k listparser lxml


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.1/211.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.9/104.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for tinysegmenter (setup.py) ... [?25l[?25hdone
  Building wheel for feedfinder2 (setup.py) ... [?25l[?25hdone
  Building wheel for jieba3k (setup.py) ... [?25l[?25hdone
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone


In [29]:
from typing import Sequence
import os
import bs4
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import RSSFeedLoader
from langchain_community.document_loaders import PDFMinerLoader # import PDFMinerLoader
import feedparser

from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, StateGraph
from langgraph.graph.message import add_messages
from typing_extensions import Annotated, TypedDict

from google.colab import userdata
from langchain_core.documents import Document
import pytz
from datetime import datetime
import pandas as pd

#Function to process RSS realtime feed

In [4]:

def process_rss(rss_path):
  docs = []
  feed = feedparser.parse(rss_path)
  for entry in feed.entries:
    today = datetime.now(pytz.timezone('GMT')).date()
    for entry in feed.entries:
      try:
          published_date = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %z")
      except ValueError:
          published_date = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S GMT")
          published_date = published_date.replace(tzinfo=pytz.timezone('GMT'))
      if published_date.date() == today:
        docs.append(
          Document(page_content=entry.get("title", ''),
                        metadata={"source": entry.get("link", ''),
                                  "published": entry.get("published", '')}
          )
        )
    return docs
#ref: https://medium.com/@paulo_marcos/save-precious-time-by-letting-ai-read-the-news-for-you-5c0c851e599a

#Set up Question and Answer app

In [20]:
# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

DATA_SOURCE = "rss"
RSSFeedLoader.rss_url = "https://www.stocktitan.net/rss"

web_paths = ("https://bbc.com",)
html_classes=("entry-content", "entry-title")
rss_paths = ("https://www.stocktitan.net/rss",)
html_classes=("title")

# based on huggingface template
# Create retriever

if DATA_SOURCE == "web":
  loader = WebBaseLoader(
      web_paths=web_paths,
      bs_kwargs=dict(
          parse_only=bs4.SoupStrainer(
              class_=html_classes
          )
      ),
  )
elif DATA_SOURCE == "rss":
  docs = process_rss(rss_paths[0])

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = InMemoryVectorStore.from_documents(
    documents=splits, embedding=OpenAIEmbeddings()
)
retriever = vectorstore.as_retriever()

# Contextualize question
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)


# Prompt and template to answer question
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

# The chat history is managed statefully
class State(TypedDict):
    input: str
    chat_history: Annotated[Sequence[BaseMessage], add_messages]
    context: str
    answer: str


def call_model(state: State):
    response = rag_chain.invoke(state)
    return {
        "chat_history": [
            HumanMessage(state["input"]),
            AIMessage(response["answer"]),
        ],
        "context": response["context"],
        "answer": response["answer"],
    }

workflow = StateGraph(state_schema=State)
workflow.add_edge(START, "model")
workflow.add_node("model", call_model)

memory = MemorySaver()
app = workflow.compile(checkpointer=memory)

#Call the app for news summarisation

In [21]:
config = {"configurable": {"thread_id": "efg234"}}

result = app.invoke(
    {"input": "summarise the text"},
    config=config,
)
print(result["answer"])

Markem-Imaje has launched a new range of print and apply consumables called Ultraply, designed to meet the demand for scannable 2D barcodes. StrikePoint has announced the results of its Annual General Meeting and provided a corporate update. 17 Education & Technology Group Inc. has released its unaudited financial results for the third quarter of 2024. OceanaGold has published a pre-feasibility study for the Waihi District, highlighting attractive economics and an initial reserve of 1.2 million ounces at Wharekirauponga.


#Sentiment analysis of each headline

In [35]:
df = pd.DataFrame([x.page_content for x in docs])
def get_sentiment(row):
  result = app.invoke(
      {"input": f"Give the sentiment of this headline: {row}"},
      config=config,
  )
  return result["answer"]

df_sentiment = df.head(10).copy()
df_sentiment['sentiment'] = df_sentiment[0].apply(get_sentiment)
display(df_sentiment)

Unnamed: 0,0,sentiment
0,Aduro Clean Technologies Announces Second Part...,Neutral
1,Pyxus Releases Fiscal Year 2024 Sustainability...,Neutral
2,Atrium Mortgage Investment Corporation Announc...,Positive
3,West Fraser Declares Dividend | WFG Stock News,Positive
4,Eldorado Gold Releases Updated Mineral Reserve...,Positive
5,Signal Gold Exercises Option and Upsizes Concu...,Positive
6,OceanaGold Releases Waihi District Pre-Feasibi...,Positive
7,Patriot Drills 31.2 m at 3.35% Li2O at CV13 in...,Positive
8,Veritiv Completes Acquisition of Orora Packagi...,Positive
9,Northern Trust Appointed by First Sentier Inve...,Positive
