# NewsRag Using Langchain framework

In [1]:
import os
import streamlit as st
from datetime import datetime, timedelta
from typing import List, Dict
import requests
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from langchain_community.vectorstores import Chroma
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import dotenv
import re

# Load environment variables
dotenv.load_dotenv()

# Set environment variables
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
os.environ["NEWSAPI_KEY"] = os.getenv("NEWS_API_KEY")
os.environ['GOOGLE_API_KEY'] = os.getenv("GOOGLE_API_KEY")
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class NewsCollector:

    def __init__(self):
        self.base_url = "https://newsapi.org/v2/top-headlines"
        self.api_key = os.getenv("NEWSAPI_KEY")
        
    def get_news(self, country: str, category: str) -> List[Dict]:
        params = {
            "country": country,
            "category": category,
            "apiKey": self.api_key,
            "pageSize": 10
        }
        response = requests.get(self.base_url, params=params)
        #print(response.json())
        return response.json()["articles"]
    
    def preprocess_news(self, articles: List[Dict], region: str, category: str) -> List[Document]:
        documents = []
        for article in articles:
            if article['title'] != "[Removed]" and article['description'] != "[Removed]" and article['content'] != "[Removed]":
                content = f"Title: {article['title']}\nDescription: {article['description']}\nContent: {article['content']}"
                summary_content = f"Title: {article['title']}\nDescription: {article['description']}"
                metadata = {
                    "published_date": article["publishedAt"],
                    "category": category,
                    "region": region,
                    "source": article["source"]["name"],
                    "summary_content": summary_content  # Store concise version in metadata

                }
                documents.append(Document(page_content=content, metadata=metadata))
        return documents


In [4]:
region ="us"
category = "business"
news_collector = NewsCollector()
articles = news_collector.get_news(region, category)
documents = news_collector.preprocess_news(articles,region,category)

In [7]:
print(articles[0])

{'source': {'id': None, 'name': 'Ambcrypto.com'}, 'author': 'Ishika Kumari', 'title': 'Tether CEO slams federal probe claims: ‘No knowledge of any such investigations’ - AMBCrypto News', 'description': "Tether's USDT recorded $6.47 billion in transaction volume, significantly outpacing USDC's $2.08 billion.", 'url': 'https://ambcrypto.com/tether-ceo-slams-federal-probe-claims-no-knowledge-of-any-such-investigations/', 'urlToImage': 'https://ambcrypto.com/wp-content/uploads/2024/10/Tether-faces-scrutiny-1000x600.webp', 'publishedAt': '2024-10-29T05:04:51Z', 'content': '<ul><li>Tether faces an investigation over alleged ties to illegal activities and sanctioned entities.</li><li>Despite the scrutiny, Tether explores opportunities for growth in the commodity sector.<… [+3334 chars]'}


In [8]:
print(documents[0])
print(documents[0].metadata)

page_content='Title: Tether CEO slams federal probe claims: ‘No knowledge of any such investigations’ - AMBCrypto News
Description: Tether's USDT recorded $6.47 billion in transaction volume, significantly outpacing USDC's $2.08 billion.
Content: <ul><li>Tether faces an investigation over alleged ties to illegal activities and sanctioned entities.</li><li>Despite the scrutiny, Tether explores opportunities for growth in the commodity sector.<… [+3334 chars]' metadata={'published_date': '2024-10-29T05:04:51Z', 'category': 'business', 'region': 'us', 'source': 'Ambcrypto.com', 'summary_content': "Title: Tether CEO slams federal probe claims: ‘No knowledge of any such investigations’ - AMBCrypto News\nDescription: Tether's USDT recorded $6.47 billion in transaction volume, significantly outpacing USDC's $2.08 billion."}
{'published_date': '2024-10-29T05:04:51Z', 'category': 'business', 'region': 'us', 'source': 'Ambcrypto.com', 'summary_content': "Title: Tether CEO slams federal probe cla

In [9]:
class VectorStore:
    def __init__(self):
        self.embeddings = GoogleGenerativeAIEmbeddings(
            model='models/text-embedding-004'
        )
        self.vectorstore = None
    
    def create_vectorstore(self, documents: List[Document]):
        self.vectorstore = Chroma.from_documents(
            documents=documents,
            embedding=self.embeddings
        )
        return self.vectorstore

class NewsSummarizer:
    def __init__(self):
        self.llm = ChatGroq(model="llama3-8b-8192")
        
    def summarize_news(self, documents: List[Document], current_date: str) -> str:
        template = """Current date: {current_date}
        Based on the following news articles and their published dates, provide a comprehensive summary of the latest news:
        
        {context}
        
        Prioritize more recent news while maintaining coherence in the summary.
        """
        
        prompt = ChatPromptTemplate.from_template(template)
        
        chain = (
            prompt 
            | self.llm 
            | StrOutputParser()
        )
        
        context = "\n\n".join([
            f"Article ({doc.metadata['published_date']}):\n{doc.metadata['summary_content']}"
            for doc in documents
        ])
        
        return chain.invoke({
            "context": context,
            "current_date": current_date
        })

In [10]:
# Create vector store

vector_store = VectorStore()
vectorstore = vector_store.create_vectorstore(documents)

news_summarizer = NewsSummarizer()
# Generate summary
current_date = datetime.now().strftime("%Y-%m-%d")
summary = news_summarizer.summarize_news(documents, current_date)
print(summary)

Here is a comprehensive summary of the latest news, prioritizing more recent news:

As of October 30, 2024, the latest news includes:

* Tether's USDT recorded $6.47 billion in transaction volume, significantly outpacing USDC's $2.08 billion, according to AMBCrypto News on October 29.
* HSBC's pre-tax profits rose to $8.5 billion in the three months to the end of September, beating analysts' expectations, as reported by BBC.com on October 29.
* XRP's price action and on-chain activity suggest building momentum, pointing toward a potential breakout, according to AMBCrypto News on October 29.
* McDonald's is set to report its earnings, with its shares having fallen 6% since its Quarter Pounder burgers were linked to a deadly E. coli outbreak, as per CNBC on October 29.
* Asia stocks were muted on October 29, with tech earnings in focus, while Japan extended its gains, according to Investing.com.
* Stock futures were little changed on October 29 as Wall Street braced for Big Tech earnings

In [11]:
class CustomNewsRetriever:
    def __init__(self, vectorstore):
        self.vectorstore = vectorstore
        self.llm = ChatGroq(model="llama3-8b-8192")
    
    def generate_questions(self, user_query: str) -> List[str]:
        template = """You are an AI language model assistant. Your task is to generate 3 different sub questions OR alternate versions of the given user question to retrieve relevant documents from a vector database.

        By generating multiple versions of the user question,
        your goal is to help the user overcome some of the limitations
        of distance-based similarity search.

        By generating sub questions, you can break down questions that refer to multiple concepts into distinct questions. This will help you get the relevant documents for constructing a final answer

        If multiple concepts are present in the question, you should break into sub questions, with one question for each concept

        Provide these alternative questions separated by newlines between XML tags. For example:

        <questions>
        - Question 1
        - Question 2
        - Question 3
        </questions>

        Original question: {question}"""
        
        prompt = ChatPromptTemplate.from_template(template)
        chain = prompt | self.llm | StrOutputParser()
        
        result = chain.invoke({"question": user_query})
        result = re.search(r'<questions>(.*?)</questions>', result, re.DOTALL).group(1)
        return result
    
    def deduplicate_docs(self, docs: List[Document]) -> List[Document]:
        """
        Deduplicate documents based on their content and metadata
        """
        unique_docs = []
        seen_contents = set()
        
        for doc in docs:
            # Create a unique identifier using content and published date
            content_identifier = (
                doc.page_content,
                doc.metadata.get('published_date', '')
            )
            
            if content_identifier not in seen_contents:
                seen_contents.add(content_identifier)
                unique_docs.append(doc)
        
        return unique_docs
    
    def get_relevant_docs(self, questions: List[str]) -> List[Document]:
        all_docs = []
        for question in questions:
            docs = self.vectorstore.similarity_search(question,k=3)
            all_docs.extend(docs)
        
        # Deduplicate documents
        return self.deduplicate_docs(all_docs)
    
    def answer_query(self, user_query: str, docs: List[Document], current_date: str) -> str:
        template = """Current date: {current_date}
        Based on the following news articles and their published dates, answer this question: {question}
        
        Context:
        {context}
        
        Provide a clear and focused answer based on the most recent and relevant information.
        """
        
        prompt = ChatPromptTemplate.from_template(template)
        chain = prompt | self.llm | StrOutputParser()
        
        # Use full content for detailed answers to specific queries
        context = "\n\n".join([
            f"Article ({doc.metadata['published_date']}):\n{doc.page_content}"
            for doc in docs
        ])
        
        return chain.invoke({
            "context": context,
            "question": user_query,
            "current_date": current_date
        })

In [14]:
current_date = datetime.now().strftime("%Y-%m-%d")

user_query = "What is the latest news on business in the US?"

custom_retriever = CustomNewsRetriever(vector_store.vectorstore)

questions = custom_retriever.generate_questions(user_query)
print("Similar questions:")
print(questions)

relevant_docs = custom_retriever.get_relevant_docs(questions)
print("Relevant documents:")
print(relevant_docs)

answer = custom_retriever.answer_query(user_query, relevant_docs, current_date)
print("Answer:")
print(answer)

Similar questions:

- What is the latest news on US economy?
- What is the latest news on US business trends?
- What is the latest news on US corporate sector?



GoogleGenerativeAIError: Error embedding content: 429 Quota exceeded for quota metric 'Batch Embed Content API requests' and limit 'Batch embed contents request limit per minute for a region' of service 'generativelanguage.googleapis.com' for consumer 'project_number:744316257755'. [reason: "RATE_LIMIT_EXCEEDED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "quota_metric"
  value: "generativelanguage.googleapis.com/batch_embed_contents_requests"
}
metadata {
  key: "quota_location"
  value: "us-central2"
}
metadata {
  key: "quota_limit"
  value: "BatchEmbedContentsRequestsPerMinutePerProjectPerRegion"
}
metadata {
  key: "quota_limit_value"
  value: "150"
}
metadata {
  key: "consumer"
  value: "projects/744316257755"
}
, links {
  description: "Request a higher quota limit."
  url: "https://cloud.google.com/docs/quotas/help/request_increase"
}
]