In [1]:
import os
import streamlit as st
from datetime import datetime, timedelta
from typing import List, Dict
import requests
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from langchain_community.vectorstores import Chroma
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import dotenv
import re

# Load environment variables
dotenv.load_dotenv()

# Set environment variables
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
os.environ["NEWSAPI_KEY"] = os.getenv("NEWS_API_KEY")
os.environ['GOOGLE_API_KEY'] = os.getenv("GOOGLE_API_KEY")
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class NewsCollector:
    def __init__(self):
        self.base_url = "https://newsapi.org/v2/top-headlines"
        self.api_key = os.getenv("NEWSAPI_KEY")
        
    def get_news(self, country: str, category: str) -> List[Dict]:
        params = {
            "country": country,
            "category": category,
            "apiKey": self.api_key,
            "pageSize": 10
        }
        response = requests.get(self.base_url, params=params)
        #print(response.json())
        return response.json()["articles"]
    
    def preprocess_news(self, articles: List[Dict], region: str, category: str) -> List[Document]:
        documents = []
        for article in articles:
            if article['title'] != "[Removed]" and article['description'] != "[Removed]" and article['content'] != "[Removed]":
                content = f"Title: {article['title']}\nDescription: {article['description']}\nContent: {article['content']}"
                summary_content = f"Title: {article['title']}\nDescription: {article['description']}"
                metadata = {
                    "published_date": article["publishedAt"],
                    "category": category,
                    "region": region,
                    "source": article["source"]["name"],
                    "summary_content": summary_content  # Store concise version in metadata

                }
                documents.append(Document(page_content=content, metadata=metadata))
        return documents


In [3]:
region ="us"
category = "business"
news_collector = NewsCollector()
articles = news_collector.get_news(region, category)
documents = news_collector.preprocess_news(articles,region,category)

In [14]:
class VectorStore:
    def __init__(self):
        self.embeddings = GoogleGenerativeAIEmbeddings(
            model='models/text-embedding-004'
        )
        self.vectorstore = None
    
    def create_vectorstore(self, documents: List[Document]):
        self.vectorstore = Chroma.from_documents(
            documents=documents,
            embedding=self.embeddings
        )
        return self.vectorstore

class NewsSummarizer:
    def __init__(self):
        self.llm = ChatGroq(model="llama3-8b-8192")
        
    def summarize_news(self, documents: List[Document], current_date: str) -> str:
        template = """Current date: {current_date}
        Based on the following news articles and their published dates, provide a comprehensive summary of the latest news:
        
        {context}
        
        Prioritize more recent news while maintaining coherence in the summary.
        """
        
        prompt = ChatPromptTemplate.from_template(template)
        
        chain = (
            prompt 
            | self.llm 
            | StrOutputParser()
        )
        
        context = "\n\n".join([
            f"Article ({doc.metadata['published_date']}):\n{doc.metadata['summary_content']}"
            for doc in documents
        ])
        
        return chain.invoke({
            "context": context,
            "current_date": current_date
        })

In [15]:
# Create vector store

vector_store = VectorStore()
vectorstore = vector_store.create_vectorstore(documents)

news_summarizer = NewsSummarizer()
# Generate summary
current_date = datetime.now().strftime("%Y-%m-%d")
summary = news_summarizer.summarize_news(documents, current_date)
print(summary)

As of October 29, 2024, here is a summary of the latest news:

In the world of finance, Bitcoin has reached a new high of $71,000 for the first time since June, driven by investments in dedicated exchange-traded funds and speculation. Meanwhile, JPMorgan Chase has started suing customers who allegedly stole funds during a viral ATM glitch scam over the summer.

In the tech industry, Robinhood has entered the election betting market, offering contracts on the outcome of the upcoming election, with prices ranging from two cents to ninety-nine cents. Additionally, Boeing is considering selling its space business, including the Starliner, in early-stage discussions.

In the field of artificial intelligence, the Open Source Initiative has defined new guidelines for open-source AI, requiring developers to reveal the training data used to develop their models. This move has implications for Meta's Llama model, which does not meet the new definition.

In the world of biotechnology, Monte Rosa 

In [None]:
class CustomNewsRetriever:
    def __init__(self, vectorstore):
        self.vectorstore = vectorstore
        self.llm = ChatGroq(model="llama3-8b-8192")
    
    def generate_questions(self, user_query: str) -> List[str]:
        template = """You are an AI language model assistant. Your task is to generate 3 different sub questions OR alternate versions of the given user question to retrieve relevant documents from a vector database.

        By generating multiple versions of the user question,
        your goal is to help the user overcome some of the limitations
        of distance-based similarity search.

        By generating sub questions, you can break down questions that refer to multiple concepts into distinct questions. This will help you get the relevant documents for constructing a final answer

        If multiple concepts are present in the question, you should break into sub questions, with one question for each concept

        Provide these alternative questions separated by newlines between XML tags. For example:

        <questions>
        - Question 1
        - Question 2
        - Question 3
        </questions>

        Original question: {question}"""
        
        prompt = ChatPromptTemplate.from_template(template)
        chain = prompt | self.llm | StrOutputParser()
        
        result = chain.invoke({"question": user_query})
        result = re.search(r'<questions>(.*?)</questions>', result, re.DOTALL).group(1)
        return result
    
    def deduplicate_docs(self, docs: List[Document]) -> List[Document]:
        """
        Deduplicate documents based on their content and metadata
        """
        unique_docs = []
        seen_contents = set()
        
        for doc in docs:
            # Create a unique identifier using content and published date
            content_identifier = (
                doc.page_content,
                doc.metadata.get('published_date', '')
            )
            
            if content_identifier not in seen_contents:
                seen_contents.add(content_identifier)
                unique_docs.append(doc)
        
        return unique_docs
    
    def get_relevant_docs(self, questions: List[str]) -> List[Document]:
        all_docs = []
        for question in questions:
            docs = self.vectorstore.similarity_search(question,k=3)
            all_docs.extend(docs)
        
        # Deduplicate documents
        return self.deduplicate_docs(all_docs)
    
    def answer_query(self, user_query: str, docs: List[Document], current_date: str) -> str:
        template = """Current date: {current_date}
        Based on the following news articles and their published dates, answer this question: {question}
        
        Context:
        {context}
        
        Provide a clear and focused answer based on the most recent and relevant information.
        """
        
        prompt = ChatPromptTemplate.from_template(template)
        chain = prompt | self.llm | StrOutputParser()
        
        # Use full content for detailed answers to specific queries
        context = "\n\n".join([
            f"Article ({doc.metadata['published_date']}):\n{doc.page_content}"
            for doc in docs
        ])
        
        return chain.invoke({
            "context": context,
            "question": user_query,
            "current_date": current_date
        })

In [17]:
current_date = datetime.now().strftime("%Y-%m-%d")

user_query = "What is the latest news on business in the US?"

custom_retriever = CustomNewsRetriever(vector_store.vectorstore)

questions = custom_retriever.generate_questions(user_query)
print(questions)

relevant_docs = custom_retriever.get_relevant_docs(questions)
print(relevant_docs)

answer = custom_retriever.answer_query(user_query, relevant_docs, current_date)
print(answer)


What is the latest business news in the United States?
What news articles are available about US business?
What is the current state of the business industry in the US?
What are the latest trends and developments in US business?
What are the most recent updates on US business news and analysis?

The latest news on business in the US is the US Treasury Department's finalization of restrictions on investments in artificial intelligence and semiconductor sectors in China, citing national security concerns. Additionally, there is news about various companies' stock performances, including Monte Rosa Therapeutics' nearly doubling in value on a deal with Novartis, and Trump Media & Technology Group's stock surge to new highs.
