In [2]:
from datasets import load_dataset
from haystack import Document

# Load the csv as dataset
dataset = load_dataset("csv", 
            data_files="/Users/raksja/Downloads/news_data_dedup.csv", 
            split="train")

# Create document for each row of the csv with data and metadata
documents = [
        Document(
            content=doc["description"], 
            meta={
                "published_date": doc["published_at"],
                "url": doc["url"]
            }) for doc in dataset
]

In [3]:
from haystack_integrations.components.embedders.ollama.document_embedder import OllamaDocumentEmbedder

# Generate vector embeddings for all the documents 
document_embedder = OllamaDocumentEmbedder(model="nomic-embed-text",
                                           url="http://localhost:11434")
documents_with_embeddings = document_embedder.run(documents)["documents"]


alculating embeddings: 100%|█| 28/28 [00:13<00:00,  

In [4]:
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy

# Write generated vector embeddings in inmemory doc store
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
document_store.write_documents(documents_with_embeddings, 
                               policy=DuplicatePolicy.OVERWRITE)

870

In [5]:
from haystack import Pipeline
from haystack_integrations.components.embedders.ollama.text_embedder import OllamaTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

# Initialize embedder and in memory retriever with top_k as 5
embedder = OllamaTextEmbedder(model="nomic-embed-text",
                              url="http://localhost:11434")
retriever = InMemoryEmbeddingRetriever(document_store=document_store,
                                       top_k=5)

# Simple query pipeline to connect embedder and retriever
query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", embedder)
query_pipeline.add_component("retriever", retriever)
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

# Run pipeline to get top 5 relavant documents stored
query = "Election debate between Biden and Trump"
retrieved_docs = query_pipeline.run({"text_embedder": {"text": query}})

In [6]:
print(retrieved_docs["retriever"]["documents"][0])

Document(id=533774e3ce14fcbc58155f677ece91de2d3cee520ed3f83ee650ebad9103025b, content: 'Trump claims the President isn’t up to the job. If Biden shuns debates, voters might conclude Trump ...', meta: {'published_date': '2024-04-26 21:39:00+00', 'url': 'https://www.wsj.com/articles/joe-biden-donald-trump-debate-2024-election-aeee86fb'}, score: 0.7608799738520099)


In [7]:
template = """
Using the recent news context information provided below, 
summarize the context and answer the question.

Context: 
{% for document in documents %}
- Content: {{ document.content }}
  URL: {{ document.meta.url }}
  Published Date: {{ document.meta.published_date }}
{% endfor %}

Question: {{ question }}

Answer: Provide a summary based on the context above. 
Include references to the URLs and their exact published dates at the end of the response.
"""

In [14]:
from haystack import Pipeline
from haystack_integrations.components.embedders.ollama.text_embedder import OllamaTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack_integrations.components.generators.ollama import OllamaGenerator

# Initialize embedder and in memory retriever with top_k as 3
embedder = OllamaTextEmbedder(model="nomic-embed-text",
                              url="http://localhost:11434")
retriever = InMemoryEmbeddingRetriever(document_store=document_store, 
                                       top_k=3)

# Initialize prompt builder with template and llm generator connecting to local llm
prompt_builder = PromptBuilder(template=template)
generator = OllamaGenerator(
    model="llama3.2", 
    url="http://localhost:11434/",
    generation_kwargs={
        "num_predict": 500,
        "temperature": 0.9
    }
)

# Simple query pipeline to connect embedder and retriever
rag_pipeline = Pipeline()
rag_pipeline.add_component("text_embedder", embedder)
rag_pipeline.add_component("retriever", retriever)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component("llm", generator)
rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
rag_pipeline.connect("retriever.documents", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder.prompt", "llm.prompt")

question = "Any news about election debate between Biden and Trump"
response = rag_pipeline.run({
    "text_embedder": {"text": question},
    "prompt_builder": {"question": question}
})

In [15]:
print(response["llm"]["replies"][0])

Based on the provided context, it appears that there is ongoing speculation and discussion about whether Joe Biden will participate in an upcoming election debate with Donald Trump.

According to reports from The Wall Street Journal (https://www.wsj.com/articles/joe-biden-donald-trump-debate-2024-election-aeee86fb published April 26, 2024) and The New York Times (https://www.nytimes.com/2024/04/26/us/politics/biden-debate-trump.html published April 26, 2024), Biden has declined to say whether he will participate in a debate with Trump, citing the president's busy campaign schedule.

However, if Biden were to shun the debate, it could potentially benefit Trump, according to one article from The Times of India (https://timesofindia.indiatimes.com/world/us/joe-biden-goes-on-campaign-spree-as-donald-trump-faces-court-disruptions/articleshow/109640744.cms published April 27, 2024). This suggests that the debate is seen as a significant opportunity for Trump to gain an advantage over Biden i