In [None]:
from typing import List
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field
from langchain_perplexity import ChatPerplexity
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader, DirectoryLoader

load_dotenv()

True

### Setup Data Loader

In [None]:
loader = DirectoryLoader(
    path="data",
    glob="*.txt",
    loader_cls=TextLoader
)

documents = loader.load()

In [None]:
for doc in documents:
    src = doc.metadata['source']
    print(src)

data\news1.txt
data\news2.txt
data\news3.txt
data\news4.txt
data\news5.txt


### Preprocess Text Data into Chunks

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200
)

chunks = text_splitter.split_documents(documents)

### Setup Embeddings

In [None]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


### Setup ChromaDB

In [None]:
from langchain_chroma import Chroma

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./news_chroma"
)


In [None]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}  # allow multiple articles
)


VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x0000012C5443C250>, search_kwargs={'k': 3})

### Setup Prompt Template

In [None]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are answering questions using ONLY the news articles below.

Rules:
- Base your answer strictly on the provided articles.
- If the answer is not found, say "Not mentioned in the articles".
- After the answer, list the source file name(s).

Articles:
{context}

Question:
{question}

Answer format:
Answer: <your answer>
Sources: <comma-separated file names>
"""
)


In [None]:
from langchain_core.documents import Document

def format_docs(docs: list[Document]) -> str:
    return "\n\n".join(
        f"\n[Source: {doc.metadata['source']}]\n{doc.page_content}"
        for doc in docs
    )


In [None]:
from langchain_core.runnables import RunnablePassthrough

chat = ChatPerplexity(
    temperature=0,
    model="sonar",
    timeout=None
)

rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | chat
)

In [None]:
response = rag_chain.invoke(
    "What memorandum circular was signed in 2025 regarding the observance of a simple Yuletide season, and when exactly was it signed?"
)

In [None]:
response.content

'**Answer:** Memorandum Circular (MC) 110 was signed on December 15 regarding the observance of simple and meaningful Yuletide season celebrations by government agencies.[Source: data\\news4.txt]  \n**Sources:** data\\news4.txt'